Move all utf8 decoding under fx_string.h

Put encoder in cfx_utf8encoder.{h,cpp} to parallel decoder.
Add tests, and fix one corner case involving 0xff.

Change-Id: Ib97540afdc708bcc6280a79c76734ec68ea72690
Reviewed-on: https://pdfium-review.googlesource.com/39770
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/BUILD.gn b/BUILD.gn
index afa83e3..f08ce43 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -858,6 +858,8 @@
     "core/fxcrt/cfx_seekablestreamproxy.h",
     "core/fxcrt/cfx_utf8decoder.cpp",
     "core/fxcrt/cfx_utf8decoder.h",
+    "core/fxcrt/cfx_utf8encoder.cpp",
+    "core/fxcrt/cfx_utf8encoder.h",
     "core/fxcrt/cfx_widetextbuf.cpp",
     "core/fxcrt/cfx_widetextbuf.h",
     "core/fxcrt/fileaccess_iface.h",
diff --git a/core/fxcrt/bytestring.cpp b/core/fxcrt/bytestring.cpp
index b6c1ce7..3ff0e35 100644
--- a/core/fxcrt/bytestring.cpp
+++ b/core/fxcrt/bytestring.cpp
@@ -669,11 +669,7 @@
 }
 
 WideString ByteString::UTF8Decode() const {
-  CFX_UTF8Decoder decoder;
-  for (size_t i = 0; i < GetLength(); i++) {
-    decoder.Input(static_cast<uint8_t>(m_pData->m_String[i]));
-  }
-  return WideString(decoder.GetResult());
+  return WideString::FromUTF8(AsStringView());
 }
 
 int ByteString::Compare(const ByteStringView& str) const {
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index bee5e16..8adab5c 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp
@@ -43,5 +43,7 @@
   } else if (byte < 0xfe) {
     m_PendingBytes = 5;
     m_PendingChar = (byte & 0x01) << 30;
+  } else {
+    m_PendingBytes = 0;
   }
 }
diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
new file mode 100644
index 0000000..9ed149f
--- /dev/null
+++ b/core/fxcrt/cfx_utf8encoder.cpp
@@ -0,0 +1,43 @@
+// Copyright 2018 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#include "core/fxcrt/cfx_utf8encoder.h"
+
+CFX_UTF8Encoder::CFX_UTF8Encoder() = default;
+
+CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
+
+void CFX_UTF8Encoder::Input(wchar_t unicodeAsWchar) {
+  uint32_t unicode = static_cast<uint32_t>(unicodeAsWchar);
+  if (unicode < 0x80) {
+    m_Buffer.push_back(unicode);
+  } else {
+    if (unicode >= 0x80000000)
+      return;
+
+    int nbytes = 0;
+    if (unicode < 0x800)
+      nbytes = 2;
+    else if (unicode < 0x10000)
+      nbytes = 3;
+    else if (unicode < 0x200000)
+      nbytes = 4;
+    else if (unicode < 0x4000000)
+      nbytes = 5;
+    else
+      nbytes = 6;
+
+    static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+    int order = 1 << ((nbytes - 1) * 6);
+    int code = unicodeAsWchar;
+    m_Buffer.push_back(prefix[nbytes - 2] | (code / order));
+    for (int i = 0; i < nbytes - 1; i++) {
+      code = code % order;
+      order >>= 6;
+      m_Buffer.push_back(0x80 | (code / order));
+    }
+  }
+}
diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
new file mode 100644
index 0000000..d44a829
--- /dev/null
+++ b/core/fxcrt/cfx_utf8encoder.h
@@ -0,0 +1,31 @@
+// Copyright 2018 PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
+
+#ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
+#define CORE_FXCRT_CFX_UTF8ENCODER_H_
+
+#include <vector>
+
+#include "core/fxcrt/fx_string.h"
+
+class CFX_UTF8Encoder {
+ public:
+  CFX_UTF8Encoder();
+  ~CFX_UTF8Encoder();
+
+  void Input(wchar_t unicodeAsWchar);
+
+  // The data returned by GetResult() is invalidated when this is modified by
+  // appending any data.
+  ByteStringView GetResult() const {
+    return ByteStringView(m_Buffer.data(), m_Buffer.size());
+  }
+
+ private:
+  std::vector<uint8_t> m_Buffer;
+};
+
+#endif  // CORE_FXCRT_CFX_UTF8ENCODER_H_
diff --git a/core/fxcrt/fx_string.cpp b/core/fxcrt/fx_string.cpp
index daf9955..c9993f9 100644
--- a/core/fxcrt/fx_string.cpp
+++ b/core/fxcrt/fx_string.cpp
@@ -7,61 +7,12 @@
 #include <limits>
 #include <vector>
 
+#include "core/fxcrt/cfx_utf8decoder.h"
+#include "core/fxcrt/cfx_utf8encoder.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_string.h"
 #include "third_party/base/compiler_specific.h"
 
-namespace {
-
-class CFX_UTF8Encoder {
- public:
-  CFX_UTF8Encoder() {}
-  ~CFX_UTF8Encoder() {}
-
-  void Input(wchar_t unicodeAsWchar) {
-    uint32_t unicode = static_cast<uint32_t>(unicodeAsWchar);
-    if (unicode < 0x80) {
-      m_Buffer.push_back(unicode);
-    } else {
-      if (unicode >= 0x80000000)
-        return;
-
-      int nbytes = 0;
-      if (unicode < 0x800)
-        nbytes = 2;
-      else if (unicode < 0x10000)
-        nbytes = 3;
-      else if (unicode < 0x200000)
-        nbytes = 4;
-      else if (unicode < 0x4000000)
-        nbytes = 5;
-      else
-        nbytes = 6;
-
-      static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-      int order = 1 << ((nbytes - 1) * 6);
-      int code = unicodeAsWchar;
-      m_Buffer.push_back(prefix[nbytes - 2] | (code / order));
-      for (int i = 0; i < nbytes - 1; i++) {
-        code = code % order;
-        order >>= 6;
-        m_Buffer.push_back(0x80 | (code / order));
-      }
-    }
-  }
-
-  // The data returned by GetResult() is invalidated when this is modified by
-  // appending any data.
-  ByteStringView GetResult() const {
-    return ByteStringView(m_Buffer.data(), m_Buffer.size());
-  }
-
- private:
-  std::vector<uint8_t> m_Buffer;
-};
-
-}  // namespace
-
 ByteString FX_UTF8Encode(const WideStringView& wsStr) {
   size_t len = wsStr.GetLength();
   const wchar_t* pStr = wsStr.unterminated_c_str();
@@ -72,6 +23,17 @@
   return ByteString(encoder.GetResult());
 }
 
+WideString FX_UTF8Decode(const ByteStringView& bsStr) {
+  if (bsStr.IsEmpty())
+    return WideString();
+
+  CFX_UTF8Decoder decoder;
+  for (size_t i = 0; i < bsStr.GetLength(); i++)
+    decoder.Input(bsStr[i]);
+
+  return WideString(decoder.GetResult());
+}
+
 namespace {
 
 const float fraction_scales[] = {0.1f,          0.01f,         0.001f,
diff --git a/core/fxcrt/fx_string.h b/core/fxcrt/fx_string.h
index 4c24181..2cf8237 100644
--- a/core/fxcrt/fx_string.h
+++ b/core/fxcrt/fx_string.h
@@ -15,6 +15,8 @@
    ((uint32_t)c4))
 
 ByteString FX_UTF8Encode(const WideStringView& wsStr);
+WideString FX_UTF8Decode(const ByteStringView& bsStr);
+
 float FX_atof(const ByteStringView& str);
 float FX_atof(const WideStringView& wsStr);
 bool FX_atonum(const ByteStringView& str, void* pData);
diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index b311165..60e7f07 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp
@@ -51,3 +51,52 @@
   EXPECT_FALSE(FX_atonum("3.24", &f));
   EXPECT_FLOAT_EQ(3.24f, f);
 }
+
+TEST(fxstring, FX_UTF8Encode) {
+  EXPECT_EQ("", FX_UTF8Encode(WideStringView()));
+  EXPECT_EQ(
+      "x"
+      "\xc2\x80"
+      "\xc3\xbf"
+      "\xef\xbc\xac"
+      "y",
+      FX_UTF8Encode(L"x"
+                    L"\u0080"
+                    L"\u00ff"
+                    L"\uff2c"
+                    L"y"));
+}
+
+TEST(fxstring, FX_UTF8Decode) {
+  EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView()));
+  EXPECT_EQ(
+      L"x"
+      L"\u0080"
+      L"\u00ff"
+      L"\uff2c"
+      L"y",
+      FX_UTF8Decode("x"
+                    "\xc2\x80"
+                    "\xc3\xbf"
+                    "\xef\xbc\xac"
+                    "y"));
+  EXPECT_EQ(L"a(A) b() c() d() e().",
+            FX_UTF8Decode("a(\xc2\x41) "      // Invalid continuation.
+                          "b(\xc2\xc2) "      // Invalid continuation.
+                          "c(\xc2\xff\x80) "  // Invalid continuation.
+                          "d(\x80\x80) "      // Invalid leading.
+                          "e(\xff\x80\x80)"   // Invalid leading.
+                          "."));
+}
+
+TEST(fxstring, FX_UTF8EncodeDecodeConsistency) {
+  WideString wstr;
+  wstr.Reserve(0x10000);
+  for (int w = 0; w < 0x10000; ++w)
+    wstr += static_cast<wchar_t>(w);
+
+  ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
+  WideString wstr2 = FX_UTF8Decode(bstr.AsStringView());
+  EXPECT_EQ(0x10000u, wstr2.GetLength());
+  EXPECT_EQ(wstr, wstr2);
+}
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index 97073f1..e3c08d7 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -12,7 +12,6 @@
 #include <cctype>
 #include <cwctype>
 
-#include "core/fxcrt/cfx_utf8decoder.h"
 #include "core/fxcrt/fx_codepage.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_safe_types.h"
@@ -885,14 +884,7 @@
 
 // static
 WideString WideString::FromUTF8(const ByteStringView& str) {
-  if (str.IsEmpty())
-    return WideString();
-
-  CFX_UTF8Decoder decoder;
-  for (size_t i = 0; i < str.GetLength(); i++)
-    decoder.Input(str[i]);
-
-  return WideString(decoder.GetResult());
+  return FX_UTF8Decode(str);
 }
 
 // static