Merge UTF-8 handling into fx_string.cpp

Merges UTF-8 encoding and decoding into core/fxcrt/fx_string.cpp, as
FX_UTF8Encode() and FX_UTF8Decode() are the only clients of the
CFX_UTF8Encoder and CFX_UTF8Decoder classes.

In other words, we have no use case for supporting UTF-8 handling
outside of the specific APIs for converting between ByteString and
WideString representations.

Bug: pdfium:2029
Change-Id: I92d038ffc96ea494fc650f224555c1d262f02718
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107791
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: K. Moon <kmoon@chromium.org>
diff --git a/core/fxcrt/BUILD.gn b/core/fxcrt/BUILD.gn
index f8b6de5..d6e5e46 100644
--- a/core/fxcrt/BUILD.gn
+++ b/core/fxcrt/BUILD.gn
@@ -45,10 +45,6 @@
     "cfx_seekablestreamproxy.h",
     "cfx_timer.cpp",
     "cfx_timer.h",
-    "cfx_utf8decoder.cpp",
-    "cfx_utf8decoder.h",
-    "cfx_utf8encoder.cpp",
-    "cfx_utf8encoder.h",
     "code_point_view.h",
     "data_vector.h",
     "fileaccess_iface.h",
diff --git a/core/fxcrt/bytestring.cpp b/core/fxcrt/bytestring.cpp
index d3e1cc0..2091bf8 100644
--- a/core/fxcrt/bytestring.cpp
+++ b/core/fxcrt/bytestring.cpp
@@ -14,7 +14,6 @@
 #include <string>
 #include <utility>
 
-#include "core/fxcrt/cfx_utf8decoder.h"
 #include "core/fxcrt/fx_codepage.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_safe_types.h"
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
deleted file mode 100644
index b66605a..0000000
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2017 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "core/fxcrt/cfx_utf8decoder.h"
-
-#include <stdint.h>
-
-#include <utility>
-
-#include "build/build_config.h"
-#include "core/fxcrt/string_view_template.h"
-#include "core/fxcrt/utf16.h"
-#include "core/fxcrt/widestring.h"
-
-CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
-  int remaining = 0;
-  char32_t code_point = 0;
-
-  for (char byte : input) {
-    uint8_t code_unit = static_cast<uint8_t>(byte);
-    if (code_unit < 0x80) {
-      remaining = 0;
-      AppendCodePoint(code_unit);
-    } else if (code_unit < 0xc0) {
-      if (remaining > 0) {
-        --remaining;
-        code_point = (code_point << 6) | (code_unit & 0x3f);
-        if (remaining == 0) {
-          AppendCodePoint(code_point);
-        }
-      }
-    } else if (code_unit < 0xe0) {
-      remaining = 1;
-      code_point = code_unit & 0x1f;
-    } else if (code_unit < 0xf0) {
-      remaining = 2;
-      code_point = code_unit & 0x0f;
-    } else if (code_unit < 0xf8) {
-      remaining = 3;
-      code_point = code_unit & 0x07;
-    } else {
-      remaining = 0;
-    }
-  }
-}
-
-CFX_UTF8Decoder::~CFX_UTF8Decoder() = default;
-
-WideString CFX_UTF8Decoder::TakeResult() {
-  return std::move(buffer_);
-}
-
-void CFX_UTF8Decoder::AppendCodePoint(char32_t code_point) {
-  if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
-    // Invalid code point above U+10FFFF.
-    return;
-  }
-
-#if defined(WCHAR_T_IS_UTF16)
-  if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
-    buffer_ += static_cast<wchar_t>(code_point);
-  } else {
-    // Encode as UTF-16 surrogate pair.
-    pdfium::SurrogatePair surrogate_pair(code_point);
-    buffer_ += surrogate_pair.high();
-    buffer_ += surrogate_pair.low();
-  }
-#else
-  buffer_ += static_cast<wchar_t>(code_point);
-#endif  // defined(WCHAR_T_IS_UTF16)
-}
diff --git a/core/fxcrt/cfx_utf8decoder.h b/core/fxcrt/cfx_utf8decoder.h
deleted file mode 100644
index 9d9b0c1..0000000
--- a/core/fxcrt/cfx_utf8decoder.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2017 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FXCRT_CFX_UTF8DECODER_H_
-#define CORE_FXCRT_CFX_UTF8DECODER_H_
-
-#include "core/fxcrt/string_view_template.h"
-#include "core/fxcrt/widestring.h"
-
-class CFX_UTF8Decoder {
- public:
-  explicit CFX_UTF8Decoder(ByteStringView input);
-  ~CFX_UTF8Decoder();
-
-  WideString TakeResult();
-
- private:
-  void AppendCodePoint(char32_t code_point);
-
-  WideString buffer_;
-};
-
-#endif  // CORE_FXCRT_CFX_UTF8DECODER_H_
diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
deleted file mode 100644
index aa69686..0000000
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2018 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "core/fxcrt/cfx_utf8encoder.h"
-
-#include <stdint.h>
-
-#include <utility>
-
-#include "core/fxcrt/bytestring.h"
-#include "core/fxcrt/code_point_view.h"
-#include "core/fxcrt/string_view_template.h"
-#include "core/fxcrt/utf16.h"
-
-CFX_UTF8Encoder::CFX_UTF8Encoder(WideStringView input) {
-  for (char32_t code_point : pdfium::CodePointView(input)) {
-    AppendCodePoint(code_point);
-  }
-}
-
-CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
-
-ByteString CFX_UTF8Encoder::TakeResult() {
-  return std::move(buffer_);
-}
-
-void CFX_UTF8Encoder::AppendCodePoint(char32_t code_point) {
-  if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
-    // Invalid code point above U+10FFFF.
-    return;
-  }
-
-  if (code_point < 0x80) {
-    // 7-bit code points are unchanged in UTF-8.
-    buffer_ += code_point;
-    return;
-  }
-
-  int byte_size;
-  if (code_point < 0x800) {
-    byte_size = 2;
-  } else if (code_point < 0x10000) {
-    byte_size = 3;
-  } else {
-    byte_size = 4;
-  }
-
-  static constexpr uint8_t kPrefix[] = {0xc0, 0xe0, 0xf0};
-  int order = 1 << ((byte_size - 1) * 6);
-  buffer_ += kPrefix[byte_size - 2] | (code_point / order);
-  for (int i = 0; i < byte_size - 1; i++) {
-    code_point = code_point % order;
-    order >>= 6;
-    buffer_ += 0x80 | (code_point / order);
-  }
-}
diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
deleted file mode 100644
index bc3ddfb..0000000
--- a/core/fxcrt/cfx_utf8encoder.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2018 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
-#define CORE_FXCRT_CFX_UTF8ENCODER_H_
-
-#include "core/fxcrt/bytestring.h"
-#include "core/fxcrt/string_view_template.h"
-
-class CFX_UTF8Encoder {
- public:
-  // `input` may be UTF-16 or UTF-32, depending on the platform.
-  // TODO(crbug.com/pdfium/2031): Always use UTF-16.
-  explicit CFX_UTF8Encoder(WideStringView input);
-  ~CFX_UTF8Encoder();
-
-  ByteString TakeResult();
-
- private:
-  void AppendCodePoint(char32_t code_point);
-
-  ByteString buffer_;
-};
-
-#endif  // CORE_FXCRT_CFX_UTF8ENCODER_H_
diff --git a/core/fxcrt/fx_string.cpp b/core/fxcrt/fx_string.cpp
index c1c25bd..b783ec7 100644
--- a/core/fxcrt/fx_string.cpp
+++ b/core/fxcrt/fx_string.cpp
@@ -6,24 +6,122 @@
 
 #include "core/fxcrt/fx_string.h"
 
+#include <stdint.h>
+
 #include <iterator>
 
+#include "build/build_config.h"
 #include "core/fxcrt/bytestring.h"
-#include "core/fxcrt/cfx_utf8decoder.h"
-#include "core/fxcrt/cfx_utf8encoder.h"
+#include "core/fxcrt/code_point_view.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/span_util.h"
 #include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
 #include "core/fxcrt/widestring.h"
 #include "third_party/base/compiler_specific.h"
 #include "third_party/base/span.h"
 
+namespace {
+
+// Appends a Unicode code point to a `ByteString` using UTF-8.
+void AppendCodePointToByteString(char32_t code_point, ByteString& buffer) {
+  if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
+    // Invalid code point above U+10FFFF.
+    return;
+  }
+
+  if (code_point < 0x80) {
+    // 7-bit code points are unchanged in UTF-8.
+    buffer += code_point;
+    return;
+  }
+
+  int byte_size;
+  if (code_point < 0x800) {
+    byte_size = 2;
+  } else if (code_point < 0x10000) {
+    byte_size = 3;
+  } else {
+    byte_size = 4;
+  }
+
+  static constexpr uint8_t kPrefix[] = {0xc0, 0xe0, 0xf0};
+  int order = 1 << ((byte_size - 1) * 6);
+  buffer += kPrefix[byte_size - 2] | (code_point / order);
+  for (int i = 0; i < byte_size - 1; i++) {
+    code_point = code_point % order;
+    order >>= 6;
+    buffer += 0x80 | (code_point / order);
+  }
+}
+
+// Appends a Unicode code point to a `WideString` using either UTF-16 or UTF-32,
+// depending on the platform's definition of `wchar_t`.
+//
+// TODO(crbug.com/pdfium/2031): Always use UTF-16.
+void AppendCodePointToWideString(char32_t code_point, WideString& buffer) {
+  if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
+    // Invalid code point above U+10FFFF.
+    return;
+  }
+
+#if defined(WCHAR_T_IS_UTF16)
+  if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
+    buffer += static_cast<wchar_t>(code_point);
+  } else {
+    // Encode as UTF-16 surrogate pair.
+    pdfium::SurrogatePair surrogate_pair(code_point);
+    buffer += surrogate_pair.high();
+    buffer += surrogate_pair.low();
+  }
+#else
+  buffer += static_cast<wchar_t>(code_point);
+#endif  // defined(WCHAR_T_IS_UTF16)
+}
+
+}  // namespace
+
 ByteString FX_UTF8Encode(WideStringView wsStr) {
-  return CFX_UTF8Encoder(wsStr).TakeResult();
+  ByteString buffer;
+  for (char32_t code_point : pdfium::CodePointView(wsStr)) {
+    AppendCodePointToByteString(code_point, buffer);
+  }
+  return buffer;
 }
 
 WideString FX_UTF8Decode(ByteStringView bsStr) {
-  return CFX_UTF8Decoder(bsStr).TakeResult();
+  WideString buffer;
+
+  int remaining = 0;
+  char32_t code_point = 0;
+  for (char byte : bsStr) {
+    uint8_t code_unit = static_cast<uint8_t>(byte);
+    if (code_unit < 0x80) {
+      remaining = 0;
+      AppendCodePointToWideString(code_unit, buffer);
+    } else if (code_unit < 0xc0) {
+      if (remaining > 0) {
+        --remaining;
+        code_point = (code_point << 6) | (code_unit & 0x3f);
+        if (remaining == 0) {
+          AppendCodePointToWideString(code_point, buffer);
+        }
+      }
+    } else if (code_unit < 0xe0) {
+      remaining = 1;
+      code_point = code_unit & 0x1f;
+    } else if (code_unit < 0xf0) {
+      remaining = 2;
+      code_point = code_unit & 0x0f;
+    } else if (code_unit < 0xf8) {
+      remaining = 3;
+      code_point = code_unit & 0x07;
+    } else {
+      remaining = 0;
+    }
+  }
+
+  return buffer;
 }
 
 namespace {