Merge UTF-8 handling into fx_string.cpp
Merges UTF-8 encoding and decoding into core/fxcrt/fx_string.cpp, as
FX_UTF8Encode() and FX_UTF8Decode() are the only clients of the
CFX_UTF8Encoder and CFX_UTF8Decoder classes.
In other words, we have no use case for supporting UTF-8 handling
outside of the specific APIs for converting between ByteString and
WideString representations.
Bug: pdfium:2029
Change-Id: I92d038ffc96ea494fc650f224555c1d262f02718
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107791
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: K. Moon <kmoon@chromium.org>
diff --git a/core/fxcrt/BUILD.gn b/core/fxcrt/BUILD.gn
index f8b6de5..d6e5e46 100644
--- a/core/fxcrt/BUILD.gn
+++ b/core/fxcrt/BUILD.gn
@@ -45,10 +45,6 @@
"cfx_seekablestreamproxy.h",
"cfx_timer.cpp",
"cfx_timer.h",
- "cfx_utf8decoder.cpp",
- "cfx_utf8decoder.h",
- "cfx_utf8encoder.cpp",
- "cfx_utf8encoder.h",
"code_point_view.h",
"data_vector.h",
"fileaccess_iface.h",
diff --git a/core/fxcrt/bytestring.cpp b/core/fxcrt/bytestring.cpp
index d3e1cc0..2091bf8 100644
--- a/core/fxcrt/bytestring.cpp
+++ b/core/fxcrt/bytestring.cpp
@@ -14,7 +14,6 @@
#include <string>
#include <utility>
-#include "core/fxcrt/cfx_utf8decoder.h"
#include "core/fxcrt/fx_codepage.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/fx_safe_types.h"
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
deleted file mode 100644
index b66605a..0000000
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright 2017 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "core/fxcrt/cfx_utf8decoder.h"
-
-#include <stdint.h>
-
-#include <utility>
-
-#include "build/build_config.h"
-#include "core/fxcrt/string_view_template.h"
-#include "core/fxcrt/utf16.h"
-#include "core/fxcrt/widestring.h"
-
-CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
- int remaining = 0;
- char32_t code_point = 0;
-
- for (char byte : input) {
- uint8_t code_unit = static_cast<uint8_t>(byte);
- if (code_unit < 0x80) {
- remaining = 0;
- AppendCodePoint(code_unit);
- } else if (code_unit < 0xc0) {
- if (remaining > 0) {
- --remaining;
- code_point = (code_point << 6) | (code_unit & 0x3f);
- if (remaining == 0) {
- AppendCodePoint(code_point);
- }
- }
- } else if (code_unit < 0xe0) {
- remaining = 1;
- code_point = code_unit & 0x1f;
- } else if (code_unit < 0xf0) {
- remaining = 2;
- code_point = code_unit & 0x0f;
- } else if (code_unit < 0xf8) {
- remaining = 3;
- code_point = code_unit & 0x07;
- } else {
- remaining = 0;
- }
- }
-}
-
-CFX_UTF8Decoder::~CFX_UTF8Decoder() = default;
-
-WideString CFX_UTF8Decoder::TakeResult() {
- return std::move(buffer_);
-}
-
-void CFX_UTF8Decoder::AppendCodePoint(char32_t code_point) {
- if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
- // Invalid code point above U+10FFFF.
- return;
- }
-
-#if defined(WCHAR_T_IS_UTF16)
- if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
- buffer_ += static_cast<wchar_t>(code_point);
- } else {
- // Encode as UTF-16 surrogate pair.
- pdfium::SurrogatePair surrogate_pair(code_point);
- buffer_ += surrogate_pair.high();
- buffer_ += surrogate_pair.low();
- }
-#else
- buffer_ += static_cast<wchar_t>(code_point);
-#endif // defined(WCHAR_T_IS_UTF16)
-}
diff --git a/core/fxcrt/cfx_utf8decoder.h b/core/fxcrt/cfx_utf8decoder.h
deleted file mode 100644
index 9d9b0c1..0000000
--- a/core/fxcrt/cfx_utf8decoder.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2017 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FXCRT_CFX_UTF8DECODER_H_
-#define CORE_FXCRT_CFX_UTF8DECODER_H_
-
-#include "core/fxcrt/string_view_template.h"
-#include "core/fxcrt/widestring.h"
-
-class CFX_UTF8Decoder {
- public:
- explicit CFX_UTF8Decoder(ByteStringView input);
- ~CFX_UTF8Decoder();
-
- WideString TakeResult();
-
- private:
- void AppendCodePoint(char32_t code_point);
-
- WideString buffer_;
-};
-
-#endif // CORE_FXCRT_CFX_UTF8DECODER_H_
diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
deleted file mode 100644
index aa69686..0000000
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2018 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#include "core/fxcrt/cfx_utf8encoder.h"
-
-#include <stdint.h>
-
-#include <utility>
-
-#include "core/fxcrt/bytestring.h"
-#include "core/fxcrt/code_point_view.h"
-#include "core/fxcrt/string_view_template.h"
-#include "core/fxcrt/utf16.h"
-
-CFX_UTF8Encoder::CFX_UTF8Encoder(WideStringView input) {
- for (char32_t code_point : pdfium::CodePointView(input)) {
- AppendCodePoint(code_point);
- }
-}
-
-CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
-
-ByteString CFX_UTF8Encoder::TakeResult() {
- return std::move(buffer_);
-}
-
-void CFX_UTF8Encoder::AppendCodePoint(char32_t code_point) {
- if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
- // Invalid code point above U+10FFFF.
- return;
- }
-
- if (code_point < 0x80) {
- // 7-bit code points are unchanged in UTF-8.
- buffer_ += code_point;
- return;
- }
-
- int byte_size;
- if (code_point < 0x800) {
- byte_size = 2;
- } else if (code_point < 0x10000) {
- byte_size = 3;
- } else {
- byte_size = 4;
- }
-
- static constexpr uint8_t kPrefix[] = {0xc0, 0xe0, 0xf0};
- int order = 1 << ((byte_size - 1) * 6);
- buffer_ += kPrefix[byte_size - 2] | (code_point / order);
- for (int i = 0; i < byte_size - 1; i++) {
- code_point = code_point % order;
- order >>= 6;
- buffer_ += 0x80 | (code_point / order);
- }
-}
diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
deleted file mode 100644
index bc3ddfb..0000000
--- a/core/fxcrt/cfx_utf8encoder.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright 2018 The PDFium Authors
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
-
-#ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
-#define CORE_FXCRT_CFX_UTF8ENCODER_H_
-
-#include "core/fxcrt/bytestring.h"
-#include "core/fxcrt/string_view_template.h"
-
-class CFX_UTF8Encoder {
- public:
- // `input` may be UTF-16 or UTF-32, depending on the platform.
- // TODO(crbug.com/pdfium/2031): Always use UTF-16.
- explicit CFX_UTF8Encoder(WideStringView input);
- ~CFX_UTF8Encoder();
-
- ByteString TakeResult();
-
- private:
- void AppendCodePoint(char32_t code_point);
-
- ByteString buffer_;
-};
-
-#endif // CORE_FXCRT_CFX_UTF8ENCODER_H_
diff --git a/core/fxcrt/fx_string.cpp b/core/fxcrt/fx_string.cpp
index c1c25bd..b783ec7 100644
--- a/core/fxcrt/fx_string.cpp
+++ b/core/fxcrt/fx_string.cpp
@@ -6,24 +6,122 @@
#include "core/fxcrt/fx_string.h"
+#include <stdint.h>
+
#include <iterator>
+#include "build/build_config.h"
#include "core/fxcrt/bytestring.h"
-#include "core/fxcrt/cfx_utf8decoder.h"
-#include "core/fxcrt/cfx_utf8encoder.h"
+#include "core/fxcrt/code_point_view.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/span_util.h"
#include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
#include "core/fxcrt/widestring.h"
#include "third_party/base/compiler_specific.h"
#include "third_party/base/span.h"
+namespace {
+
+// Appends a Unicode code point to a `ByteString` using UTF-8.
+void AppendCodePointToByteString(char32_t code_point, ByteString& buffer) {
+ if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
+ // Invalid code point above U+10FFFF.
+ return;
+ }
+
+ if (code_point < 0x80) {
+ // 7-bit code points are unchanged in UTF-8.
+ buffer += code_point;
+ return;
+ }
+
+ int byte_size;
+ if (code_point < 0x800) {
+ byte_size = 2;
+ } else if (code_point < 0x10000) {
+ byte_size = 3;
+ } else {
+ byte_size = 4;
+ }
+
+ static constexpr uint8_t kPrefix[] = {0xc0, 0xe0, 0xf0};
+ int order = 1 << ((byte_size - 1) * 6);
+ buffer += kPrefix[byte_size - 2] | (code_point / order);
+ for (int i = 0; i < byte_size - 1; i++) {
+ code_point = code_point % order;
+ order >>= 6;
+ buffer += 0x80 | (code_point / order);
+ }
+}
+
+// Appends a Unicode code point to a `WideString` using either UTF-16 or UTF-32,
+// depending on the platform's definition of `wchar_t`.
+//
+// TODO(crbug.com/pdfium/2031): Always use UTF-16.
+void AppendCodePointToWideString(char32_t code_point, WideString& buffer) {
+ if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
+ // Invalid code point above U+10FFFF.
+ return;
+ }
+
+#if defined(WCHAR_T_IS_UTF16)
+ if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
+ buffer += static_cast<wchar_t>(code_point);
+ } else {
+ // Encode as UTF-16 surrogate pair.
+ pdfium::SurrogatePair surrogate_pair(code_point);
+ buffer += surrogate_pair.high();
+ buffer += surrogate_pair.low();
+ }
+#else
+ buffer += static_cast<wchar_t>(code_point);
+#endif // defined(WCHAR_T_IS_UTF16)
+}
+
+} // namespace
+
ByteString FX_UTF8Encode(WideStringView wsStr) {
- return CFX_UTF8Encoder(wsStr).TakeResult();
+ ByteString buffer;
+ for (char32_t code_point : pdfium::CodePointView(wsStr)) {
+ AppendCodePointToByteString(code_point, buffer);
+ }
+ return buffer;
}
WideString FX_UTF8Decode(ByteStringView bsStr) {
- return CFX_UTF8Decoder(bsStr).TakeResult();
+ WideString buffer;
+
+ int remaining = 0;
+ char32_t code_point = 0;
+ for (char byte : bsStr) {
+ uint8_t code_unit = static_cast<uint8_t>(byte);
+ if (code_unit < 0x80) {
+ remaining = 0;
+ AppendCodePointToWideString(code_unit, buffer);
+ } else if (code_unit < 0xc0) {
+ if (remaining > 0) {
+ --remaining;
+ code_point = (code_point << 6) | (code_unit & 0x3f);
+ if (remaining == 0) {
+ AppendCodePointToWideString(code_point, buffer);
+ }
+ }
+ } else if (code_unit < 0xe0) {
+ remaining = 1;
+ code_point = code_unit & 0x1f;
+ } else if (code_unit < 0xf0) {
+ remaining = 2;
+ code_point = code_unit & 0x0f;
+ } else if (code_unit < 0xf8) {
+ remaining = 3;
+ code_point = code_unit & 0x07;
+ } else {
+ remaining = 0;
+ }
+ }
+
+ return buffer;
}
namespace {