Clean up the UTF-8 encoder and decoder Makes various minor improvements to the UTF-8 encoder and decoder: 1. Consistently uses the terms "code unit" and "code point" as defined in the Unicode standard. 2. Drops support for invalid code points above U+10FFFF. 3. Simplifies UTF-8 decoder by placing more state on the stack. 4. Consistently uses "char16_t" to represent UTF-16 code units. 5. Consistently uses "char32_t" to represent code points. 6. Uses "char" to represent ByteString elements. 7. Prepares tests to handle UTF-16 in the future. 8. Converts class member names to Google style. 9. IWYU Bug: pdfium:2029 Change-Id: I1ca1ecf28ee34737e1df741ecfdc259ca2052833 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107090 Reviewed-by: Nigi <nigi@chromium.org> Commit-Queue: K. Moon <kmoon@chromium.org>

commit: c651b5492a99b48be1ed6e54a3da2b97c8bbe113 [log] [tgz]
author: K. Moon <kmoon@chromium.org> Thu May 04 18:43:36 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Thu May 04 18:43:36 2023 +0000
tree: 955de294cae55e638a17cd9111580b95c19c7e98
parent: 09e030db27a1778fe502257985538aeb5fa67d00 [diff]
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index e834815..276e186 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp

@@ -6,53 +6,53 @@
 
 #include "core/fxcrt/cfx_utf8decoder.h"
 
+#include <stdint.h>
+
 #include <utility>
 
 CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
-  for (char c : input) {
-    ProcessByte(c);
+  int remaining = 0;
+  char32_t code_point = 0;
+
+  for (char byte : input) {
+    uint8_t code_unit = static_cast<uint8_t>(byte);
+    if (code_unit < 0x80) {
+      remaining = 0;
+      AppendCodePoint(code_unit);
+    } else if (code_unit < 0xc0) {
+      if (remaining > 0) {
+        --remaining;
+        code_point = (code_point << 6) | (code_unit & 0x3f);
+        if (remaining == 0) {
+          AppendCodePoint(code_point);
+        }
+      }
+    } else if (code_unit < 0xe0) {
+      remaining = 1;
+      code_point = code_unit & 0x1f;
+    } else if (code_unit < 0xf0) {
+      remaining = 2;
+      code_point = code_unit & 0x0f;
+    } else if (code_unit < 0xf8) {
+      remaining = 3;
+      code_point = code_unit & 0x07;
+    } else {
+      remaining = 0;
+    }
   }
 }
 
 CFX_UTF8Decoder::~CFX_UTF8Decoder() = default;
 
 WideString CFX_UTF8Decoder::TakeResult() {
-  return std::move(m_Buffer);
+  return std::move(buffer_);
 }
 
-void CFX_UTF8Decoder::AppendCodePoint(uint32_t ch) {
-  m_Buffer += static_cast<wchar_t>(ch);
-}
-
-void CFX_UTF8Decoder::ProcessByte(uint8_t byte) {
-  if (byte < 0x80) {
-    m_PendingBytes = 0;
-    AppendCodePoint(byte);
-  } else if (byte < 0xc0) {
-    if (m_PendingBytes == 0) {
-      return;
-    }
-    m_PendingBytes--;
-    m_PendingChar |= (byte & 0x3f) << (m_PendingBytes * 6);
-    if (m_PendingBytes == 0) {
-      AppendCodePoint(m_PendingChar);
-    }
-  } else if (byte < 0xe0) {
-    m_PendingBytes = 1;
-    m_PendingChar = (byte & 0x1f) << 6;
-  } else if (byte < 0xf0) {
-    m_PendingBytes = 2;
-    m_PendingChar = (byte & 0x0f) << 12;
-  } else if (byte < 0xf8) {
-    m_PendingBytes = 3;
-    m_PendingChar = (byte & 0x07) << 18;
-  } else if (byte < 0xfc) {
-    m_PendingBytes = 4;
-    m_PendingChar = (byte & 0x03) << 24;
-  } else if (byte < 0xfe) {
-    m_PendingBytes = 5;
-    m_PendingChar = (byte & 0x01) << 30;
-  } else {
-    m_PendingBytes = 0;
+void CFX_UTF8Decoder::AppendCodePoint(char32_t code_point) {
+  if (code_point > 0x10ffff) {
+    // Invalid code point above U+10FFFF.
+    return;
   }
+
+  buffer_ += static_cast<wchar_t>(code_point);
 }

diff --git a/core/fxcrt/cfx_utf8decoder.h b/core/fxcrt/cfx_utf8decoder.h
index 35b5671..9d9b0c1 100644
--- a/core/fxcrt/cfx_utf8decoder.h
+++ b/core/fxcrt/cfx_utf8decoder.h

@@ -18,12 +18,9 @@
   WideString TakeResult();
 
  private:
-  void ProcessByte(uint8_t byte);
-  void AppendCodePoint(uint32_t ch);
+  void AppendCodePoint(char32_t code_point);
 
-  int m_PendingBytes = 0;
-  uint32_t m_PendingChar = 0;
-  WideString m_Buffer;
+  WideString buffer_;
 };
 
 #endif  // CORE_FXCRT_CFX_UTF8DECODER_H_

diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
index 4951126..117f3f7 100644
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ b/core/fxcrt/cfx_utf8encoder.cpp

@@ -6,38 +6,40 @@
 
 #include "core/fxcrt/cfx_utf8encoder.h"
 
+#include <stdint.h>
+
 CFX_UTF8Encoder::CFX_UTF8Encoder() = default;
 
 CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
 
-void CFX_UTF8Encoder::Input(wchar_t unicodeAsWchar) {
-  uint32_t unicode = static_cast<uint32_t>(unicodeAsWchar);
-  if (unicode < 0x80) {
-    m_Buffer.push_back(unicode);
+void CFX_UTF8Encoder::Input(wchar_t code_unit) {
+  char32_t code_point = static_cast<char32_t>(code_unit);
+  if (code_point > 0x10ffff) {
+    // Invalid code point above U+10FFFF.
+    return;
+  }
+
+  if (code_point < 0x80) {
+    // 7-bit code points are unchanged in UTF-8.
+    buffer_.push_back(code_point);
+    return;
+  }
+
+  int byte_size;
+  if (code_point < 0x800) {
+    byte_size = 2;
+  } else if (code_point < 0x10000) {
+    byte_size = 3;
   } else {
-    if (unicode >= 0x80000000)
-      return;
+    byte_size = 4;
+  }
 
-    int nbytes = 0;
-    if (unicode < 0x800)
-      nbytes = 2;
-    else if (unicode < 0x10000)
-      nbytes = 3;
-    else if (unicode < 0x200000)
-      nbytes = 4;
-    else if (unicode < 0x4000000)
-      nbytes = 5;
-    else
-      nbytes = 6;
-
-    static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-    int order = 1 << ((nbytes - 1) * 6);
-    int code = unicodeAsWchar;
-    m_Buffer.push_back(prefix[nbytes - 2] | (code / order));
-    for (int i = 0; i < nbytes - 1; i++) {
-      code = code % order;
-      order >>= 6;
-      m_Buffer.push_back(0x80 | (code / order));
-    }
+  static constexpr uint8_t kPrefix[] = {0xc0, 0xe0, 0xf0};
+  int order = 1 << ((byte_size - 1) * 6);
+  buffer_.push_back(kPrefix[byte_size - 2] | (code_point / order));
+  for (int i = 0; i < byte_size - 1; i++) {
+    code_point = code_point % order;
+    order >>= 6;
+    buffer_.push_back(0x80 | (code_point / order));
   }
 }

diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
index 13815cd..a39f0cb 100644
--- a/core/fxcrt/cfx_utf8encoder.h
+++ b/core/fxcrt/cfx_utf8encoder.h

@@ -7,26 +7,26 @@
 #ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
 #define CORE_FXCRT_CFX_UTF8ENCODER_H_
 
-#include <stdint.h>
-
-#include "core/fxcrt/bytestring.h"
 #include "core/fxcrt/data_vector.h"
+#include "core/fxcrt/string_view_template.h"
 
 class CFX_UTF8Encoder {
  public:
   CFX_UTF8Encoder();
   ~CFX_UTF8Encoder();
 
-  void Input(wchar_t unicodeAsWchar);
+  // `code_unit` may be UTF-16 or UTF-32, depending on the platform.
+  // TODO(crbug.com/pdfium/2031): Accept `char16_t` instead of `wchar_t`.
+  void Input(wchar_t code_unit);
 
-  // The data returned by GetResult() is invalidated when this is modified by
+  // The data returned by `GetResult()` is invalidated when this is modified by
   // appending any data.
   ByteStringView GetResult() const {
-    return ByteStringView(m_Buffer.data(), m_Buffer.size());
+    return ByteStringView(buffer_.data(), buffer_.size());
   }
 
  private:
-  DataVector<uint8_t> m_Buffer;
+  DataVector<char> buffer_;
 };
 
 #endif  // CORE_FXCRT_CFX_UTF8ENCODER_H_

diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index 2619864..b3f2864 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp

@@ -20,52 +20,75 @@
   return buf.data();
 }
 
-TEST(fxstring, FX_UTF8Encode) {
+TEST(fxstring, FXUTF8Encode) {
   EXPECT_EQ("", FX_UTF8Encode(WideStringView()));
   EXPECT_EQ(
       "x"
       "\xc2\x80"
       "\xc3\xbf"
+      "\xed\x9f\xbf"
+      "\xee\x80\x80"
       "\xef\xbc\xac"
+      "\xef\xbf\xbf"
       "y",
       FX_UTF8Encode(L"x"
                     L"\u0080"
                     L"\u00ff"
+                    L"\ud7ff"
+                    L"\ue000"
                     L"\uff2c"
+                    L"\uffff"
                     L"y"));
 }
 
-TEST(fxstring, FX_UTF8Decode) {
+TEST(fxstring, FXUTF8Decode) {
   EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView()));
   EXPECT_EQ(
       L"x"
       L"\u0080"
       L"\u00ff"
+      L"\ud7ff"
+      L"\ue000"
       L"\uff2c"
+      L"\uffff"
       L"y",
       FX_UTF8Decode("x"
                     "\xc2\x80"
                     "\xc3\xbf"
+                    "\xed\x9f\xbf"
+                    "\xee\x80\x80"
                     "\xef\xbc\xac"
+                    "\xef\xbf\xbf"
                     "y"));
-  EXPECT_EQ(L"a(A) b() c() d() e().",
-            FX_UTF8Decode("a(\xc2\x41) "      // Invalid continuation.
-                          "b(\xc2\xc2) "      // Invalid continuation.
-                          "c(\xc2\xff\x80) "  // Invalid continuation.
-                          "d(\x80\x80) "      // Invalid leading.
-                          "e(\xff\x80\x80)"   // Invalid leading.
-                          "."));
 }
 
-TEST(fxstring, FX_UTF8EncodeDecodeConsistency) {
+TEST(fxstring, FXUTF8DecodeErrorRecovery) {
+  EXPECT_EQ(L"(A)", FX_UTF8Decode("(\xc2\x41)")) << "Invalid continuation";
+  EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xc2)")) << "Invalid continuation";
+  EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xff\x80)")) << "Invalid continuation";
+  EXPECT_EQ(L"()", FX_UTF8Decode("(\x80\x80)")) << "Invalid leading";
+  EXPECT_EQ(L"()", FX_UTF8Decode("(\xff\x80\x80)")) << "Invalid leading";
+  EXPECT_EQ(L"()", FX_UTF8Decode("(\xf8\x80\x80\x80\x80)"))
+      << "Invalid leading";
+  EXPECT_EQ(L"()", FX_UTF8Decode("(\xf8\x88\x80\x80\x80)"))
+      << "Invalid leading";
+  EXPECT_EQ(L"()", FX_UTF8Decode("(\xf4\x90\x80\x80)"))
+      << "Code point greater than U+10FFFF";
+}
+
+TEST(fxstring, FXUTF8EncodeDecodeConsistency) {
   WideString wstr;
   wstr.Reserve(0x10000);
-  for (int w = 0; w < 0x10000; ++w)
-    wstr += static_cast<wchar_t>(w);
+  for (int w = 0; w < 0x10000; ++w) {
+    // Skip UTF-16 surrogates.
+    if (w < 0xD800 || w >= 0xE000) {
+      wstr += static_cast<wchar_t>(w);
+    }
+  }
+  ASSERT_EQ(0xF800u, wstr.GetLength());
 
   ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
   WideString wstr2 = FX_UTF8Decode(bstr.AsStringView());
-  EXPECT_EQ(0x10000u, wstr2.GetLength());
   EXPECT_EQ(wstr, wstr2);
 }
 

diff --git a/core/fxcrt/widestring.h b/core/fxcrt/widestring.h
index 24547dd..cf9edd3 100644
--- a/core/fxcrt/widestring.h
+++ b/core/fxcrt/widestring.h

@@ -32,6 +32,7 @@
 // avoids the cost of std::string's iterator stability guarantees.
 class WideString {
  public:
+  // TODO(crbug.com/pdfium/2031): Consider switching to `char16_t` instead.
   using CharType = wchar_t;
   using const_iterator = const CharType*;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;

diff --git a/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp b/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp
index 1478bf9..9aa7b8b 100644
--- a/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp
+++ b/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp

@@ -4,6 +4,7 @@
 
 #include <math.h>
 
+#include "build/build_config.h"
 #include "fxjs/fxv8.h"
 #include "fxjs/xfa/cfxjse_engine.h"
 #include "fxjs/xfa/cfxjse_isolatetracker.h"
@@ -717,13 +718,12 @@
   ExecuteExpectString("Encode(\"\\u0022\\u00f5\\ufed0\", \"html\")",
                       "&quot;&otilde;&#xfed0;");
 
-#if !BUILDFLAG(IS_WIN)
-  // Windows wchar_t isn't wide enough to handle these anyways.
-  // TODO(tsepez): fix surrogate encodings.
+#if defined(WCHAR_T_IS_UTF32)
+  // TODO(crbug.com/pdfium/2029): Support UTF-16.
   ExecuteExpectString("Encode(\"\\uD83D\\uDCA9\", \"url\")", "%01%f4%a9");
   ExecuteExpectString("Encode(\"\\uD83D\\uDCA9\", \"xml\")", "");
   ExecuteExpectString("Encode(\"\\uD83D\\uDCA9\", \"html\")", "");
-#endif  // !BUILDFLAG(IS_WIN)
+#endif  // defined(WCHAR_T_IS_UTF32)
 }
 
 TEST_F(CFXJSE_FormCalcContextEmbedderTest, DISABLED_Format) {
commit	c651b5492a99b48be1ed6e54a3da2b97c8bbe113	[log] [tgz]
author	K. Moon <kmoon@chromium.org>	Thu May 04 18:43:36 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Thu May 04 18:43:36 2023 +0000
tree	955de294cae55e638a17cd9111580b95c19c7e98
parent	09e030db27a1778fe502257985538aeb5fa67d00 [diff]