Handle non-BMP code points for UTF-16 wchar_t Correctly encodes supplementary (non-BMP) code points in WideString as UTF-16 if wchar_t is 16 bits (as on Windows), rather than 32 bits (as on Linux). Bug: pdfium:2029 Change-Id: I662ac7fb08bd48267e32da5f36fd1c9c2bb70717 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107070 Reviewed-by: Nigi <nigi@chromium.org> Commit-Queue: K. Moon <kmoon@chromium.org>

commit: 7014c95eecc2dd755ffdbb26ec79a998446099e2 [log] [tgz]
author: K. Moon <kmoon@chromium.org> Thu May 04 19:21:13 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Thu May 04 19:21:13 2023 +0000
tree: daa3a0d7d594746a4acde2a74298c5c271378e95
parent: c651b5492a99b48be1ed6e54a3da2b97c8bbe113 [diff]
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index 276e186..9661745 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp

@@ -10,6 +10,8 @@
 
 #include <utility>
 
+#include "build/build_config.h"
+
 CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
   int remaining = 0;
   char32_t code_point = 0;
@@ -54,5 +56,16 @@
     return;
   }
 
+#if defined(WCHAR_T_IS_UTF16)
+  if (code_point < 0x10000) {
+    buffer_ += static_cast<wchar_t>(code_point);
+  } else {
+    // Encode as UTF-16 surrogate pair.
+    code_point -= 0x10000;
+    buffer_ += 0xd800 | (code_point >> 10);
+    buffer_ += 0xdc00 | (code_point & 0x3ff);
+  }
+#else
   buffer_ += static_cast<wchar_t>(code_point);
+#endif  // defined(WCHAR_T_IS_UTF16)
 }

diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
index 117f3f7..e0b7ee5 100644
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ b/core/fxcrt/cfx_utf8encoder.cpp

@@ -8,12 +8,36 @@
 
 #include <stdint.h>
 
+#include "build/build_config.h"
+
 CFX_UTF8Encoder::CFX_UTF8Encoder() = default;
 
 CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
 
 void CFX_UTF8Encoder::Input(wchar_t code_unit) {
-  char32_t code_point = static_cast<char32_t>(code_unit);
+#if defined(WCHAR_T_IS_UTF16)
+  if (code_unit >= 0xd800 && code_unit < 0xdc00) {
+    // High surrogate.
+    high_surrogate_ = code_unit;
+  } else if (code_unit >= 0xdc00 && code_unit <= 0xdfff) {
+    // Low surrogate.
+    if (high_surrogate_) {
+      char32_t code_point = code_unit & 0x3ff;
+      code_point |= (high_surrogate_ & 0x3ff) << 10;
+      code_point += 0x10000;
+      high_surrogate_ = 0;
+      AppendCodePoint(code_point);
+    }
+  } else {
+    high_surrogate_ = 0;
+    AppendCodePoint(code_unit);
+  }
+#else
+  AppendCodePoint(code_unit);
+#endif  // defined(WCHAR_T_IS_UTF16)
+}
+
+void CFX_UTF8Encoder::AppendCodePoint(char32_t code_point) {
   if (code_point > 0x10ffff) {
     // Invalid code point above U+10FFFF.
     return;

diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
index a39f0cb..71b9ac3 100644
--- a/core/fxcrt/cfx_utf8encoder.h
+++ b/core/fxcrt/cfx_utf8encoder.h

@@ -7,6 +7,7 @@
 #ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
 #define CORE_FXCRT_CFX_UTF8ENCODER_H_
 
+#include "build/build_config.h"
 #include "core/fxcrt/data_vector.h"
 #include "core/fxcrt/string_view_template.h"
 
@@ -26,7 +27,13 @@
   }
 
  private:
+  void AppendCodePoint(char32_t code_point);
+
   DataVector<char> buffer_;
+
+#if defined(WCHAR_T_IS_UTF16)
+  char16_t high_surrogate_ = 0;
+#endif  // defined(WCHAR_T_IS_UTF16)
 };
 
 #endif  // CORE_FXCRT_CFX_UTF8ENCODER_H_

diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index b3f2864..34ffcee 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp

@@ -4,6 +4,7 @@
 
 #include <limits>
 
+#include "build/build_config.h"
 #include "core/fxcrt/fx_string.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "third_party/base/span.h"
@@ -41,6 +42,25 @@
                     L"y"));
 }
 
+TEST(fxstring, FXUTF8EncodeSupplementary) {
+  EXPECT_EQ(
+      "\xf0\x90\x80\x80"
+      "🎨"
+      "\xf4\x8f\xbf\xbf",
+      FX_UTF8Encode(L"\U00010000"
+                    L"\U0001f3a8"
+                    L"\U0010ffff"));
+}
+
+#if defined(WCHAR_T_IS_UTF16)
+TEST(fxstring, FXUTF8EncodeSurrogateErrorRecovery) {
+  EXPECT_EQ("()", FX_UTF8Encode(L"(\xd800)")) << "High";
+  EXPECT_EQ("()", FX_UTF8Encode(L"(\xdc00)")) << "Low";
+  EXPECT_EQ("(🎨)", FX_UTF8Encode(L"(\xd800\xd83c\xdfa8)")) << "High-high";
+  EXPECT_EQ("(🎨)", FX_UTF8Encode(L"(\xd83c\xdfa8\xdc00)")) << "Low-low";
+}
+#endif  // defined(WCHAR_T_IS_UTF16)
+
 TEST(fxstring, FXUTF8Decode) {
   EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView()));
   EXPECT_EQ(
@@ -62,6 +82,16 @@
                     "y"));
 }
 
+TEST(fxstring, FXUTF8DecodeSupplementary) {
+  EXPECT_EQ(
+      L"\U00010000"
+      L"\U0001f3a8"
+      L"\U0010ffff",
+      FX_UTF8Decode("\xf0\x90\x80\x80"
+                    "🎨"
+                    "\xf4\x8f\xbf\xbf"));
+}
+
 TEST(fxstring, FXUTF8DecodeErrorRecovery) {
   EXPECT_EQ(L"(A)", FX_UTF8Decode("(\xc2\x41)")) << "Invalid continuation";
   EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xc2)")) << "Invalid continuation";
commit	7014c95eecc2dd755ffdbb26ec79a998446099e2	[log] [tgz]
author	K. Moon <kmoon@chromium.org>	Thu May 04 19:21:13 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Thu May 04 19:21:13 2023 +0000
tree	daa3a0d7d594746a4acde2a74298c5c271378e95
parent	c651b5492a99b48be1ed6e54a3da2b97c8bbe113 [diff]