Make WideString's FromUTF16BE / FromUTF16LE do surrogate fusing ...and use them in PDF_DecodeText(). No behavior change for PDF_DecodeText(). Otherwise, these functions are mostly used on Windows (where this is a no-op since wchar_t is 2 bytes there) and in fuzzers, so this generally shouldn't have a big effect. This is a bit weird since WideString::ToUTF16LE() doesn't create surrogates yet, so things don't round-trip cleanly through FromUTF16LE().ToUTF16LE(). But that's not a new problem since PDF_DecodeText() created fused surrogates before this as well, and FX_UTF8Decode() also creates WideStrings with characters outside the BMP. (ToUTF16LE() adds a zero-terminator, which is also a problem round-trip wise.) It's also strange that fx_string.h contains UTF8 conversion methods, in addition to WideString::FromUTF8(). They operate on WideStringViews. FX_UTF8Decode() should arguably be replaced by WideString::FromUTF8() everywhere, but FX_UTF8Encode() takes a WideStringView -- we probably want to create FX_UTF16LEEncode() and make WideString::ToUTF16LE() call that possibly. Or maybe FX_UTF8Encode() should become a method on ByteString -- ByteString::UTF8FromWide() or something. But that's all for other CLs. This here seems like a small self-contained progression. Change-Id: I13afb5f66df6f553358cc03ccfd56a78e8e624c3 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/114030 Commit-Queue: Nico Weber <thakis@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org> Auto-Submit: Nico Weber <thakis@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org>

commit: d06523d84bb3152be44a07e8cf0ca7fe30c8bc8e [log] [tgz]
author: Nico Weber <thakis@chromium.org> Thu Nov 30 01:54:56 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Thu Nov 30 01:54:56 2023 +0000
tree: 71c39e08679c7dc5c530c6f31d7780401a99c0ab
parent: e2704cba8119851a6b0d80ac94ace8d306efba45 [diff]
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index 6412a87..29b2eb6 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp

@@ -466,23 +466,6 @@
   return true;
 }
 
-#if defined(WCHAR_T_IS_32_BIT)
-static size_t FuseSurrogates(pdfium::span<wchar_t> s, size_t n) {
-  size_t dest_pos = 0;
-  for (size_t i = 0; i < n; ++i) {
-    // TODO(crbug.com/pdfium/2031): Always use UTF-16.
-    if (pdfium::IsHighSurrogate(s[i]) && i + 1 < n &&
-        pdfium::IsLowSurrogate(s[i + 1])) {
-      s[dest_pos++] = pdfium::SurrogatePair(s[i], s[i + 1]).ToCodePoint();
-      ++i;
-      continue;
-    }
-    s[dest_pos++] = s[i];
-  }
-  return dest_pos;
-}
-#endif  // defined(WCHAR_T_IS_UTF32)
-
 static size_t StripLanguageCodes(pdfium::span<wchar_t> s, size_t n) {
   size_t dest_pos = 0;
   for (size_t i = 0; i < n; ++i) {
@@ -504,24 +487,16 @@
   WideString result;
   if (span.size() >= 2 && ((span[0] == 0xfe && span[1] == 0xff) ||
                            (span[0] == 0xff && span[1] == 0xfe))) {
-    pdfium::span<wchar_t> dest_buf = result.GetBuffer((span.size() - 2) / 2);
     if (span[0] == 0xfe) {
-      for (size_t i = 2; i < span.size() - 1; i += 2) {
-        dest_buf[dest_pos++] = span[i] << 8 | span[i + 1];
-      }
+      result = WideString::FromUTF16BE(span.subspan(2));
     } else {
-      for (size_t i = 2; i < span.size() - 1; i += 2) {
-        dest_buf[dest_pos++] = span[i + 1] << 8 | span[i];
-      }
+      result = WideString::FromUTF16LE(span.subspan(2));
     }
-#if defined(WCHAR_T_IS_32_BIT)
-    dest_pos = FuseSurrogates(dest_buf, dest_pos);
-#endif
-
-    dest_pos = StripLanguageCodes(dest_buf, dest_pos);
+    pdfium::span<wchar_t> dest_buf = result.GetBuffer(result.GetLength());
+    dest_pos = StripLanguageCodes(dest_buf, result.GetLength());
   } else if (span.size() >= 3 && span[0] == 0xef && span[1] == 0xbb &&
              span[2] == 0xbf) {
-    result = FX_UTF8Decode(span.subspan(3));
+    result = WideString::FromUTF8(span.subspan(3));
     pdfium::span<wchar_t> dest_buf = result.GetBuffer(result.GetLength());
     dest_pos = StripLanguageCodes(dest_buf, result.GetLength());
   } else {

diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index 2ad7b5d..dca0a48 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp

@@ -18,6 +18,7 @@
 #include "core/fxcrt/fx_safe_types.h"
 #include "core/fxcrt/fx_system.h"
 #include "core/fxcrt/string_pool_template.h"
+#include "core/fxcrt/utf16.h"
 #include "third_party/base/check.h"
 #include "third_party/base/check_op.h"
 #include "third_party/base/numerics/safe_math.h"
@@ -33,6 +34,23 @@
 
 namespace {
 
+#if defined(WCHAR_T_IS_32_BIT)
+size_t FuseSurrogates(pdfium::span<wchar_t> s, size_t n) {
+  size_t dest_pos = 0;
+  for (size_t i = 0; i < n; ++i) {
+    // TODO(crbug.com/pdfium/2031): Always use UTF-16.
+    if (pdfium::IsHighSurrogate(s[i]) && i + 1 < n &&
+        pdfium::IsLowSurrogate(s[i + 1])) {
+      s[dest_pos++] = pdfium::SurrogatePair(s[i], s[i + 1]).ToCodePoint();
+      ++i;
+      continue;
+    }
+    s[dest_pos++] = s[i];
+  }
+  return dest_pos;
+}
+#endif  // defined(WCHAR_T_IS_32_BIT)
+
 constexpr wchar_t kWideTrimChars[] = L"\x09\x0a\x0b\x0c\x0d\x20";
 
 const wchar_t* FX_wcsstr(const wchar_t* haystack,
@@ -970,6 +988,10 @@
     for (size_t i = 0; i < data.size() - 1; i += 2) {
       buf[length++] = data[i] | data[i + 1] << 8;
     }
+
+#if defined(WCHAR_T_IS_32_BIT)
+    length = FuseSurrogates(buf, length);
+#endif
   }
   result.ReleaseBuffer(length);
   return result;
@@ -988,6 +1010,10 @@
     for (size_t i = 0; i < data.size() - 1; i += 2) {
       buf[length++] = data[i] << 8 | data[i + 1];
     }
+
+#if defined(WCHAR_T_IS_32_BIT)
+    length = FuseSurrogates(buf, length);
+#endif
   }
   result.ReleaseBuffer(length);
   return result;

diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index 947926c..5c555ca 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp

@@ -1136,6 +1136,44 @@
   EXPECT_EQ(0, iter - multi_str.rbegin());
 }
 
+TEST(WideString, FromUTF16BE) {
+  struct UTF16BEDecodeCase {
+    ByteString in;
+    WideString out;
+  } const utf16be_decode_cases[] = {
+      {"", L""},
+      {ByteString("\0a\0b\0c", 6), L"abc"},
+      {ByteString("\0a\0b\0c\0\0\0d\0e\0f", 14), WideString(L"abc\0def", 7)},
+      {ByteString(" &", 2), L"…"},
+      {ByteString("\xD8\x3C\xDF\xA8", 4), L"🎨"},
+  };
+
+  for (size_t i = 0; i < std::size(utf16be_decode_cases); ++i) {
+    EXPECT_EQ(WideString::FromUTF16BE(utf16be_decode_cases[i].in.raw_span()),
+              utf16be_decode_cases[i].out)
+        << " for case number " << i;
+  }
+}
+
+TEST(WideString, FromUTF16LE) {
+  struct UTF16LEDecodeCase {
+    ByteString in;
+    WideString out;
+  } const utf16le_decode_cases[] = {
+      {"", L""},
+      {ByteString("a\0b\0c\0", 6), L"abc"},
+      {ByteString("a\0b\0c\0\0\0d\0e\0f\0", 14), WideString(L"abc\0def", 7)},
+      {ByteString("& ", 2), L"…"},
+      {ByteString("\x3C\xD8\xA8\xDF", 4), L"🎨"},
+  };
+
+  for (size_t i = 0; i < std::size(utf16le_decode_cases); ++i) {
+    EXPECT_EQ(WideString::FromUTF16LE(utf16le_decode_cases[i].in.raw_span()),
+              utf16le_decode_cases[i].out)
+        << " for case number " << i;
+  }
+}
+
 TEST(WideString, ToUTF16LE) {
   struct UTF16LEEncodeCase {
     WideString ws;
commit	d06523d84bb3152be44a07e8cf0ca7fe30c8bc8e	[log] [tgz]
author	Nico Weber <thakis@chromium.org>	Thu Nov 30 01:54:56 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Thu Nov 30 01:54:56 2023 +0000
tree	71c39e08679c7dc5c530c6f31d7780401a99c0ab
parent	e2704cba8119851a6b0d80ac94ace8d306efba45 [diff]