Make WideString::ToUTF16LE() do surrogate splitting if needed Should make text strings with characters outside of the BMP show up correctly on non-Windows platforms. (Because e.g. PDFiumEngine::TraverseBookmarks() calls FPDFBookmark_GetTitle() expecting to get a std::16string, and FPDFBookmark_GetTitle() calls Utf16EncodeMaybeCopyAndReturnLength() which calls WideString::ToUTF16LE() which currently chops off the two upper bytes of wchar_t on platforms where wchar_t is 4 bytes -- i.e. on non-Windows.) It's still a bit unfortunate that: * ToUTF16LE() is a method on WideString, since that means we can't use it when we only have a WideStringView, such as in PDF_EncodeText() (which wants UTF16BE, but we could call UTF16LE and swap bytes). We might want to move the method to be a static thing on ByteString instead: ByteString::UTF16LEFromWide(WideStringView) * ToUTF16LE() adds a zero-terminator, making it nonsymmetrical with FromUTF16LE. Some places (AsFPDFWideString()) want to convert the result to a FPDF_WIDESTRING which is nul-terminated. Maybe there should be an arg for opting in to the zero termination. But again, that's for separate CLs. Bug: pdfium:2105 Change-Id: I7aa1f1aa96f7b490c0c4435577fffe82a91809d9 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/114130 Commit-Queue: Tom Sepez <tsepez@chromium.org> Auto-Submit: Nico Weber <thakis@chromium.org> Commit-Queue: Nico Weber <thakis@chromium.org> Reviewed-by: Tom Sepez <tsepez@chromium.org>

commit: fea01fa9e2a82abd5f6d1117725a4afd01102236 [log] [tgz]
author: Nico Weber <thakis@chromium.org> Fri Dec 01 22:25:00 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Fri Dec 01 22:25:00 2023 +0000
tree: 94f15675b30aafa68b84670c581cb7c5de4cb3a6
parent: 7388bd02f160a35d06b58e57f6374780fe4bafd3 [diff]
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index bfc9555..aa01fc3 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp

@@ -771,17 +771,36 @@
 
   ByteString result;
   size_t len = m_pData->m_nDataLength;
+  size_t output_length = 0;
   {
     // Span's lifetime must end before ReleaseBuffer() below.
-    pdfium::span<char> buffer = result.GetBuffer(len * 2 + 2);
+#if defined(WCHAR_T_IS_32_BIT)
+    // 2 or 4 bytes required per UTF-32 code unit.
+    pdfium::span<uint8_t> buffer =
+        pdfium::as_writable_bytes(result.GetBuffer(len * 4 + 2));
+#else
+    // 2 bytes required per UTF-16 code unit.
+    pdfium::span<uint8_t> buffer =
+        pdfium::as_writable_bytes(result.GetBuffer(len * 2 + 2));
+#endif
     for (size_t i = 0; i < len; i++) {
-      buffer[i * 2] = m_pData->m_String[i] & 0xff;
-      buffer[i * 2 + 1] = m_pData->m_String[i] >> 8;
+#if defined(WCHAR_T_IS_32_BIT)
+      if (pdfium::IsSupplementary(m_pData->m_String[i])) {
+        pdfium::SurrogatePair pair(m_pData->m_String[i]);
+        buffer[output_length++] = pair.high() & 0xff;
+        buffer[output_length++] = pair.high() >> 8;
+        buffer[output_length++] = pair.low() & 0xff;
+        buffer[output_length++] = pair.low() >> 8;
+        continue;
+      }
+#endif  // defined(WCHAR_T_IS_32_BIT)
+      buffer[output_length++] = m_pData->m_String[i] & 0xff;
+      buffer[output_length++] = m_pData->m_String[i] >> 8;
     }
-    buffer[len * 2] = 0;
-    buffer[len * 2 + 1] = 0;
+    buffer[output_length++] = 0;
+    buffer[output_length++] = 0;
   }
-  result.ReleaseBuffer(len * 2 + 2);
+  result.ReleaseBuffer(output_length);
   return result;
 }
 

diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index f60e11f..39d734b 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp

@@ -1279,6 +1279,7 @@
       {L"abc\0def", ByteString("a\0b\0c\0\0\0", 8)},
       {L"\xaabb\xccdd", ByteString("\xbb\xaa\xdd\xcc\0\0", 6)},
       {L"\x3132\x6162", ByteString("\x32\x31\x62\x61\0\0", 6)},
+      {L"🎨", ByteString("\x3C\xD8\xA8\xDF\0\0", 6)},
   };
 
   for (size_t i = 0; i < std::size(utf16le_encode_cases); ++i) {

diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index f1b193d..194a602 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp

@@ -338,8 +338,6 @@
   if (str.GetLength() > static_cast<size_t>(char_count))
     str = str.First(static_cast<size_t>(char_count));
 
-  // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
-  // the number of items to stay the same.
   ByteString byte_str = str.ToUTF16LE();
   size_t byte_str_len = byte_str.GetLength();
   size_t ret_count = byte_str_len / kBytesPerCharacter;
commit	fea01fa9e2a82abd5f6d1117725a4afd01102236	[log] [tgz]
author	Nico Weber <thakis@chromium.org>	Fri Dec 01 22:25:00 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Fri Dec 01 22:25:00 2023 +0000
tree	94f15675b30aafa68b84670c581cb7c5de4cb3a6
parent	7388bd02f160a35d06b58e57f6374780fe4bafd3 [diff]