Make WideString::ToUTF16LE() do surrogate splitting if needed

Should make text strings with characters outside of the BMP show up
correctly on non-Windows platforms.

(Because e.g. PDFiumEngine::TraverseBookmarks() calls
FPDFBookmark_GetTitle() expecting to get a std::16string, and
FPDFBookmark_GetTitle() calls Utf16EncodeMaybeCopyAndReturnLength()
which calls WideString::ToUTF16LE() which currently chops off the
two upper bytes of wchar_t on platforms where wchar_t is 4 bytes --
i.e. on non-Windows.)

It's still a bit unfortunate that:
* ToUTF16LE() is a method on WideString, since that means we can't
  use it when we only have a WideStringView, such as in PDF_EncodeText()
  (which wants UTF16BE, but we could call UTF16LE and swap bytes).
  We might want to move the method to be a static thing on ByteString
  instead: ByteString::UTF16LEFromWide(WideStringView)
* ToUTF16LE() adds a zero-terminator, making it nonsymmetrical with
  FromUTF16LE. Some places (AsFPDFWideString()) want to convert the
  result to a FPDF_WIDESTRING which is nul-terminated. Maybe there
  should be an arg for opting in to the zero termination.

But again, that's for separate CLs.

Bug: pdfium:2105
Change-Id: I7aa1f1aa96f7b490c0c4435577fffe82a91809d9
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/114130
Commit-Queue: Tom Sepez <tsepez@chromium.org>
Auto-Submit: Nico Weber <thakis@chromium.org>
Commit-Queue: Nico Weber <thakis@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index bfc9555..aa01fc3 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -771,17 +771,36 @@
 
   ByteString result;
   size_t len = m_pData->m_nDataLength;
+  size_t output_length = 0;
   {
     // Span's lifetime must end before ReleaseBuffer() below.
-    pdfium::span<char> buffer = result.GetBuffer(len * 2 + 2);
+#if defined(WCHAR_T_IS_32_BIT)
+    // 2 or 4 bytes required per UTF-32 code unit.
+    pdfium::span<uint8_t> buffer =
+        pdfium::as_writable_bytes(result.GetBuffer(len * 4 + 2));
+#else
+    // 2 bytes required per UTF-16 code unit.
+    pdfium::span<uint8_t> buffer =
+        pdfium::as_writable_bytes(result.GetBuffer(len * 2 + 2));
+#endif
     for (size_t i = 0; i < len; i++) {
-      buffer[i * 2] = m_pData->m_String[i] & 0xff;
-      buffer[i * 2 + 1] = m_pData->m_String[i] >> 8;
+#if defined(WCHAR_T_IS_32_BIT)
+      if (pdfium::IsSupplementary(m_pData->m_String[i])) {
+        pdfium::SurrogatePair pair(m_pData->m_String[i]);
+        buffer[output_length++] = pair.high() & 0xff;
+        buffer[output_length++] = pair.high() >> 8;
+        buffer[output_length++] = pair.low() & 0xff;
+        buffer[output_length++] = pair.low() >> 8;
+        continue;
+      }
+#endif  // defined(WCHAR_T_IS_32_BIT)
+      buffer[output_length++] = m_pData->m_String[i] & 0xff;
+      buffer[output_length++] = m_pData->m_String[i] >> 8;
     }
-    buffer[len * 2] = 0;
-    buffer[len * 2 + 1] = 0;
+    buffer[output_length++] = 0;
+    buffer[output_length++] = 0;
   }
-  result.ReleaseBuffer(len * 2 + 2);
+  result.ReleaseBuffer(output_length);
   return result;
 }
 
diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index f60e11f..39d734b 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp
@@ -1279,6 +1279,7 @@
       {L"abc\0def", ByteString("a\0b\0c\0\0\0", 8)},
       {L"\xaabb\xccdd", ByteString("\xbb\xaa\xdd\xcc\0\0", 6)},
       {L"\x3132\x6162", ByteString("\x32\x31\x62\x61\0\0", 6)},
+      {L"🎨", ByteString("\x3C\xD8\xA8\xDF\0\0", 6)},
   };
 
   for (size_t i = 0; i < std::size(utf16le_encode_cases); ++i) {
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index f1b193d..194a602 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -338,8 +338,6 @@
   if (str.GetLength() > static_cast<size_t>(char_count))
     str = str.First(static_cast<size_t>(char_count));
 
-  // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
-  // the number of items to stay the same.
   ByteString byte_str = str.ToUTF16LE();
   size_t byte_str_len = byte_str.GetLength();
   size_t ret_count = byte_str_len / kBytesPerCharacter;