Fix text string handling on UTF-32 platforms Fixes PDF_DecodeText() and PDF_EncodeText() to handle supplementary characters correctly on UTF-32 platforms. Supplementary characters require 2 code units to represent in UTF-16, but only 1 in UTF-32. Fixed: pdfium:2029 Change-Id: I985cea3a3d5f79d7cf544904f78d6d47881f4331 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107831 Reviewed-by: Tom Sepez <tsepez@chromium.org> Commit-Queue: K. Moon <kmoon@chromium.org>

commit: 4a5e28a78c2dda8033481a0b351953dceb8116fb [log] [tgz]
author: K. Moon <kmoon@chromium.org> Fri May 19 00:15:16 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Fri May 19 00:15:16 2023 +0000
tree: 16649596cb97348e2a7ef74336d58ce0c663b32f
parent: c4e6b7d3e039b9a8085b231be6efda7e6a33be31 [diff]
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index 6f29b7a..1fbee03 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp

@@ -8,10 +8,12 @@
 
 #include <ctype.h>
 #include <limits.h>
+#include <stddef.h>
 
 #include <algorithm>
 #include <utility>
 
+#include "build/build_config.h"
 #include "constants/stream_dict_common.h"
 #include "core/fpdfapi/parser/cpdf_array.h"
 #include "core/fpdfapi/parser/cpdf_dictionary.h"
@@ -22,6 +24,7 @@
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_safe_types.h"
 #include "core/fxcrt/span_util.h"
+#include "core/fxcrt/utf16.h"
 #include "third_party/base/check.h"
 #include "third_party/base/containers/contains.h"
 
@@ -472,7 +475,7 @@
 }
 
 WideString PDF_DecodeText(pdfium::span<const uint8_t> span) {
-  int dest_pos = 0;
+  size_t dest_pos = 0;
   WideString result;
   if (span.size() >= 2 && ((span[0] == 0xfe && span[1] == 0xff) ||
                            (span[0] == 0xff && span[1] == 0xfe))) {
@@ -485,6 +488,10 @@
         span[0] == 0xfe ? GetUnicodeFromBigEndianBytes
                         : GetUnicodeFromLittleEndianBytes;
     const uint8_t* unicode_str = &span[2];
+
+#if defined(WCHAR_T_IS_UTF32)
+    char16_t high_surrogate = 0;
+#endif  // defined(WCHAR_T_IS_UTF32)
     for (size_t i = 0; i < max_chars * 2; i += 2) {
       uint16_t unicode = GetUnicodeFromBytes(unicode_str + i);
 
@@ -505,8 +512,34 @@
           break;
       }
 
+#if defined(WCHAR_T_IS_UTF32)
+      // TODO(crbug.com/pdfium/2031): Always use UTF-16.
+      if (high_surrogate) {
+        char16_t previous_high_surrogate = high_surrogate;
+        high_surrogate = 0;
+
+        if (pdfium::IsLowSurrogate(unicode)) {
+          dest_buf[dest_pos++] =
+              pdfium::SurrogatePair(previous_high_surrogate, unicode)
+                  .ToCodePoint();
+          continue;
+        }
+        dest_buf[dest_pos++] = previous_high_surrogate;
+      }
+
+      if (pdfium::IsHighSurrogate(unicode)) {
+        high_surrogate = unicode;
+        continue;
+      }
+#endif  // defined(WCHAR_T_IS_UTF32)
       dest_buf[dest_pos++] = unicode;
     }
+
+#if defined(WCHAR_T_IS_UTF32)
+    if (high_surrogate) {
+      dest_buf[dest_pos++] = high_surrogate;
+    }
+#endif  // defined(WCHAR_T_IS_UTF32)
   } else {
     pdfium::span<wchar_t> dest_buf = result.GetBuffer(span.size());
     for (size_t i = 0; i < span.size(); ++i)
@@ -545,18 +578,35 @@
   }
 
   size_t dest_index = 0;
-  size_t encLen = len * 2 + 2;
   {
+#if defined(WCHAR_T_IS_UTF32)
+    // 2 or 4 bytes required per UTF-32 code unit.
     pdfium::span<uint8_t> dest_buf =
-        pdfium::as_writable_bytes(result.GetBuffer(encLen));
+        pdfium::as_writable_bytes(result.GetBuffer(len * 4 + 2));
+#else
+    // 2 bytes required per UTF-16 code unit.
+    pdfium::span<uint8_t> dest_buf =
+        pdfium::as_writable_bytes(result.GetBuffer(len * 2 + 2));
+#endif  // defined(WCHAR_T_IS_UTF32)
+
     dest_buf[dest_index++] = 0xfe;
     dest_buf[dest_index++] = 0xff;
     for (size_t j = 0; j < len; ++j) {
+#if defined(WCHAR_T_IS_UTF32)
+      if (pdfium::IsSupplementary(str[j])) {
+        pdfium::SurrogatePair pair(str[j]);
+        dest_buf[dest_index++] = pair.high() >> 8;
+        dest_buf[dest_index++] = static_cast<uint8_t>(pair.high());
+        dest_buf[dest_index++] = pair.low() >> 8;
+        dest_buf[dest_index++] = static_cast<uint8_t>(pair.low());
+        continue;
+      }
+#endif  // defined(WCHAR_T_IS_UTF32)
       dest_buf[dest_index++] = str[j] >> 8;
       dest_buf[dest_index++] = static_cast<uint8_t>(str[j]);
     }
   }
-  result.ReleaseBuffer(encLen);
+  result.ReleaseBuffer(dest_index);
   return result;
 }
 

diff --git a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
index f812097..ad0416e 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp

@@ -419,6 +419,9 @@
       PDF_DecodeText(
           ToSpan("\xFE\xFF\x7F\x51\x98\x75\x00\x20\x56\xFE\x72\x47\x00"
                  "\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB")));
+
+  // Supplementary Unicode text.
+  EXPECT_EQ(L"🎨", PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x3C\xDF\xA8")));
 }
 
 // https://crbug.com/pdfium/182
@@ -450,6 +453,12 @@
 TEST(ParserDecodeTest, DecodeTextWithUnpairedSurrogates) {
   EXPECT_EQ(L"\xD800", PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x00"))) << "High";
   EXPECT_EQ(L"\xDC00", PDF_DecodeText(ToSpan("\xFE\xFF\xDC\x00"))) << "Low";
+  EXPECT_EQ(L"\xD800🎨",
+            PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x00\xD8\x3C\xDF\xA8")))
+      << "High-high";
+  EXPECT_EQ(L"🎨\xDC00",
+            PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x3C\xDF\xA8\xDC\x00")))
+      << "Low-low";
 }
 
 TEST(ParserDecodeTest, EncodeText) {
@@ -468,6 +477,9 @@
                    "\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB"),
       PDF_EncodeText(L"\x7F51\x9875\x0020\x56FE\x7247\x0020"
                      L"\x8D44\x8BAF\x66F4\x591A\x0020\x00BB"));
+
+  // Supplementary Unicode text.
+  EXPECT_EQ("\xFE\xFF\xD8\x3C\xDF\xA8", PDF_EncodeText(L"🎨"));
 }
 
 TEST(ParserDecodeTest, RoundTripText) {

diff --git a/core/fpdfdoc/cpdf_annotlist_unittest.cpp b/core/fpdfdoc/cpdf_annotlist_unittest.cpp
index 5d7e1e2..653fdff 100644
--- a/core/fpdfdoc/cpdf_annotlist_unittest.cpp
+++ b/core/fpdfdoc/cpdf_annotlist_unittest.cpp

@@ -86,7 +86,7 @@
 TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromUnicode) {
   const ByteString kContents =
       MakeByteString({0xFE, 0xFF, 0x00, 'A', 0x00, 'a', 0x00, 0xE4, 0x20, 0xAC,
-                      0xD8, 0x3D, 0xDC, 0xC4});
+                      0xD8, 0x3C, 0xDF, 0xA8});
   AddTextAnnotation(kContents);
 
   CPDF_AnnotList list(page_);
@@ -94,9 +94,7 @@
   ASSERT_EQ(2u, list.Count());
   EXPECT_EQ(kContents, GetRawContents(list.GetAt(1)));
 
-  // TODO(crbug.com/pdfium/2029): `WideString::FromUTF8()` mishandles '📄'.
-  EXPECT_EQ(WideString::FromUTF8("Aaä€\xED\xA0\xBD\xED\xB3\x84"),
-            GetDecodedContents(list.GetAt(1)));
+  EXPECT_EQ(WideString::FromUTF8("Aaä€🎨"), GetDecodedContents(list.GetAt(1)));
 }
 
 TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromEmptyPdfEncoded) {
commit	4a5e28a78c2dda8033481a0b351953dceb8116fb	[log] [tgz]
author	K. Moon <kmoon@chromium.org>	Fri May 19 00:15:16 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Fri May 19 00:15:16 2023 +0000
tree	16649596cb97348e2a7ef74336d58ce0c663b32f
parent	c4e6b7d3e039b9a8085b231be6efda7e6a33be31 [diff]