Simplify PDF_DecodeText() and new helper functions I kept the code as-is when moving it. Now that it's moved and untangled, we can simplify the code a bit. Every modified function is supposed to have exactly the same behavior as it had before this change. They no longer write to a `uint16_t` temporary, so they no longer truncate characters to 16-bits, but the caller only passed in 16-bit data in both cases, so no effective behavior change. (A future change will call StripLanguageCodes() with the output of an UTF-8 decoder, which can pass 32-bit characters. For that future use-case, the change is a progression.) Bug: pdfium:2101 Change-Id: If56afbadb8282b042f4411a5d1930543c67a197b Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/113810 Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Nico Weber <thakis@chromium.org>

commit: 9b8ac25af8da61c8a81e0162c39a85eed2cc38dc [log] [tgz]
author: Nico Weber <thakis@chromium.org> Thu Nov 23 02:05:21 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Thu Nov 23 02:05:21 2023 +0000
tree: f0b4886b00609c2b3a9dcbe67f959f52213248cc
parent: 5adcad9d30f98aa4d53f6af087523034617b2394 [diff]
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index 78b81c1..47b5e43 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp

@@ -32,14 +32,6 @@
 
 const uint32_t kMaxStreamSize = 20 * 1024 * 1024;
 
-uint16_t GetUnicodeFromBigEndianBytes(const uint8_t* bytes) {
-  return bytes[0] << 8 | bytes[1];
-}
-
-uint16_t GetUnicodeFromLittleEndianBytes(const uint8_t* bytes) {
-  return bytes[1] << 8 | bytes[0];
-}
-
 bool CheckFlateDecodeParams(int Colors, int BitsPerComponent, int Columns) {
   if (Colors < 0 || BitsPerComponent < 0 || Columns < 0)
     return false;
@@ -477,29 +469,15 @@
 #if defined(WCHAR_T_IS_32_BIT)
 static size_t FuseSurrogates(pdfium::span<wchar_t> s, size_t n) {
   size_t dest_pos = 0;
-  char16_t high_surrogate = 0;
   for (size_t i = 0; i < n; ++i) {
-    uint16_t unicode = s[i];
-
     // TODO(crbug.com/pdfium/2031): Always use UTF-16.
-    if (high_surrogate) {
-      char16_t previous_high_surrogate = high_surrogate;
-      high_surrogate = 0;
-      if (pdfium::IsLowSurrogate(unicode)) {
-        s[dest_pos++] = pdfium::SurrogatePair(previous_high_surrogate, unicode)
-                            .ToCodePoint();
-        continue;
-      }
-      s[dest_pos++] = previous_high_surrogate;
-    }
-    if (pdfium::IsHighSurrogate(unicode)) {
-      high_surrogate = unicode;
+    if (pdfium::IsHighSurrogate(s[i]) && i + 1 < n &&
+        pdfium::IsLowSurrogate(s[i + 1])) {
+      s[dest_pos++] = pdfium::SurrogatePair(s[i], s[i + 1]).ToCodePoint();
+      ++i;
       continue;
     }
-    s[dest_pos++] = unicode;
-  }
-  if (high_surrogate) {
-    s[dest_pos++] = high_surrogate;
+    s[dest_pos++] = s[i];
   }
   return dest_pos;
 }
@@ -508,28 +486,15 @@
 static size_t StripLanguageCodes(pdfium::span<wchar_t> s, size_t n) {
   size_t dest_pos = 0;
   for (size_t i = 0; i < n; ++i) {
-    uint16_t unicode = s[i];
-
     // 0x001B is a begin/end marker for language metadata region that
     // should not be in the decoded text.
-    if (unicode == 0x001B) {
-      ++i;
-      for (; i < n; ++i) {
-        unicode = s[i];
-        if (unicode == 0x001B) {
-          ++i;
-          if (i < n) {
-            unicode = s[i];
-          }
-          break;
-        }
+    if (s[i] == 0x001B) {
+      for (++i; i < n && s[i] != 0x001B; ++i) {
+        // No for-loop body. The loop searches for the terminating 0x001B.
       }
-      if (i >= n) {
-        break;
-      }
+      continue;
     }
-
-    s[dest_pos++] = unicode;
+    s[dest_pos++] = s[i];
   }
   return dest_pos;
 }
@@ -539,18 +504,15 @@
   WideString result;
   if (span.size() >= 2 && ((span[0] == 0xfe && span[1] == 0xff) ||
                            (span[0] == 0xff && span[1] == 0xfe))) {
-    size_t max_chars = (span.size() - 2) / 2;
-    if (!max_chars)
-      return result;
-
-    pdfium::span<wchar_t> dest_buf = result.GetBuffer(max_chars);
-    uint16_t (*GetUnicodeFromBytes)(const uint8_t*) =
-        span[0] == 0xfe ? GetUnicodeFromBigEndianBytes
-                        : GetUnicodeFromLittleEndianBytes;
-    const uint8_t* unicode_str = &span[2];
-
-    for (size_t i = 0; i < max_chars * 2; i += 2) {
-      dest_buf[dest_pos++] = GetUnicodeFromBytes(unicode_str + i);
+    pdfium::span<wchar_t> dest_buf = result.GetBuffer((span.size() - 2) / 2);
+    if (span[0] == 0xfe) {
+      for (size_t i = 2; i < span.size() - 1; i += 2) {
+        dest_buf[dest_pos++] = span[i] << 8 | span[i + 1];
+      }
+    } else {
+      for (size_t i = 2; i < span.size() - 1; i += 2) {
+        dest_buf[dest_pos++] = span[i + 1] << 8 | span[i];
+      }
     }
 
     dest_pos = StripLanguageCodes(dest_buf, dest_pos);
commit	9b8ac25af8da61c8a81e0162c39a85eed2cc38dc	[log] [tgz]
author	Nico Weber <thakis@chromium.org>	Thu Nov 23 02:05:21 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Thu Nov 23 02:05:21 2023 +0000
tree	f0b4886b00609c2b3a9dcbe67f959f52213248cc
parent	5adcad9d30f98aa4d53f6af087523034617b2394 [diff]