Extract language code stripping from PDF_DecodeText() into function For unicode text, strings can contain 0x001b (for UTF-16) or 0x1b (for UTF-8) followed a 2-byte BCP 47 language code (two ascii bytes), optionally followed by a 2-byte ISO 3166 country code (another two ascii bytes), terminated by another 0x001b / 0x1b. These can be used to put different translations of the same text into the same string. But we currently just strip out these language codes. Since the language and country codes are ascii and they're a multiple of two, it's ok to strip them after doing UTF-16 / UTF-8 conversion. So extract this code into a separate function. (I think the function can be simplified a bit now, but in this CL I'd like to try and due a pure code move.) This modifies code added in https://pdfium-review.googlesource.com/c/pdfium/+/41070 (After this it's hopefully easy to add support for UTF-8 text strings, which is my actual goal.) No intended behavior change. Change-Id: I5e25a26a8f30308ee6ca377f17e82850f2d43274 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/113790 Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org> Auto-Submit: Nico Weber <thakis@chromium.org>

commit: 1e9d89db3c00fd1eab2959bd063832bebe6b868d [log] [tgz]
author: Nico Weber <thakis@chromium.org> Wed Nov 22 06:11:15 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Nov 22 06:11:15 2023 +0000
tree: 2f5c96180737cd8388754f0a1b059c49ddf9c68b
parent: 445b54a7346277d3aa270c3b3c4e049dd9cf14d8 [diff]
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index cb2a48e..78b81c1 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp

@@ -475,7 +475,7 @@
 }
 
 #if defined(WCHAR_T_IS_32_BIT)
-static size_t FuseSurrogates(pdfium::span<wchar_t>& s, size_t n) {
+static size_t FuseSurrogates(pdfium::span<wchar_t> s, size_t n) {
   size_t dest_pos = 0;
   char16_t high_surrogate = 0;
   for (size_t i = 0; i < n; ++i) {
@@ -505,6 +505,35 @@
 }
 #endif  // defined(WCHAR_T_IS_UTF32)
 
+static size_t StripLanguageCodes(pdfium::span<wchar_t> s, size_t n) {
+  size_t dest_pos = 0;
+  for (size_t i = 0; i < n; ++i) {
+    uint16_t unicode = s[i];
+
+    // 0x001B is a begin/end marker for language metadata region that
+    // should not be in the decoded text.
+    if (unicode == 0x001B) {
+      ++i;
+      for (; i < n; ++i) {
+        unicode = s[i];
+        if (unicode == 0x001B) {
+          ++i;
+          if (i < n) {
+            unicode = s[i];
+          }
+          break;
+        }
+      }
+      if (i >= n) {
+        break;
+      }
+    }
+
+    s[dest_pos++] = unicode;
+  }
+  return dest_pos;
+}
+
 WideString PDF_DecodeText(pdfium::span<const uint8_t> span) {
   size_t dest_pos = 0;
   WideString result;
@@ -521,28 +550,11 @@
     const uint8_t* unicode_str = &span[2];
 
     for (size_t i = 0; i < max_chars * 2; i += 2) {
-      uint16_t unicode = GetUnicodeFromBytes(unicode_str + i);
-
-      // 0x001B is a begin/end marker for language metadata region that
-      // should not be in the decoded text.
-      if (unicode == 0x001B) {
-        i += 2;
-        for (; i < max_chars * 2; i += 2) {
-          unicode = GetUnicodeFromBytes(unicode_str + i);
-          if (unicode == 0x001B) {
-            i += 2;
-            if (i < max_chars * 2)
-              unicode = GetUnicodeFromBytes(unicode_str + i);
-            break;
-          }
-        }
-        if (i >= max_chars * 2)
-          break;
-      }
-
-      dest_buf[dest_pos++] = unicode;
+      dest_buf[dest_pos++] = GetUnicodeFromBytes(unicode_str + i);
     }
 
+    dest_pos = StripLanguageCodes(dest_buf, dest_pos);
+
 #if defined(WCHAR_T_IS_32_BIT)
     dest_pos = FuseSurrogates(dest_buf, dest_pos);
 #endif
commit	1e9d89db3c00fd1eab2959bd063832bebe6b868d	[log] [tgz]
author	Nico Weber <thakis@chromium.org>	Wed Nov 22 06:11:15 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Nov 22 06:11:15 2023 +0000
tree	2f5c96180737cd8388754f0a1b059c49ddf9c68b
parent	445b54a7346277d3aa270c3b3c4e049dd9cf14d8 [diff]