Extract language code stripping from PDF_DecodeText() into function

For unicode text, strings can contain 0x001b (for UTF-16) or 0x1b (for
UTF-8) followed a 2-byte BCP 47 language code (two ascii bytes),
optionally followed by a 2-byte ISO 3166 country code (another two ascii
bytes), terminated by another 0x001b / 0x1b.

These can be used to put different translations of the same text into
the same string.  But we currently just strip out these language codes.

Since the language and country codes are ascii and they're a multiple of
two, it's ok to strip them after doing UTF-16 / UTF-8 conversion.

So extract this code into a separate function.
(I think the function can be simplified a bit now, but in this CL I'd
like to try and due a pure code move.)

This modifies code added in
https://pdfium-review.googlesource.com/c/pdfium/+/41070

(After this it's hopefully easy to add support for UTF-8 text strings,
which is my actual goal.)

No intended behavior change.

Change-Id: I5e25a26a8f30308ee6ca377f17e82850f2d43274
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/113790
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Auto-Submit: Nico Weber <thakis@chromium.org>
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index cb2a48e..78b81c1 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp
@@ -475,7 +475,7 @@
 }
 
 #if defined(WCHAR_T_IS_32_BIT)
-static size_t FuseSurrogates(pdfium::span<wchar_t>& s, size_t n) {
+static size_t FuseSurrogates(pdfium::span<wchar_t> s, size_t n) {
   size_t dest_pos = 0;
   char16_t high_surrogate = 0;
   for (size_t i = 0; i < n; ++i) {
@@ -505,6 +505,35 @@
 }
 #endif  // defined(WCHAR_T_IS_UTF32)
 
+static size_t StripLanguageCodes(pdfium::span<wchar_t> s, size_t n) {
+  size_t dest_pos = 0;
+  for (size_t i = 0; i < n; ++i) {
+    uint16_t unicode = s[i];
+
+    // 0x001B is a begin/end marker for language metadata region that
+    // should not be in the decoded text.
+    if (unicode == 0x001B) {
+      ++i;
+      for (; i < n; ++i) {
+        unicode = s[i];
+        if (unicode == 0x001B) {
+          ++i;
+          if (i < n) {
+            unicode = s[i];
+          }
+          break;
+        }
+      }
+      if (i >= n) {
+        break;
+      }
+    }
+
+    s[dest_pos++] = unicode;
+  }
+  return dest_pos;
+}
+
 WideString PDF_DecodeText(pdfium::span<const uint8_t> span) {
   size_t dest_pos = 0;
   WideString result;
@@ -521,28 +550,11 @@
     const uint8_t* unicode_str = &span[2];
 
     for (size_t i = 0; i < max_chars * 2; i += 2) {
-      uint16_t unicode = GetUnicodeFromBytes(unicode_str + i);
-
-      // 0x001B is a begin/end marker for language metadata region that
-      // should not be in the decoded text.
-      if (unicode == 0x001B) {
-        i += 2;
-        for (; i < max_chars * 2; i += 2) {
-          unicode = GetUnicodeFromBytes(unicode_str + i);
-          if (unicode == 0x001B) {
-            i += 2;
-            if (i < max_chars * 2)
-              unicode = GetUnicodeFromBytes(unicode_str + i);
-            break;
-          }
-        }
-        if (i >= max_chars * 2)
-          break;
-      }
-
-      dest_buf[dest_pos++] = unicode;
+      dest_buf[dest_pos++] = GetUnicodeFromBytes(unicode_str + i);
     }
 
+    dest_pos = StripLanguageCodes(dest_buf, dest_pos);
+
 #if defined(WCHAR_T_IS_32_BIT)
     dest_pos = FuseSurrogates(dest_buf, dest_pos);
 #endif