Ignore whitespace in ToUnicode codepoint strings White-space characters shall be ignored in Hexadecimal Strings in ToUnicode CMaps. "Make CPDF_ToUnicodeMap::StringToCode() tolerate whitespaces" [0] allowed white-space in character codes but not in the code point strings. This change ignores any whitespace present in a code point string and simplifies ignoring it in character codes. [0] https://pdfium.googlesource.com/pdfium/+/bf9170ee47d4f656964252898787152edc0345cd Bug: 42270019 Change-Id: I13e8f2b5e495a732d11b20759bbcde99828880ea Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/133030 Reviewed-by: Ben Wagner <bungeman@google.com> Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Ben Wagner <bungeman@google.com>

commit: 096f61b5ce685efb4ed24af71240de24a0347e9c [log] [tgz]
author: Ben Wagner <bungeman@chromium.org> Wed Jun 11 11:29:11 2025 -0700
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Jun 11 11:29:11 2025 -0700
tree: c20480e80144e60968411e8cdc51f9c74d946240
parent: 54b23007aecfc02c12d08479ce05ce88b8882f05 [diff]
diff --git a/core/fpdfapi/font/cpdf_tounicodemap.cpp b/core/fpdfapi/font/cpdf_tounicodemap.cpp
index a105486..f71d21c 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.cpp
+++ b/core/fpdfapi/font/cpdf_tounicodemap.cpp

@@ -91,28 +91,7 @@
 }
 
 // static
-std::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView input) {
-  // Ignore whitespaces within `input`. See https://crbug.com/pdfium/2065.
-  std::set<char> seen_whitespace_chars;
-  for (char c : input) {
-    if (PDFCharIsWhitespace(c)) {
-      seen_whitespace_chars.insert(c);
-    }
-  }
-  ByteString str_without_whitespace_chars;  // Must outlive `str`.
-  ByteStringView str;
-  if (seen_whitespace_chars.empty()) {
-    str = input;
-  } else {
-    str_without_whitespace_chars.Reserve(input.GetLength());
-    for (char c : input) {
-      if (!pdfium::Contains(seen_whitespace_chars, c)) {
-        str_without_whitespace_chars += c;
-      }
-    }
-    str = str_without_whitespace_chars.AsStringView();
-  }
-
+std::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView str) {
   size_t len = str.GetLength();
   if (len <= 2 || str[0] != '<' || str[len - 1] != '>') {
     return std::nullopt;
@@ -120,6 +99,10 @@
 
   FX_SAFE_UINT32 code = 0;
   for (char c : str.Substr(1, len - 2)) {
+    // Ignore whitespace https://crbug.com/pdfium/2065
+    if (PDFCharIsWhitespace(c)) {
+      continue;
+    }
     if (!FXSYS_IsHexDigit(c)) {
       return std::nullopt;
     }
@@ -143,6 +126,10 @@
   int byte_pos = 0;
   wchar_t ch = 0;
   for (char c : str.Substr(1, len - 2)) {
+    // Ignore whitespace https://crbug.com/pdfium/1022
+    if (PDFCharIsWhitespace(c)) {
+      continue;
+    }
     if (!FXSYS_IsHexDigit(c)) {
       break;
     }

diff --git a/core/fpdfapi/font/cpdf_tounicodemap.h b/core/fpdfapi/font/cpdf_tounicodemap.h
index 3a2c48e..ba73b82 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.h
+++ b/core/fpdfapi/font/cpdf_tounicodemap.h

@@ -34,7 +34,7 @@
   friend class CPDFToUnicodeMapTest_StringToCode_Test;
   friend class CPDFToUnicodeMapTest_StringToWideString_Test;
 
-  static std::optional<uint32_t> StringToCode(ByteStringView input);
+  static std::optional<uint32_t> StringToCode(ByteStringView str);
   static WideString StringToWideString(ByteStringView str);
 
   void Load(RetainPtr<const CPDF_Stream> pStream);

diff --git a/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp b/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp
index 3081ad4..ce04af5 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp
+++ b/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp

@@ -52,11 +52,14 @@
   WideString res = L"\xc2ab";
   EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2ab>"));
   EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2abab>"));
-  EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2ab 1234>"));
 
   res += L"\xfaab";
   EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2abFaAb>"));
   EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2abFaAb12>"));
+  EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2ab FaAb>"));
+  EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2ab FaAb12>"));
+  EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2ab FaAb 12>"));
+  EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("< c 2 a b  F a A b  1 2 >"));
 }
 
 TEST(CPDFToUnicodeMapTest, HandleBeginBFCharBadCount) {
commit	096f61b5ce685efb4ed24af71240de24a0347e9c	[log] [tgz]
author	Ben Wagner <bungeman@chromium.org>	Wed Jun 11 11:29:11 2025 -0700
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Jun 11 11:29:11 2025 -0700
tree	c20480e80144e60968411e8cdc51f9c74d946240
parent	54b23007aecfc02c12d08479ce05ce88b8882f05 [diff]