Make CPDF_ToUnicodeMap::StringToCode() tolerate whitespaces Currently, CPDF_ToUnicodeMap::StringToCode() expects the stringified code to be something like "<0020>". Some PDF generators unintuitively insert whitespaces inside the string. e.g. "<0020\r>". StringToCode() should tolerate cases like this and ignore the whitespaces. Then it can correctly parse more /ToUnicode maps from the wild. Bug: pdfium:1455,pdfium:2065 Change-Id: I1f06ade8b152bc97bd76038b6f540cc01195b8a4 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/109950 Commit-Queue: Lei Zhang <thestig@chromium.org> Reviewed-by: Nigi <nigi@chromium.org>

commit: bf9170ee47d4f656964252898787152edc0345cd [log] [tgz]
author: Lei Zhang <thestig@chromium.org> Wed Jul 19 17:15:31 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Jul 19 17:15:31 2023 +0000
tree: d6975bab09a0eb593438ef30d892707e7d2916cb
parent: 03e309bf05ea10fedb15c24d546351a2a754e262 [diff]
diff --git a/core/fpdfapi/font/cpdf_tounicodemap.cpp b/core/fpdfapi/font/cpdf_tounicodemap.cpp
index 3c6e73e..2e354a2 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.cpp
+++ b/core/fpdfapi/font/cpdf_tounicodemap.cpp

@@ -13,6 +13,7 @@
 #include "core/fpdfapi/font/cpdf_fontglobals.h"
 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
 #include "core/fpdfapi/parser/cpdf_stream.h"
+#include "core/fpdfapi/parser/fpdf_parser_utility.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_safe_types.h"
 #include "third_party/base/containers/contains.h"
@@ -78,7 +79,28 @@
 }
 
 // static
-absl::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView str) {
+absl::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView input) {
+  // Ignore whitespaces within `input`. See https://crbug.com/pdfium/2065.
+  std::set<char> seen_whitespace_chars;
+  for (char c : input) {
+    if (PDFCharIsWhitespace(c)) {
+      seen_whitespace_chars.insert(c);
+    }
+  }
+  ByteString str_without_whitespace_chars;  // Must outlive `str`.
+  ByteStringView str;
+  if (seen_whitespace_chars.empty()) {
+    str = input;
+  } else {
+    str_without_whitespace_chars.Reserve(input.GetLength());
+    for (char c : input) {
+      if (!pdfium::Contains(seen_whitespace_chars, c)) {
+        str_without_whitespace_chars += c;
+      }
+    }
+    str = str_without_whitespace_chars.AsStringView();
+  }
+
   size_t len = str.GetLength();
   if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
     return absl::nullopt;

diff --git a/core/fpdfapi/font/cpdf_tounicodemap.h b/core/fpdfapi/font/cpdf_tounicodemap.h
index a074f45..7f4ba03 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.h
+++ b/core/fpdfapi/font/cpdf_tounicodemap.h

@@ -34,7 +34,7 @@
   friend class cpdf_tounicodemap_StringToCode_Test;
   friend class cpdf_tounicodemap_StringToWideString_Test;
 
-  static absl::optional<uint32_t> StringToCode(ByteStringView str);
+  static absl::optional<uint32_t> StringToCode(ByteStringView input);
   static WideString StringToWideString(ByteStringView str);
 
   void Load(RetainPtr<const CPDF_Stream> pStream);

diff --git a/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp b/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp
index 86ac2b1..7de56f8 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp
+++ b/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp

@@ -19,6 +19,14 @@
   EXPECT_THAT(CPDF_ToUnicodeMap::StringToCode("<FFFFFFFF>"),
               testing::Optional(4294967295u));
 
+  // Whitespaces within the string are ignored.
+  EXPECT_THAT(CPDF_ToUnicodeMap::StringToCode("<00\n0\r1>"),
+              testing::Optional(1u));
+  EXPECT_THAT(CPDF_ToUnicodeMap::StringToCode("<c 2>"),
+              testing::Optional(194u));
+  EXPECT_THAT(CPDF_ToUnicodeMap::StringToCode("<A2\r\n>"),
+              testing::Optional(162u));
+
   // Integer overflow
   EXPECT_FALSE(CPDF_ToUnicodeMap::StringToCode("<100000000>").has_value());
   EXPECT_FALSE(CPDF_ToUnicodeMap::StringToCode("<1abcdFFFF>").has_value());
commit	bf9170ee47d4f656964252898787152edc0345cd	[log] [tgz]
author	Lei Zhang <thestig@chromium.org>	Wed Jul 19 17:15:31 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Jul 19 17:15:31 2023 +0000
tree	d6975bab09a0eb593438ef30d892707e7d2916cb
parent	03e309bf05ea10fedb15c24d546351a2a754e262 [diff]