Do more validation in CPDF_ToUnicodeMap

For the CPDF_ToUnicodeMap code that parses bfchar and bfrange sections,
do more validation to make sure:

- The counts are integers.
- The counts are not negatives.
- The counts are less than or equal to 100, per spec.
- The actual count matches the specified count.
- The code value array has a matching pair of brackets.

If parsing fails, consume all the words in the parser until the end, or
until an endbfchar / endbfrange keyword.

Add some test cases for inputs with bad bounds to demonstrate the above
code works. Update other test cases to resolve TODOs for incorrect test
expectations.

Then to make all other existing tests pass:

- Update testing/resources/pixel/bug_909762.in to have the right count.
- Update fpdf_edittext.cpp code that generate cmaps to do so in the
  spec-complaint way.

Bug: 41489089
Change-Id: If2de527b0e8454e7b12c7803d0ea97ffab536c9a
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/126013
Reviewed-by: Thomas Sepez <tsepez@google.com>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdfapi/font/cpdf_tounicodemap.cpp b/core/fpdfapi/font/cpdf_tounicodemap.cpp
index 7cbf881..186f9ec 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.cpp
+++ b/core/fpdfapi/font/cpdf_tounicodemap.cpp
@@ -17,6 +17,7 @@
 #include "core/fxcrt/containers/contains.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_safe_types.h"
+#include "third_party/abseil-cpp/absl/types/variant.h"
 
 namespace {
 
@@ -147,94 +148,216 @@
   auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream));
   pAcc->LoadAllDataFiltered();
   CPDF_SimpleParser parser(pAcc->GetSpan());
+  ByteStringView previous_word;
   while (true) {
     ByteStringView word = parser.GetWord();
-    if (word.IsEmpty())
+    if (word.IsEmpty()) {
       break;
+    }
 
-    if (word == "beginbfchar")
-      HandleBeginBFChar(&parser);
-    else if (word == "beginbfrange")
-      HandleBeginBFRange(&parser);
-    else if (word == "/Adobe-Korea1-UCS2")
+    if (word == "beginbfchar") {
+      word = HandleBeginBFChar(parser, previous_word);
+    } else if (word == "beginbfrange") {
+      word = HandleBeginBFRange(parser, previous_word);
+    } else if (word == "/Adobe-Korea1-UCS2") {
       cid_set = CIDSET_KOREA1;
-    else if (word == "/Adobe-Japan1-UCS2")
+    } else if (word == "/Adobe-Japan1-UCS2") {
       cid_set = CIDSET_JAPAN1;
-    else if (word == "/Adobe-CNS1-UCS2")
+    } else if (word == "/Adobe-CNS1-UCS2") {
       cid_set = CIDSET_CNS1;
-    else if (word == "/Adobe-GB1-UCS2")
+    } else if (word == "/Adobe-GB1-UCS2") {
       cid_set = CIDSET_GB1;
+    }
+
+    previous_word = word;
   }
   if (cid_set != CIDSET_UNKNOWN) {
     m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set);
   }
 }
 
-void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) {
+ByteStringView CPDF_ToUnicodeMap::HandleBeginBFChar(
+    CPDF_SimpleParser& parser,
+    ByteStringView previous_word) {
+  struct CodeWord {
+    uint32_t code;
+    ByteStringView word;
+  };
+  std::vector<CodeWord> code_words;
+
+  const int raw_count = StringToInt(previous_word);
+  bool is_valid = raw_count >= 0 && raw_count <= 100;
+  const size_t expected_count = is_valid ? static_cast<size_t>(raw_count) : 0;
+  code_words.reserve(expected_count);
+
+  ByteStringView word;
   while (true) {
-    ByteStringView word = pParser->GetWord();
-    if (word.IsEmpty() || word == "endbfchar")
-      return;
+    word = parser.GetWord();
+    if (word.IsEmpty() || word == "endbfchar") {
+      break;
+    }
+    if (!is_valid) {
+      continue;  // Keep consuming words. Do nothing else.
+    }
 
     std::optional<uint32_t> code = StringToCode(word);
     if (!code.has_value() || code.value() > kCidLimit) {
-      return;
+      is_valid = false;
+      continue;
     }
 
-    SetCode(code.value(), StringToWideString(pParser->GetWord()));
+    word = parser.GetWord();
+    code_words.emplace_back(code.value(), word);
+
+    if (code_words.size() > expected_count) {
+      is_valid = false;
+    }
   }
+
+  if (is_valid && code_words.size() == expected_count) {
+    for (const auto& entry : code_words) {
+      SetCode(entry.code, StringToWideString(entry.word));
+    }
+  }
+  return word;
 }
 
-void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) {
+ByteStringView CPDF_ToUnicodeMap::HandleBeginBFRange(
+    CPDF_SimpleParser& parser,
+    ByteStringView previous_word) {
+  struct CodeWordRange {
+    uint32_t low_code;
+    std::vector<ByteStringView> code_words;
+  };
+  struct MultimapSingleDestRange {
+    uint32_t low_code;
+    uint32_t high_code;
+    uint32_t start_value;
+  };
+  struct MultimapMultiDestRange {
+    uint32_t low_code;
+    std::vector<WideString> retcodes;
+  };
+  using Range = absl::variant<CodeWordRange, MultimapSingleDestRange,
+                              MultimapMultiDestRange>;
+  std::vector<Range> ranges;
+
+  const int raw_count = StringToInt(previous_word);
+  bool is_valid = raw_count >= 0 && raw_count <= 100;
+  const size_t expected_count = is_valid ? static_cast<size_t>(raw_count) : 0;
+  ranges.reserve(expected_count);
+
+  ByteStringView word;
   while (true) {
-    ByteStringView lowcode_str = pParser->GetWord();
-    if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange")
-      return;
+    word = parser.GetWord();
+    if (word.IsEmpty() || word == "endbfrange") {
+      break;
+    }
+    if (!is_valid) {
+      continue;  // Keep consuming words. Do nothing else.
+    }
 
-    std::optional<uint32_t> lowcode_opt = StringToCode(lowcode_str);
-    if (!lowcode_opt.has_value())
-      return;
+    std::optional<uint32_t> lowcode_opt = StringToCode(word);
+    if (!lowcode_opt.has_value()) {
+      is_valid = false;
+      continue;
+    }
 
-    ByteStringView highcode_str = pParser->GetWord();
-    std::optional<uint32_t> highcode_opt = StringToCode(highcode_str);
-    if (!highcode_opt.has_value())
-      return;
+    word = parser.GetWord();
+    std::optional<uint32_t> highcode_opt = StringToCode(word);
+    if (!highcode_opt.has_value()) {
+      is_valid = false;
+      continue;
+    }
 
     uint32_t lowcode = lowcode_opt.value();
     uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
     if (lowcode > kCidLimit || highcode > kCidLimit || lowcode > highcode) {
-      return;
+      is_valid = false;
+      continue;
     }
 
-    ByteStringView start = pParser->GetWord();
+    word = parser.GetWord();
+    ByteStringView start = word;
     if (start == "[") {
+      CodeWordRange range;
+      range.low_code = lowcode;
+      range.code_words.reserve(1 + highcode - lowcode);
       for (uint32_t code = lowcode; code <= highcode; ++code) {
-        SetCode(code, StringToWideString(pParser->GetWord()));
+        word = parser.GetWord();
+        range.code_words.push_back(word);
       }
-      pParser->GetWord();
+      ranges.push_back(std::move(range));
+
+      if (ranges.size() > expected_count) {
+        is_valid = false;
+        continue;
+      }
+
+      word = parser.GetWord();
+      if (word != "]") {
+        is_valid = false;
+      }
       continue;
     }
 
     WideString destcode = StringToWideString(start);
     if (destcode.GetLength() == 1) {
       std::optional<uint32_t> value_or_error = StringToCode(start);
-      if (!value_or_error.has_value())
-        return;
-
-      uint32_t value = value_or_error.value();
-      for (uint32_t code = lowcode; code <= highcode; ++code) {
-        InsertIntoMultimap(code, value++);
+      if (!value_or_error.has_value()) {
+        is_valid = false;
+        continue;
       }
+
+      ranges.push_back(
+          MultimapSingleDestRange{.low_code = lowcode,
+                                  .high_code = highcode,
+                                  .start_value = value_or_error.value()});
     } else {
-      for (uint32_t code = lowcode; code <= highcode; ++code) {
-        WideString retcode =
-            code == lowcode ? destcode : StringDataAdd(destcode);
-        InsertIntoMultimap(code, GetMultiCharIndexIndicator());
-        m_MultiCharVec.push_back(retcode);
-        destcode = std::move(retcode);
+      MultimapMultiDestRange range;
+      range.low_code = lowcode;
+      range.retcodes.reserve(1 + highcode - lowcode);
+      range.retcodes.push_back(destcode);
+      for (uint32_t code = lowcode + 1; code <= highcode; ++code) {
+        WideString retcode = StringDataAdd(range.retcodes.back());
+        range.retcodes.push_back(std::move(retcode));
+      }
+      ranges.push_back(std::move(range));
+    }
+
+    if (ranges.size() > expected_count) {
+      is_valid = false;
+    }
+  }
+
+  if (is_valid && ranges.size() == expected_count) {
+    for (const auto& entry : ranges) {
+      if (absl::holds_alternative<CodeWordRange>(entry)) {
+        const auto& range = absl::get<CodeWordRange>(entry);
+        uint32_t code = range.low_code;
+        for (const auto& code_word : range.code_words) {
+          SetCode(code, StringToWideString(code_word));
+          ++code;
+        }
+      } else if (absl::holds_alternative<MultimapSingleDestRange>(entry)) {
+        const auto& range = absl::get<MultimapSingleDestRange>(entry);
+        uint32_t value = range.start_value;
+        for (uint32_t code = range.low_code; code <= range.high_code; ++code) {
+          InsertIntoMultimap(code, value++);
+        }
+      } else {
+        CHECK(absl::holds_alternative<MultimapMultiDestRange>(entry));
+        const auto& range = absl::get<MultimapMultiDestRange>(entry);
+        uint32_t code = range.low_code;
+        for (const auto& retcode : range.retcodes) {
+          InsertIntoMultimap(code, GetMultiCharIndexIndicator());
+          m_MultiCharVec.push_back(retcode);
+          ++code;
+        }
       }
     }
   }
+  return word;
 }
 
 uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const {
diff --git a/core/fpdfapi/font/cpdf_tounicodemap.h b/core/fpdfapi/font/cpdf_tounicodemap.h
index c5feca3..abb701f 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.h
+++ b/core/fpdfapi/font/cpdf_tounicodemap.h
@@ -38,8 +38,14 @@
   static WideString StringToWideString(ByteStringView str);
 
   void Load(RetainPtr<const CPDF_Stream> pStream);
-  void HandleBeginBFChar(CPDF_SimpleParser* pParser);
-  void HandleBeginBFRange(CPDF_SimpleParser* pParser);
+
+  // `previous_word` is the most recent word that `parser` returned.
+  // Returns the last word `parser` encountered.
+  ByteStringView HandleBeginBFChar(CPDF_SimpleParser& parser,
+                                   ByteStringView previous_word);
+  ByteStringView HandleBeginBFRange(CPDF_SimpleParser& parser,
+                                    ByteStringView previous_word);
+
   uint32_t GetMultiCharIndexIndicator() const;
   void SetCode(uint32_t srccode, WideString destcode);
 
diff --git a/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp b/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp
index 4f5d41d..d516696 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp
+++ b/core/fpdfapi/font/cpdf_tounicodemap_unittest.cpp
@@ -59,6 +59,29 @@
   EXPECT_EQ(res, CPDF_ToUnicodeMap::StringToWideString("<c2abFaAb12>"));
 }
 
+TEST(CPDFToUnicodeMapTest, HandleBeginBFCharBadCount) {
+  {
+    static constexpr uint8_t kInput1[] =
+        "1 beginbfchar<1><0041><2><0042>endbfchar";
+    auto stream = pdfium::MakeRetain<CPDF_Stream>(kInput1);
+    CPDF_ToUnicodeMap map(stream);
+    EXPECT_EQ(0u, map.ReverseLookup(0x0041));
+    EXPECT_EQ(0u, map.ReverseLookup(0x0042));
+    EXPECT_EQ(0u, map.GetUnicodeCountByCharcodeForTesting(1u));
+    EXPECT_EQ(0u, map.GetUnicodeCountByCharcodeForTesting(2u));
+  }
+  {
+    static constexpr uint8_t kInput2[] =
+        "3 beginbfchar<1><0041><2><0042>endbfchar";
+    auto stream = pdfium::MakeRetain<CPDF_Stream>(kInput2);
+    CPDF_ToUnicodeMap map(stream);
+    EXPECT_EQ(0u, map.ReverseLookup(0x0041));
+    EXPECT_EQ(0u, map.ReverseLookup(0x0042));
+    EXPECT_EQ(0u, map.GetUnicodeCountByCharcodeForTesting(1u));
+    EXPECT_EQ(0u, map.GetUnicodeCountByCharcodeForTesting(2u));
+  }
+}
+
 TEST(CPDFToUnicodeMapTest, HandleBeginBFRangeRejectsInvalidCidValues) {
   {
     static constexpr uint8_t kInput1[] =
@@ -114,9 +137,35 @@
   static constexpr uint8_t kInput[] = "1 beginbfrange<3><3>[<0041>}endbfrange";
   auto stream = pdfium::MakeRetain<CPDF_Stream>(kInput);
   CPDF_ToUnicodeMap map(stream);
-  // TODO(thestig): The calls below should both return 0.
-  EXPECT_EQ(3u, map.ReverseLookup(0x0041));
-  EXPECT_EQ(1u, map.GetUnicodeCountByCharcodeForTesting(3u));
+  EXPECT_EQ(0u, map.ReverseLookup(0x0041));
+  EXPECT_EQ(0u, map.GetUnicodeCountByCharcodeForTesting(3u));
+}
+
+TEST(CPDFToUnicodeMapTest, HandleBeginBFRangeBadCount) {
+  {
+    static constexpr uint8_t kInput1[] =
+        "1 beginbfrange<1><2><0040><4><5><0050>endbfrange";
+    auto stream = pdfium::MakeRetain<CPDF_Stream>(kInput1);
+    CPDF_ToUnicodeMap map(stream);
+    for (wchar_t unicode = 0x0039; unicode < 0x0053; ++unicode) {
+      EXPECT_EQ(0u, map.ReverseLookup(unicode));
+    }
+    for (uint32_t charcode = 0; charcode < 7; ++charcode) {
+      EXPECT_EQ(0u, map.GetUnicodeCountByCharcodeForTesting(charcode));
+    }
+  }
+  {
+    static constexpr uint8_t kInput2[] =
+        "3 beginbfrange<1><2><0040><4><5><0050>endbfrange";
+    auto stream = pdfium::MakeRetain<CPDF_Stream>(kInput2);
+    CPDF_ToUnicodeMap map(stream);
+    for (wchar_t unicode = 0x0039; unicode < 0x0053; ++unicode) {
+      EXPECT_EQ(0u, map.ReverseLookup(unicode));
+    }
+    for (uint32_t charcode = 0; charcode < 7; ++charcode) {
+      EXPECT_EQ(0u, map.GetUnicodeCountByCharcodeForTesting(charcode));
+    }
+  }
 }
 
 TEST(CPDFToUnicodeMapTest, HandleBeginBFRangeGoodCount) {
diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp
index f849a7b..fa8867c 100644
--- a/fpdfsdk/fpdf_edittext.cpp
+++ b/fpdfsdk/fpdf_edittext.cpp
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#include <algorithm>
 #include <map>
 #include <memory>
 #include <sstream>
@@ -80,6 +81,8 @@
 
 namespace {
 
+constexpr uint32_t kMaxBfCharBfRangeEntries = 100;
+
 ByteString BaseFontNameForType(const CFX_Font* font, int font_type) {
   ByteString name = font_type == FPDF_FONT_TYPE1 ? font->GetPsName()
                                                  : font->GetBaseFontName();
@@ -336,41 +339,82 @@
 
   fxcrt::ostringstream buffer;
   buffer << kToUnicodeStart;
-  // Add maps to buffer
-  buffer << static_cast<uint32_t>(char_to_uni.size()) << " beginbfchar\n";
-  for (const auto& it : char_to_uni) {
-    AddCharcode(buffer, it.first);
-    buffer << " ";
-    AddUnicode(buffer, it.second);
-    buffer << "\n";
-  }
-  buffer << "endbfchar\n"
-         << static_cast<uint32_t>(map_range_vector.size() + map_range.size())
-         << " beginbfrange\n";
-  for (const auto& it : map_range_vector) {
-    const std::pair<uint32_t, uint32_t>& charcode_range = it.first;
-    AddCharcode(buffer, charcode_range.first);
-    buffer << " ";
-    AddCharcode(buffer, charcode_range.second);
-    buffer << " [";
-    const std::vector<uint32_t>& unicodes = it.second;
-    for (size_t i = 0; i < unicodes.size(); ++i) {
-      AddUnicode(buffer, unicodes[i]);
-      if (i != unicodes.size() - 1)
+
+  {
+    // Add `char_to_uni` to `buffer`.
+    uint32_t to_process = pdfium::checked_cast<uint32_t>(char_to_uni.size());
+    auto it = char_to_uni.begin();
+    while (to_process) {
+      const uint32_t to_process_this_iteration =
+          std::min(to_process, kMaxBfCharBfRangeEntries);
+      buffer << to_process_this_iteration << " beginbfchar\n";
+      for (uint32_t i = 0; i < to_process_this_iteration; ++i) {
+        CHECK(it != char_to_uni.end());
+        AddCharcode(buffer, it->first);
         buffer << " ";
+        AddUnicode(buffer, it->second);
+        buffer << "\n";
+        ++it;
+      }
+      buffer << "endbfchar\n";
+      to_process -= to_process_this_iteration;
     }
-    buffer << "]\n";
   }
-  for (const auto& it : map_range) {
-    const std::pair<uint32_t, uint32_t>& charcode_range = it.first;
-    AddCharcode(buffer, charcode_range.first);
-    buffer << " ";
-    AddCharcode(buffer, charcode_range.second);
-    buffer << " ";
-    AddUnicode(buffer, it.second);
-    buffer << "\n";
+
+  {
+    // Add `map_range_vector` to `buffer`.
+    uint32_t to_process =
+        pdfium::checked_cast<uint32_t>(map_range_vector.size());
+    auto it = map_range_vector.begin();
+    while (to_process) {
+      const uint32_t to_process_this_iteration =
+          std::min(to_process, kMaxBfCharBfRangeEntries);
+      buffer << to_process_this_iteration << " beginbfrange\n";
+      for (uint32_t i = 0; i < to_process_this_iteration; ++i) {
+        CHECK(it != map_range_vector.end());
+        const std::pair<uint32_t, uint32_t>& charcode_range = it->first;
+        AddCharcode(buffer, charcode_range.first);
+        buffer << " ";
+        AddCharcode(buffer, charcode_range.second);
+        buffer << " [";
+        auto unicodes = pdfium::make_span(it->second);
+        AddUnicode(buffer, unicodes[0]);
+        for (uint32_t code : unicodes.subspan(1u)) {
+          buffer << " ";
+          AddUnicode(buffer, code);
+        }
+        buffer << "]\n";
+        ++it;
+      }
+      buffer << "endbfrange\n";
+      to_process -= to_process_this_iteration;
+    }
   }
-  buffer << "endbfrange\n";
+
+  {
+    // Add `map_range` to `buffer`.
+    uint32_t to_process = pdfium::checked_cast<uint32_t>(map_range.size());
+    auto it = map_range.begin();
+    while (to_process) {
+      const uint32_t to_process_this_iteration =
+          std::min(to_process, kMaxBfCharBfRangeEntries);
+      buffer << to_process_this_iteration << " beginbfrange\n";
+      for (uint32_t i = 0; i < to_process_this_iteration; ++i) {
+        CHECK(it != map_range.end());
+        const std::pair<uint32_t, uint32_t>& charcode_range = it->first;
+        AddCharcode(buffer, charcode_range.first);
+        buffer << " ";
+        AddCharcode(buffer, charcode_range.second);
+        buffer << " ";
+        AddUnicode(buffer, it->second);
+        buffer << "\n";
+        ++it;
+      }
+      buffer << "endbfrange\n";
+      to_process -= to_process_this_iteration;
+    }
+  }
+
   buffer << kToUnicodeEnd;
   auto stream = doc->NewIndirect<CPDF_Stream>(&buffer);
   return stream;
diff --git a/testing/resources/pixel/bug_909762.in b/testing/resources/pixel/bug_909762.in
index 652fe23..09ace97 100644
--- a/testing/resources/pixel/bug_909762.in
+++ b/testing/resources/pixel/bug_909762.in
@@ -91,7 +91,7 @@
 1 begincodespacerange
 <00><ff>
 endcodespacerange
-82 beginbfrange
+10 beginbfrange
 <20><20><0020>
 <21><21><0021>
 <2c><2c><002c>