M124: Fix yet another edited PDF that regressed

Rework CPDF_Parser cross reference table loading code yet again to make
the bug_335309995.pdf pixel test pass, while not causing any other tests
to regress.

For starters, get rid of the code where cross reference stream entries
do not overwrite existing entries. That logic forced cross reference
streams to be parsed in reverse order, which is awkward when the cross
reference tables are parsed in the normal order.

With that sorted out, LoadAllSecondaryCrossRefStreams() and
LoadAllCrossRefStreams() are no longer useful, as they parsed cross
reference data in reverse order. Instead, let LoadAllCrossRefTables()
take over their duties with more cross reference stream parsing logic.
As such, rename it to LoadAllCrossRefTablesAndStreams().

Then flip the ordering when reading cross reference data updates, so
tables have precedence over streams, per PDF spec.

Apply the same change to CPDF_Parser::LoadLinearizedAllCrossRefTable()
as well.

Bug: 330885080, 335309995, 336517370
Change-Id: I1f44cee17626104f78aaceca403f6bf1a3a09802
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/118571
Reviewed-by: Thomas Sepez <tsepez@google.com>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
(cherry picked from commit 02ff1756cfb2c2b8d900d98efd54a4d726b805b6)
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/119055
diff --git a/core/fpdfapi/parser/cpdf_cross_ref_table.cpp b/core/fpdfapi/parser/cpdf_cross_ref_table.cpp
index af52d37..b46d3d5 100644
--- a/core/fpdfapi/parser/cpdf_cross_ref_table.cpp
+++ b/core/fpdfapi/parser/cpdf_cross_ref_table.cpp
@@ -67,9 +67,6 @@
   if (info.gennum > gen_num)
     return;
 
-  if (info.type == ObjectType::kCompressed && gen_num == 0)
-    return;
-
   info.type = ObjectType::kNormal;
   info.is_object_stream_flag |= is_object_stream;
   info.gennum = gen_num;
diff --git a/core/fpdfapi/parser/cpdf_data_avail.cpp b/core/fpdfapi/parser/cpdf_data_avail.cpp
index 0850f7e..d1121fb 100644
--- a/core/fpdfapi/parser/cpdf_data_avail.cpp
+++ b/core/fpdfapi/parser/cpdf_data_avail.cpp
@@ -195,9 +195,7 @@
       return false;
   }
 
-  if (!m_parser.LoadAllCrossRefTables(
-          m_pCrossRefAvail->last_crossref_offset()) &&
-      !m_parser.LoadAllCrossRefStreams(
+  if (!m_parser.LoadAllCrossRefTablesAndStreams(
           m_pCrossRefAvail->last_crossref_offset())) {
     m_internalStatus = InternalStatus::kLoadAllFile;
     return false;
diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp
index 61861de..c040bdd 100644
--- a/core/fpdfapi/parser/cpdf_parser.cpp
+++ b/core/fpdfapi/parser/cpdf_parser.cpp
@@ -250,8 +250,7 @@
 
   m_LastXRefOffset = ParseStartXRef();
   if (m_LastXRefOffset >= kPDFHeaderSize) {
-    if (!LoadAllCrossRefTables(m_LastXRefOffset) &&
-        !LoadAllCrossRefStreams(m_LastXRefOffset)) {
+    if (!LoadAllCrossRefTablesAndStreams(m_LastXRefOffset)) {
       if (!RebuildCrossRef())
         return FORMAT_ERROR;
 
@@ -372,55 +371,82 @@
   return true;
 }
 
-bool CPDF_Parser::LoadAllCrossRefTables(FX_FILESIZE xref_offset) {
-  if (!LoadCrossRefTable(xref_offset, /*skip=*/true)) {
-    return false;
-  }
-
-  RetainPtr<CPDF_Dictionary> trailer = LoadTrailer();
-  if (!trailer)
-    return false;
-
-  m_CrossRefTable->SetTrailer(std::move(trailer), kNoTrailerObjectNumber);
-  const int32_t xrefsize = GetTrailer()->GetDirectIntegerFor("Size");
-  if (xrefsize > 0 && xrefsize <= kMaxXRefSize)
-    m_CrossRefTable->SetObjectMapSize(xrefsize);
-
-  FX_FILESIZE xref_stm = GetTrailer()->GetDirectIntegerFor("XRefStm");
-  if (xref_stm > 0) {
-    // In hybrid-reference files, use /Prev from the cross-reference stream and
-    // ignore the /Prev in the trailer.
-    // See ISO 32000-1:2008 spec, table 17.
-    if (!LoadAllSecondaryCrossRefStreams(xref_stm)) {
+bool CPDF_Parser::LoadAllCrossRefTablesAndStreams(FX_FILESIZE xref_offset) {
+  const bool is_xref_stream = !LoadCrossRefTable(xref_offset, /*skip=*/true);
+  if (is_xref_stream) {
+    // Use a copy of `xref_offset`, as LoadCrossRefStream() may change it.
+    FX_FILESIZE xref_offset_copy = xref_offset;
+    if (!LoadCrossRefStream(&xref_offset_copy, /*is_main_xref=*/true)) {
       return false;
     }
 
-    // Then load the reference table in the trailer.
-    return LoadCrossRefTable(xref_offset, /*skip=*/false);
+    // LoadCrossRefStream() sets the trailer when `is_main_xref` is true.
+    // Thus no SetTrailer() call like the else-block below. Similarly,
+    // LoadCrossRefStream() also calls SetObjectMapSize() itself, so no need to
+    // call it again here.
+  } else {
+    RetainPtr<CPDF_Dictionary> trailer = LoadTrailer();
+    if (!trailer) {
+      return false;
+    }
+
+    m_CrossRefTable->SetTrailer(std::move(trailer), kNoTrailerObjectNumber);
+
+    const int32_t xrefsize = GetTrailer()->GetDirectIntegerFor("Size");
+    if (xrefsize > 0 && xrefsize <= kMaxXRefSize) {
+      m_CrossRefTable->SetObjectMapSize(xrefsize);
+    }
   }
 
-  // Otherwise, follow the /Prev from the trailer.
-  std::vector<FX_FILESIZE> xref_list{xref_offset};
-  std::vector<FX_FILESIZE> xref_stream_list{xref_stm};
+  std::vector<FX_FILESIZE> xref_list;
+  std::vector<FX_FILESIZE> xref_stream_list;
+
+  if (is_xref_stream) {
+    xref_list.push_back(0);
+    xref_stream_list.push_back(xref_offset);
+  } else {
+    xref_list.push_back(xref_offset);
+    xref_stream_list.push_back(GetTrailer()->GetDirectIntegerFor("XRefStm"));
+  }
+
   if (!FindAllCrossReferenceTablesAndStream(xref_offset, xref_list,
                                             xref_stream_list)) {
     return false;
   }
 
-  for (size_t i = 0; i < xref_list.size(); ++i) {
-    if (xref_list[i] > 0 && !LoadCrossRefTable(xref_list[i], /*skip=*/false)) {
+  if (xref_list.front() > 0) {
+    if (!LoadCrossRefTable(xref_list.front(), /*skip=*/false)) {
       return false;
     }
 
+    if (!VerifyCrossRefTable()) {
+      return false;
+    }
+  }
+
+  // Cross reference table entries take precedence over cross reference stream
+  // entries. So process the stream entries first and then give the cross
+  // reference tables a chance to overwrite them.
+  //
+  // XRefStm entries should only be used in update sections, so skip
+  // `xref_stream_list.front()`.
+  //
+  // See details in ISO 32000-1:2008, section 7.5.8.4.
+  for (size_t i = 1; i < xref_list.size(); ++i) {
     if (xref_stream_list[i] > 0 &&
         !LoadCrossRefStream(&xref_stream_list[i], /*is_main_xref=*/false)) {
       return false;
     }
-
-    if (i == 0 && !VerifyCrossRefTable()) {
+    if (xref_list[i] > 0 && !LoadCrossRefTable(xref_list[i], /*skip=*/false)) {
       return false;
     }
   }
+
+  if (is_xref_stream) {
+    m_ObjectStreamMap.clear();
+    m_bXRefStream = true;
+  }
+
   return true;
 }
 
@@ -451,30 +477,29 @@
                                            kNoTrailerObjectNumber),
       std::move(m_CrossRefTable));
 
-  // Ignore /Prev for hybrid-reference files.
-  // See ISO 32000-1:2008 spec, table 17.
-  if (!xref_stm) {
-    if (!FindAllCrossReferenceTablesAndStream(main_xref_offset, xref_list,
-                                              xref_stream_list)) {
-      return false;
-    }
-  }
-
-  if (xref_stream_list[0] > 0 &&
-      !LoadCrossRefStream(&xref_stream_list[0], /*is_main_xref=*/false)) {
+  if (!FindAllCrossReferenceTablesAndStream(main_xref_offset, xref_list,
+                                            xref_stream_list)) {
     return false;
   }
 
+  // Cross reference table entries take precedence over cross reference stream
+  // entries. So process the stream entries first and then give the cross
+  // reference tables a chance to overwrite them.
+  //
+  // XRefStm entries should only be used in update sections, so skip
+  // `xref_stream_list[0]`.
+  //
+  // See details in ISO 32000-1:2008, section 7.5.8.4.
   for (size_t i = 1; i < xref_list.size(); ++i) {
-    if (xref_list[i] > 0 && !LoadCrossRefTable(xref_list[i], /*skip=*/false)) {
-      return false;
-    }
-
     if (xref_stream_list[i] > 0 &&
         !LoadCrossRefStream(&xref_stream_list[i], /*is_main_xref=*/false)) {
       return false;
     }
+    if (xref_list[i] > 0 && !LoadCrossRefTable(xref_list[i], /*skip=*/false)) {
+      return false;
+    }
   }
+
   return true;
 }
 
@@ -635,49 +660,6 @@
   }
 }
 
-bool CPDF_Parser::LoadAllCrossRefStreams(FX_FILESIZE xref_offset) {
-  if (!LoadCrossRefStream(&xref_offset, /*is_main_xref=*/true)) {
-    return false;
-  }
-
-  if (!LoadAllSecondaryCrossRefStreams(xref_offset)) {
-    return false;
-  }
-
-  m_ObjectStreamMap.clear();
-  m_bXRefStream = true;
-  return true;
-}
-
-bool CPDF_Parser::LoadAllSecondaryCrossRefStreams(FX_FILESIZE xref_offset) {
-  std::set<FX_FILESIZE> seen_xref_offset;
-  while (xref_offset > 0) {
-    seen_xref_offset.insert(xref_offset);
-    FX_FILESIZE save_xref_offset = xref_offset;
-    if (!LoadCrossRefStream(&xref_offset, /*is_main_xref=*/false)) {
-      // If a cross-reference stream failed to load at `xref_offset`, try
-      // loading a cross-reference table at the same location. Use
-      // `save_xref_offset` instead of `xref_offset`, as `xref_offset` may have
-      // changed.
-      if (!LoadCrossRefTable(save_xref_offset, /*skip=*/false)) {
-        return false;
-      }
-
-      RetainPtr<CPDF_Dictionary> trailer_dict = LoadTrailer();
-      if (!trailer_dict) {
-        return false;
-      }
-      xref_offset = trailer_dict->GetDirectIntegerFor("Prev");
-    }
-
-    // Check for circular references.
-    if (pdfium::Contains(seen_xref_offset, xref_offset)) {
-      return false;
-    }
-  }
-  return true;
-}
-
 bool CPDF_Parser::FindAllCrossReferenceTablesAndStream(
     FX_FILESIZE main_xref_offset,
     std::vector<FX_FILESIZE>& xref_list,
@@ -924,10 +906,6 @@
     return;
   }
 
-  if (existing_type != ObjectType::kFree) {
-    return;
-  }
-
   if (type == ObjectType::kFree) {
     m_CrossRefTable->SetFree(obj_num);
     return;
diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h
index 26abc7d..96e1a3e 100644
--- a/core/fpdfapi/parser/cpdf_parser.h
+++ b/core/fpdfapi/parser/cpdf_parser.h
@@ -143,9 +143,7 @@
     CPDF_CrossRefTable::ObjectInfo info;
   };
 
-  bool LoadAllCrossRefTables(FX_FILESIZE xref_offset);
-  bool LoadAllCrossRefStreams(FX_FILESIZE xref_offset);
-  bool LoadAllSecondaryCrossRefStreams(FX_FILESIZE xref_offset);
+  bool LoadAllCrossRefTablesAndStreams(FX_FILESIZE xref_offset);
   bool FindAllCrossReferenceTablesAndStream(
       FX_FILESIZE main_xref_offset,
       std::vector<FX_FILESIZE>& xref_list,
diff --git a/core/fpdfapi/parser/cpdf_parser_unittest.cpp b/core/fpdfapi/parser/cpdf_parser_unittest.cpp
index b42d7f1..dbf470d 100644
--- a/core/fpdfapi/parser/cpdf_parser_unittest.cpp
+++ b/core/fpdfapi/parser/cpdf_parser_unittest.cpp
@@ -725,7 +725,7 @@
       {.type = CPDF_CrossRefTable::ObjectType::kNormal, .pos = 0},
       // Since the /Index does not follow the spec, this is one of the 2
       // possible values that a parser can come up with.
-      {.type = CPDF_CrossRefTable::ObjectType::kNormal, .pos = 15}};
+      {.type = CPDF_CrossRefTable::ObjectType::kNormal, .pos = 18}};
 
   EXPECT_THAT(objects_info, ElementsAre(Pair(2, expected_result[0]),
                                         Pair(3, expected_result[1])));