Add object tree traversal utility functions

Add GetObjectsWithMultipleReferences() to figure out which objects have
multiple references. Then code that manipulates PDFs can use this
function to determine if it is safe to edit an existing object. If it is
unsafe, the code can clone before editing instead.

Also add GetObjectsWithReference() to figure out which objects are
referenced. This can be used to determine which objects have no
references and can therefore be deleted.

Bug: chromium:1428724,pdfium:1409,pdfium:2012
Change-Id: I28817068d50c79f36d12bd50c1910937241194a5
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/105611
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdfapi/parser/BUILD.gn b/core/fpdfapi/parser/BUILD.gn
index bdc2e86..e3152d8 100644
--- a/core/fpdfapi/parser/BUILD.gn
+++ b/core/fpdfapi/parser/BUILD.gn
@@ -73,6 +73,8 @@
     "fpdf_parser_decode.h",
     "fpdf_parser_utility.cpp",
     "fpdf_parser_utility.h",
+    "object_tree_traversal_util.cpp",
+    "object_tree_traversal_util.h",
   ]
   configs += [ "../../../:pdfium_strict_config" ]
   deps = [
@@ -151,6 +153,7 @@
     "cpdf_parser_embeddertest.cpp",
     "cpdf_security_handler_embeddertest.cpp",
     "fpdf_parser_decode_embeddertest.cpp",
+    "object_tree_traversal_util_embeddertest.cpp",
   ]
   deps = [
     ":parser",
diff --git a/core/fpdfapi/parser/cpdf_reference.h b/core/fpdfapi/parser/cpdf_reference.h
index df24458..241b398 100644
--- a/core/fpdfapi/parser/cpdf_reference.h
+++ b/core/fpdfapi/parser/cpdf_reference.h
@@ -32,6 +32,7 @@
       CPDF_IndirectObjectHolder* holder) const override;
 
   uint32_t GetRefObjNum() const { return m_RefObjNum; }
+  bool HasIndirectObjectHolder() const { return !!m_pObjList; }
   void SetRef(CPDF_IndirectObjectHolder* pDoc, uint32_t objnum);
 
  private:
diff --git a/core/fpdfapi/parser/object_tree_traversal_util.cpp b/core/fpdfapi/parser/object_tree_traversal_util.cpp
new file mode 100644
index 0000000..e1a0d52
--- /dev/null
+++ b/core/fpdfapi/parser/object_tree_traversal_util.cpp
@@ -0,0 +1,221 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "core/fpdfapi/parser/object_tree_traversal_util.h"
+
+#include <stdint.h>
+
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "core/fpdfapi/parser/cpdf_array.h"
+#include "core/fpdfapi/parser/cpdf_dictionary.h"
+#include "core/fpdfapi/parser/cpdf_document.h"
+#include "core/fpdfapi/parser/cpdf_reference.h"
+#include "core/fpdfapi/parser/cpdf_stream.h"
+#include "third_party/base/check.h"
+#include "third_party/base/containers/contains.h"
+
+namespace {
+
+class ObjectTreeTraverser {
+ public:
+  explicit ObjectTreeTraverser(const CPDF_Document* document)
+      : document_(document) {
+    const CPDF_Parser* parser = document_->GetParser();
+    const CPDF_Dictionary* trailer = parser ? parser->GetTrailer() : nullptr;
+    const CPDF_Dictionary* root = trailer ? trailer : document_->GetRoot();
+    const uint32_t root_object_number =
+        trailer ? parser->GetTrailerObjectNumber() : root->GetObjNum();
+    // If `root` is a trailer, then it may not have an object number, as many
+    // trailers are inlined.
+    if (root_object_number) {
+      referenced_objects_[root_object_number] = 1;
+      object_number_map_[root] = root_object_number;
+    }
+
+    object_queue_.push(pdfium::WrapRetain(root));
+    seen_objects_.insert(root);
+  }
+  ~ObjectTreeTraverser() = default;
+
+  void Traverse() { CalculateReferenceCounts(GetReferenceEntries()); }
+
+  const std::map<uint32_t, int>& referenced_objects() {
+    return referenced_objects_;
+  }
+
+ private:
+  struct ReferenceEntry {
+    uint32_t ref_object_number;
+    uint32_t referenced_object_number;
+  };
+
+  std::vector<ReferenceEntry> GetReferenceEntries() {
+    std::vector<ReferenceEntry> reference_entries;
+    while (!object_queue_.empty()) {
+      RetainPtr<const CPDF_Object> current_object = object_queue_.front();
+      object_queue_.pop();
+
+      switch (current_object->GetType()) {
+        case CPDF_Object::kArray: {
+          CPDF_ArrayLocker locker(current_object->AsArray());
+          for (const auto& it : locker) {
+            PushNewObject(current_object, it);
+          }
+          break;
+        }
+        case CPDF_Object::kDictionary: {
+          CPDF_DictionaryLocker locker(current_object->AsDictionary());
+          for (const auto& it : locker) {
+            PushNewObject(current_object, it.second);
+          }
+          break;
+        }
+        case CPDF_Object::kReference: {
+          const CPDF_Reference* ref_object = current_object->AsReference();
+          const uint32_t ref_object_number = GetObjectNumber(ref_object);
+          const uint32_t referenced_object_number = ref_object->GetRefObjNum();
+          CHECK(referenced_object_number);
+
+          RetainPtr<const CPDF_Object> referenced_object;
+          if (ref_object->HasIndirectObjectHolder()) {
+            // Calling GetIndirectObject() does not work for normal references.
+            referenced_object = ref_object->GetDirect();
+          } else {
+            // Calling GetDirect() does not work for references from trailers.
+            referenced_object =
+                document_->GetIndirectObject(referenced_object_number);
+          }
+          // Unlike the other object types, CPDF_Reference can point at nullptr.
+          if (referenced_object) {
+            reference_entries.push_back(
+                {ref_object_number, referenced_object_number});
+            PushNewObject(ref_object, referenced_object);
+          }
+          break;
+        }
+        case CPDF_Object::kStream: {
+          RetainPtr<const CPDF_Dictionary> dict =
+              current_object->AsStream()->GetDict();
+          CHECK(dict->IsInline());  // i.e. No object number.
+          CPDF_DictionaryLocker locker(dict);
+          for (const auto& it : locker) {
+            PushNewObject(current_object, it.second);
+          }
+          break;
+        }
+        default: {
+          break;
+        }
+      }
+    }
+    return reference_entries;
+  }
+
+  void CalculateReferenceCounts(
+      const std::vector<ReferenceEntry>& reference_entries) {
+    // Tracks PDF objects that referenced other PDF objects, identified by their
+    // object numbers. Never 0.
+    std::set<uint32_t> seen_ref_objects;
+
+    for (const ReferenceEntry& entry : reference_entries) {
+      // Make sure this is not a self-reference.
+      if (entry.referenced_object_number == entry.ref_object_number) {
+        continue;
+      }
+
+      // Make sure this is not a circular reference.
+      if (pdfium::Contains(seen_ref_objects, entry.ref_object_number) &&
+          pdfium::Contains(seen_ref_objects, entry.referenced_object_number)) {
+        continue;
+      }
+
+      ++referenced_objects_[entry.referenced_object_number];
+      if (entry.ref_object_number) {
+        seen_ref_objects.insert(entry.ref_object_number);
+      }
+    }
+  }
+
+  void PushNewObject(const CPDF_Object* parent_object,
+                     RetainPtr<const CPDF_Object> child_object) {
+    CHECK(parent_object);
+    CHECK(child_object);
+    const bool inserted = seen_objects_.insert(child_object).second;
+    if (!inserted) {
+      return;
+    }
+    const uint32_t child_object_number = child_object->GetObjNum();
+    if (child_object_number) {
+      object_number_map_[child_object] = child_object_number;
+    } else {
+      // This search can fail for inlined trailers.
+      auto it = object_number_map_.find(parent_object);
+      if (it != object_number_map_.end()) {
+        object_number_map_[child_object] = it->second;
+      }
+    }
+    object_queue_.push(std::move(child_object));
+  }
+
+  // Returns 0 if not found.
+  uint32_t GetObjectNumber(const CPDF_Object* object) const {
+    auto it = object_number_map_.find(object);
+    return it != object_number_map_.end() ? it->second : 0;
+  }
+
+  const CPDF_Document* const document_;
+
+  // Queue of objects to traverse.
+  // - Pointers in the queue are non-null.
+  // - The same pointer never enters the queue twice.
+  std::queue<RetainPtr<const CPDF_Object>> object_queue_;
+
+  // Map of objects to "top-level" object numbers. For inline objects, this is
+  // the ancestor object with an object number. The keys are non-null and the
+  // values are never 0.
+  // This is used to prevent self-references, as a single PDF object, with
+  // inlined objects, is represented by multiple CPDF_Objects.
+  std::map<const CPDF_Object*, uint32_t> object_number_map_;
+
+  // Tracks traversed objects to prevent duplicates from getting into
+  // `object_queue_` and `object_number_map_`.
+  std::set<const CPDF_Object*> seen_objects_;
+
+  // Tracks which PDF objects are referenced.
+  // Key: object number
+  // Value: number of times referenced
+  std::map<uint32_t, int> referenced_objects_;
+};
+
+}  // namespace
+
+std::set<uint32_t> GetObjectsWithReferences(const CPDF_Document* document) {
+  ObjectTreeTraverser traverser(document);
+  traverser.Traverse();
+
+  std::set<uint32_t> results;
+  for (const auto& it : traverser.referenced_objects()) {
+    results.insert(it.first);
+  }
+  return results;
+}
+
+std::set<uint32_t> GetObjectsWithMultipleReferences(
+    const CPDF_Document* document) {
+  ObjectTreeTraverser traverser(document);
+  traverser.Traverse();
+
+  std::set<uint32_t> results;
+  for (const auto& it : traverser.referenced_objects()) {
+    if (it.second > 1) {
+      results.insert(it.first);
+    }
+  }
+  return results;
+}
diff --git a/core/fpdfapi/parser/object_tree_traversal_util.h b/core/fpdfapi/parser/object_tree_traversal_util.h
new file mode 100644
index 0000000..e9db96d
--- /dev/null
+++ b/core/fpdfapi/parser/object_tree_traversal_util.h
@@ -0,0 +1,46 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CORE_FPDFAPI_PARSER_OBJECT_TREE_TRAVERSAL_UTIL_H_
+#define CORE_FPDFAPI_PARSER_OBJECT_TREE_TRAVERSAL_UTIL_H_
+
+#include <stdint.h>
+
+#include <set>
+
+class CPDF_Document;
+
+// Traverses `document` starting with its trailer, if it has one, or starting at
+// the catalog, which always exists. The trailer should have a reference to the
+// catalog. The traversal avoids cycles.
+// Returns all the PDF objects (not CPDF_Objects) the traversal reached as a set
+// of object numbers.
+std::set<uint32_t> GetObjectsWithReferences(const CPDF_Document* document);
+
+// Same as GetObjectsWithReferences(), but only returns the objects with
+// multiple references. References that would create a cycle are ignored.
+//
+// In this example, where (A) is the root node:
+//
+//     A -> B
+//     A -> C
+//     B -> D
+//     C -> D
+//
+// GetObjectsWithMultipleReferences() returns {D}, since both (B) and (C)
+// references to (D), and there are no cycles.
+//
+// In this example, where (A) is the root node:
+//
+//     A -> B
+//     B -> C
+//     C -> B
+//
+// GetObjectsWithMultipleReferences() returns {}, even though both (A) and (C)
+// references (B). Since (B) -> (C) -> (B) creates a cycle, the (C) -> (B)
+// reference does not count.
+std::set<uint32_t> GetObjectsWithMultipleReferences(
+    const CPDF_Document* document);
+
+#endif  // CORE_FPDFAPI_PARSER_OBJECT_TREE_TRAVERSAL_UTIL_H_
diff --git a/core/fpdfapi/parser/object_tree_traversal_util_embeddertest.cpp b/core/fpdfapi/parser/object_tree_traversal_util_embeddertest.cpp
new file mode 100644
index 0000000..c90c77e
--- /dev/null
+++ b/core/fpdfapi/parser/object_tree_traversal_util_embeddertest.cpp
@@ -0,0 +1,96 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "core/fpdfapi/parser/object_tree_traversal_util.h"
+
+#include <stdint.h>
+
+#include <set>
+
+#include "core/fpdfapi/parser/cpdf_document.h"
+#include "testing/embedder_test.h"
+#include "testing/gmock/include/gmock/gmock.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using testing::UnorderedElementsAreArray;
+using ObjectTreeTraversalUtilEmbedderTest = EmbedderTest;
+
+namespace {
+
+CPDF_Document* GetCPDFDocument(FPDF_DOCUMENT document) {
+  // This is cheating slightly to avoid a layering violation, since this file
+  // cannot include fpdfsdk/cpdfsdk_helpers.h to get access to
+  // CPDFDocumentFromFPDFDocument().
+  return reinterpret_cast<CPDF_Document*>((document));
+}
+
+}  // namespace
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest, GetObjectsWithReferencesBasic) {
+  ASSERT_TRUE(OpenDocument("hello_world.pdf"));
+  CPDF_Document* doc = GetCPDFDocument(document());
+  std::set<uint32_t> referenced_objects = GetObjectsWithReferences(doc);
+  EXPECT_THAT(referenced_objects,
+              UnorderedElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest, GetObjectsWithReferencesNewDoc) {
+  ScopedFPDFDocument new_doc(FPDF_CreateNewDocument());
+  CPDF_Document* doc = GetCPDFDocument(new_doc.get());
+  std::set<uint32_t> referenced_objects = GetObjectsWithReferences(doc);
+  // Empty documents have a catalog and an empty pages object.
+  EXPECT_THAT(referenced_objects, UnorderedElementsAreArray({1, 2}));
+}
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest,
+       GetObjectsWithReferencesCircularRefs) {
+  ASSERT_TRUE(OpenDocument("circular_viewer_ref.pdf"));
+  CPDF_Document* doc = GetCPDFDocument(document());
+  std::set<uint32_t> referenced_objects = GetObjectsWithReferences(doc);
+  // The trailer points at a catalog, and the catalog only references itself.
+  EXPECT_THAT(referenced_objects, UnorderedElementsAreArray({1}));
+}
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest,
+       GetObjectsWithReferencesCrossRefStream) {
+  ASSERT_TRUE(OpenDocument("bug_1399.pdf"));
+  CPDF_Document* doc = GetCPDFDocument(document());
+  std::set<uint32_t> referenced_objects = GetObjectsWithReferences(doc);
+  // The trailer is the dictionary inside /XRef object 16 0. Note that it
+  // references object 3 0, but the rest of the document does not.
+  EXPECT_THAT(referenced_objects,
+              UnorderedElementsAreArray({1, 2, 3, 4, 5, 12, 13, 14, 16}));
+}
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest,
+       GetObjectsWithMultipleReferencesBasic) {
+  ASSERT_TRUE(OpenDocument("hello_world.pdf"));
+  CPDF_Document* doc = GetCPDFDocument(document());
+  std::set<uint32_t> referenced_objects = GetObjectsWithMultipleReferences(doc);
+  EXPECT_TRUE(referenced_objects.empty());
+}
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest,
+       GetObjectsWithMultipleReferencesNewDoc) {
+  ScopedFPDFDocument new_doc(FPDF_CreateNewDocument());
+  CPDF_Document* doc = GetCPDFDocument(new_doc.get());
+  std::set<uint32_t> referenced_objects = GetObjectsWithMultipleReferences(doc);
+  EXPECT_TRUE(referenced_objects.empty());
+}
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest,
+       GetObjectsWithMultipleReferencesCircularRefs) {
+  ASSERT_TRUE(OpenDocument("circular_viewer_ref.pdf"));
+  CPDF_Document* doc = GetCPDFDocument(document());
+  std::set<uint32_t> referenced_objects = GetObjectsWithMultipleReferences(doc);
+  EXPECT_TRUE(referenced_objects.empty());
+}
+
+TEST_F(ObjectTreeTraversalUtilEmbedderTest,
+       GetObjectsWithMultipleReferencesSharedObjects) {
+  ASSERT_TRUE(OpenDocument("hello_world_2_pages.pdf"));
+  CPDF_Document* doc = GetCPDFDocument(document());
+  std::set<uint32_t> referenced_objects = GetObjectsWithMultipleReferences(doc);
+  EXPECT_THAT(referenced_objects, UnorderedElementsAreArray({5, 6, 7}));
+}