| // Copyright 2023 The PDFium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "core/fpdfapi/parser/object_tree_traversal_util.h" |
| |
| #include <stdint.h> |
| |
| #include <map> |
| #include <queue> |
| #include <set> |
| #include <utility> |
| #include <vector> |
| |
| #include "core/fpdfapi/parser/cpdf_array.h" |
| #include "core/fpdfapi/parser/cpdf_dictionary.h" |
| #include "core/fpdfapi/parser/cpdf_document.h" |
| #include "core/fpdfapi/parser/cpdf_reference.h" |
| #include "core/fpdfapi/parser/cpdf_stream.h" |
| #include "core/fxcrt/unowned_ptr.h" |
| #include "third_party/base/check.h" |
| #include "third_party/base/containers/contains.h" |
| |
| namespace { |
| |
| class ObjectTreeTraverser { |
| public: |
| explicit ObjectTreeTraverser(const CPDF_Document* document) |
| : document_(document) { |
| const CPDF_Parser* parser = document_->GetParser(); |
| const CPDF_Dictionary* trailer = parser ? parser->GetTrailer() : nullptr; |
| const CPDF_Dictionary* root = trailer ? trailer : document_->GetRoot(); |
| const uint32_t root_object_number = |
| trailer ? parser->GetTrailerObjectNumber() : root->GetObjNum(); |
| // If `root` is a trailer, then it may not have an object number, as many |
| // trailers are inlined. |
| if (root_object_number) { |
| referenced_objects_[root_object_number] = 1; |
| object_number_map_[root] = root_object_number; |
| } |
| |
| object_queue_.push(pdfium::WrapRetain(root)); |
| seen_objects_.insert(root); |
| } |
| ~ObjectTreeTraverser() = default; |
| |
| void Traverse() { CalculateReferenceCounts(GetReferenceEntries()); } |
| |
| const std::map<uint32_t, int>& referenced_objects() { |
| return referenced_objects_; |
| } |
| |
| private: |
| struct ReferenceEntry { |
| uint32_t ref_object_number; |
| uint32_t referenced_object_number; |
| }; |
| |
| std::vector<ReferenceEntry> GetReferenceEntries() { |
| std::vector<ReferenceEntry> reference_entries; |
| while (!object_queue_.empty()) { |
| RetainPtr<const CPDF_Object> current_object = object_queue_.front(); |
| object_queue_.pop(); |
| |
| switch (current_object->GetType()) { |
| case CPDF_Object::kArray: { |
| CPDF_ArrayLocker locker(current_object->AsArray()); |
| for (const auto& it : locker) { |
| PushNewObject(current_object, it); |
| } |
| break; |
| } |
| case CPDF_Object::kDictionary: { |
| CPDF_DictionaryLocker locker(current_object->AsDictionary()); |
| for (const auto& it : locker) { |
| PushNewObject(current_object, it.second); |
| } |
| break; |
| } |
| case CPDF_Object::kReference: { |
| const CPDF_Reference* ref_object = current_object->AsReference(); |
| const uint32_t ref_object_number = GetObjectNumber(ref_object); |
| const uint32_t referenced_object_number = ref_object->GetRefObjNum(); |
| |
| RetainPtr<const CPDF_Object> referenced_object; |
| if (ref_object->HasIndirectObjectHolder()) { |
| // Calling GetIndirectObject() does not work for normal references. |
| referenced_object = ref_object->GetDirect(); |
| } else { |
| // Calling GetDirect() does not work for references from trailers. |
| referenced_object = |
| document_->GetIndirectObject(referenced_object_number); |
| } |
| // Unlike the other object types, CPDF_Reference can point at nullptr. |
| if (referenced_object) { |
| CHECK(referenced_object_number); |
| reference_entries.push_back( |
| {ref_object_number, referenced_object_number}); |
| PushNewObject(ref_object, referenced_object); |
| } |
| break; |
| } |
| case CPDF_Object::kStream: { |
| RetainPtr<const CPDF_Dictionary> dict = |
| current_object->AsStream()->GetDict(); |
| CHECK(dict->IsInline()); // i.e. No object number. |
| CPDF_DictionaryLocker locker(dict); |
| for (const auto& it : locker) { |
| PushNewObject(current_object, it.second); |
| } |
| break; |
| } |
| default: { |
| break; |
| } |
| } |
| } |
| return reference_entries; |
| } |
| |
| void CalculateReferenceCounts( |
| const std::vector<ReferenceEntry>& reference_entries) { |
| // Tracks PDF objects that referenced other PDF objects, identified by their |
| // object numbers. Never 0. |
| std::set<uint32_t> seen_ref_objects; |
| |
| for (const ReferenceEntry& entry : reference_entries) { |
| // Make sure this is not a self-reference. |
| if (entry.referenced_object_number == entry.ref_object_number) { |
| continue; |
| } |
| |
| // Make sure this is not a circular reference. |
| if (pdfium::Contains(seen_ref_objects, entry.ref_object_number) && |
| pdfium::Contains(seen_ref_objects, entry.referenced_object_number)) { |
| continue; |
| } |
| |
| ++referenced_objects_[entry.referenced_object_number]; |
| if (entry.ref_object_number) { |
| seen_ref_objects.insert(entry.ref_object_number); |
| } |
| } |
| } |
| |
| void PushNewObject(const CPDF_Object* parent_object, |
| RetainPtr<const CPDF_Object> child_object) { |
| CHECK(parent_object); |
| CHECK(child_object); |
| const bool inserted = seen_objects_.insert(child_object).second; |
| if (!inserted) { |
| return; |
| } |
| const uint32_t child_object_number = child_object->GetObjNum(); |
| if (child_object_number) { |
| object_number_map_[child_object] = child_object_number; |
| } else { |
| // This search can fail for inlined trailers. |
| auto it = object_number_map_.find(parent_object); |
| if (it != object_number_map_.end()) { |
| object_number_map_[child_object] = it->second; |
| } |
| } |
| object_queue_.push(std::move(child_object)); |
| } |
| |
| // Returns 0 if not found. |
| uint32_t GetObjectNumber(const CPDF_Object* object) const { |
| auto it = object_number_map_.find(object); |
| return it != object_number_map_.end() ? it->second : 0; |
| } |
| |
| UnownedPtr<const CPDF_Document> const document_; |
| |
| // Queue of objects to traverse. |
| // - Pointers in the queue are non-null. |
| // - The same pointer never enters the queue twice. |
| std::queue<RetainPtr<const CPDF_Object>> object_queue_; |
| |
| // Map of objects to "top-level" object numbers. For inline objects, this is |
| // the ancestor object with an object number. The keys are non-null and the |
| // values are never 0. |
| // This is used to prevent self-references, as a single PDF object, with |
| // inlined objects, is represented by multiple CPDF_Objects. |
| std::map<const CPDF_Object*, uint32_t> object_number_map_; |
| |
| // Tracks traversed objects to prevent duplicates from getting into |
| // `object_queue_` and `object_number_map_`. |
| std::set<const CPDF_Object*> seen_objects_; |
| |
| // Tracks which PDF objects are referenced. |
| // Key: object number |
| // Value: number of times referenced |
| std::map<uint32_t, int> referenced_objects_; |
| }; |
| |
| } // namespace |
| |
| std::set<uint32_t> GetObjectsWithReferences(const CPDF_Document* document) { |
| ObjectTreeTraverser traverser(document); |
| traverser.Traverse(); |
| |
| std::set<uint32_t> results; |
| for (const auto& it : traverser.referenced_objects()) { |
| results.insert(it.first); |
| } |
| return results; |
| } |
| |
| std::set<uint32_t> GetObjectsWithMultipleReferences( |
| const CPDF_Document* document) { |
| ObjectTreeTraverser traverser(document); |
| traverser.Traverse(); |
| |
| std::set<uint32_t> results; |
| for (const auto& it : traverser.referenced_objects()) { |
| if (it.second > 1) { |
| results.insert(it.first); |
| } |
| } |
| return results; |
| } |