blob: e1a0d52d0f5246e7778fed056e1aef3e49e9b917 [file] [log] [blame]
// Copyright 2023 The PDFium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "core/fpdfapi/parser/object_tree_traversal_util.h"
#include <stdint.h>
#include <map>
#include <queue>
#include <set>
#include <utility>
#include <vector>
#include "core/fpdfapi/parser/cpdf_array.h"
#include "core/fpdfapi/parser/cpdf_dictionary.h"
#include "core/fpdfapi/parser/cpdf_document.h"
#include "core/fpdfapi/parser/cpdf_reference.h"
#include "core/fpdfapi/parser/cpdf_stream.h"
#include "third_party/base/check.h"
#include "third_party/base/containers/contains.h"
namespace {
class ObjectTreeTraverser {
public:
explicit ObjectTreeTraverser(const CPDF_Document* document)
: document_(document) {
const CPDF_Parser* parser = document_->GetParser();
const CPDF_Dictionary* trailer = parser ? parser->GetTrailer() : nullptr;
const CPDF_Dictionary* root = trailer ? trailer : document_->GetRoot();
const uint32_t root_object_number =
trailer ? parser->GetTrailerObjectNumber() : root->GetObjNum();
// If `root` is a trailer, then it may not have an object number, as many
// trailers are inlined.
if (root_object_number) {
referenced_objects_[root_object_number] = 1;
object_number_map_[root] = root_object_number;
}
object_queue_.push(pdfium::WrapRetain(root));
seen_objects_.insert(root);
}
~ObjectTreeTraverser() = default;
void Traverse() { CalculateReferenceCounts(GetReferenceEntries()); }
const std::map<uint32_t, int>& referenced_objects() {
return referenced_objects_;
}
private:
struct ReferenceEntry {
uint32_t ref_object_number;
uint32_t referenced_object_number;
};
std::vector<ReferenceEntry> GetReferenceEntries() {
std::vector<ReferenceEntry> reference_entries;
while (!object_queue_.empty()) {
RetainPtr<const CPDF_Object> current_object = object_queue_.front();
object_queue_.pop();
switch (current_object->GetType()) {
case CPDF_Object::kArray: {
CPDF_ArrayLocker locker(current_object->AsArray());
for (const auto& it : locker) {
PushNewObject(current_object, it);
}
break;
}
case CPDF_Object::kDictionary: {
CPDF_DictionaryLocker locker(current_object->AsDictionary());
for (const auto& it : locker) {
PushNewObject(current_object, it.second);
}
break;
}
case CPDF_Object::kReference: {
const CPDF_Reference* ref_object = current_object->AsReference();
const uint32_t ref_object_number = GetObjectNumber(ref_object);
const uint32_t referenced_object_number = ref_object->GetRefObjNum();
CHECK(referenced_object_number);
RetainPtr<const CPDF_Object> referenced_object;
if (ref_object->HasIndirectObjectHolder()) {
// Calling GetIndirectObject() does not work for normal references.
referenced_object = ref_object->GetDirect();
} else {
// Calling GetDirect() does not work for references from trailers.
referenced_object =
document_->GetIndirectObject(referenced_object_number);
}
// Unlike the other object types, CPDF_Reference can point at nullptr.
if (referenced_object) {
reference_entries.push_back(
{ref_object_number, referenced_object_number});
PushNewObject(ref_object, referenced_object);
}
break;
}
case CPDF_Object::kStream: {
RetainPtr<const CPDF_Dictionary> dict =
current_object->AsStream()->GetDict();
CHECK(dict->IsInline()); // i.e. No object number.
CPDF_DictionaryLocker locker(dict);
for (const auto& it : locker) {
PushNewObject(current_object, it.second);
}
break;
}
default: {
break;
}
}
}
return reference_entries;
}
void CalculateReferenceCounts(
const std::vector<ReferenceEntry>& reference_entries) {
// Tracks PDF objects that referenced other PDF objects, identified by their
// object numbers. Never 0.
std::set<uint32_t> seen_ref_objects;
for (const ReferenceEntry& entry : reference_entries) {
// Make sure this is not a self-reference.
if (entry.referenced_object_number == entry.ref_object_number) {
continue;
}
// Make sure this is not a circular reference.
if (pdfium::Contains(seen_ref_objects, entry.ref_object_number) &&
pdfium::Contains(seen_ref_objects, entry.referenced_object_number)) {
continue;
}
++referenced_objects_[entry.referenced_object_number];
if (entry.ref_object_number) {
seen_ref_objects.insert(entry.ref_object_number);
}
}
}
void PushNewObject(const CPDF_Object* parent_object,
RetainPtr<const CPDF_Object> child_object) {
CHECK(parent_object);
CHECK(child_object);
const bool inserted = seen_objects_.insert(child_object).second;
if (!inserted) {
return;
}
const uint32_t child_object_number = child_object->GetObjNum();
if (child_object_number) {
object_number_map_[child_object] = child_object_number;
} else {
// This search can fail for inlined trailers.
auto it = object_number_map_.find(parent_object);
if (it != object_number_map_.end()) {
object_number_map_[child_object] = it->second;
}
}
object_queue_.push(std::move(child_object));
}
// Returns 0 if not found.
uint32_t GetObjectNumber(const CPDF_Object* object) const {
auto it = object_number_map_.find(object);
return it != object_number_map_.end() ? it->second : 0;
}
const CPDF_Document* const document_;
// Queue of objects to traverse.
// - Pointers in the queue are non-null.
// - The same pointer never enters the queue twice.
std::queue<RetainPtr<const CPDF_Object>> object_queue_;
// Map of objects to "top-level" object numbers. For inline objects, this is
// the ancestor object with an object number. The keys are non-null and the
// values are never 0.
// This is used to prevent self-references, as a single PDF object, with
// inlined objects, is represented by multiple CPDF_Objects.
std::map<const CPDF_Object*, uint32_t> object_number_map_;
// Tracks traversed objects to prevent duplicates from getting into
// `object_queue_` and `object_number_map_`.
std::set<const CPDF_Object*> seen_objects_;
// Tracks which PDF objects are referenced.
// Key: object number
// Value: number of times referenced
std::map<uint32_t, int> referenced_objects_;
};
} // namespace
std::set<uint32_t> GetObjectsWithReferences(const CPDF_Document* document) {
ObjectTreeTraverser traverser(document);
traverser.Traverse();
std::set<uint32_t> results;
for (const auto& it : traverser.referenced_objects()) {
results.insert(it.first);
}
return results;
}
std::set<uint32_t> GetObjectsWithMultipleReferences(
const CPDF_Document* document) {
ObjectTreeTraverser traverser(document);
traverser.Traverse();
std::set<uint32_t> results;
for (const auto& it : traverser.referenced_objects()) {
if (it.second > 1) {
results.insert(it.first);
}
}
return results;
}