blob: 6748c3ec1e9f4e3a1ea07359653dd9c8fd277147 [file] [log] [blame] [edit]
// Copyright 2026 The PDFium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "core/fpdfapi/edit/cpdf_fontsubsetter.h"
#include <hb-subset.h>
#include <hb.h>
#include <stdint.h>
#include <algorithm>
#include <map>
#include <memory>
#include <set>
#include <utility>
#include <vector>
#include "core/fpdfapi/font/cpdf_font.h"
#include "core/fpdfapi/page/cpdf_page.h"
#include "core/fpdfapi/page/cpdf_textobject.h"
#include "core/fpdfapi/parser/cpdf_array.h"
#include "core/fpdfapi/parser/cpdf_dictionary.h"
#include "core/fpdfapi/parser/cpdf_document.h"
#include "core/fpdfapi/parser/cpdf_number.h"
#include "core/fpdfapi/parser/cpdf_stream.h"
#include "core/fpdfapi/parser/cpdf_stream_acc.h"
#include "core/fxcrt/check.h"
#include "core/fxcrt/compiler_specific.h"
#include "core/fxcrt/data_vector.h"
#include "core/fxcrt/retain_ptr.h"
#include "core/fxcrt/span.h"
namespace {
template <auto DestroyFunction>
struct HBDeleter {
template <typename T>
void operator()(T* ptr) const {
DestroyFunction(ptr);
}
};
using ScopedHBBlob = std::unique_ptr<hb_blob_t, HBDeleter<hb_blob_destroy>>;
using ScopedHBFace = std::unique_ptr<hb_face_t, HBDeleter<hb_face_destroy>>;
using ScopedHBSubsetInput =
std::unique_ptr<hb_subset_input_t, HBDeleter<hb_subset_input_destroy>>;
DataVector<uint8_t> GenerateFontSubset(CPDF_Document* doc,
pdfium::span<const uint8_t> font_data,
const std::set<uint32_t>& gids) {
// Wrap the data.
ScopedHBBlob blob(
hb_blob_create_or_fail(reinterpret_cast<const char*>(font_data.data()),
static_cast<uint32_t>(font_data.size()),
HB_MEMORY_MODE_READONLY, nullptr, nullptr));
if (!blob) {
return {};
}
ScopedHBSubsetInput input(hb_subset_input_create_or_fail());
if (!input) {
return {};
}
hb_subset_input_set_flags(input.get(), HB_SUBSET_FLAGS_RETAIN_GIDS |
HB_SUBSET_FLAGS_NOTDEF_OUTLINE);
hb_set_t* glyphs = hb_subset_input_glyph_set(input.get());
for (uint32_t gid : gids) {
hb_set_add(glyphs, gid);
}
ScopedHBFace face(hb_face_create(blob.get(), 0));
ScopedHBFace subset_face(hb_subset_or_fail(face.get(), input.get()));
if (!subset_face) {
return {};
}
ScopedHBBlob subset_blob(hb_face_reference_blob(subset_face.get()));
unsigned int out_len;
const char* out_data = hb_blob_get_data(subset_blob.get(), &out_len);
if (!out_data || out_len == 0) {
return {};
}
// SAFETY: HarfBuzz guarantees the correct length from hb_blob_get_length.
return DataVector<uint8_t>(out_data, UNSAFE_BUFFERS(out_data + out_len));
}
} // namespace
CPDF_FontSubsetter::CPDF_FontSubsetter(CPDF_Document* doc) : doc_(doc) {}
CPDF_FontSubsetter::~CPDF_FontSubsetter() = default;
std::map<uint32_t, RetainPtr<const CPDF_Object>>
CPDF_FontSubsetter::GenerateObjectOverrides(
pdfium::span<const uint32_t> new_obj_nums) {
if (new_obj_nums.empty()) {
return {};
}
candidates_.clear();
CollectSubsetCandidates(new_obj_nums);
std::map<uint32_t, RetainPtr<const CPDF_Object>> overrides;
for (auto& [obj_num, candidate] : candidates_) {
auto original_stream_acc =
pdfium::MakeRetain<CPDF_StreamAcc>(candidate.font_stream);
original_stream_acc->LoadAllDataFiltered();
auto original_stream_span = original_stream_acc->GetSpan();
DataVector<uint8_t> subsetted_font_data =
GenerateFontSubset(doc_, original_stream_span, candidate.used_gids);
if (subsetted_font_data.empty()) {
continue;
}
// Override the font file stream.
// TODO(crbug.com/476127152): Correctly support OpenType CFF.
auto subsetted_font_dict = pdfium::MakeRetain<CPDF_Dictionary>();
// TrueType fonts requires a Length1 entry.
subsetted_font_dict->SetNewFor<CPDF_Number>(
"Length1", static_cast<int>(subsetted_font_data.size()));
overrides[obj_num] = pdfium::MakeRetain<CPDF_Stream>(
std::move(subsetted_font_data), std::move(subsetted_font_dict));
}
return overrides;
}
CPDF_FontSubsetter::SubsetCandidate::SubsetCandidate() = default;
CPDF_FontSubsetter::SubsetCandidate::~SubsetCandidate() = default;
void CPDF_FontSubsetter::CollectSubsetCandidates(
pdfium::span<const uint32_t> new_obj_nums) {
for (int i = 0; i < doc_->GetPageCount(); ++i) {
RetainPtr<CPDF_Dictionary> page_dict = doc_->GetMutablePageDictionary(i);
if (!page_dict) {
continue;
}
auto page = pdfium::MakeRetain<CPDF_Page>(doc_, std::move(page_dict));
page->ParseContent();
CollectSubsetCandidatesFromPage(page, new_obj_nums);
}
}
void CPDF_FontSubsetter::CollectSubsetCandidatesFromPage(
CPDF_Page* page,
pdfium::span<const uint32_t> new_obj_nums) {
for (const auto& page_obj : *page) {
const CPDF_TextObject* text = page_obj->AsText();
if (!text) {
continue;
}
RetainPtr<CPDF_Font> font = text->GetFont();
RetainPtr<const CPDF_Dictionary> root_font = font->GetFontDict();
if (!std::ranges::binary_search(new_obj_nums, root_font->GetObjNum())) {
continue;
}
RetainPtr<const CPDF_Dictionary> descriptor;
if (font->IsCIDFont()) {
RetainPtr<const CPDF_Array> descendants =
root_font->GetArrayFor("DescendantFonts");
CHECK(descendants);
RetainPtr<const CPDF_Dictionary> cid_font = descendants->GetDictAt(0);
CHECK(cid_font);
descriptor = cid_font->GetDictFor("FontDescriptor");
} else {
descriptor = root_font->GetDictFor("FontDescriptor");
}
if (!descriptor) {
continue;
}
RetainPtr<const CPDF_Stream> font_stream =
descriptor->GetStreamFor("FontFile2");
if (!font_stream) {
continue;
}
uint32_t obj_num = font_stream->GetObjNum();
auto& candidate = candidates_[obj_num];
if (!candidate.font_stream) {
candidate.font_stream = font_stream;
}
AddUsedText(text, candidate);
}
}
void CPDF_FontSubsetter::AddUsedText(const CPDF_TextObject* text,
SubsetCandidate& candidate) {
CPDF_Font* font = text->GetFont();
const std::vector<uint32_t>& char_codes = text->GetCharCodes();
std::set<uint32_t>& used_gids = candidate.used_gids;
for (uint32_t char_code : char_codes) {
int gid = font->GlyphFromCharCode(char_code, /*pVertGlyph=*/nullptr);
if (gid != -1) {
used_gids.insert(static_cast<uint32_t>(gid));
}
}
}