blob: 500ca024394384c3fb5a059980ebf7998d51fa81 [file] [log] [blame]
// Copyright 2026 The PDFium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "core/fpdfapi/edit/cpdf_fontsubsetter.h"
#include <stdint.h>
#include <array>
#include <numeric>
#include <string>
#include <vector>
#include "core/fpdfapi/font/cpdf_font.h"
#include "core/fpdfapi/parser/cpdf_array.h"
#include "core/fpdfapi/parser/cpdf_dictionary.h"
#include "core/fpdfapi/parser/cpdf_document.h"
#include "core/fpdfapi/parser/cpdf_name.h"
#include "core/fpdfapi/parser/cpdf_number.h"
#include "core/fpdfapi/parser/cpdf_stream.h"
#include "core/fxcrt/bytestring.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/numerics/safe_conversions.h"
#include "core/fxcrt/retain_ptr.h"
#include "core/fxcrt/widestring.h"
#include "core/fxge/fx_font.h"
#include "public/fpdf_edit.h"
#include "public/fpdfview.h"
#include "testing/embedder_test.h"
#include "testing/fx_string_testhelpers.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "testing/utils/file_util.h"
#include "testing/utils/path_service.h"
using ::testing::IsEmpty;
using ::testing::Matcher;
using ::testing::UnorderedElementsAre;
namespace {
constexpr char kArimoBaseFontName[] = "Arimo-Regular";
constexpr char kLohitTamilBaseFontName[] = "Lohit-Tamil";
constexpr char kNotoSansBaseFontName[] = "NotoSansCJKjp-Regular";
constexpr size_t kSubsettedFontPrefixWithPlusLength =
kSubsettedFontPrefixLength + 1;
// These are cheating slightly to avoid a layering violation, since this file
// cannot include fpdfsdk/cpdfsdk_helpers.h.
CPDF_Document* CPDFDocumentFromFPDFDocument(FPDF_DOCUMENT document) {
return reinterpret_cast<CPDF_Document*>(document);
}
CPDF_Font* CPDFFontFromFPDFFont(FPDF_FONT font) {
return reinterpret_cast<CPDF_Font*>(font);
}
// Returns the file path for a test font provided by the third-party test_fonts.
std::string GetTestFontFilePath(const std::string& file_name) {
return PathService::GetThirdPartyFilePath("test_fonts/test_fonts/" +
file_name);
}
// Returns a list of new object nums used for testing. Since the object nums
// list is used to see if an object num is new, this can exceed the amount of
// objects in the document. This uses a large enough number sufficient for these
// tests. As a result, tests can avoid having to explicitly write the new object
// numbers added when writing text to the document.
std::vector<uint32_t> GetTestNewObjNums() {
std::vector<uint32_t> test_obj_nums(100);
std::iota(test_obj_nums.begin(), test_obj_nums.end(), 1);
return test_obj_nums;
}
// Returns if `actual_name` follows the "XXXXXX+BaseName" pattern, where 'X' is
// an uppercase letter.
bool IsSubsetFontName(const ByteString& actual_name,
ByteStringView expected_base_name) {
if (actual_name.GetLength() !=
kSubsettedFontPrefixWithPlusLength + expected_base_name.GetLength()) {
return false;
}
// Check the first '+' is in the correct position..
std::optional<size_t> first_plus = actual_name.Find('+');
if (!first_plus.has_value() ||
first_plus.value() != kSubsettedFontPrefixLength) {
return false;
}
// Check there is not an additional '+'.
std::optional<size_t> last_plus = actual_name.ReverseFind('+');
if (!last_plus.has_value() || first_plus.value() != last_plus.value()) {
return false;
}
// Check for the tag.
for (char ch : actual_name.First(kSubsettedFontPrefixLength)) {
if (!FXSYS_IsUpperASCII(ch)) {
return false;
}
}
return actual_name.Substr(kSubsettedFontPrefixWithPlusLength) ==
expected_base_name;
}
// See `StreamSizeIsWithinRange` and its relevant matchers.
bool IsMatchingStream(const CPDF_Object* obj,
size_t min_size,
size_t max_size) {
const CPDF_Stream* stream = ToStream(obj);
if (!stream) {
return false;
}
size_t actual_size = stream->GetRawSize();
return actual_size >= min_size && actual_size < max_size;
}
// Matcher that verifies the stream does not contain a subtype and that the
// stream size is strictly within the range min inclusive, max exclusive.
MATCHER_P2(StreamSizeIsWithinRange, min_size, max_size, "") {
const auto& obj = arg.second;
if (!IsMatchingStream(obj, min_size, max_size)) {
return false;
}
const CPDF_Dictionary* dict = obj->GetDict();
if (dict->KeyExist("Subtype")) {
return false;
}
RetainPtr<const CPDF_Number> length1 = dict->GetNumberFor("Length1");
if (!length1 || !length1->IsInteger()) {
return false;
}
int length = length1->GetInteger();
return length >= 0 &&
pdfium::checked_cast<size_t>(length) == obj->AsStream()->GetRawSize();
}
// Same as `StreamSizeIsWithinRange`, but checks for a subtype of "OpenType" and
// excludes the Length1 entry instead.
MATCHER_P2(OpenTypeCFFStreamSizeIsWithinRange, min_size, max_size, "") {
const auto& obj = arg.second;
if (!IsMatchingStream(obj, min_size, max_size)) {
return false;
}
const CPDF_Dictionary* dict = obj->GetDict();
if (dict->GetNameFor("Subtype") != "OpenType") {
return false;
}
return !dict->GetNumberFor("Length1");
}
// Matches the Root Font, checking for a valid subset font name.
MATCHER_P(IsRootFont, expected_base_name, "") {
const auto& obj = arg.second;
const CPDF_Dictionary* dict = ToDictionary(obj);
if (!dict) {
return false;
}
if (dict->GetNameFor("Type") != "Font" ||
dict->GetNameFor("Subtype") != "Type0" ||
dict->GetNameFor("Encoding") != "Identity-H" ||
!dict->KeyExist("DescendantFonts")) {
return false;
}
return IsSubsetFontName(dict->GetNameFor("BaseFont"), expected_base_name);
}
// See `IsCIDFont` and its relevant matchers.
bool IsMatchingCIDFont(const CPDF_Object* obj,
ByteStringView expected_base_name,
ByteStringView expected_subtype) {
const CPDF_Dictionary* dict = ToDictionary(obj);
if (!dict) {
return false;
}
if (dict->GetNameFor("Type") != "Font" || !dict->KeyExist("CIDSystemInfo")) {
return false;
}
return IsSubsetFontName(dict->GetNameFor("BaseFont"), expected_base_name) &&
dict->GetNameFor("Subtype") == expected_subtype;
}
// Matches the CID Font, checking for a valid subset font name and a subtype of
// "CIDFontType2".
MATCHER_P(IsCIDFont, expected_base_name, "") {
const auto& obj = arg.second;
return IsMatchingCIDFont(obj, expected_base_name, "CIDFontType2");
}
// Same as `IsCIDFont`, but checks for a subtype of "CIDFontType0" instead.
MATCHER_P(IsOpenTypeCFFCIDFont, expected_base_name, "") {
const auto& obj = arg.second;
return IsMatchingCIDFont(obj, expected_base_name, "CIDFontType0");
}
// See `IsFontDescriptor` and its relevant matchers.
bool IsMatchingFontDescriptor(const CPDF_Object* obj,
ByteStringView expected_base_name) {
const CPDF_Dictionary* dict = ToDictionary(obj);
if (!dict) {
return false;
}
if (dict->GetNameFor("Type") != "FontDescriptor") {
return false;
}
return IsSubsetFontName(dict->GetNameFor("FontName"), expected_base_name);
}
// Matches the FontDescriptor, checking for a valid subset font name and entry
// for "FontFile2".
MATCHER_P(IsFontDescriptor, expected_base_name, "") {
const auto& obj = arg.second;
if (!IsMatchingFontDescriptor(obj, expected_base_name)) {
return false;
}
const CPDF_Dictionary* dict = obj->AsDictionary();
return dict->GetStreamFor("FontFile2") && !dict->KeyExist("FontFile3");
}
// Same as `IsFontDescriptor`, but checks for valid entries in "Flags" and
// "FontFile3" instead.
MATCHER_P(IsOpenTypeCFFFontDescriptor, expected_base_name, "") {
const auto& obj = arg.second;
if (!IsMatchingFontDescriptor(obj, expected_base_name)) {
return false;
}
const CPDF_Dictionary* dict = obj->AsDictionary();
// See ISO 32000-1:2008 section 9.8.2 "Font Descriptor Flags".
RetainPtr<const CPDF_Number> flags = dict->GetNumberFor("Flags");
if (!flags || !flags->IsInteger()) {
return false;
}
int flags_int = flags->GetInteger();
if (!(flags_int & 0x04) || !!(flags_int & 0x20)) {
return false;
}
return !dict->KeyExist("FontFile2") && dict->GetStreamFor("FontFile3");
}
MATCHER(IsWidths, "") {
const CPDF_Array* array = ToArray(arg.second);
return array && !array->IsEmpty();
}
MATCHER(IsToUnicode, "") {
RetainPtr<const CPDF_Stream> stream = ToStream(arg.second);
if (!stream) {
return false;
}
WideString unicode_text = stream->GetUnicodeText();
return unicode_text.Contains(L"/CIDInit /ProcSet findresource begin") &&
unicode_text.Contains(L"begincmap") &&
unicode_text.Contains(L"endcmap");
}
} // namespace
// Prints overrides nicely for debugging purposes.
void PrintTo(const RetainPtr<const CPDF_Object>& obj, std::ostream* os) {
if (!obj) {
*os << "nullptr";
return;
}
*os << "(Obj type=" << obj->GetType();
if (obj->IsDictionary()) {
const CPDF_Dictionary* dict = obj->AsDictionary();
*os << " {";
static constexpr std::array<const char*, 4> kKeys = {
"Type", "Subtype", "BaseFont", "FontName"};
bool first = true;
for (const char* key : kKeys) {
if (dict->KeyExist(key)) {
if (!first) {
*os << ", ";
}
*os << key << "=" << dict->GetObjectFor(key)->GetString();
first = false;
}
}
*os << "}";
} else if (obj->IsStream()) {
*os << " size=" << obj->AsStream()->GetRawSize();
} else if (obj->IsArray()) {
const CPDF_Array* array = obj->AsArray();
*os << " [size=" << array->size() << "]";
}
*os << ")";
}
class CPDFFontSubsetterTest : public EmbedderTest {
public:
void InsertNewTextObject(const std::wstring& text,
FPDF_PAGE page,
FPDF_FONT font) {
FPDF_PAGEOBJECT text_object =
FPDFPageObj_CreateTextObj(document(), font, 20.0f);
EXPECT_TRUE(text_object);
ScopedFPDFWideString fpdf_text = GetFPDFWideString(text);
EXPECT_TRUE(FPDFText_SetText(text_object, fpdf_text.get()));
const FS_MATRIX matrix{1.0f, 0.0f, 0.0f, 1.0f, 50.0f, 200.0f};
ASSERT_TRUE(FPDFPageObj_TransformF(text_object, &matrix));
FPDFPage_InsertObject(page, text_object);
EXPECT_TRUE(FPDFPage_GenerateContent(page));
}
};
TEST_F(CPDFFontSubsetterTest, NoNewText) {
CreateEmptyDocument();
ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
CPDF_FontSubsetter subsetter(CPDFDocumentFromFPDFDocument(document()));
EXPECT_THAT(subsetter.GenerateObjectOverrides({}), IsEmpty());
EXPECT_THAT(subsetter.GenerateObjectOverrides(GetTestNewObjNums()),
IsEmpty());
// Not a text object.
FPDF_PAGEOBJECT rect = FPDFPageObj_CreateNewRect(20, 100, 50, 50);
FPDFPage_InsertObject(page.get(), rect);
EXPECT_THAT(subsetter.GenerateObjectOverrides(GetTestNewObjNums()),
IsEmpty());
}
TEST_F(CPDFFontSubsetterTest, StandardFont) {
CreateEmptyDocument();
ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
ScopedFPDFFont font(FPDFText_LoadStandardFont(document(), "Helvetica"));
ASSERT_TRUE(font);
ASSERT_NO_FATAL_FAILURE(
InsertNewTextObject(L"Hello world", page.get(), font.get()));
CPDF_FontSubsetter subsetter(CPDFDocumentFromFPDFDocument(document()));
EXPECT_THAT(subsetter.GenerateObjectOverrides(GetTestNewObjNums()),
IsEmpty());
}
TEST_F(CPDFFontSubsetterTest, OpenType) {
CreateEmptyDocument();
ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
const std::string font_path =
GetTestFontFilePath("NotoSansCJKjp-Regular.otf");
ASSERT_FALSE(font_path.empty());
std::vector<uint8_t> font_data = GetFileContents(font_path.c_str());
const size_t original_size = font_data.size();
ASSERT_EQ(16427228u, original_size);
ScopedFPDFFont font(FPDFText_LoadFont(document(), font_data.data(),
font_data.size(), FPDF_FONT_TRUETYPE,
/*cid=*/true));
ASSERT_TRUE(font);
ASSERT_NO_FATAL_FAILURE(InsertNewTextObject(L"这", page.get(), font.get()));
CPDF_FontSubsetter subsetter(CPDFDocumentFromFPDFDocument(document()));
auto overrides = subsetter.GenerateObjectOverrides(GetTestNewObjNums());
ASSERT_EQ(6u, overrides.size());
// Subset size is ~2.5% of the original font file, i.e. ~450 KB.
EXPECT_THAT(overrides, UnorderedElementsAre(
OpenTypeCFFStreamSizeIsWithinRange(
original_size * 0.02, original_size * 0.03),
IsRootFont(kNotoSansBaseFontName),
IsOpenTypeCFFCIDFont(kNotoSansBaseFontName),
IsOpenTypeCFFFontDescriptor(kNotoSansBaseFontName),
IsWidths(), IsToUnicode()));
}
TEST_F(CPDFFontSubsetterTest, TrueType) {
CreateEmptyDocument();
ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
const std::string font_path = GetTestFontFilePath("Arimo-Regular.ttf");
ASSERT_FALSE(font_path.empty());
std::vector<uint8_t> font_data = GetFileContents(font_path.c_str());
const size_t original_size = font_data.size();
ASSERT_EQ(436180u, original_size);
ScopedFPDFFont font(FPDFText_LoadFont(document(), font_data.data(),
font_data.size(), FPDF_FONT_TRUETYPE,
/*cid=*/true));
ASSERT_TRUE(font);
ASSERT_NO_FATAL_FAILURE(
InsertNewTextObject(L"Hello world", page.get(), font.get()));
CPDF_FontSubsetter subsetter(CPDFDocumentFromFPDFDocument(document()));
auto overrides = subsetter.GenerateObjectOverrides(GetTestNewObjNums());
ASSERT_EQ(6u, overrides.size());
// Subset size is ~3% of the original font file, i.e. ~13 KB.
EXPECT_THAT(
overrides,
UnorderedElementsAre(
StreamSizeIsWithinRange(original_size * 0.025, original_size * 0.035),
IsRootFont(kArimoBaseFontName), IsCIDFont(kArimoBaseFontName),
IsFontDescriptor(kArimoBaseFontName), IsWidths(), IsToUnicode()));
}
TEST_F(CPDFFontSubsetterTest, SingleFontMultipleTexts) {
CreateEmptyDocument();
ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
const std::string font_path = GetTestFontFilePath("Arimo-Regular.ttf");
ASSERT_FALSE(font_path.empty());
std::vector<uint8_t> font_data = GetFileContents(font_path.c_str());
const size_t original_size = font_data.size();
ASSERT_EQ(436180u, original_size);
ScopedFPDFFont font(FPDFText_LoadFont(document(), font_data.data(),
font_data.size(), FPDF_FONT_TRUETYPE,
/*cid=*/true));
ASSERT_TRUE(font);
ASSERT_NO_FATAL_FAILURE(
InsertNewTextObject(L"Abcdefg", page.get(), font.get()));
ASSERT_NO_FATAL_FAILURE(
InsertNewTextObject(L"Hijklmnop", page.get(), font.get()));
CPDF_FontSubsetter subsetter(CPDFDocumentFromFPDFDocument(document()));
auto overrides = subsetter.GenerateObjectOverrides(GetTestNewObjNums());
ASSERT_EQ(6u, overrides.size());
// Subset size is ~3.5% of the original font file, i.e. ~15 KB.
EXPECT_THAT(
overrides,
UnorderedElementsAre(
StreamSizeIsWithinRange(original_size * 0.03, original_size * 0.04),
IsRootFont(kArimoBaseFontName), IsCIDFont(kArimoBaseFontName),
IsFontDescriptor(kArimoBaseFontName), IsWidths(), IsToUnicode()));
}
TEST_F(CPDFFontSubsetterTest, MultipleFontsMultipleTexts) {
CreateEmptyDocument();
ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
const std::string font_path1 = GetTestFontFilePath("Lohit-Tamil.ttf");
ASSERT_FALSE(font_path1.empty());
const std::string font_path2 = GetTestFontFilePath("Arimo-Regular.ttf");
ASSERT_FALSE(font_path2.empty());
std::vector<uint8_t> font_data1 = GetFileContents(font_path1.c_str());
const size_t original_size1 = font_data1.size();
ASSERT_EQ(48908u, original_size1);
std::vector<uint8_t> font_data2 = GetFileContents(font_path2.c_str());
const size_t original_size2 = font_data2.size();
ASSERT_EQ(436180u, original_size2);
ScopedFPDFFont font1(FPDFText_LoadFont(document(), font_data1.data(),
font_data1.size(), FPDF_FONT_TRUETYPE,
/*cid=*/true));
ASSERT_TRUE(font1);
ScopedFPDFFont font2(FPDFText_LoadFont(document(), font_data2.data(),
font_data2.size(), FPDF_FONT_TRUETYPE,
/*cid=*/true));
ASSERT_TRUE(font2);
ASSERT_NO_FATAL_FAILURE(
InsertNewTextObject(L"வணக்கம்", page.get(), font1.get()));
ASSERT_NO_FATAL_FAILURE(
InsertNewTextObject(L"Goodbye", page.get(), font2.get()));
CPDF_FontSubsetter subsetter(CPDFDocumentFromFPDFDocument(document()));
auto overrides = subsetter.GenerateObjectOverrides(GetTestNewObjNums());
ASSERT_EQ(12u, overrides.size());
// Subset size for `font_data1` is ~6% of the original file, i.e. ~3 KB.
// Subset size for `font_data2` is ~3% of the original file, i.e. ~13.3 KB.
EXPECT_THAT(
overrides,
UnorderedElementsAre(
StreamSizeIsWithinRange(original_size1 * 0.055,
original_size1 * 0.065),
IsRootFont(kLohitTamilBaseFontName),
IsCIDFont(kLohitTamilBaseFontName),
IsFontDescriptor(kLohitTamilBaseFontName), IsWidths(), IsToUnicode(),
StreamSizeIsWithinRange(original_size2 * 0.025,
original_size2 * 0.035),
IsRootFont(kArimoBaseFontName), IsCIDFont(kArimoBaseFontName),
IsFontDescriptor(kArimoBaseFontName), IsWidths(), IsToUnicode()));
}
TEST_F(CPDFFontSubsetterTest, ReplaceExistingPrefix) {
CreateEmptyDocument();
ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
const std::string font_path = GetTestFontFilePath("Arimo-Regular.ttf");
ASSERT_FALSE(font_path.empty());
// The file size for `font_path` is ~436 KB.
std::vector<uint8_t> font_data = GetFileContents(font_path.c_str());
size_t original_size = font_data.size();
ASSERT_GT(original_size, 0u);
ScopedFPDFFont font(FPDFText_LoadFont(document(), font_data.data(),
font_data.size(), FPDF_FONT_TRUETYPE,
/*cid=*/true));
// Manually insert an existing prefix to the font name.
CPDF_Font* cfont = CPDFFontFromFPDFFont(font.get());
RetainPtr<CPDF_Dictionary> font_dict = cfont->GetMutableFontDict();
ASSERT_TRUE(font_dict);
font_dict->SetNewFor<CPDF_Name>("BaseFont", "AAAAAA+Arimo-Regular");
ASSERT_NO_FATAL_FAILURE(
InsertNewTextObject(L"Hello world", page.get(), font.get()));
CPDF_FontSubsetter subsetter(CPDFDocumentFromFPDFDocument(document()));
auto overrides = subsetter.GenerateObjectOverrides(GetTestNewObjNums());
ASSERT_EQ(6u, overrides.size());
// Subset size is ~3% of the original font file, i.e. ~13 KB.
EXPECT_THAT(
overrides,
UnorderedElementsAre(
StreamSizeIsWithinRange(original_size * 0.025, original_size * 0.035),
IsRootFont(kArimoBaseFontName), IsCIDFont(kArimoBaseFontName),
IsFontDescriptor(kArimoBaseFontName), IsWidths(), IsToUnicode()));
}