Keep track of Font and XObject resources For CPDF_ImageObjects, CPDF_FormObjects, and CPDF_TextObjects, keep track of the Font/XObject resources associated with them. Using this data, CPDF_PageContentGenerator can detect and remove unused resources from resources dictionaries. Then CPDF_Creator can detect the unreferenced resources and remove them when saving. Bug: chromium:1428724,pdfium:1409 Change-Id: I510e6c51eda28535ed00e87b6e10971f7178122c Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/105613 Commit-Queue: Lei Zhang <thestig@chromium.org> Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp b/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp index 914bec0..c149194 100644 --- a/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp +++ b/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp
@@ -45,6 +45,10 @@ namespace { +// Key: The resource type. +// Value: The resource names of a given type. +using ResourcesMap = std::map<ByteString, std::set<ByteString>>; + bool GetColor(const CPDF_Color* pColor, float* rgb) { int intRGB[3]; if (!pColor || !pColor->IsColorSpaceRGB() || @@ -57,6 +61,59 @@ return true; } +void RecordPageObjectResourceUsage(const CPDF_PageObject* page_object, + ResourcesMap& seen_resources) { + const ByteString& resource_name = page_object->GetResourceName(); + if (!resource_name.IsEmpty()) { + switch (page_object->GetType()) { + case CPDF_PageObject::Type::kText: + seen_resources["Font"].insert(resource_name); + break; + case CPDF_PageObject::Type::kImage: + case CPDF_PageObject::Type::kForm: + seen_resources["XObject"].insert(resource_name); + break; + case CPDF_PageObject::Type::kPath: + break; + case CPDF_PageObject::Type::kShading: + break; + } + } +} + +void RemoveUnusedResources(RetainPtr<CPDF_Dictionary> resources_dict, + const ResourcesMap& resources_in_use) { + // TODO(thestig): Remove other unused resource types, like ExtGState. + static constexpr const char* kResourceKeys[] = {"Font", "XObject"}; + for (const char* resource_key : kResourceKeys) { + RetainPtr<CPDF_Dictionary> resource_dict = + resources_dict->GetMutableDictFor(resource_key); + if (!resource_dict) { + continue; + } + + std::vector<ByteString> keys; + { + CPDF_DictionaryLocker resource_dict_locker(resource_dict); + for (auto& it : resource_dict_locker) { + keys.push_back(it.first); + } + } + + auto it = resources_in_use.find(resource_key); + const std::set<ByteString>* resource_in_use_of_current_type = + it != resources_in_use.end() ? &it->second : nullptr; + for (const ByteString& key : keys) { + if (resource_in_use_of_current_type && + pdfium::Contains(*resource_in_use_of_current_type, key)) { + continue; + } + + resource_dict->RemoveFor(key.AsStringView()); + } + } +} + } // namespace CPDF_PageContentGenerator::CPDF_PageContentGenerator( @@ -162,6 +219,18 @@ page_content_manager.UpdateStream(stream_index, buf); } + + RetainPtr<CPDF_Dictionary> resources = m_pObjHolder->GetMutableResources(); + if (!resources) { + return; + } + + ResourcesMap seen_resources; + for (auto& page_object : m_pageObjects) { + RecordPageObjectResourceUsage(page_object, seen_resources); + } + + RemoveUnusedResources(std::move(resources), seen_resources); } ByteString CPDF_PageContentGenerator::RealizeResource( @@ -313,6 +382,8 @@ pImage->ConvertStreamToIndirectObject(); ByteString name = RealizeResource(pStream, "XObject"); + pImageObj->SetResourceName(name); + if (bWasInline) { auto* pPageData = CPDF_DocPageData::FromDocument(m_pDocument); pImageObj->SetImage(pPageData->GetImage(pStream->GetObjNum())); @@ -332,10 +403,11 @@ if (!pStream) return; + ByteString name = RealizeResource(pStream.Get(), "XObject"); + pFormObj->SetResourceName(name); + *buf << "q\n"; WriteMatrix(*buf, pFormObj->form_matrix()) << " cm "; - - ByteString name = RealizeResource(pStream.Get(), "XObject"); *buf << "/" << PDF_NameEncode(name) << " Do Q\n"; } @@ -554,10 +626,10 @@ } data.baseFont = pFont->GetBaseFontName(); - ByteString dictName; + ByteString dict_name; absl::optional<ByteString> maybe_name = m_pObjHolder->FontsMapSearch(data); if (maybe_name.has_value()) { - dictName = std::move(maybe_name.value()); + dict_name = std::move(maybe_name.value()); } else { RetainPtr<const CPDF_Object> pIndirectFont = pFont->GetFontDict(); if (pIndirectFont->IsInline()) { @@ -573,10 +645,12 @@ m_pDocument->AddIndirectObject(pFontDict); pIndirectFont = std::move(pFontDict); } - dictName = RealizeResource(std::move(pIndirectFont), "Font"); - m_pObjHolder->FontsMapInsert(data, dictName); + dict_name = RealizeResource(std::move(pIndirectFont), "Font"); + m_pObjHolder->FontsMapInsert(data, dict_name); } - *buf << "/" << PDF_NameEncode(dictName) << " "; + pTextObj->SetResourceName(dict_name); + + *buf << "/" << PDF_NameEncode(dict_name) << " "; WriteFloat(*buf, pTextObj->GetFontSize()) << " Tf "; *buf << static_cast<int>(pTextObj->GetTextRenderMode()) << " Tr "; ByteString text;
diff --git a/core/fpdfapi/font/cpdf_font.h b/core/fpdfapi/font/cpdf_font.h index dee1724..a74cba9 100644 --- a/core/fpdfapi/font/cpdf_font.h +++ b/core/fpdfapi/font/cpdf_font.h
@@ -133,6 +133,9 @@ CFX_Font* GetFontFallback(int position); + const ByteString& GetResourceName() const { return m_ResourceName; } + void SetResourceName(const ByteString& name) { m_ResourceName = name; } + protected: CPDF_Font(CPDF_Document* pDocument, RetainPtr<CPDF_Dictionary> pFontDict); @@ -163,6 +166,7 @@ void CheckFontMetrics(); UnownedPtr<CPDF_Document> const m_pDocument; + ByteString m_ResourceName; // The resource name for this font. CFX_Font m_Font; std::vector<std::unique_ptr<CFX_Font>> m_FontFallbacks; RetainPtr<CPDF_StreamAcc> m_pFontFile;
diff --git a/core/fpdfapi/page/cpdf_pageobject.h b/core/fpdfapi/page/cpdf_pageobject.h index b5f5e3f..3a0b87a 100644 --- a/core/fpdfapi/page/cpdf_pageobject.h +++ b/core/fpdfapi/page/cpdf_pageobject.h
@@ -11,6 +11,7 @@ #include "core/fpdfapi/page/cpdf_contentmarks.h" #include "core/fpdfapi/page/cpdf_graphicstates.h" +#include "core/fxcrt/bytestring.h" #include "core/fxcrt/fx_coordinates.h" class CPDF_FormObject; @@ -88,6 +89,11 @@ m_ContentStream = new_content_stream; } + const ByteString& GetResourceName() const { return m_ResourceName; } + void SetResourceName(const ByteString& resource_name) { + m_ResourceName = resource_name; + } + protected: void CopyData(const CPDF_PageObject* pSrcObject); @@ -97,6 +103,7 @@ CPDF_ContentMarks m_ContentMarks; bool m_bDirty = false; int32_t m_ContentStream; + ByteString m_ResourceName; // The resource name for this object. }; #endif // CORE_FPDFAPI_PAGE_CPDF_PAGEOBJECT_H_
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.cpp b/core/fpdfapi/page/cpdf_streamcontentparser.cpp index 4226754..30c5b90 100644 --- a/core/fpdfapi/page/cpdf_streamcontentparser.cpp +++ b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
@@ -649,7 +649,8 @@ if (m_pSyntax->GetWord() == "EI") break; } - CPDF_ImageObject* pObj = AddImageFromStream(std::move(pStream)); + CPDF_ImageObject* pObj = + AddImageFromStream(std::move(pStream), /*resource_name=*/""); // Record the bounding box of this image, so rendering code can draw it // properly. if (pObj && pObj->GetImage()->IsMask()) @@ -741,14 +742,15 @@ type = pXObject->GetDict()->GetByteStringFor("Subtype"); if (type == "Form") { - AddForm(std::move(pXObject)); + AddForm(std::move(pXObject), name); return; } if (type == "Image") { CPDF_ImageObject* pObj = - pXObject->IsInline() ? AddImageFromStream(ToStream(pXObject->Clone())) - : AddImageFromStreamObjNum(pXObject->GetObjNum()); + pXObject->IsInline() + ? AddImageFromStream(ToStream(pXObject->Clone()), name) + : AddImageFromStreamObjNum(pXObject->GetObjNum(), name); m_LastImageName = std::move(name); if (pObj) { @@ -759,7 +761,8 @@ } } -void CPDF_StreamContentParser::AddForm(RetainPtr<CPDF_Stream> pStream) { +void CPDF_StreamContentParser::AddForm(RetainPtr<CPDF_Stream> pStream, + const ByteString& name) { CPDF_AllStates status; status.m_GeneralState = m_pCurStates->m_GeneralState; status.m_GraphState = m_pCurStates->m_GraphState; @@ -772,6 +775,7 @@ CFX_Matrix matrix = m_pCurStates->m_CTM * m_mtContentToUser; auto pFormObj = std::make_unique<CPDF_FormObject>(GetCurrentStreamIndex(), std::move(form), matrix); + pFormObj->SetResourceName(name); if (!m_pObjectHolder->BackgroundAlphaNeeded() && pFormObj->form()->BackgroundAlphaNeeded()) { m_pObjectHolder->SetBackgroundAlphaNeeded(true); @@ -782,11 +786,13 @@ } CPDF_ImageObject* CPDF_StreamContentParser::AddImageFromStream( - RetainPtr<CPDF_Stream> pStream) { + RetainPtr<CPDF_Stream> pStream, + const ByteString& name) { if (!pStream) return nullptr; auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex()); + pImageObj->SetResourceName(name); pImageObj->SetImage( pdfium::MakeRetain<CPDF_Image>(m_pDocument, std::move(pStream))); @@ -794,8 +800,10 @@ } CPDF_ImageObject* CPDF_StreamContentParser::AddImageFromStreamObjNum( - uint32_t stream_obj_num) { + uint32_t stream_obj_num, + const ByteString& name) { auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex()); + pImageObj->SetResourceName(name); pImageObj->SetImage( CPDF_DocPageData::FromDocument(m_pDocument)->GetImage(stream_obj_num)); @@ -806,6 +814,7 @@ DCHECK(m_pLastImage); auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex()); + pImageObj->SetResourceName(m_LastImageName); pImageObj->SetImage(CPDF_DocPageData::FromDocument(m_pDocument) ->GetImage(m_pLastImage->GetStream()->GetObjNum())); @@ -1145,9 +1154,14 @@ } RetainPtr<CPDF_Font> pFont = CPDF_DocPageData::FromDocument(m_pDocument) ->GetFont(std::move(pFontDict)); - if (pFont && pFont->IsType3Font()) { - pFont->AsType3Font()->SetPageResources(m_pResources.Get()); - pFont->AsType3Font()->CheckType3FontMetrics(); + if (pFont) { + // Save `name` for later retrieval by the CPDF_TextObject that uses the + // font. + pFont->SetResourceName(name); + if (pFont->IsType3Font()) { + pFont->AsType3Font()->SetPageResources(m_pResources.Get()); + pFont->AsType3Font()->CheckType3FontMetrics(); + } } return pFont; } @@ -1222,6 +1236,7 @@ : m_pCurStates->m_TextState.GetTextMode(); { auto pText = std::make_unique<CPDF_TextObject>(GetCurrentStreamIndex()); + pText->SetResourceName(pFont->GetResourceName()); SetGraphicStates(pText.get(), true, true, true); if (TextRenderingModeIsStrokeMode(text_mode)) { pdfium::span<float> pCTM = pText->m_TextState.GetMutableCTM();
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.h b/core/fpdfapi/page/cpdf_streamcontentparser.h index 86da47d..276dc72 100644 --- a/core/fpdfapi/page/cpdf_streamcontentparser.h +++ b/core/fpdfapi/page/cpdf_streamcontentparser.h
@@ -119,11 +119,13 @@ void AddPathRect(float x, float y, float w, float h); void AddPathObject(CFX_FillRenderOptions::FillType fill_type, RenderType render_type); - CPDF_ImageObject* AddImageFromStream(RetainPtr<CPDF_Stream> pStream); - CPDF_ImageObject* AddImageFromStreamObjNum(uint32_t stream_obj_num); + CPDF_ImageObject* AddImageFromStream(RetainPtr<CPDF_Stream> pStream, + const ByteString& name); + CPDF_ImageObject* AddImageFromStreamObjNum(uint32_t stream_obj_num, + const ByteString& name); CPDF_ImageObject* AddLastImage(); - void AddForm(RetainPtr<CPDF_Stream> pStream); + void AddForm(RetainPtr<CPDF_Stream> pStream, const ByteString& name); void SetGraphicStates(CPDF_PageObject* pObj, bool bColor, bool bText,
diff --git a/fpdfsdk/fpdf_edit_embeddertest.cpp b/fpdfsdk/fpdf_edit_embeddertest.cpp index 4e7c7fd..21c3512 100644 --- a/fpdfsdk/fpdf_edit_embeddertest.cpp +++ b/fpdfsdk/fpdf_edit_embeddertest.cpp
@@ -38,6 +38,7 @@ using pdfium::HelloWorldChecksum; using testing::HasSubstr; +using testing::Not; namespace { @@ -928,10 +929,9 @@ VerifySavedDocument(200, 200, FirstRemovedChecksum()); // Verify removed/renamed resources are no longer there. - // TODO(crbug.com/1428724): Negate these checks. - EXPECT_THAT(GetString(), HasSubstr("/F1")); - EXPECT_THAT(GetString(), HasSubstr("/F2")); - EXPECT_THAT(GetString(), HasSubstr("/Times-Roman")); + EXPECT_THAT(GetString(), Not(HasSubstr("/F1"))); + EXPECT_THAT(GetString(), Not(HasSubstr("/F2"))); + EXPECT_THAT(GetString(), Not(HasSubstr("/Times-Roman"))); UnloadPage(page); } @@ -1114,10 +1114,8 @@ std::vector<std::string> split_saved_data = StringSplit(GetString(), '\n'); // Verify removed/renamed resources are in the save PDF the correct number of // times. - // TODO(crbug.com/1428724): Should only be in the PDF once. - EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F1")).Times(2)); - EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F2")).Times(2)); - + EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F1")).Times(1)); + EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F2")).Times(1)); EXPECT_THAT(split_saved_data, Contains(HasSubstr("/Times-Roman")).Times(1)); UnloadPage(page1);
diff --git a/fpdfsdk/fpdf_save_embeddertest.cpp b/fpdfsdk/fpdf_save_embeddertest.cpp index c18566a..3f5efd8 100644 --- a/fpdfsdk/fpdf_save_embeddertest.cpp +++ b/fpdfsdk/fpdf_save_embeddertest.cpp
@@ -157,10 +157,8 @@ EXPECT_THAT(GetString(), StartsWith("%PDF-1.7\r\n")); EXPECT_THAT(GetString(), HasSubstr("/Root ")); - // TODO(crbug.com/pdfium/1409): The PDF should not have any images, given it - // is rendering blank. The file size should also be a lot smaller. - EXPECT_THAT(GetString(), HasSubstr("/Image")); - EXPECT_LT(GetString().size(), 1300u); + EXPECT_THAT(GetString(), Not(HasSubstr("/Image"))); + EXPECT_LT(GetString().size(), 600u); } #ifdef PDF_ENABLE_XFA