Keep track of Font and XObject resources
For CPDF_ImageObjects, CPDF_FormObjects, and CPDF_TextObjects, keep
track of the Font/XObject resources associated with them. Using this
data, CPDF_PageContentGenerator can detect and remove unused resources
from resources dictionaries. Then CPDF_Creator can detect the
unreferenced resources and remove them when saving.
Bug: chromium:1428724,pdfium:1409
Change-Id: I510e6c51eda28535ed00e87b6e10971f7178122c
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/105613
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp b/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp
index 914bec0..c149194 100644
--- a/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp
+++ b/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp
@@ -45,6 +45,10 @@
namespace {
+// Key: The resource type.
+// Value: The resource names of a given type.
+using ResourcesMap = std::map<ByteString, std::set<ByteString>>;
+
bool GetColor(const CPDF_Color* pColor, float* rgb) {
int intRGB[3];
if (!pColor || !pColor->IsColorSpaceRGB() ||
@@ -57,6 +61,59 @@
return true;
}
+void RecordPageObjectResourceUsage(const CPDF_PageObject* page_object,
+ ResourcesMap& seen_resources) {
+ const ByteString& resource_name = page_object->GetResourceName();
+ if (!resource_name.IsEmpty()) {
+ switch (page_object->GetType()) {
+ case CPDF_PageObject::Type::kText:
+ seen_resources["Font"].insert(resource_name);
+ break;
+ case CPDF_PageObject::Type::kImage:
+ case CPDF_PageObject::Type::kForm:
+ seen_resources["XObject"].insert(resource_name);
+ break;
+ case CPDF_PageObject::Type::kPath:
+ break;
+ case CPDF_PageObject::Type::kShading:
+ break;
+ }
+ }
+}
+
+void RemoveUnusedResources(RetainPtr<CPDF_Dictionary> resources_dict,
+ const ResourcesMap& resources_in_use) {
+ // TODO(thestig): Remove other unused resource types, like ExtGState.
+ static constexpr const char* kResourceKeys[] = {"Font", "XObject"};
+ for (const char* resource_key : kResourceKeys) {
+ RetainPtr<CPDF_Dictionary> resource_dict =
+ resources_dict->GetMutableDictFor(resource_key);
+ if (!resource_dict) {
+ continue;
+ }
+
+ std::vector<ByteString> keys;
+ {
+ CPDF_DictionaryLocker resource_dict_locker(resource_dict);
+ for (auto& it : resource_dict_locker) {
+ keys.push_back(it.first);
+ }
+ }
+
+ auto it = resources_in_use.find(resource_key);
+ const std::set<ByteString>* resource_in_use_of_current_type =
+ it != resources_in_use.end() ? &it->second : nullptr;
+ for (const ByteString& key : keys) {
+ if (resource_in_use_of_current_type &&
+ pdfium::Contains(*resource_in_use_of_current_type, key)) {
+ continue;
+ }
+
+ resource_dict->RemoveFor(key.AsStringView());
+ }
+ }
+}
+
} // namespace
CPDF_PageContentGenerator::CPDF_PageContentGenerator(
@@ -162,6 +219,18 @@
page_content_manager.UpdateStream(stream_index, buf);
}
+
+ RetainPtr<CPDF_Dictionary> resources = m_pObjHolder->GetMutableResources();
+ if (!resources) {
+ return;
+ }
+
+ ResourcesMap seen_resources;
+ for (auto& page_object : m_pageObjects) {
+ RecordPageObjectResourceUsage(page_object, seen_resources);
+ }
+
+ RemoveUnusedResources(std::move(resources), seen_resources);
}
ByteString CPDF_PageContentGenerator::RealizeResource(
@@ -313,6 +382,8 @@
pImage->ConvertStreamToIndirectObject();
ByteString name = RealizeResource(pStream, "XObject");
+ pImageObj->SetResourceName(name);
+
if (bWasInline) {
auto* pPageData = CPDF_DocPageData::FromDocument(m_pDocument);
pImageObj->SetImage(pPageData->GetImage(pStream->GetObjNum()));
@@ -332,10 +403,11 @@
if (!pStream)
return;
+ ByteString name = RealizeResource(pStream.Get(), "XObject");
+ pFormObj->SetResourceName(name);
+
*buf << "q\n";
WriteMatrix(*buf, pFormObj->form_matrix()) << " cm ";
-
- ByteString name = RealizeResource(pStream.Get(), "XObject");
*buf << "/" << PDF_NameEncode(name) << " Do Q\n";
}
@@ -554,10 +626,10 @@
}
data.baseFont = pFont->GetBaseFontName();
- ByteString dictName;
+ ByteString dict_name;
absl::optional<ByteString> maybe_name = m_pObjHolder->FontsMapSearch(data);
if (maybe_name.has_value()) {
- dictName = std::move(maybe_name.value());
+ dict_name = std::move(maybe_name.value());
} else {
RetainPtr<const CPDF_Object> pIndirectFont = pFont->GetFontDict();
if (pIndirectFont->IsInline()) {
@@ -573,10 +645,12 @@
m_pDocument->AddIndirectObject(pFontDict);
pIndirectFont = std::move(pFontDict);
}
- dictName = RealizeResource(std::move(pIndirectFont), "Font");
- m_pObjHolder->FontsMapInsert(data, dictName);
+ dict_name = RealizeResource(std::move(pIndirectFont), "Font");
+ m_pObjHolder->FontsMapInsert(data, dict_name);
}
- *buf << "/" << PDF_NameEncode(dictName) << " ";
+ pTextObj->SetResourceName(dict_name);
+
+ *buf << "/" << PDF_NameEncode(dict_name) << " ";
WriteFloat(*buf, pTextObj->GetFontSize()) << " Tf ";
*buf << static_cast<int>(pTextObj->GetTextRenderMode()) << " Tr ";
ByteString text;
diff --git a/core/fpdfapi/font/cpdf_font.h b/core/fpdfapi/font/cpdf_font.h
index dee1724..a74cba9 100644
--- a/core/fpdfapi/font/cpdf_font.h
+++ b/core/fpdfapi/font/cpdf_font.h
@@ -133,6 +133,9 @@
CFX_Font* GetFontFallback(int position);
+ const ByteString& GetResourceName() const { return m_ResourceName; }
+ void SetResourceName(const ByteString& name) { m_ResourceName = name; }
+
protected:
CPDF_Font(CPDF_Document* pDocument, RetainPtr<CPDF_Dictionary> pFontDict);
@@ -163,6 +166,7 @@
void CheckFontMetrics();
UnownedPtr<CPDF_Document> const m_pDocument;
+ ByteString m_ResourceName; // The resource name for this font.
CFX_Font m_Font;
std::vector<std::unique_ptr<CFX_Font>> m_FontFallbacks;
RetainPtr<CPDF_StreamAcc> m_pFontFile;
diff --git a/core/fpdfapi/page/cpdf_pageobject.h b/core/fpdfapi/page/cpdf_pageobject.h
index b5f5e3f..3a0b87a 100644
--- a/core/fpdfapi/page/cpdf_pageobject.h
+++ b/core/fpdfapi/page/cpdf_pageobject.h
@@ -11,6 +11,7 @@
#include "core/fpdfapi/page/cpdf_contentmarks.h"
#include "core/fpdfapi/page/cpdf_graphicstates.h"
+#include "core/fxcrt/bytestring.h"
#include "core/fxcrt/fx_coordinates.h"
class CPDF_FormObject;
@@ -88,6 +89,11 @@
m_ContentStream = new_content_stream;
}
+ const ByteString& GetResourceName() const { return m_ResourceName; }
+ void SetResourceName(const ByteString& resource_name) {
+ m_ResourceName = resource_name;
+ }
+
protected:
void CopyData(const CPDF_PageObject* pSrcObject);
@@ -97,6 +103,7 @@
CPDF_ContentMarks m_ContentMarks;
bool m_bDirty = false;
int32_t m_ContentStream;
+ ByteString m_ResourceName; // The resource name for this object.
};
#endif // CORE_FPDFAPI_PAGE_CPDF_PAGEOBJECT_H_
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.cpp b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
index 4226754..30c5b90 100644
--- a/core/fpdfapi/page/cpdf_streamcontentparser.cpp
+++ b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
@@ -649,7 +649,8 @@
if (m_pSyntax->GetWord() == "EI")
break;
}
- CPDF_ImageObject* pObj = AddImageFromStream(std::move(pStream));
+ CPDF_ImageObject* pObj =
+ AddImageFromStream(std::move(pStream), /*resource_name=*/"");
// Record the bounding box of this image, so rendering code can draw it
// properly.
if (pObj && pObj->GetImage()->IsMask())
@@ -741,14 +742,15 @@
type = pXObject->GetDict()->GetByteStringFor("Subtype");
if (type == "Form") {
- AddForm(std::move(pXObject));
+ AddForm(std::move(pXObject), name);
return;
}
if (type == "Image") {
CPDF_ImageObject* pObj =
- pXObject->IsInline() ? AddImageFromStream(ToStream(pXObject->Clone()))
- : AddImageFromStreamObjNum(pXObject->GetObjNum());
+ pXObject->IsInline()
+ ? AddImageFromStream(ToStream(pXObject->Clone()), name)
+ : AddImageFromStreamObjNum(pXObject->GetObjNum(), name);
m_LastImageName = std::move(name);
if (pObj) {
@@ -759,7 +761,8 @@
}
}
-void CPDF_StreamContentParser::AddForm(RetainPtr<CPDF_Stream> pStream) {
+void CPDF_StreamContentParser::AddForm(RetainPtr<CPDF_Stream> pStream,
+ const ByteString& name) {
CPDF_AllStates status;
status.m_GeneralState = m_pCurStates->m_GeneralState;
status.m_GraphState = m_pCurStates->m_GraphState;
@@ -772,6 +775,7 @@
CFX_Matrix matrix = m_pCurStates->m_CTM * m_mtContentToUser;
auto pFormObj = std::make_unique<CPDF_FormObject>(GetCurrentStreamIndex(),
std::move(form), matrix);
+ pFormObj->SetResourceName(name);
if (!m_pObjectHolder->BackgroundAlphaNeeded() &&
pFormObj->form()->BackgroundAlphaNeeded()) {
m_pObjectHolder->SetBackgroundAlphaNeeded(true);
@@ -782,11 +786,13 @@
}
CPDF_ImageObject* CPDF_StreamContentParser::AddImageFromStream(
- RetainPtr<CPDF_Stream> pStream) {
+ RetainPtr<CPDF_Stream> pStream,
+ const ByteString& name) {
if (!pStream)
return nullptr;
auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex());
+ pImageObj->SetResourceName(name);
pImageObj->SetImage(
pdfium::MakeRetain<CPDF_Image>(m_pDocument, std::move(pStream)));
@@ -794,8 +800,10 @@
}
CPDF_ImageObject* CPDF_StreamContentParser::AddImageFromStreamObjNum(
- uint32_t stream_obj_num) {
+ uint32_t stream_obj_num,
+ const ByteString& name) {
auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex());
+ pImageObj->SetResourceName(name);
pImageObj->SetImage(
CPDF_DocPageData::FromDocument(m_pDocument)->GetImage(stream_obj_num));
@@ -806,6 +814,7 @@
DCHECK(m_pLastImage);
auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex());
+ pImageObj->SetResourceName(m_LastImageName);
pImageObj->SetImage(CPDF_DocPageData::FromDocument(m_pDocument)
->GetImage(m_pLastImage->GetStream()->GetObjNum()));
@@ -1145,9 +1154,14 @@
}
RetainPtr<CPDF_Font> pFont = CPDF_DocPageData::FromDocument(m_pDocument)
->GetFont(std::move(pFontDict));
- if (pFont && pFont->IsType3Font()) {
- pFont->AsType3Font()->SetPageResources(m_pResources.Get());
- pFont->AsType3Font()->CheckType3FontMetrics();
+ if (pFont) {
+ // Save `name` for later retrieval by the CPDF_TextObject that uses the
+ // font.
+ pFont->SetResourceName(name);
+ if (pFont->IsType3Font()) {
+ pFont->AsType3Font()->SetPageResources(m_pResources.Get());
+ pFont->AsType3Font()->CheckType3FontMetrics();
+ }
}
return pFont;
}
@@ -1222,6 +1236,7 @@
: m_pCurStates->m_TextState.GetTextMode();
{
auto pText = std::make_unique<CPDF_TextObject>(GetCurrentStreamIndex());
+ pText->SetResourceName(pFont->GetResourceName());
SetGraphicStates(pText.get(), true, true, true);
if (TextRenderingModeIsStrokeMode(text_mode)) {
pdfium::span<float> pCTM = pText->m_TextState.GetMutableCTM();
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.h b/core/fpdfapi/page/cpdf_streamcontentparser.h
index 86da47d..276dc72 100644
--- a/core/fpdfapi/page/cpdf_streamcontentparser.h
+++ b/core/fpdfapi/page/cpdf_streamcontentparser.h
@@ -119,11 +119,13 @@
void AddPathRect(float x, float y, float w, float h);
void AddPathObject(CFX_FillRenderOptions::FillType fill_type,
RenderType render_type);
- CPDF_ImageObject* AddImageFromStream(RetainPtr<CPDF_Stream> pStream);
- CPDF_ImageObject* AddImageFromStreamObjNum(uint32_t stream_obj_num);
+ CPDF_ImageObject* AddImageFromStream(RetainPtr<CPDF_Stream> pStream,
+ const ByteString& name);
+ CPDF_ImageObject* AddImageFromStreamObjNum(uint32_t stream_obj_num,
+ const ByteString& name);
CPDF_ImageObject* AddLastImage();
- void AddForm(RetainPtr<CPDF_Stream> pStream);
+ void AddForm(RetainPtr<CPDF_Stream> pStream, const ByteString& name);
void SetGraphicStates(CPDF_PageObject* pObj,
bool bColor,
bool bText,
diff --git a/fpdfsdk/fpdf_edit_embeddertest.cpp b/fpdfsdk/fpdf_edit_embeddertest.cpp
index 4e7c7fd..21c3512 100644
--- a/fpdfsdk/fpdf_edit_embeddertest.cpp
+++ b/fpdfsdk/fpdf_edit_embeddertest.cpp
@@ -38,6 +38,7 @@
using pdfium::HelloWorldChecksum;
using testing::HasSubstr;
+using testing::Not;
namespace {
@@ -928,10 +929,9 @@
VerifySavedDocument(200, 200, FirstRemovedChecksum());
// Verify removed/renamed resources are no longer there.
- // TODO(crbug.com/1428724): Negate these checks.
- EXPECT_THAT(GetString(), HasSubstr("/F1"));
- EXPECT_THAT(GetString(), HasSubstr("/F2"));
- EXPECT_THAT(GetString(), HasSubstr("/Times-Roman"));
+ EXPECT_THAT(GetString(), Not(HasSubstr("/F1")));
+ EXPECT_THAT(GetString(), Not(HasSubstr("/F2")));
+ EXPECT_THAT(GetString(), Not(HasSubstr("/Times-Roman")));
UnloadPage(page);
}
@@ -1114,10 +1114,8 @@
std::vector<std::string> split_saved_data = StringSplit(GetString(), '\n');
// Verify removed/renamed resources are in the save PDF the correct number of
// times.
- // TODO(crbug.com/1428724): Should only be in the PDF once.
- EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F1")).Times(2));
- EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F2")).Times(2));
-
+ EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F1")).Times(1));
+ EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F2")).Times(1));
EXPECT_THAT(split_saved_data, Contains(HasSubstr("/Times-Roman")).Times(1));
UnloadPage(page1);
diff --git a/fpdfsdk/fpdf_save_embeddertest.cpp b/fpdfsdk/fpdf_save_embeddertest.cpp
index c18566a..3f5efd8 100644
--- a/fpdfsdk/fpdf_save_embeddertest.cpp
+++ b/fpdfsdk/fpdf_save_embeddertest.cpp
@@ -157,10 +157,8 @@
EXPECT_THAT(GetString(), StartsWith("%PDF-1.7\r\n"));
EXPECT_THAT(GetString(), HasSubstr("/Root "));
- // TODO(crbug.com/pdfium/1409): The PDF should not have any images, given it
- // is rendering blank. The file size should also be a lot smaller.
- EXPECT_THAT(GetString(), HasSubstr("/Image"));
- EXPECT_LT(GetString().size(), 1300u);
+ EXPECT_THAT(GetString(), Not(HasSubstr("/Image")));
+ EXPECT_LT(GetString().size(), 600u);
}
#ifdef PDF_ENABLE_XFA