Keep track of Font and XObject resources

For CPDF_ImageObjects, CPDF_FormObjects, and CPDF_TextObjects, keep
track of the Font/XObject resources associated with them. Using this
data, CPDF_PageContentGenerator can detect and remove unused resources
from resources dictionaries. Then CPDF_Creator can detect the
unreferenced resources and remove them when saving.

Bug: chromium:1428724,pdfium:1409
Change-Id: I510e6c51eda28535ed00e87b6e10971f7178122c
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/105613
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp b/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp
index 914bec0..c149194 100644
--- a/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp
+++ b/core/fpdfapi/edit/cpdf_pagecontentgenerator.cpp
@@ -45,6 +45,10 @@
 
 namespace {
 
+// Key: The resource type.
+// Value: The resource names of a given type.
+using ResourcesMap = std::map<ByteString, std::set<ByteString>>;
+
 bool GetColor(const CPDF_Color* pColor, float* rgb) {
   int intRGB[3];
   if (!pColor || !pColor->IsColorSpaceRGB() ||
@@ -57,6 +61,59 @@
   return true;
 }
 
+void RecordPageObjectResourceUsage(const CPDF_PageObject* page_object,
+                                   ResourcesMap& seen_resources) {
+  const ByteString& resource_name = page_object->GetResourceName();
+  if (!resource_name.IsEmpty()) {
+    switch (page_object->GetType()) {
+      case CPDF_PageObject::Type::kText:
+        seen_resources["Font"].insert(resource_name);
+        break;
+      case CPDF_PageObject::Type::kImage:
+      case CPDF_PageObject::Type::kForm:
+        seen_resources["XObject"].insert(resource_name);
+        break;
+      case CPDF_PageObject::Type::kPath:
+        break;
+      case CPDF_PageObject::Type::kShading:
+        break;
+    }
+  }
+}
+
+void RemoveUnusedResources(RetainPtr<CPDF_Dictionary> resources_dict,
+                           const ResourcesMap& resources_in_use) {
+  // TODO(thestig): Remove other unused resource types, like ExtGState.
+  static constexpr const char* kResourceKeys[] = {"Font", "XObject"};
+  for (const char* resource_key : kResourceKeys) {
+    RetainPtr<CPDF_Dictionary> resource_dict =
+        resources_dict->GetMutableDictFor(resource_key);
+    if (!resource_dict) {
+      continue;
+    }
+
+    std::vector<ByteString> keys;
+    {
+      CPDF_DictionaryLocker resource_dict_locker(resource_dict);
+      for (auto& it : resource_dict_locker) {
+        keys.push_back(it.first);
+      }
+    }
+
+    auto it = resources_in_use.find(resource_key);
+    const std::set<ByteString>* resource_in_use_of_current_type =
+        it != resources_in_use.end() ? &it->second : nullptr;
+    for (const ByteString& key : keys) {
+      if (resource_in_use_of_current_type &&
+          pdfium::Contains(*resource_in_use_of_current_type, key)) {
+        continue;
+      }
+
+      resource_dict->RemoveFor(key.AsStringView());
+    }
+  }
+}
+
 }  // namespace
 
 CPDF_PageContentGenerator::CPDF_PageContentGenerator(
@@ -162,6 +219,18 @@
 
     page_content_manager.UpdateStream(stream_index, buf);
   }
+
+  RetainPtr<CPDF_Dictionary> resources = m_pObjHolder->GetMutableResources();
+  if (!resources) {
+    return;
+  }
+
+  ResourcesMap seen_resources;
+  for (auto& page_object : m_pageObjects) {
+    RecordPageObjectResourceUsage(page_object, seen_resources);
+  }
+
+  RemoveUnusedResources(std::move(resources), seen_resources);
 }
 
 ByteString CPDF_PageContentGenerator::RealizeResource(
@@ -313,6 +382,8 @@
     pImage->ConvertStreamToIndirectObject();
 
   ByteString name = RealizeResource(pStream, "XObject");
+  pImageObj->SetResourceName(name);
+
   if (bWasInline) {
     auto* pPageData = CPDF_DocPageData::FromDocument(m_pDocument);
     pImageObj->SetImage(pPageData->GetImage(pStream->GetObjNum()));
@@ -332,10 +403,11 @@
   if (!pStream)
     return;
 
+  ByteString name = RealizeResource(pStream.Get(), "XObject");
+  pFormObj->SetResourceName(name);
+
   *buf << "q\n";
   WriteMatrix(*buf, pFormObj->form_matrix()) << " cm ";
-
-  ByteString name = RealizeResource(pStream.Get(), "XObject");
   *buf << "/" << PDF_NameEncode(name) << " Do Q\n";
 }
 
@@ -554,10 +626,10 @@
   }
   data.baseFont = pFont->GetBaseFontName();
 
-  ByteString dictName;
+  ByteString dict_name;
   absl::optional<ByteString> maybe_name = m_pObjHolder->FontsMapSearch(data);
   if (maybe_name.has_value()) {
-    dictName = std::move(maybe_name.value());
+    dict_name = std::move(maybe_name.value());
   } else {
     RetainPtr<const CPDF_Object> pIndirectFont = pFont->GetFontDict();
     if (pIndirectFont->IsInline()) {
@@ -573,10 +645,12 @@
       m_pDocument->AddIndirectObject(pFontDict);
       pIndirectFont = std::move(pFontDict);
     }
-    dictName = RealizeResource(std::move(pIndirectFont), "Font");
-    m_pObjHolder->FontsMapInsert(data, dictName);
+    dict_name = RealizeResource(std::move(pIndirectFont), "Font");
+    m_pObjHolder->FontsMapInsert(data, dict_name);
   }
-  *buf << "/" << PDF_NameEncode(dictName) << " ";
+  pTextObj->SetResourceName(dict_name);
+
+  *buf << "/" << PDF_NameEncode(dict_name) << " ";
   WriteFloat(*buf, pTextObj->GetFontSize()) << " Tf ";
   *buf << static_cast<int>(pTextObj->GetTextRenderMode()) << " Tr ";
   ByteString text;
diff --git a/core/fpdfapi/font/cpdf_font.h b/core/fpdfapi/font/cpdf_font.h
index dee1724..a74cba9 100644
--- a/core/fpdfapi/font/cpdf_font.h
+++ b/core/fpdfapi/font/cpdf_font.h
@@ -133,6 +133,9 @@
 
   CFX_Font* GetFontFallback(int position);
 
+  const ByteString& GetResourceName() const { return m_ResourceName; }
+  void SetResourceName(const ByteString& name) { m_ResourceName = name; }
+
  protected:
   CPDF_Font(CPDF_Document* pDocument, RetainPtr<CPDF_Dictionary> pFontDict);
 
@@ -163,6 +166,7 @@
   void CheckFontMetrics();
 
   UnownedPtr<CPDF_Document> const m_pDocument;
+  ByteString m_ResourceName;  // The resource name for this font.
   CFX_Font m_Font;
   std::vector<std::unique_ptr<CFX_Font>> m_FontFallbacks;
   RetainPtr<CPDF_StreamAcc> m_pFontFile;
diff --git a/core/fpdfapi/page/cpdf_pageobject.h b/core/fpdfapi/page/cpdf_pageobject.h
index b5f5e3f..3a0b87a 100644
--- a/core/fpdfapi/page/cpdf_pageobject.h
+++ b/core/fpdfapi/page/cpdf_pageobject.h
@@ -11,6 +11,7 @@
 
 #include "core/fpdfapi/page/cpdf_contentmarks.h"
 #include "core/fpdfapi/page/cpdf_graphicstates.h"
+#include "core/fxcrt/bytestring.h"
 #include "core/fxcrt/fx_coordinates.h"
 
 class CPDF_FormObject;
@@ -88,6 +89,11 @@
     m_ContentStream = new_content_stream;
   }
 
+  const ByteString& GetResourceName() const { return m_ResourceName; }
+  void SetResourceName(const ByteString& resource_name) {
+    m_ResourceName = resource_name;
+  }
+
  protected:
   void CopyData(const CPDF_PageObject* pSrcObject);
 
@@ -97,6 +103,7 @@
   CPDF_ContentMarks m_ContentMarks;
   bool m_bDirty = false;
   int32_t m_ContentStream;
+  ByteString m_ResourceName;  // The resource name for this object.
 };
 
 #endif  // CORE_FPDFAPI_PAGE_CPDF_PAGEOBJECT_H_
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.cpp b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
index 4226754..30c5b90 100644
--- a/core/fpdfapi/page/cpdf_streamcontentparser.cpp
+++ b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
@@ -649,7 +649,8 @@
     if (m_pSyntax->GetWord() == "EI")
       break;
   }
-  CPDF_ImageObject* pObj = AddImageFromStream(std::move(pStream));
+  CPDF_ImageObject* pObj =
+      AddImageFromStream(std::move(pStream), /*resource_name=*/"");
   // Record the bounding box of this image, so rendering code can draw it
   // properly.
   if (pObj && pObj->GetImage()->IsMask())
@@ -741,14 +742,15 @@
     type = pXObject->GetDict()->GetByteStringFor("Subtype");
 
   if (type == "Form") {
-    AddForm(std::move(pXObject));
+    AddForm(std::move(pXObject), name);
     return;
   }
 
   if (type == "Image") {
     CPDF_ImageObject* pObj =
-        pXObject->IsInline() ? AddImageFromStream(ToStream(pXObject->Clone()))
-                             : AddImageFromStreamObjNum(pXObject->GetObjNum());
+        pXObject->IsInline()
+            ? AddImageFromStream(ToStream(pXObject->Clone()), name)
+            : AddImageFromStreamObjNum(pXObject->GetObjNum(), name);
 
     m_LastImageName = std::move(name);
     if (pObj) {
@@ -759,7 +761,8 @@
   }
 }
 
-void CPDF_StreamContentParser::AddForm(RetainPtr<CPDF_Stream> pStream) {
+void CPDF_StreamContentParser::AddForm(RetainPtr<CPDF_Stream> pStream,
+                                       const ByteString& name) {
   CPDF_AllStates status;
   status.m_GeneralState = m_pCurStates->m_GeneralState;
   status.m_GraphState = m_pCurStates->m_GraphState;
@@ -772,6 +775,7 @@
   CFX_Matrix matrix = m_pCurStates->m_CTM * m_mtContentToUser;
   auto pFormObj = std::make_unique<CPDF_FormObject>(GetCurrentStreamIndex(),
                                                     std::move(form), matrix);
+  pFormObj->SetResourceName(name);
   if (!m_pObjectHolder->BackgroundAlphaNeeded() &&
       pFormObj->form()->BackgroundAlphaNeeded()) {
     m_pObjectHolder->SetBackgroundAlphaNeeded(true);
@@ -782,11 +786,13 @@
 }
 
 CPDF_ImageObject* CPDF_StreamContentParser::AddImageFromStream(
-    RetainPtr<CPDF_Stream> pStream) {
+    RetainPtr<CPDF_Stream> pStream,
+    const ByteString& name) {
   if (!pStream)
     return nullptr;
 
   auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex());
+  pImageObj->SetResourceName(name);
   pImageObj->SetImage(
       pdfium::MakeRetain<CPDF_Image>(m_pDocument, std::move(pStream)));
 
@@ -794,8 +800,10 @@
 }
 
 CPDF_ImageObject* CPDF_StreamContentParser::AddImageFromStreamObjNum(
-    uint32_t stream_obj_num) {
+    uint32_t stream_obj_num,
+    const ByteString& name) {
   auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex());
+  pImageObj->SetResourceName(name);
   pImageObj->SetImage(
       CPDF_DocPageData::FromDocument(m_pDocument)->GetImage(stream_obj_num));
 
@@ -806,6 +814,7 @@
   DCHECK(m_pLastImage);
 
   auto pImageObj = std::make_unique<CPDF_ImageObject>(GetCurrentStreamIndex());
+  pImageObj->SetResourceName(m_LastImageName);
   pImageObj->SetImage(CPDF_DocPageData::FromDocument(m_pDocument)
                           ->GetImage(m_pLastImage->GetStream()->GetObjNum()));
 
@@ -1145,9 +1154,14 @@
   }
   RetainPtr<CPDF_Font> pFont = CPDF_DocPageData::FromDocument(m_pDocument)
                                    ->GetFont(std::move(pFontDict));
-  if (pFont && pFont->IsType3Font()) {
-    pFont->AsType3Font()->SetPageResources(m_pResources.Get());
-    pFont->AsType3Font()->CheckType3FontMetrics();
+  if (pFont) {
+    // Save `name` for later retrieval by the CPDF_TextObject that uses the
+    // font.
+    pFont->SetResourceName(name);
+    if (pFont->IsType3Font()) {
+      pFont->AsType3Font()->SetPageResources(m_pResources.Get());
+      pFont->AsType3Font()->CheckType3FontMetrics();
+    }
   }
   return pFont;
 }
@@ -1222,6 +1236,7 @@
                            : m_pCurStates->m_TextState.GetTextMode();
   {
     auto pText = std::make_unique<CPDF_TextObject>(GetCurrentStreamIndex());
+    pText->SetResourceName(pFont->GetResourceName());
     SetGraphicStates(pText.get(), true, true, true);
     if (TextRenderingModeIsStrokeMode(text_mode)) {
       pdfium::span<float> pCTM = pText->m_TextState.GetMutableCTM();
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.h b/core/fpdfapi/page/cpdf_streamcontentparser.h
index 86da47d..276dc72 100644
--- a/core/fpdfapi/page/cpdf_streamcontentparser.h
+++ b/core/fpdfapi/page/cpdf_streamcontentparser.h
@@ -119,11 +119,13 @@
   void AddPathRect(float x, float y, float w, float h);
   void AddPathObject(CFX_FillRenderOptions::FillType fill_type,
                      RenderType render_type);
-  CPDF_ImageObject* AddImageFromStream(RetainPtr<CPDF_Stream> pStream);
-  CPDF_ImageObject* AddImageFromStreamObjNum(uint32_t stream_obj_num);
+  CPDF_ImageObject* AddImageFromStream(RetainPtr<CPDF_Stream> pStream,
+                                       const ByteString& name);
+  CPDF_ImageObject* AddImageFromStreamObjNum(uint32_t stream_obj_num,
+                                             const ByteString& name);
   CPDF_ImageObject* AddLastImage();
 
-  void AddForm(RetainPtr<CPDF_Stream> pStream);
+  void AddForm(RetainPtr<CPDF_Stream> pStream, const ByteString& name);
   void SetGraphicStates(CPDF_PageObject* pObj,
                         bool bColor,
                         bool bText,
diff --git a/fpdfsdk/fpdf_edit_embeddertest.cpp b/fpdfsdk/fpdf_edit_embeddertest.cpp
index 4e7c7fd..21c3512 100644
--- a/fpdfsdk/fpdf_edit_embeddertest.cpp
+++ b/fpdfsdk/fpdf_edit_embeddertest.cpp
@@ -38,6 +38,7 @@
 
 using pdfium::HelloWorldChecksum;
 using testing::HasSubstr;
+using testing::Not;
 
 namespace {
 
@@ -928,10 +929,9 @@
   VerifySavedDocument(200, 200, FirstRemovedChecksum());
 
   // Verify removed/renamed resources are no longer there.
-  // TODO(crbug.com/1428724): Negate these checks.
-  EXPECT_THAT(GetString(), HasSubstr("/F1"));
-  EXPECT_THAT(GetString(), HasSubstr("/F2"));
-  EXPECT_THAT(GetString(), HasSubstr("/Times-Roman"));
+  EXPECT_THAT(GetString(), Not(HasSubstr("/F1")));
+  EXPECT_THAT(GetString(), Not(HasSubstr("/F2")));
+  EXPECT_THAT(GetString(), Not(HasSubstr("/Times-Roman")));
 
   UnloadPage(page);
 }
@@ -1114,10 +1114,8 @@
   std::vector<std::string> split_saved_data = StringSplit(GetString(), '\n');
   // Verify removed/renamed resources are in the save PDF the correct number of
   // times.
-  // TODO(crbug.com/1428724): Should only be in the PDF once.
-  EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F1")).Times(2));
-  EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F2")).Times(2));
-
+  EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F1")).Times(1));
+  EXPECT_THAT(split_saved_data, Contains(HasSubstr("/F2")).Times(1));
   EXPECT_THAT(split_saved_data, Contains(HasSubstr("/Times-Roman")).Times(1));
 
   UnloadPage(page1);
diff --git a/fpdfsdk/fpdf_save_embeddertest.cpp b/fpdfsdk/fpdf_save_embeddertest.cpp
index c18566a..3f5efd8 100644
--- a/fpdfsdk/fpdf_save_embeddertest.cpp
+++ b/fpdfsdk/fpdf_save_embeddertest.cpp
@@ -157,10 +157,8 @@
 
   EXPECT_THAT(GetString(), StartsWith("%PDF-1.7\r\n"));
   EXPECT_THAT(GetString(), HasSubstr("/Root "));
-  // TODO(crbug.com/pdfium/1409): The PDF should not have any images, given it
-  // is rendering blank. The file size should also be a lot smaller.
-  EXPECT_THAT(GetString(), HasSubstr("/Image"));
-  EXPECT_LT(GetString().size(), 1300u);
+  EXPECT_THAT(GetString(), Not(HasSubstr("/Image")));
+  EXPECT_LT(GetString().size(), 600u);
 }
 
 #ifdef PDF_ENABLE_XFA