Traverse PDF page tree only once in CPDF_Document Try 3
Now, we do not start traversal from where we were at, but from the top.
This makes the code less prone to bugs, as now there is no need to call
methods to recursively fix things. This will save a lot of time when
the trees are rather flat, as in the PDF file in the bug. It can still
be slow, for instance if we have a chain of page nodes, and the last
in the chain contains all of the pages (this is artificial).
Try 2 at https://codereview.chromium.org/2442403002/
Also added test where Try 2 would have failed.
Tested the pdf from the bug on my Mac:
With this CL: load in 21 seconds
Without this CL: did not load in 4 minutes, got tired of waiting
BUG=chromium:638513
Review-Url: https://codereview.chromium.org/2470803003
diff --git a/core/fpdfapi/parser/cpdf_document.cpp b/core/fpdfapi/parser/cpdf_document.cpp
index 6457404..8e181de 100644
--- a/core/fpdfapi/parser/cpdf_document.cpp
+++ b/core/fpdfapi/parser/cpdf_document.cpp
@@ -336,6 +336,7 @@
m_pParser(std::move(pParser)),
m_pRootDict(nullptr),
m_pInfoDict(nullptr),
+ m_iNextPageToTraverse(0),
m_bLinearized(false),
m_iFirstPageNo(0),
m_dwFirstPageObjNum(0),
@@ -400,40 +401,72 @@
m_PageList.SetSize(RetrievePageCount());
}
-CPDF_Dictionary* CPDF_Document::FindPDFPage(CPDF_Dictionary* pPages,
- int iPage,
- int nPagesToGo,
- int level) {
- CPDF_Array* pKidList = pPages->GetArrayFor("Kids");
- if (!pKidList)
- return nPagesToGo == 0 ? pPages : nullptr;
-
- if (level >= FX_MAX_PAGE_LEVEL)
+CPDF_Dictionary* CPDF_Document::TraversePDFPages(int iPage,
+ int* nPagesToGo,
+ size_t level) {
+ if (*nPagesToGo < 0)
return nullptr;
+ CPDF_Dictionary* pPages = m_pTreeTraversal[level].first;
+ CPDF_Array* pKidList = pPages->GetArrayFor("Kids");
+ if (!pKidList) {
+ if (*nPagesToGo != 1)
+ return nullptr;
+ m_PageList.SetAt(iPage, pPages->GetObjNum());
+ return pPages;
+ }
- for (size_t i = 0; i < pKidList->GetCount(); i++) {
+ if (level >= FX_MAX_PAGE_LEVEL) {
+ m_pTreeTraversal.pop_back();
+ return nullptr;
+ }
+
+ CPDF_Dictionary* page = nullptr;
+ for (size_t i = m_pTreeTraversal[level].second; i < pKidList->GetCount();
+ i++) {
+ if (*nPagesToGo == 0)
+ break;
CPDF_Dictionary* pKid = pKidList->GetDictAt(i);
if (!pKid) {
- nPagesToGo--;
+ (*nPagesToGo)--;
+ m_pTreeTraversal[level].second++;
continue;
}
- if (pKid == pPages)
+ if (pKid == pPages) {
+ m_pTreeTraversal[level].second++;
continue;
+ }
if (!pKid->KeyExist("Kids")) {
- if (nPagesToGo == 0)
- return pKid;
-
- m_PageList.SetAt(iPage - nPagesToGo, pKid->GetObjNum());
- nPagesToGo--;
+ m_PageList.SetAt(iPage - (*nPagesToGo) + 1, pKid->GetObjNum());
+ (*nPagesToGo)--;
+ m_pTreeTraversal[level].second++;
+ if (*nPagesToGo == 0) {
+ page = pKid;
+ break;
+ }
} else {
- int nPages = pKid->GetIntegerFor("Count");
- if (nPagesToGo < nPages)
- return FindPDFPage(pKid, iPage, nPagesToGo, level + 1);
-
- nPagesToGo -= nPages;
+ // If the vector has size level+1, the child is not in yet
+ if (m_pTreeTraversal.size() == level + 1)
+ m_pTreeTraversal.push_back(std::make_pair(pKid, 0));
+ // Now m_pTreeTraversal[level+1] should exist and be equal to pKid.
+ CPDF_Dictionary* pageKid = TraversePDFPages(iPage, nPagesToGo, level + 1);
+ // Check if child was completely processed, i.e. it popped itself out
+ if (m_pTreeTraversal.size() == level + 1)
+ m_pTreeTraversal[level].second++;
+ // If child did not finish or if no pages to go, we are done
+ if (m_pTreeTraversal.size() != level + 1 || *nPagesToGo == 0) {
+ page = pageKid;
+ break;
+ }
}
}
- return nullptr;
+ if (m_pTreeTraversal[level].second == pKidList->GetCount())
+ m_pTreeTraversal.pop_back();
+ return page;
+}
+
+void CPDF_Document::ResetTraversal() {
+ m_iNextPageToTraverse = 0;
+ m_pTreeTraversal.clear();
}
CPDF_Dictionary* CPDF_Document::GetPagesDict() const {
@@ -460,17 +493,18 @@
if (objnum) {
if (CPDF_Dictionary* pDict = ToDictionary(GetOrParseIndirectObject(objnum)))
return pDict;
+ return nullptr;
}
CPDF_Dictionary* pPages = GetPagesDict();
if (!pPages)
return nullptr;
- CPDF_Dictionary* pPage = FindPDFPage(pPages, iPage, iPage, 0);
- if (!pPage)
- return nullptr;
-
- m_PageList.SetAt(iPage, pPage->GetObjNum());
+ if (m_pTreeTraversal.empty())
+ m_pTreeTraversal.push_back(std::make_pair(pPages, 0));
+ int nPagesToGo = iPage - m_iNextPageToTraverse + 1;
+ CPDF_Dictionary* pPage = TraversePDFPages(iPage, &nPagesToGo, 0);
+ m_iNextPageToTraverse = iPage + 1;
return pPage;
}
@@ -664,6 +698,7 @@
}
pPages->SetIntegerFor(
"Count", pPages->GetIntegerFor("Count") + (bInsert ? 1 : -1));
+ ResetTraversal();
break;
}
int nPages = pKid->GetIntegerFor("Count");
@@ -704,6 +739,7 @@
pPagesList->Add(new CPDF_Reference(this, pPageDict->GetObjNum()));
pPages->SetIntegerFor("Count", nPages + 1);
pPageDict->SetReferenceFor("Parent", this, pPages->GetObjNum());
+ ResetTraversal();
} else {
std::set<CPDF_Dictionary*> stack = {pPages};
if (!InsertDeletePDFPage(pPages, iPage, pPageDict, true, &stack))
diff --git a/core/fpdfapi/parser/cpdf_document.h b/core/fpdfapi/parser/cpdf_document.h
index e113526..0a99e42 100644
--- a/core/fpdfapi/parser/cpdf_document.h
+++ b/core/fpdfapi/parser/cpdf_document.h
@@ -105,10 +105,8 @@
protected:
// Retrieve page count information by getting count value from the tree nodes
int RetrievePageCount() const;
- CPDF_Dictionary* FindPDFPage(CPDF_Dictionary* pPages,
- int iPage,
- int nPagesToGo,
- int level);
+ // When this method is called, m_pTreeTraversal[level] exists.
+ CPDF_Dictionary* TraversePDFPages(int iPage, int* nPagesToGo, size_t level);
int FindPageIndex(CPDF_Dictionary* pNode,
uint32_t& skip_count,
uint32_t objnum,
@@ -130,10 +128,18 @@
bool bInsert,
std::set<CPDF_Dictionary*>* pVisited);
bool InsertNewPage(int iPage, CPDF_Dictionary* pPageDict);
+ void ResetTraversal();
std::unique_ptr<CPDF_Parser> m_pParser;
CPDF_Dictionary* m_pRootDict;
CPDF_Dictionary* m_pInfoDict;
+ // Vector of pairs to know current position in the page tree. The index in the
+ // vector corresponds to the level being described. The pair contains a
+ // pointer to the dictionary being processed at the level, and an index of the
+ // of the child being processed within the dictionary's /Kids array.
+ std::vector<std::pair<CPDF_Dictionary*, size_t>> m_pTreeTraversal;
+ // Index of the next page that will be traversed from the page tree.
+ int m_iNextPageToTraverse;
bool m_bLinearized;
int m_iFirstPageNo;
uint32_t m_dwFirstPageObjNum;
diff --git a/core/fpdfapi/parser/cpdf_document_unittest.cpp b/core/fpdfapi/parser/cpdf_document_unittest.cpp
index c09665b..71716a6 100644
--- a/core/fpdfapi/parser/cpdf_document_unittest.cpp
+++ b/core/fpdfapi/parser/cpdf_document_unittest.cpp
@@ -95,7 +95,7 @@
for (int i = 0; i < 7; i++) {
CPDF_Dictionary* page = document->GetPage(i);
ASSERT_TRUE(page);
- ASSERT_TRUE(page->GetObjectFor("PageNumbering"));
+ ASSERT_TRUE(page->KeyExist("PageNumbering"));
EXPECT_EQ(i, page->GetIntegerFor("PageNumbering"));
}
CPDF_Dictionary* page = document->GetPage(7);
@@ -108,13 +108,36 @@
for (int i = 6; i >= 0; i--) {
CPDF_Dictionary* page = document->GetPage(i);
ASSERT_TRUE(page);
- ASSERT_TRUE(page->GetObjectFor("PageNumbering"));
+ ASSERT_TRUE(page->KeyExist("PageNumbering"));
EXPECT_EQ(i, page->GetIntegerFor("PageNumbering"));
}
CPDF_Dictionary* page = document->GetPage(7);
EXPECT_FALSE(page);
}
+TEST(cpdf_document, GetPagesInDisorder) {
+ std::unique_ptr<CPDF_TestDocumentForPages> document =
+ pdfium::MakeUnique<CPDF_TestDocumentForPages>();
+
+ CPDF_Dictionary* page = document->GetPage(1);
+ ASSERT_TRUE(page);
+ ASSERT_TRUE(page->KeyExist("PageNumbering"));
+ EXPECT_EQ(1, page->GetIntegerFor("PageNumbering"));
+
+ page = document->GetPage(3);
+ ASSERT_TRUE(page);
+ ASSERT_TRUE(page->KeyExist("PageNumbering"));
+ EXPECT_EQ(3, page->GetIntegerFor("PageNumbering"));
+
+ page = document->GetPage(7);
+ EXPECT_FALSE(page);
+
+ page = document->GetPage(6);
+ ASSERT_TRUE(page);
+ ASSERT_TRUE(page->KeyExist("PageNumbering"));
+ EXPECT_EQ(6, page->GetIntegerFor("PageNumbering"));
+}
+
TEST_F(cpdf_document_test, UseCachedPageObjNumIfHaveNotPagesDict) {
// ObjNum can be added in CPDF_DataAvail::IsPageAvail, and PagesDict
// can be not exists in this case.