Add a FPDFStructTreeEmbedderTest for MCR and OBJR element types.
Provide test coverage and illustrate issues with the existing StructTree
code, where they cannot retrieve inline objects.
The test PDF is derived from a HTML file with the following content:
<body>
<a name="top">
<p>hello world</p>
<p><a href="#top">link to top</a>
</body>
The HTML is converted to a tagged PDF using Chromium's headless mode
with the following flags, with the assumption that Chromium's tagged PDF
generator code works properly:
--print-to-pdf --export-tagged-pdf --print-to-pdf-no-header
The output PDF from Chromium is then minimized to produce the
tagged_mcr_objr.in and tagged_mcr_objr.pdf.
Bug: pdfium:672,pdfium:1297
Change-Id: I0ba6e9a9a1bba57acf1ed052d4bec631e67d7677
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/68990
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/fpdfsdk/fpdf_structtree_embeddertest.cpp b/fpdfsdk/fpdf_structtree_embeddertest.cpp
index 21cbffd..f527877 100644
--- a/fpdfsdk/fpdf_structtree_embeddertest.cpp
+++ b/fpdfsdk/fpdf_structtree_embeddertest.cpp
@@ -21,33 +21,33 @@
FPDF_STRUCTELEMENT element =
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), -1);
- EXPECT_EQ(nullptr, element);
+ EXPECT_FALSE(element);
element = FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 1);
- EXPECT_EQ(nullptr, element);
+ EXPECT_FALSE(element);
element = FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
- ASSERT_NE(nullptr, element);
+ ASSERT_TRUE(element);
EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(element));
EXPECT_EQ(0U, FPDF_StructElement_GetAltText(element, nullptr, 0));
ASSERT_EQ(1, FPDF_StructElement_CountChildren(element));
FPDF_STRUCTELEMENT child_element =
FPDF_StructElement_GetChildAtIndex(element, -1);
- EXPECT_EQ(nullptr, child_element);
+ EXPECT_FALSE(child_element);
child_element = FPDF_StructElement_GetChildAtIndex(element, 1);
- EXPECT_EQ(nullptr, child_element);
+ EXPECT_FALSE(child_element);
child_element = FPDF_StructElement_GetChildAtIndex(element, 0);
- ASSERT_NE(nullptr, child_element);
+ ASSERT_TRUE(child_element);
EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(child_element));
EXPECT_EQ(0U, FPDF_StructElement_GetAltText(child_element, nullptr, 0));
ASSERT_EQ(1, FPDF_StructElement_CountChildren(child_element));
FPDF_STRUCTELEMENT gchild_element =
FPDF_StructElement_GetChildAtIndex(child_element, -1);
- EXPECT_EQ(nullptr, gchild_element);
+ EXPECT_FALSE(gchild_element);
gchild_element = FPDF_StructElement_GetChildAtIndex(child_element, 1);
- EXPECT_EQ(nullptr, gchild_element);
+ EXPECT_FALSE(gchild_element);
gchild_element = FPDF_StructElement_GetChildAtIndex(child_element, 0);
- ASSERT_NE(nullptr, gchild_element);
+ ASSERT_TRUE(gchild_element);
EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(gchild_element));
ASSERT_EQ(24U, FPDF_StructElement_GetAltText(gchild_element, nullptr, 0));
@@ -69,7 +69,7 @@
ASSERT_EQ(1, FPDF_StructElement_CountChildren(gchild_element));
FPDF_STRUCTELEMENT ggchild_element =
FPDF_StructElement_GetChildAtIndex(gchild_element, 0);
- EXPECT_EQ(nullptr, ggchild_element);
+ EXPECT_FALSE(ggchild_element);
}
UnloadPage(page);
@@ -105,7 +105,7 @@
FPDF_STRUCTELEMENT element =
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
- ASSERT_NE(nullptr, element);
+ ASSERT_TRUE(element);
// test nullptr inputs
unsigned short buffer[12];
@@ -141,7 +141,7 @@
FPDF_STRUCTELEMENT element =
FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
- ASSERT_NE(nullptr, element);
+ ASSERT_TRUE(element);
// test nullptr inputs
unsigned short buffer[13];
@@ -166,7 +166,7 @@
ASSERT_EQ(1, FPDF_StructElement_CountChildren(element));
FPDF_STRUCTELEMENT child_element =
FPDF_StructElement_GetChildAtIndex(element, 0);
- ASSERT_NE(nullptr, element);
+ ASSERT_TRUE(element);
ASSERT_EQ(26U, FPDF_StructElement_GetTitle(child_element, buffer,
sizeof(buffer)));
@@ -190,3 +190,109 @@
}
UnloadPage(page);
}
+
+TEST_F(FPDFStructTreeEmbedderTest, MarkedContentReferenceAndObjectReference) {
+ ASSERT_TRUE(OpenDocument("tagged_mcr_objr.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ {
+ ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+ ASSERT_TRUE(struct_tree);
+ ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+ FPDF_STRUCTELEMENT object8 =
+ FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+ ASSERT_TRUE(object8);
+ unsigned short buffer[12];
+ ASSERT_EQ(18U, FPDF_StructElement_GetType(object8, buffer, sizeof(buffer)));
+ const wchar_t kExpectedObject8Type[] = L"Document";
+ EXPECT_EQ(WideString(kExpectedObject8Type),
+ WideString::FromUTF16LE(buffer, FXSYS_len(kExpectedObject8Type)));
+ EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(object8));
+ ASSERT_EQ(2, FPDF_StructElement_CountChildren(object8));
+
+ // First branch. 10 -> 12 -> 13 -> Inline dict.
+ FPDF_STRUCTELEMENT object10 =
+ FPDF_StructElement_GetChildAtIndex(object8, 0);
+ ASSERT_TRUE(object10);
+ ASSERT_EQ(20U,
+ FPDF_StructElement_GetType(object10, buffer, sizeof(buffer)));
+ const wchar_t kExpectedObject10Type[] = L"NonStruct";
+ EXPECT_EQ(
+ WideString(kExpectedObject10Type),
+ WideString::FromUTF16LE(buffer, FXSYS_len(kExpectedObject10Type)));
+ EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(object10));
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(object10));
+
+ FPDF_STRUCTELEMENT object12 =
+ FPDF_StructElement_GetChildAtIndex(object10, 0);
+ ASSERT_TRUE(object12);
+ ASSERT_EQ(4U, FPDF_StructElement_GetType(object12, buffer, sizeof(buffer)));
+ const wchar_t kExpectedObject12Type[] = L"P";
+ EXPECT_EQ(
+ WideString(kExpectedObject12Type),
+ WideString::FromUTF16LE(buffer, FXSYS_len(kExpectedObject12Type)));
+ EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(object12));
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(object12));
+
+ FPDF_STRUCTELEMENT object13 =
+ FPDF_StructElement_GetChildAtIndex(object12, 0);
+ ASSERT_TRUE(object13);
+ ASSERT_EQ(20U,
+ FPDF_StructElement_GetType(object13, buffer, sizeof(buffer)));
+ const wchar_t kExpectedObject13Type[] = L"NonStruct";
+ EXPECT_EQ(
+ WideString(kExpectedObject13Type),
+ WideString::FromUTF16LE(buffer, FXSYS_len(kExpectedObject13Type)));
+ EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(object13));
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(object13));
+
+ // TODO(crbug.com/pdfium/672): Fetch this child element.
+ EXPECT_FALSE(FPDF_StructElement_GetChildAtIndex(object13, 0));
+
+ // Second branch. 11 -> 14 -> Inline dict.
+ // -> 15 -> Inline dict.
+ FPDF_STRUCTELEMENT object11 =
+ FPDF_StructElement_GetChildAtIndex(object8, 1);
+ ASSERT_TRUE(object11);
+ ASSERT_EQ(4U, FPDF_StructElement_GetType(object11, buffer, sizeof(buffer)));
+ const wchar_t kExpectedObject11Type[] = L"P";
+ EXPECT_EQ(
+ WideString(kExpectedObject11Type),
+ WideString::FromUTF16LE(buffer, FXSYS_len(kExpectedObject11Type)));
+ EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(object11));
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(object11));
+
+ FPDF_STRUCTELEMENT object14 =
+ FPDF_StructElement_GetChildAtIndex(object11, 0);
+ ASSERT_TRUE(object14);
+ ASSERT_EQ(20U,
+ FPDF_StructElement_GetType(object14, buffer, sizeof(buffer)));
+ const wchar_t kExpectedObject14Type[] = L"NonStruct";
+ EXPECT_EQ(
+ WideString(kExpectedObject14Type),
+ WideString::FromUTF16LE(buffer, FXSYS_len(kExpectedObject14Type)));
+ EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(object14));
+ ASSERT_EQ(2, FPDF_StructElement_CountChildren(object14));
+
+ // TODO(crbug.com/pdfium/672): Object 15 should be at index 1.
+ EXPECT_FALSE(FPDF_StructElement_GetChildAtIndex(object14, 1));
+ FPDF_STRUCTELEMENT object15 =
+ FPDF_StructElement_GetChildAtIndex(object14, 0);
+ ASSERT_TRUE(object15);
+ ASSERT_EQ(20U,
+ FPDF_StructElement_GetType(object15, buffer, sizeof(buffer)));
+ const wchar_t kExpectedObject15Type[] = L"NonStruct";
+ EXPECT_EQ(
+ WideString(kExpectedObject15Type),
+ WideString::FromUTF16LE(buffer, FXSYS_len(kExpectedObject15Type)));
+ EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentID(object15));
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(object15));
+
+ // TODO(crbug.com/pdfium/672): Fetch this child element.
+ EXPECT_FALSE(FPDF_StructElement_GetChildAtIndex(object15, 0));
+ }
+
+ UnloadPage(page);
+}
diff --git a/testing/resources/tagged_mcr_objr.in b/testing/resources/tagged_mcr_objr.in
new file mode 100644
index 0000000..3394633
--- /dev/null
+++ b/testing/resources/tagged_mcr_objr.in
@@ -0,0 +1,160 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+ /StructTreeRoot 7 0 R
+ /MarkInfo <<
+ /Type /MarkInfo
+ /Marked true
+ >>
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /StructParents 0
+ /Annots [4 0 R]
+ /Contents 5 0 R
+ /MediaBox [0 0 612 792]
+ /Resources <<
+ /ProcSet [/PDF /Text]
+ /Font <<
+ /F4 6 0 R
+ >>
+ >>
+>>
+endobj
+{{object 4 0}} <<
+ /Type /Annot
+ /Subtype /Link
+ /Border [0 0 0]
+ /Dest /top
+ /F 4
+ /Rect [20 46 68 61]
+>>
+endobj
+{{object 5 0}} <<
+ {{streamlen}}
+>>
+stream
+q
+BT
+/P <</MCID 0 >>BDC
+/F4 16 Tf
+20 650 Td
+(Hello, world!) Tj
+EMC
+ET
+BT
+/P <</MCID 1 >>BDC
+/F4 16 Tf
+20 50 Td
+(Link to top) Tj
+EMC
+ET
+Q
+endstream
+endobj
+{{object 6 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+{{object 7 0}} <<
+ /Type /StructTreeRoot
+ /K 8 0 R
+ /ParentTree 9 0 R
+ /ParentTreeNextKey 1
+>>
+endobj
+{{object 8 0}} <<
+ /Type /StructElem
+ /S /Document
+ /P 7 0 R
+ /K [10 0 R 11 0 R]
+ /ID /2
+ /Lang (en-US)
+>>
+endobj
+{{object 9 0}} <<
+ /Type /ParentTree
+ /Nums [0 [13 0 R 15 0 R]]
+>>
+endobj
+{{object 10 0}} <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 8 0 R
+ /K [12 0 R]
+ /ID /6
+>>
+endobj
+{{object 11 0}} <<
+ /Type /StructElem
+ /S /P
+ /P 8 0 R
+ /K [14 0 R]
+ /ID /4
+>>
+endobj
+{{object 12 0}} <<
+ /Type /StructElem
+ /S /P
+ /P 10 0 R
+ /K [13 0 R]
+ /ID /3
+>>
+endobj
+{{object 13 0}} <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 12 0 R
+ /K [
+ <<
+ /Type /MCR
+ /MCID 0
+ /Pg 3 0 R
+ >>
+ ]
+ /ID /7
+>>
+endobj
+{{object 14 0}} <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 11 0 R
+ /K [
+ 15 0 R
+ <<
+ /Type /OBJR
+ /Obj 4 0 R
+ >>
+ ]
+ /ID /9
+>>
+endobj
+{{object 15 0}} <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 14 0 R
+ /K [
+ <<
+ /Type /MCR
+ /Pg 3 0 R
+ /MCID 1
+ >>
+ ]
+ /ID /10
+>>
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/tagged_mcr_objr.pdf b/testing/resources/tagged_mcr_objr.pdf
new file mode 100644
index 0000000..a86faa7
--- /dev/null
+++ b/testing/resources/tagged_mcr_objr.pdf
@@ -0,0 +1,182 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+ /StructTreeRoot 7 0 R
+ /MarkInfo <<
+ /Type /MarkInfo
+ /Marked true
+ >>
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /StructParents 0
+ /Annots [4 0 R]
+ /Contents 5 0 R
+ /MediaBox [0 0 612 792]
+ /Resources <<
+ /ProcSet [/PDF /Text]
+ /Font <<
+ /F4 6 0 R
+ >>
+ >>
+>>
+endobj
+4 0 obj <<
+ /Type /Annot
+ /Subtype /Link
+ /Border [0 0 0]
+ /Dest /top
+ /F 4
+ /Rect [20 46 68 61]
+>>
+endobj
+5 0 obj <<
+ /Length 137
+>>
+stream
+q
+BT
+/P <</MCID 0 >>BDC
+/F4 16 Tf
+20 650 Td
+(Hello, world!) Tj
+EMC
+ET
+BT
+/P <</MCID 1 >>BDC
+/F4 16 Tf
+20 50 Td
+(Link to top) Tj
+EMC
+ET
+Q
+endstream
+endobj
+6 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+7 0 obj <<
+ /Type /StructTreeRoot
+ /K 8 0 R
+ /ParentTree 9 0 R
+ /ParentTreeNextKey 1
+>>
+endobj
+8 0 obj <<
+ /Type /StructElem
+ /S /Document
+ /P 7 0 R
+ /K [10 0 R 11 0 R]
+ /ID /2
+ /Lang (en-US)
+>>
+endobj
+9 0 obj <<
+ /Type /ParentTree
+ /Nums [0 [13 0 R 15 0 R]]
+>>
+endobj
+10 0 obj <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 8 0 R
+ /K [12 0 R]
+ /ID /6
+>>
+endobj
+11 0 obj <<
+ /Type /StructElem
+ /S /P
+ /P 8 0 R
+ /K [14 0 R]
+ /ID /4
+>>
+endobj
+12 0 obj <<
+ /Type /StructElem
+ /S /P
+ /P 10 0 R
+ /K [13 0 R]
+ /ID /3
+>>
+endobj
+13 0 obj <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 12 0 R
+ /K [
+ <<
+ /Type /MCR
+ /MCID 0
+ /Pg 3 0 R
+ >>
+ ]
+ /ID /7
+>>
+endobj
+14 0 obj <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 11 0 R
+ /K [
+ 15 0 R
+ <<
+ /Type /OBJR
+ /Obj 4 0 R
+ >>
+ ]
+ /ID /9
+>>
+endobj
+15 0 obj <<
+ /Type /StructElem
+ /S /NonStruct
+ /P 14 0 R
+ /K [
+ <<
+ /Type /MCR
+ /Pg 3 0 R
+ /MCID 1
+ >>
+ ]
+ /ID /10
+>>
+endobj
+xref
+0 16
+0000000000 65535 f
+0000000015 00000 n
+0000000149 00000 n
+0000000212 00000 n
+0000000427 00000 n
+0000000540 00000 n
+0000000729 00000 n
+0000000807 00000 n
+0000000906 00000 n
+0000001019 00000 n
+0000001088 00000 n
+0000001180 00000 n
+0000001264 00000 n
+0000001349 00000 n
+0000001500 00000 n
+0000001650 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 16
+>>
+startxref
+1802
+%%EOF