Add FPDF_StructElement_GetStringAttribute
Provides a way to retrieve a string attribute from a struct element
in a tagged PDF.
Bug: pdfium:1564
Change-Id: I9283d458df5560b03a28e68d568eb7c36e6c3a81
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/71530
Auto-Submit: Dominic Mazzoni <dmazzoni@chromium.org>
Commit-Queue: Dominic Mazzoni <dmazzoni@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_structtree.cpp b/fpdfsdk/fpdf_structtree.cpp
index 2c5c6e3..56d1762 100644
--- a/fpdfsdk/fpdf_structtree.cpp
+++ b/fpdfsdk/fpdf_structtree.cpp
@@ -7,6 +7,7 @@
#include <memory>
#include "core/fpdfapi/page/cpdf_page.h"
+#include "core/fpdfapi/parser/cpdf_array.h"
#include "core/fpdfapi/parser/cpdf_dictionary.h"
#include "core/fpdfdoc/cpdf_structelement.h"
#include "core/fpdfdoc/cpdf_structtree.h"
@@ -78,6 +79,28 @@
return elem ? WideStringToBuffer(elem->GetAltText(), buffer, buflen) : 0;
}
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetStringAttribute(FPDF_STRUCTELEMENT struct_element,
+ FPDF_BYTESTRING attr_name,
+ void* buffer,
+ unsigned long buflen) {
+ CPDF_StructElement* elem =
+ CPDFStructElementFromFPDFStructElement(struct_element);
+ const CPDF_Dictionary* dict = elem ? elem->GetDict() : nullptr;
+ const CPDF_Array* array = dict ? dict->GetArrayFor("A") : nullptr;
+ if (!array)
+ return 0;
+ CPDF_ArrayLocker locker(array);
+ for (const RetainPtr<CPDF_Object>& obj : locker) {
+ const CPDF_Dictionary* obj_dict = obj->AsDictionary();
+ if (obj_dict && obj_dict->KeyExist(attr_name)) {
+ return WideStringToBuffer(obj_dict->GetUnicodeTextFor(attr_name), buffer,
+ buflen);
+ }
+ }
+ return 0;
+}
+
FPDF_EXPORT int FPDF_CALLCONV
FPDF_StructElement_GetMarkedContentID(FPDF_STRUCTELEMENT struct_element) {
CPDF_StructElement* elem =
diff --git a/fpdfsdk/fpdf_structtree_embeddertest.cpp b/fpdfsdk/fpdf_structtree_embeddertest.cpp
index 8a8c65f..5a3ba0b 100644
--- a/fpdfsdk/fpdf_structtree_embeddertest.cpp
+++ b/fpdfsdk/fpdf_structtree_embeddertest.cpp
@@ -5,6 +5,7 @@
#include "core/fxcrt/fx_string.h"
#include "public/fpdf_structtree.h"
#include "testing/embedder_test.h"
+#include "testing/fx_string_testhelpers.h"
#include "third_party/base/optional.h"
#include "third_party/base/stl_util.h"
@@ -76,6 +77,60 @@
UnloadPage(page);
}
+TEST_F(FPDFStructTreeEmbedderTest, GetStringAttribute) {
+ ASSERT_TRUE(OpenDocument("tagged_table.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ {
+ ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+ ASSERT_TRUE(struct_tree);
+ ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+ FPDF_STRUCTELEMENT document = document =
+ FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+ ASSERT_TRUE(document);
+
+ constexpr int kBufLen = 100;
+ uint16_t buffer[kBufLen] = {0};
+ EXPECT_EQ(18U, FPDF_StructElement_GetType(document, buffer, kBufLen));
+ EXPECT_EQ("Document", GetPlatformString(buffer));
+
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(document));
+ FPDF_STRUCTELEMENT table = FPDF_StructElement_GetChildAtIndex(document, 0);
+ ASSERT_TRUE(table);
+
+ EXPECT_EQ(12U, FPDF_StructElement_GetType(table, buffer, kBufLen));
+ EXPECT_EQ("Table", GetPlatformString(buffer));
+
+ ASSERT_EQ(2, FPDF_StructElement_CountChildren(table));
+ FPDF_STRUCTELEMENT row = FPDF_StructElement_GetChildAtIndex(table, 0);
+ ASSERT_TRUE(row);
+
+ ASSERT_EQ(2, FPDF_StructElement_CountChildren(row));
+ FPDF_STRUCTELEMENT header_cell = FPDF_StructElement_GetChildAtIndex(row, 0);
+ ASSERT_TRUE(header_cell);
+
+ EXPECT_EQ(6U, FPDF_StructElement_GetType(header_cell, buffer, kBufLen));
+ EXPECT_EQ("TH", GetPlatformString(buffer));
+
+ // The header should have an attribute "Scope" with a scope of "Row".
+ EXPECT_EQ(8U, FPDF_StructElement_GetStringAttribute(header_cell, "Scope",
+ buffer, kBufLen));
+ EXPECT_EQ("Row", GetPlatformString(buffer));
+
+ // An unsupported attribute should return 0.
+ EXPECT_EQ(0U, FPDF_StructElement_GetStringAttribute(header_cell, "Other",
+ buffer, kBufLen));
+
+ // A null struct element should not crash.
+ EXPECT_EQ(0U, FPDF_StructElement_GetStringAttribute(nullptr, "Other",
+ buffer, kBufLen));
+ }
+
+ UnloadPage(page);
+}
+
TEST_F(FPDFStructTreeEmbedderTest, GetMarkedContentID) {
ASSERT_TRUE(OpenDocument("marked_content_id.pdf"));
FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index b0e4606..214fb733 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -324,6 +324,7 @@
CHK(FPDF_StructElement_GetAltText);
CHK(FPDF_StructElement_GetChildAtIndex);
CHK(FPDF_StructElement_GetMarkedContentID);
+ CHK(FPDF_StructElement_GetStringAttribute);
CHK(FPDF_StructElement_GetTitle);
CHK(FPDF_StructElement_GetType);
CHK(FPDF_StructTree_Close);
diff --git a/public/fpdf_structtree.h b/public/fpdf_structtree.h
index a8083d5..6a14023 100644
--- a/public/fpdf_structtree.h
+++ b/public/fpdf_structtree.h
@@ -75,6 +75,29 @@
void* buffer,
unsigned long buflen);
+// Function: FPDF_StructElement_GetStringAttribute
+// Get a struct element attribute of type "name" or "string".
+// Experimental API.
+// Parameters:
+// struct_element - Handle to the struct element.
+// attr_name - The name of the attribute to retrieve.
+// buffer - A buffer for output. May be NULL.
+// buflen - The length of the buffer, in bytes. May be 0.
+// Return value:
+// The number of bytes in the attribute value, including the
+// terminating NUL character. The number of bytes is returned
+// regardless of the |buffer| and |buflen| parameters.
+// Comments:
+// Regardless of the platform, the |buffer| is always in UTF-16LE
+// encoding. The string is terminated by a UTF16 NUL character. If
+// |buflen| is less than the required length, or |buffer| is NULL,
+// |buffer| will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetStringAttribute(FPDF_STRUCTELEMENT struct_element,
+ FPDF_BYTESTRING attr_name,
+ void* buffer,
+ unsigned long buflen);
+
// Function: FPDF_StructElement_GetMarkedContentID
// Get the marked content ID for a given element.
// Parameters:
diff --git a/testing/resources/tagged_table.in b/testing/resources/tagged_table.in
new file mode 100644
index 0000000..d5a9877
--- /dev/null
+++ b/testing/resources/tagged_table.in
@@ -0,0 +1,196 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+ /StructTreeRoot 8 0 R
+ /Lang (en-US)
+ /MarkInfo <<
+ /Marked true
+ >>
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+ /MediaBox [0 0 612 792]
+ /Group <<
+ /CS /DeviceRGB
+ /I true
+ /S /Transparency
+ >>
+ /Resources <<
+ /ProcSet [/PDF /ImageC /ImageI /ImageB]
+ /XObject <<
+ /Tr8 5 0 R
+ /Im7 6 0 R
+ >>
+ /ExtGState <<
+ /EGS9 7 0 R
+ >>
+ >>
+ /StructParents 0
+>>
+endobj
+{{object 4 0}} <<
+ {{streamlen}}
+>>
+stream
+0.1 w
+/Artifact
+BMC
+q
+0 0 612 792 re
+W* n
+EMC
+/Figure<</MCID 0>>
+BDC
+Q
+q
+281 685.3 50 50 re
+W* n
+q
+49.9 0 0 50 281.1 685.4 cm
+/Im7 Do
+Q
+EMC
+Q
+q
+EGS9 gs /Tr8 Do
+Q
+endstream
+endobj
+{{object 5 0}} <<
+ /Type /XObject
+ /Subtype /Form
+ /BBox [-140 395 753 395.1]
+ /Group <<
+ /CS /DeviceRGB
+ /K true
+ /S /Transparency
+ >>
+ {{streamlen}}
+>>
+stream
+endstream
+endobj
+{{object 6 0}} <<
+ /Type /XObject
+ /Subtype /Image
+ /Width 50
+ /Height 50
+ /BitsPerComponent 8
+ /ColorSpace /DeviceRGB
+ /Filter [/ASCIIHexDecode /FlateDecode]
+ {{streamlen}}
+>>
+stream
+789cedc13101000000c2a0f54fed6f06a00000000000000078031d4c0001
+endstream
+endobj
+{{object 7 0}} <<
+ /ca 0.5
+ /CA 0.5
+>>
+endobj
+{{object 8 0}} <<
+ /Type /StructTreeRoot
+ /ParentTree 9 0 R
+ /K [10 0 R]
+ /RoleMap <<
+ /Document /Document
+ /Standard /P
+ /Figure /Figure
+ >>
+>>
+endobj
+{{object 9 0}} <<
+ /Nums [
+ 0
+ [10 0 R 11 0 R 12 0 R 13 0 R 14 0 R 15 0 R 16 0 R 17 0 R]
+ ]
+>>
+endobj
+{{object 10 0}} <<
+ /Type /StructElem
+ /S /Document
+ /K [11 0 R]
+ /P 8 0 R
+ /T (TitleText)
+ /Pg 3 0 R
+>>
+endobj
+{{object 11 0}} <<
+ /Type /StructElem
+ /S /Table
+ /K [12 0 R 13 0 R]
+ /P 10 0 R
+ /Pg 3 0 R
+>>
+endobj
+{{object 12 0}} <<
+ /Type /StructElem
+ /S /TR
+ /K [14 0 R 15 0 R]
+ /P 11 0 R
+ /Pg 3 0 R
+>>
+endobj
+{{object 13 0}} <<
+ /Type /StructElem
+ /S /TR
+ /K [16 0 R 17 0 R]
+ /P 11 0 R
+ /Pg 3 0 R
+>>
+endobj
+{{object 14 0}} <<
+ /Type /StructElem
+ /S /TH
+ /P 12 0 R
+ /Pg 3 0 R
+ /A [
+ <<
+ /O /Table
+ /Scope /Row
+ >>
+ ]
+>>
+endobj
+{{object 15 0}} <<
+ /Type /StructElem
+ /S /TD
+ /P 12 0 R
+ /Pg 3 0 R
+>>
+endobj
+{{object 16 0}} <<
+ /Type /StructElem
+ /S /TH
+ /P 13 0 R
+ /Pg 3 0 R
+ /A [
+ <<
+ /O /Table
+ /Scope /Row
+ >>
+ ]
+>>
+endobj
+{{object 17 0}} <<
+ /Type /StructElem
+ /S /TD
+ /P 13 0 R
+ /Pg 3 0 R
+>>
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/tagged_table.pdf b/testing/resources/tagged_table.pdf
new file mode 100644
index 0000000..b4469fd
--- /dev/null
+++ b/testing/resources/tagged_table.pdf
@@ -0,0 +1,220 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+ /StructTreeRoot 8 0 R
+ /Lang (en-US)
+ /MarkInfo <<
+ /Marked true
+ >>
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+ /MediaBox [0 0 612 792]
+ /Group <<
+ /CS /DeviceRGB
+ /I true
+ /S /Transparency
+ >>
+ /Resources <<
+ /ProcSet [/PDF /ImageC /ImageI /ImageB]
+ /XObject <<
+ /Tr8 5 0 R
+ /Im7 6 0 R
+ >>
+ /ExtGState <<
+ /EGS9 7 0 R
+ >>
+ >>
+ /StructParents 0
+>>
+endobj
+4 0 obj <<
+ /Length 162
+>>
+stream
+0.1 w
+/Artifact
+BMC
+q
+0 0 612 792 re
+W* n
+EMC
+/Figure<</MCID 0>>
+BDC
+Q
+q
+281 685.3 50 50 re
+W* n
+q
+49.9 0 0 50 281.1 685.4 cm
+/Im7 Do
+Q
+EMC
+Q
+q
+EGS9 gs /Tr8 Do
+Q
+endstream
+endobj
+5 0 obj <<
+ /Type /XObject
+ /Subtype /Form
+ /BBox [-140 395 753 395.1]
+ /Group <<
+ /CS /DeviceRGB
+ /K true
+ /S /Transparency
+ >>
+ /Length 0
+>>
+stream
+endstream
+endobj
+6 0 obj <<
+ /Type /XObject
+ /Subtype /Image
+ /Width 50
+ /Height 50
+ /BitsPerComponent 8
+ /ColorSpace /DeviceRGB
+ /Filter [/ASCIIHexDecode /FlateDecode]
+ /Length 61
+>>
+stream
+789cedc13101000000c2a0f54fed6f06a00000000000000078031d4c0001
+endstream
+endobj
+7 0 obj <<
+ /ca 0.5
+ /CA 0.5
+>>
+endobj
+8 0 obj <<
+ /Type /StructTreeRoot
+ /ParentTree 9 0 R
+ /K [10 0 R]
+ /RoleMap <<
+ /Document /Document
+ /Standard /P
+ /Figure /Figure
+ >>
+>>
+endobj
+9 0 obj <<
+ /Nums [
+ 0
+ [10 0 R 11 0 R 12 0 R 13 0 R 14 0 R 15 0 R 16 0 R 17 0 R]
+ ]
+>>
+endobj
+10 0 obj <<
+ /Type /StructElem
+ /S /Document
+ /K [11 0 R]
+ /P 8 0 R
+ /T (TitleText)
+ /Pg 3 0 R
+>>
+endobj
+11 0 obj <<
+ /Type /StructElem
+ /S /Table
+ /K [12 0 R 13 0 R]
+ /P 10 0 R
+ /Pg 3 0 R
+>>
+endobj
+12 0 obj <<
+ /Type /StructElem
+ /S /TR
+ /K [14 0 R 15 0 R]
+ /P 11 0 R
+ /Pg 3 0 R
+>>
+endobj
+13 0 obj <<
+ /Type /StructElem
+ /S /TR
+ /K [16 0 R 17 0 R]
+ /P 11 0 R
+ /Pg 3 0 R
+>>
+endobj
+14 0 obj <<
+ /Type /StructElem
+ /S /TH
+ /P 12 0 R
+ /Pg 3 0 R
+ /A [
+ <<
+ /O /Table
+ /Scope /Row
+ >>
+ ]
+>>
+endobj
+15 0 obj <<
+ /Type /StructElem
+ /S /TD
+ /P 12 0 R
+ /Pg 3 0 R
+>>
+endobj
+16 0 obj <<
+ /Type /StructElem
+ /S /TH
+ /P 13 0 R
+ /Pg 3 0 R
+ /A [
+ <<
+ /O /Table
+ /Scope /Row
+ >>
+ ]
+>>
+endobj
+17 0 obj <<
+ /Type /StructElem
+ /S /TD
+ /P 13 0 R
+ /Pg 3 0 R
+>>
+endobj
+xref
+0 18
+0000000000 65535 f
+0000000015 00000 n
+0000000145 00000 n
+0000000208 00000 n
+0000000556 00000 n
+0000000770 00000 n
+0000000952 00000 n
+0000001212 00000 n
+0000001253 00000 n
+0000001412 00000 n
+0000001515 00000 n
+0000001626 00000 n
+0000001725 00000 n
+0000001821 00000 n
+0000001917 00000 n
+0000002051 00000 n
+0000002126 00000 n
+0000002260 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 18
+>>
+startxref
+2335
+%%EOF