Add FPDF_StructElement_GetID and FPDF_StructElement_GetLang
Provides a way to retrieve the ID or language from a struct element
in a tagged PDF.
Bug: pdfium:1564
Change-Id: I1499863cd5896466d70de409c0e09c8c4088d8bf
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/71570
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Dominic Mazzoni <dmazzoni@chromium.org>
diff --git a/fpdfsdk/fpdf_structtree.cpp b/fpdfsdk/fpdf_structtree.cpp
index 56d1762..e611647 100644
--- a/fpdfsdk/fpdf_structtree.cpp
+++ b/fpdfsdk/fpdf_structtree.cpp
@@ -80,6 +80,36 @@
}
FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetID(FPDF_STRUCTELEMENT struct_element,
+ void* buffer,
+ unsigned long buflen) {
+ CPDF_StructElement* elem =
+ CPDFStructElementFromFPDFStructElement(struct_element);
+ const CPDF_Dictionary* dict = elem ? elem->GetDict() : nullptr;
+ if (!dict)
+ return 0;
+ const CPDF_Object* obj = dict->GetObjectFor("ID");
+ if (!obj || !obj->IsString())
+ return 0;
+ return Utf16EncodeMaybeCopyAndReturnLength(obj->GetUnicodeText(), buffer,
+ buflen);
+}
+
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetLang(FPDF_STRUCTELEMENT struct_element,
+ void* buffer,
+ unsigned long buflen) {
+ CPDF_StructElement* elem =
+ CPDFStructElementFromFPDFStructElement(struct_element);
+ const CPDF_Dictionary* dict = elem ? elem->GetDict() : nullptr;
+ const CPDF_Object* obj = dict->GetObjectFor("Lang");
+ if (!obj || !obj->IsString())
+ return 0;
+ return Utf16EncodeMaybeCopyAndReturnLength(obj->GetUnicodeText(), buffer,
+ buflen);
+}
+
+FPDF_EXPORT unsigned long FPDF_CALLCONV
FPDF_StructElement_GetStringAttribute(FPDF_STRUCTELEMENT struct_element,
FPDF_BYTESTRING attr_name,
void* buffer,
@@ -93,10 +123,11 @@
CPDF_ArrayLocker locker(array);
for (const RetainPtr<CPDF_Object>& obj : locker) {
const CPDF_Dictionary* obj_dict = obj->AsDictionary();
- if (obj_dict && obj_dict->KeyExist(attr_name)) {
- return WideStringToBuffer(obj_dict->GetUnicodeTextFor(attr_name), buffer,
- buflen);
- }
+ const CPDF_Object* attr = obj_dict->GetObjectFor(attr_name);
+ if (!attr || !(attr->IsString() || attr->IsName()))
+ continue;
+ return Utf16EncodeMaybeCopyAndReturnLength(attr->GetUnicodeText(), buffer,
+ buflen);
}
return 0;
}
diff --git a/fpdfsdk/fpdf_structtree_embeddertest.cpp b/fpdfsdk/fpdf_structtree_embeddertest.cpp
index 5a3ba0b..aff903a 100644
--- a/fpdfsdk/fpdf_structtree_embeddertest.cpp
+++ b/fpdfsdk/fpdf_structtree_embeddertest.cpp
@@ -103,6 +103,10 @@
EXPECT_EQ(12U, FPDF_StructElement_GetType(table, buffer, kBufLen));
EXPECT_EQ("Table", GetPlatformString(buffer));
+ // The table should have an attribute "Summary" set to the empty string.
+ EXPECT_EQ(2U, FPDF_StructElement_GetStringAttribute(table, "Summary",
+ buffer, kBufLen));
+
ASSERT_EQ(2, FPDF_StructElement_CountChildren(table));
FPDF_STRUCTELEMENT row = FPDF_StructElement_GetChildAtIndex(table, 0);
ASSERT_TRUE(row);
@@ -119,6 +123,11 @@
buffer, kBufLen));
EXPECT_EQ("Row", GetPlatformString(buffer));
+ // The header has an attribute "ColSpan", but it's not a string so it
+ // returns null.
+ EXPECT_EQ(0U, FPDF_StructElement_GetStringAttribute(header_cell, "ColSpan",
+ buffer, kBufLen));
+
// An unsupported attribute should return 0.
EXPECT_EQ(0U, FPDF_StructElement_GetStringAttribute(header_cell, "Other",
buffer, kBufLen));
@@ -131,6 +140,96 @@
UnloadPage(page);
}
+TEST_F(FPDFStructTreeEmbedderTest, GetID) {
+ ASSERT_TRUE(OpenDocument("tagged_table.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ {
+ ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+ ASSERT_TRUE(struct_tree);
+ ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+ FPDF_STRUCTELEMENT document = document =
+ FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+ ASSERT_TRUE(document);
+
+ constexpr int kBufLen = 100;
+ uint16_t buffer[kBufLen] = {0};
+ EXPECT_EQ(18U, FPDF_StructElement_GetType(document, buffer, kBufLen));
+ EXPECT_EQ("Document", GetPlatformString(buffer));
+
+ // The document has no ID.
+ EXPECT_EQ(0U, FPDF_StructElement_GetID(document, buffer, kBufLen));
+
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(document));
+ FPDF_STRUCTELEMENT table = FPDF_StructElement_GetChildAtIndex(document, 0);
+ ASSERT_TRUE(table);
+
+ EXPECT_EQ(12U, FPDF_StructElement_GetType(table, buffer, kBufLen));
+ EXPECT_EQ("Table", GetPlatformString(buffer));
+
+ // The table has an ID.
+ EXPECT_EQ(14U, FPDF_StructElement_GetID(table, buffer, kBufLen));
+ EXPECT_EQ("node12", GetPlatformString(buffer));
+
+ // The first child of the table is a row, which has an empty ID.
+ // It returns 2U, the length of an empty string, instead of 0U,
+ // representing null.
+ ASSERT_EQ(2, FPDF_StructElement_CountChildren(table));
+ FPDF_STRUCTELEMENT row = FPDF_StructElement_GetChildAtIndex(table, 0);
+ ASSERT_TRUE(row);
+ EXPECT_EQ(2U, FPDF_StructElement_GetID(row, buffer, kBufLen));
+ }
+
+ UnloadPage(page);
+}
+
+TEST_F(FPDFStructTreeEmbedderTest, GetLang) {
+ ASSERT_TRUE(OpenDocument("tagged_table.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ {
+ ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+ ASSERT_TRUE(struct_tree);
+ ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+ FPDF_STRUCTELEMENT document = document =
+ FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+ ASSERT_TRUE(document);
+
+ constexpr int kBufLen = 100;
+ uint16_t buffer[kBufLen] = {0};
+ EXPECT_EQ(18U, FPDF_StructElement_GetType(document, buffer, kBufLen));
+ EXPECT_EQ("Document", GetPlatformString(buffer));
+
+ // The document has a language.
+ EXPECT_EQ(12U, FPDF_StructElement_GetLang(document, buffer, kBufLen));
+ EXPECT_EQ("en-US", GetPlatformString(buffer));
+
+ ASSERT_EQ(1, FPDF_StructElement_CountChildren(document));
+ FPDF_STRUCTELEMENT table = FPDF_StructElement_GetChildAtIndex(document, 0);
+ ASSERT_TRUE(table);
+
+ // The first child is a table, with a language.
+ EXPECT_EQ(12U, FPDF_StructElement_GetType(table, buffer, kBufLen));
+ EXPECT_EQ("Table", GetPlatformString(buffer));
+
+ EXPECT_EQ(6U, FPDF_StructElement_GetLang(table, buffer, kBufLen));
+ EXPECT_EQ("hu", GetPlatformString(buffer));
+
+ // The first child of the table is a row, which doesn't have a
+ // language explicitly set on it.
+ ASSERT_EQ(2, FPDF_StructElement_CountChildren(table));
+ FPDF_STRUCTELEMENT row = FPDF_StructElement_GetChildAtIndex(table, 0);
+ ASSERT_TRUE(row);
+ EXPECT_EQ(0U, FPDF_StructElement_GetLang(row, buffer, kBufLen));
+ }
+
+ UnloadPage(page);
+}
+
TEST_F(FPDFStructTreeEmbedderTest, GetMarkedContentID) {
ASSERT_TRUE(OpenDocument("marked_content_id.pdf"));
FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 825691d..a7310b9 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -325,6 +325,8 @@
CHK(FPDF_StructElement_CountChildren);
CHK(FPDF_StructElement_GetAltText);
CHK(FPDF_StructElement_GetChildAtIndex);
+ CHK(FPDF_StructElement_GetID);
+ CHK(FPDF_StructElement_GetLang);
CHK(FPDF_StructElement_GetMarkedContentID);
CHK(FPDF_StructElement_GetStringAttribute);
CHK(FPDF_StructElement_GetTitle);
diff --git a/public/fpdf_structtree.h b/public/fpdf_structtree.h
index 6a14023..5e1f22e 100644
--- a/public/fpdf_structtree.h
+++ b/public/fpdf_structtree.h
@@ -75,6 +75,47 @@
void* buffer,
unsigned long buflen);
+// Function: FPDF_StructElement_GetID
+// Get the ID for a given element. Experimental API.
+// Parameters:
+// struct_element - Handle to the struct element.
+// buffer - A buffer for output the ID string. May be NULL.
+// buflen - The length of the buffer, in bytes. May be 0.
+// Return value:
+// The number of bytes in the ID string, including the terminating NUL
+// character. The number of bytes is returned regardless of the
+// |buffer| and |buflen| parameters.
+// Comments:
+// Regardless of the platform, the |buffer| is always in UTF-16LE
+// encoding. The string is terminated by a UTF16 NUL character. If
+// |buflen| is less than the required length, or |buffer| is NULL,
+// |buffer| will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetID(FPDF_STRUCTELEMENT struct_element,
+ void* buffer,
+ unsigned long buflen);
+
+// Function: FPDF_StructElement_GetLang
+// Get the case-insensitive IETF BCP 47 language code for an element.
+// Experimental API.
+// Parameters:
+// struct_element - Handle to the struct element.
+// buffer - A buffer for output the lang string. May be NULL.
+// buflen - The length of the buffer, in bytes. May be 0.
+// Return value:
+// The number of bytes in the ID string, including the terminating NUL
+// character. The number of bytes is returned regardless of the
+// |buffer| and |buflen| parameters.
+// Comments:
+// Regardless of the platform, the |buffer| is always in UTF-16LE
+// encoding. The string is terminated by a UTF16 NUL character. If
+// |buflen| is less than the required length, or |buffer| is NULL,
+// |buffer| will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetLang(FPDF_STRUCTELEMENT struct_element,
+ void* buffer,
+ unsigned long buflen);
+
// Function: FPDF_StructElement_GetStringAttribute
// Get a struct element attribute of type "name" or "string".
// Experimental API.
diff --git a/testing/resources/tagged_table.in b/testing/resources/tagged_table.in
index d5a9877..70df01c 100644
--- a/testing/resources/tagged_table.in
+++ b/testing/resources/tagged_table.in
@@ -124,6 +124,7 @@
/P 8 0 R
/T (TitleText)
/Pg 3 0 R
+ /Lang (en-US)
>>
endobj
{{object 11 0}} <<
@@ -132,6 +133,12 @@
/K [12 0 R 13 0 R]
/P 10 0 R
/Pg 3 0 R
+ /A [<<
+ /O /Table
+ /Summary ()
+ >>]
+ /ID (node12)
+ /Lang (hu)
>>
endobj
{{object 12 0}} <<
@@ -140,6 +147,7 @@
/K [14 0 R 15 0 R]
/P 11 0 R
/Pg 3 0 R
+ /ID ()
>>
endobj
{{object 13 0}} <<
@@ -148,6 +156,7 @@
/K [16 0 R 17 0 R]
/P 11 0 R
/Pg 3 0 R
+ /ID (node14)
>>
endobj
{{object 14 0}} <<
@@ -155,12 +164,15 @@
/S /TH
/P 12 0 R
/Pg 3 0 R
- /A [
- <<
- /O /Table
- /Scope /Row
- >>
- ]
+ /A [<<
+ /O /Table
+ /Scope /Row
+ >>
+ <<
+ /O /Table
+ /ColSpan 2
+ >>]
+ /ID (node15)
>>
endobj
{{object 15 0}} <<
@@ -168,6 +180,7 @@
/S /TD
/P 12 0 R
/Pg 3 0 R
+ /ID (node16)
>>
endobj
{{object 16 0}} <<
@@ -175,12 +188,11 @@
/S /TH
/P 13 0 R
/Pg 3 0 R
- /A [
- <<
- /O /Table
- /Scope /Row
- >>
- ]
+ /A [<<
+ /O /Table
+ /Scope /Row
+ >>]
+ /ID (node17)
>>
endobj
{{object 17 0}} <<
@@ -188,6 +200,7 @@
/S /TD
/P 13 0 R
/Pg 3 0 R
+ /ID (node18)
>>
endobj
{{xref}}
diff --git a/testing/resources/tagged_table.pdf b/testing/resources/tagged_table.pdf
index b4469fd..b11437c 100644
--- a/testing/resources/tagged_table.pdf
+++ b/testing/resources/tagged_table.pdf
@@ -125,6 +125,7 @@
/P 8 0 R
/T (TitleText)
/Pg 3 0 R
+ /Lang (en-US)
>>
endobj
11 0 obj <<
@@ -133,6 +134,12 @@
/K [12 0 R 13 0 R]
/P 10 0 R
/Pg 3 0 R
+ /A [<<
+ /O /Table
+ /Summary ()
+ >>]
+ /ID (node12)
+ /Lang (hu)
>>
endobj
12 0 obj <<
@@ -141,6 +148,7 @@
/K [14 0 R 15 0 R]
/P 11 0 R
/Pg 3 0 R
+ /ID ()
>>
endobj
13 0 obj <<
@@ -149,6 +157,7 @@
/K [16 0 R 17 0 R]
/P 11 0 R
/Pg 3 0 R
+ /ID (node14)
>>
endobj
14 0 obj <<
@@ -156,12 +165,15 @@
/S /TH
/P 12 0 R
/Pg 3 0 R
- /A [
- <<
- /O /Table
- /Scope /Row
- >>
- ]
+ /A [<<
+ /O /Table
+ /Scope /Row
+ >>
+ <<
+ /O /Table
+ /ColSpan 2
+ >>]
+ /ID (node15)
>>
endobj
15 0 obj <<
@@ -169,6 +181,7 @@
/S /TD
/P 12 0 R
/Pg 3 0 R
+ /ID (node16)
>>
endobj
16 0 obj <<
@@ -176,12 +189,11 @@
/S /TH
/P 13 0 R
/Pg 3 0 R
- /A [
- <<
- /O /Table
- /Scope /Row
- >>
- ]
+ /A [<<
+ /O /Table
+ /Scope /Row
+ >>]
+ /ID (node17)
>>
endobj
17 0 obj <<
@@ -189,6 +201,7 @@
/S /TD
/P 13 0 R
/Pg 3 0 R
+ /ID (node18)
>>
endobj
xref
@@ -204,17 +217,17 @@
0000001253 00000 n
0000001412 00000 n
0000001515 00000 n
-0000001626 00000 n
-0000001725 00000 n
-0000001821 00000 n
-0000001917 00000 n
-0000002051 00000 n
-0000002126 00000 n
-0000002260 00000 n
+0000001642 00000 n
+0000001826 00000 n
+0000001931 00000 n
+0000002042 00000 n
+0000002244 00000 n
+0000002334 00000 n
+0000002481 00000 n
trailer <<
/Root 1 0 R
/Size 18
>>
startxref
-2335
+2571
%%EOF