Add FPDF_StructElement_GetID and FPDF_StructElement_GetLang

Provides a way to retrieve the ID or language from a struct element
in a tagged PDF.

Bug: pdfium:1564
Change-Id: I1499863cd5896466d70de409c0e09c8c4088d8bf
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/71570
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Dominic Mazzoni <dmazzoni@chromium.org>
diff --git a/fpdfsdk/fpdf_structtree.cpp b/fpdfsdk/fpdf_structtree.cpp
index 56d1762..e611647 100644
--- a/fpdfsdk/fpdf_structtree.cpp
+++ b/fpdfsdk/fpdf_structtree.cpp
@@ -80,6 +80,36 @@
 }
 
 FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetID(FPDF_STRUCTELEMENT struct_element,
+                         void* buffer,
+                         unsigned long buflen) {
+  CPDF_StructElement* elem =
+      CPDFStructElementFromFPDFStructElement(struct_element);
+  const CPDF_Dictionary* dict = elem ? elem->GetDict() : nullptr;
+  if (!dict)
+    return 0;
+  const CPDF_Object* obj = dict->GetObjectFor("ID");
+  if (!obj || !obj->IsString())
+    return 0;
+  return Utf16EncodeMaybeCopyAndReturnLength(obj->GetUnicodeText(), buffer,
+                                             buflen);
+}
+
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetLang(FPDF_STRUCTELEMENT struct_element,
+                           void* buffer,
+                           unsigned long buflen) {
+  CPDF_StructElement* elem =
+      CPDFStructElementFromFPDFStructElement(struct_element);
+  const CPDF_Dictionary* dict = elem ? elem->GetDict() : nullptr;
+  const CPDF_Object* obj = dict->GetObjectFor("Lang");
+  if (!obj || !obj->IsString())
+    return 0;
+  return Utf16EncodeMaybeCopyAndReturnLength(obj->GetUnicodeText(), buffer,
+                                             buflen);
+}
+
+FPDF_EXPORT unsigned long FPDF_CALLCONV
 FPDF_StructElement_GetStringAttribute(FPDF_STRUCTELEMENT struct_element,
                                       FPDF_BYTESTRING attr_name,
                                       void* buffer,
@@ -93,10 +123,11 @@
   CPDF_ArrayLocker locker(array);
   for (const RetainPtr<CPDF_Object>& obj : locker) {
     const CPDF_Dictionary* obj_dict = obj->AsDictionary();
-    if (obj_dict && obj_dict->KeyExist(attr_name)) {
-      return WideStringToBuffer(obj_dict->GetUnicodeTextFor(attr_name), buffer,
-                                buflen);
-    }
+    const CPDF_Object* attr = obj_dict->GetObjectFor(attr_name);
+    if (!attr || !(attr->IsString() || attr->IsName()))
+      continue;
+    return Utf16EncodeMaybeCopyAndReturnLength(attr->GetUnicodeText(), buffer,
+                                               buflen);
   }
   return 0;
 }
diff --git a/fpdfsdk/fpdf_structtree_embeddertest.cpp b/fpdfsdk/fpdf_structtree_embeddertest.cpp
index 5a3ba0b..aff903a 100644
--- a/fpdfsdk/fpdf_structtree_embeddertest.cpp
+++ b/fpdfsdk/fpdf_structtree_embeddertest.cpp
@@ -103,6 +103,10 @@
     EXPECT_EQ(12U, FPDF_StructElement_GetType(table, buffer, kBufLen));
     EXPECT_EQ("Table", GetPlatformString(buffer));
 
+    // The table should have an attribute "Summary" set to the empty string.
+    EXPECT_EQ(2U, FPDF_StructElement_GetStringAttribute(table, "Summary",
+                                                        buffer, kBufLen));
+
     ASSERT_EQ(2, FPDF_StructElement_CountChildren(table));
     FPDF_STRUCTELEMENT row = FPDF_StructElement_GetChildAtIndex(table, 0);
     ASSERT_TRUE(row);
@@ -119,6 +123,11 @@
                                                         buffer, kBufLen));
     EXPECT_EQ("Row", GetPlatformString(buffer));
 
+    // The header has an attribute "ColSpan", but it's not a string so it
+    // returns null.
+    EXPECT_EQ(0U, FPDF_StructElement_GetStringAttribute(header_cell, "ColSpan",
+                                                        buffer, kBufLen));
+
     // An unsupported attribute should return 0.
     EXPECT_EQ(0U, FPDF_StructElement_GetStringAttribute(header_cell, "Other",
                                                         buffer, kBufLen));
@@ -131,6 +140,96 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFStructTreeEmbedderTest, GetID) {
+  ASSERT_TRUE(OpenDocument("tagged_table.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  {
+    ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+    ASSERT_TRUE(struct_tree);
+    ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+    FPDF_STRUCTELEMENT document = document =
+        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+    ASSERT_TRUE(document);
+
+    constexpr int kBufLen = 100;
+    uint16_t buffer[kBufLen] = {0};
+    EXPECT_EQ(18U, FPDF_StructElement_GetType(document, buffer, kBufLen));
+    EXPECT_EQ("Document", GetPlatformString(buffer));
+
+    // The document has no ID.
+    EXPECT_EQ(0U, FPDF_StructElement_GetID(document, buffer, kBufLen));
+
+    ASSERT_EQ(1, FPDF_StructElement_CountChildren(document));
+    FPDF_STRUCTELEMENT table = FPDF_StructElement_GetChildAtIndex(document, 0);
+    ASSERT_TRUE(table);
+
+    EXPECT_EQ(12U, FPDF_StructElement_GetType(table, buffer, kBufLen));
+    EXPECT_EQ("Table", GetPlatformString(buffer));
+
+    // The table has an ID.
+    EXPECT_EQ(14U, FPDF_StructElement_GetID(table, buffer, kBufLen));
+    EXPECT_EQ("node12", GetPlatformString(buffer));
+
+    // The first child of the table is a row, which has an empty ID.
+    // It returns 2U, the length of an empty string, instead of 0U,
+    // representing null.
+    ASSERT_EQ(2, FPDF_StructElement_CountChildren(table));
+    FPDF_STRUCTELEMENT row = FPDF_StructElement_GetChildAtIndex(table, 0);
+    ASSERT_TRUE(row);
+    EXPECT_EQ(2U, FPDF_StructElement_GetID(row, buffer, kBufLen));
+  }
+
+  UnloadPage(page);
+}
+
+TEST_F(FPDFStructTreeEmbedderTest, GetLang) {
+  ASSERT_TRUE(OpenDocument("tagged_table.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  {
+    ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+    ASSERT_TRUE(struct_tree);
+    ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+    FPDF_STRUCTELEMENT document = document =
+        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+    ASSERT_TRUE(document);
+
+    constexpr int kBufLen = 100;
+    uint16_t buffer[kBufLen] = {0};
+    EXPECT_EQ(18U, FPDF_StructElement_GetType(document, buffer, kBufLen));
+    EXPECT_EQ("Document", GetPlatformString(buffer));
+
+    // The document has a language.
+    EXPECT_EQ(12U, FPDF_StructElement_GetLang(document, buffer, kBufLen));
+    EXPECT_EQ("en-US", GetPlatformString(buffer));
+
+    ASSERT_EQ(1, FPDF_StructElement_CountChildren(document));
+    FPDF_STRUCTELEMENT table = FPDF_StructElement_GetChildAtIndex(document, 0);
+    ASSERT_TRUE(table);
+
+    // The first child is a table, with a language.
+    EXPECT_EQ(12U, FPDF_StructElement_GetType(table, buffer, kBufLen));
+    EXPECT_EQ("Table", GetPlatformString(buffer));
+
+    EXPECT_EQ(6U, FPDF_StructElement_GetLang(table, buffer, kBufLen));
+    EXPECT_EQ("hu", GetPlatformString(buffer));
+
+    // The first child of the table is a row, which doesn't have a
+    // language explicitly set on it.
+    ASSERT_EQ(2, FPDF_StructElement_CountChildren(table));
+    FPDF_STRUCTELEMENT row = FPDF_StructElement_GetChildAtIndex(table, 0);
+    ASSERT_TRUE(row);
+    EXPECT_EQ(0U, FPDF_StructElement_GetLang(row, buffer, kBufLen));
+  }
+
+  UnloadPage(page);
+}
+
 TEST_F(FPDFStructTreeEmbedderTest, GetMarkedContentID) {
   ASSERT_TRUE(OpenDocument("marked_content_id.pdf"));
   FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 825691d..a7310b9 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -325,6 +325,8 @@
     CHK(FPDF_StructElement_CountChildren);
     CHK(FPDF_StructElement_GetAltText);
     CHK(FPDF_StructElement_GetChildAtIndex);
+    CHK(FPDF_StructElement_GetID);
+    CHK(FPDF_StructElement_GetLang);
     CHK(FPDF_StructElement_GetMarkedContentID);
     CHK(FPDF_StructElement_GetStringAttribute);
     CHK(FPDF_StructElement_GetTitle);
diff --git a/public/fpdf_structtree.h b/public/fpdf_structtree.h
index 6a14023..5e1f22e 100644
--- a/public/fpdf_structtree.h
+++ b/public/fpdf_structtree.h
@@ -75,6 +75,47 @@
                               void* buffer,
                               unsigned long buflen);
 
+// Function: FPDF_StructElement_GetID
+//          Get the ID for a given element. Experimental API.
+// Parameters:
+//          struct_element -   Handle to the struct element.
+//          buffer         -   A buffer for output the ID string. May be NULL.
+//          buflen         -   The length of the buffer, in bytes. May be 0.
+// Return value:
+//          The number of bytes in the ID string, including the terminating NUL
+//          character. The number of bytes is returned regardless of the
+//          |buffer| and |buflen| parameters.
+// Comments:
+//          Regardless of the platform, the |buffer| is always in UTF-16LE
+//          encoding. The string is terminated by a UTF16 NUL character. If
+//          |buflen| is less than the required length, or |buffer| is NULL,
+//          |buffer| will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetID(FPDF_STRUCTELEMENT struct_element,
+                         void* buffer,
+                         unsigned long buflen);
+
+// Function: FPDF_StructElement_GetLang
+//          Get the case-insensitive IETF BCP 47 language code for an element.
+//          Experimental API.
+// Parameters:
+//          struct_element -   Handle to the struct element.
+//          buffer         -   A buffer for output the lang string. May be NULL.
+//          buflen         -   The length of the buffer, in bytes. May be 0.
+// Return value:
+//          The number of bytes in the ID string, including the terminating NUL
+//          character. The number of bytes is returned regardless of the
+//          |buffer| and |buflen| parameters.
+// Comments:
+//          Regardless of the platform, the |buffer| is always in UTF-16LE
+//          encoding. The string is terminated by a UTF16 NUL character. If
+//          |buflen| is less than the required length, or |buffer| is NULL,
+//          |buffer| will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetLang(FPDF_STRUCTELEMENT struct_element,
+                           void* buffer,
+                           unsigned long buflen);
+
 // Function: FPDF_StructElement_GetStringAttribute
 //          Get a struct element attribute of type "name" or "string".
 //          Experimental API.
diff --git a/testing/resources/tagged_table.in b/testing/resources/tagged_table.in
index d5a9877..70df01c 100644
--- a/testing/resources/tagged_table.in
+++ b/testing/resources/tagged_table.in
@@ -124,6 +124,7 @@
   /P 8 0 R
   /T (TitleText)
   /Pg 3 0 R
+  /Lang (en-US)
 >>
 endobj
 {{object 11 0}} <<
@@ -132,6 +133,12 @@
   /K [12 0 R 13 0 R]
   /P 10 0 R
   /Pg 3 0 R
+  /A [<<
+        /O /Table
+        /Summary ()
+      >>]
+  /ID (node12)
+  /Lang (hu)
 >>
 endobj
 {{object 12 0}} <<
@@ -140,6 +147,7 @@
   /K [14 0 R 15 0 R]
   /P 11 0 R
   /Pg 3 0 R
+  /ID ()
 >>
 endobj
 {{object 13 0}} <<
@@ -148,6 +156,7 @@
   /K [16 0 R 17 0 R]
   /P 11 0 R
   /Pg 3 0 R
+  /ID (node14)
 >>
 endobj
 {{object 14 0}} <<
@@ -155,12 +164,15 @@
   /S /TH
   /P 12 0 R
   /Pg 3 0 R
-  /A [
-    <<
-      /O /Table
-      /Scope /Row
-    >>
-  ]
+  /A [<<
+        /O /Table
+        /Scope /Row
+      >>
+      <<
+        /O /Table
+        /ColSpan 2
+      >>]
+  /ID (node15)
 >>
 endobj
 {{object 15 0}} <<
@@ -168,6 +180,7 @@
   /S /TD
   /P 12 0 R
   /Pg 3 0 R
+  /ID (node16)
 >>
 endobj
 {{object 16 0}} <<
@@ -175,12 +188,11 @@
   /S /TH
   /P 13 0 R
   /Pg 3 0 R
-  /A [
-    <<
-      /O /Table
-      /Scope /Row
-    >>
-  ]
+  /A [<<
+        /O /Table
+        /Scope /Row
+      >>]
+  /ID (node17)
 >>
 endobj
 {{object 17 0}} <<
@@ -188,6 +200,7 @@
   /S /TD
   /P 13 0 R
   /Pg 3 0 R
+  /ID (node18)
 >>
 endobj
 {{xref}}
diff --git a/testing/resources/tagged_table.pdf b/testing/resources/tagged_table.pdf
index b4469fd..b11437c 100644
--- a/testing/resources/tagged_table.pdf
+++ b/testing/resources/tagged_table.pdf
@@ -125,6 +125,7 @@
   /P 8 0 R
   /T (TitleText)
   /Pg 3 0 R
+  /Lang (en-US)
 >>
 endobj
 11 0 obj <<
@@ -133,6 +134,12 @@
   /K [12 0 R 13 0 R]
   /P 10 0 R
   /Pg 3 0 R
+  /A [<<
+        /O /Table
+        /Summary ()
+      >>]
+  /ID (node12)
+  /Lang (hu)
 >>
 endobj
 12 0 obj <<
@@ -141,6 +148,7 @@
   /K [14 0 R 15 0 R]
   /P 11 0 R
   /Pg 3 0 R
+  /ID ()
 >>
 endobj
 13 0 obj <<
@@ -149,6 +157,7 @@
   /K [16 0 R 17 0 R]
   /P 11 0 R
   /Pg 3 0 R
+  /ID (node14)
 >>
 endobj
 14 0 obj <<
@@ -156,12 +165,15 @@
   /S /TH
   /P 12 0 R
   /Pg 3 0 R
-  /A [
-    <<
-      /O /Table
-      /Scope /Row
-    >>
-  ]
+  /A [<<
+        /O /Table
+        /Scope /Row
+      >>
+      <<
+        /O /Table
+        /ColSpan 2
+      >>]
+  /ID (node15)
 >>
 endobj
 15 0 obj <<
@@ -169,6 +181,7 @@
   /S /TD
   /P 12 0 R
   /Pg 3 0 R
+  /ID (node16)
 >>
 endobj
 16 0 obj <<
@@ -176,12 +189,11 @@
   /S /TH
   /P 13 0 R
   /Pg 3 0 R
-  /A [
-    <<
-      /O /Table
-      /Scope /Row
-    >>
-  ]
+  /A [<<
+        /O /Table
+        /Scope /Row
+      >>]
+  /ID (node17)
 >>
 endobj
 17 0 obj <<
@@ -189,6 +201,7 @@
   /S /TD
   /P 13 0 R
   /Pg 3 0 R
+  /ID (node18)
 >>
 endobj
 xref
@@ -204,17 +217,17 @@
 0000001253 00000 n 
 0000001412 00000 n 
 0000001515 00000 n 
-0000001626 00000 n 
-0000001725 00000 n 
-0000001821 00000 n 
-0000001917 00000 n 
-0000002051 00000 n 
-0000002126 00000 n 
-0000002260 00000 n 
+0000001642 00000 n 
+0000001826 00000 n 
+0000001931 00000 n 
+0000002042 00000 n 
+0000002244 00000 n 
+0000002334 00000 n 
+0000002481 00000 n 
 trailer <<
   /Root 1 0 R
   /Size 18
 >>
 startxref
-2335
+2571
 %%EOF