Add FPDF_StructElement_GetActualText

Get the actual text for a given structure element.

Change-Id: If2767b12099a21fbe7c0abcb89c307cb4fe59b02
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/90431
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdfdoc/cpdf_structelement.cpp b/core/fpdfdoc/cpdf_structelement.cpp
index 344b4fe..56cd410 100644
--- a/core/fpdfdoc/cpdf_structelement.cpp
+++ b/core/fpdfdoc/cpdf_structelement.cpp
@@ -53,6 +53,10 @@
   return GetDict()->GetUnicodeTextFor("Alt");
 }
 
+WideString CPDF_StructElement::GetActualText() const {
+  return GetDict()->GetUnicodeTextFor("ActualText");
+}
+
 WideString CPDF_StructElement::GetTitle() const {
   return GetDict()->GetUnicodeTextFor("T");
 }
diff --git a/core/fpdfdoc/cpdf_structelement.h b/core/fpdfdoc/cpdf_structelement.h
index 3b7a375..5aac005 100644
--- a/core/fpdfdoc/cpdf_structelement.h
+++ b/core/fpdfdoc/cpdf_structelement.h
@@ -23,6 +23,7 @@
 
   ByteString GetType() const { return m_Type; }
   WideString GetAltText() const;
+  WideString GetActualText() const;
   WideString GetTitle() const;
 
   // Never returns nullptr.
diff --git a/fpdfsdk/fpdf_structtree.cpp b/fpdfsdk/fpdf_structtree.cpp
index f06a2c7..a162d7c 100644
--- a/fpdfsdk/fpdf_structtree.cpp
+++ b/fpdfsdk/fpdf_structtree.cpp
@@ -80,6 +80,15 @@
 }
 
 FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetActualText(FPDF_STRUCTELEMENT struct_element,
+                                 void* buffer,
+                                 unsigned long buflen) {
+  CPDF_StructElement* elem =
+      CPDFStructElementFromFPDFStructElement(struct_element);
+  return elem ? WideStringToBuffer(elem->GetActualText(), buffer, buflen) : 0;
+}
+
+FPDF_EXPORT unsigned long FPDF_CALLCONV
 FPDF_StructElement_GetID(FPDF_STRUCTELEMENT struct_element,
                          void* buffer,
                          unsigned long buflen) {
diff --git a/fpdfsdk/fpdf_structtree_embeddertest.cpp b/fpdfsdk/fpdf_structtree_embeddertest.cpp
index 6781a48..280b520 100644
--- a/fpdfsdk/fpdf_structtree_embeddertest.cpp
+++ b/fpdfsdk/fpdf_structtree_embeddertest.cpp
@@ -74,6 +74,50 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFStructTreeEmbedderTest, GetActualText) {
+  ASSERT_TRUE(OpenDocument("tagged_actual_text.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  {
+    ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+    ASSERT_TRUE(struct_tree);
+    ASSERT_EQ(1, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+    EXPECT_EQ(0U, FPDF_StructElement_GetActualText(nullptr, nullptr, 0));
+
+    FPDF_STRUCTELEMENT element =
+        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+    ASSERT_TRUE(element);
+    EXPECT_EQ(0U, FPDF_StructElement_GetActualText(element, nullptr, 0));
+
+    ASSERT_EQ(1, FPDF_StructElement_CountChildren(element));
+    FPDF_STRUCTELEMENT child_element =
+        FPDF_StructElement_GetChildAtIndex(element, 0);
+    ASSERT_TRUE(child_element);
+    EXPECT_EQ(0U, FPDF_StructElement_GetActualText(child_element, nullptr, 0));
+
+    ASSERT_EQ(1, FPDF_StructElement_CountChildren(child_element));
+    FPDF_STRUCTELEMENT gchild_element =
+        FPDF_StructElement_GetChildAtIndex(child_element, 0);
+    ASSERT_TRUE(gchild_element);
+    ASSERT_EQ(24U,
+              FPDF_StructElement_GetActualText(gchild_element, nullptr, 0));
+
+    unsigned short buffer[12] = {};
+    // Deliberately pass in a small buffer size to make sure |buffer| remains
+    // untouched.
+    ASSERT_EQ(24U, FPDF_StructElement_GetActualText(gchild_element, buffer, 1));
+    for (size_t i = 0; i < pdfium::size(buffer); ++i)
+      EXPECT_EQ(0U, buffer[i]);
+    ASSERT_EQ(24U, FPDF_StructElement_GetActualText(gchild_element, buffer,
+                                                    sizeof(buffer)));
+    EXPECT_EQ(L"Actual Text", GetPlatformWString(buffer));
+  }
+
+  UnloadPage(page);
+}
+
 TEST_F(FPDFStructTreeEmbedderTest, GetStringAttribute) {
   ASSERT_TRUE(OpenDocument("tagged_table.pdf"));
   FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index d32e2d2..99c53a3 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -353,6 +353,7 @@
 
     // fpdf_structtree.h
     CHK(FPDF_StructElement_CountChildren);
+    CHK(FPDF_StructElement_GetActualText);
     CHK(FPDF_StructElement_GetAltText);
     CHK(FPDF_StructElement_GetChildAtIndex);
     CHK(FPDF_StructElement_GetID);
diff --git a/public/fpdf_structtree.h b/public/fpdf_structtree.h
index 211abdf..4784bd4 100644
--- a/public/fpdf_structtree.h
+++ b/public/fpdf_structtree.h
@@ -76,6 +76,26 @@
                               unsigned long buflen);
 
 // Experimental API.
+// Function: FPDF_StructElement_GetActualText
+//          Get the actual text for a given element.
+// Parameters:
+//          struct_element -   Handle to the struct element.
+//          buffer         -   A buffer for output the actual text. May be NULL.
+//          buflen         -   The length of the buffer, in bytes. May be 0.
+// Return value:
+//          The number of bytes in the actual text, including the terminating
+//          NUL character. The number of bytes is returned regardless of the
+//          |buffer| and |buflen| parameters.
+// Comments:
+//          Regardless of the platform, the |buffer| is always in UTF-16LE
+//          encoding. The string is terminated by a UTF16 NUL character. If
+//          |buflen| is less than the required length, or |buffer| is NULL,
+//          |buffer| will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_StructElement_GetActualText(FPDF_STRUCTELEMENT struct_element,
+                                 void* buffer,
+                                 unsigned long buflen);
+
 // Function: FPDF_StructElement_GetID
 //          Get the ID for a given element.
 // Parameters:
diff --git a/testing/resources/tagged_actual_text.in b/testing/resources/tagged_actual_text.in
new file mode 100644
index 0000000..9bb917b
--- /dev/null
+++ b/testing/resources/tagged_actual_text.in
@@ -0,0 +1,162 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+  /StructTreeRoot 8 0 R
+  /Lang (en-US)
+  /MarkInfo <<
+    /Marked true
+  >>
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 4 0 R
+  /MediaBox [0 0 612 792]
+  /Group <<
+    /CS /DeviceRGB
+    /I true
+    /S /Transparency
+  >>
+  /Resources <<
+    /ProcSet [/PDF /ImageC /ImageI /ImageB]
+    /XObject <<
+      /Tr8 5 0 R
+      /Im7 6 0 R
+    >>
+    /ExtGState <<
+      /EGS9 7 0 R
+    >>
+  >>
+  /StructParents 0
+>>
+endobj
+{{object 4 0}} <<
+  {{streamlen}}
+>>
+stream
+0.1 w
+/Artifact
+BMC
+q
+0 0 612 792 re
+W* n
+EMC
+/Figure<</MCID 0>>
+BDC
+Q
+q
+281 685.3 50 50 re
+W* n
+q
+49.9 0 0 50 281.1 685.4 cm
+/Im7 Do
+Q
+EMC
+Q
+q
+EGS9 gs /Tr8 Do
+Q
+endstream
+endobj
+{{object 5 0}} <<
+  /Type /XObject
+  /Subtype /Form
+  /BBox [-140 395 753 395.1]
+  /Group <<
+    /CS /DeviceRGB
+    /K true
+    /S /Transparency
+  >>
+  {{streamlen}}
+>>
+stream
+endstream
+endobj
+{{object 6 0}} <<
+  /Type /XObject
+  /Subtype /Image
+  /Width 50
+  /Height 50
+  /BitsPerComponent 8
+  /ColorSpace /DeviceRGB
+  /Filter [/ASCIIHexDecode /FlateDecode]
+  {{streamlen}}
+>>
+stream
+789cedc13101000000c2a0f54fed6f06a00000000000000078031d4c0001
+endstream
+endobj
+{{object 7 0}} <<
+  /ca 0.5
+  /CA 0.5
+>>
+endobj
+{{object 8 0}} <<
+  /Type /StructTreeRoot
+  /ParentTree 9 0 R
+  /K [11 0 R]
+  /RoleMap <<
+    /Document /Document
+    /Standard /P
+    /Figure /Figure
+  >>
+>>
+endobj
+{{object 9 0}} <<
+  /Nums [0 [10 0 R]]
+>>
+endobj
+{{object 10 0}} <<
+  /Type /StructElem
+  /S /Figure
+  /A 13 0 R
+  /K [0]
+  /P 12 0 R
+  /ActualText <feff00410063007400750061006c00200054006500780074>
+  /Pg 3 0 R
+>>
+endobj
+{{object 11 0}} <<
+  /Type /StructElem
+  /S /Document
+  /K [12 0 R]
+  /P 8 0 R
+  /T (TitleText)
+  /Pg 3 0 R
+>>
+endobj
+{{object 12 0}} <<
+  /Type /StructElem
+  /S /Standard
+  /A 14 0 R
+  /K [10 0 R]
+  /P 11 0 R
+  /T <feff00730079006d0062006f006c003a0020003100300030006b>
+  /Pg 3 0 R
+>>
+endobj
+{{object 13 0}} <<
+  /O /Layout
+  /Placement /Block
+  /BBox [281.1 685.3 331.1 735.3]
+  /Width 99.9
+  /Height 99.9
+>>
+endobj
+{{object 14 0}} <<
+  /O /Layout
+  /Placement /Block
+>>
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/tagged_actual_text.pdf b/testing/resources/tagged_actual_text.pdf
new file mode 100644
index 0000000..634e333
--- /dev/null
+++ b/testing/resources/tagged_actual_text.pdf
@@ -0,0 +1,183 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+  /StructTreeRoot 8 0 R
+  /Lang (en-US)
+  /MarkInfo <<
+    /Marked true
+  >>
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 4 0 R
+  /MediaBox [0 0 612 792]
+  /Group <<
+    /CS /DeviceRGB
+    /I true
+    /S /Transparency
+  >>
+  /Resources <<
+    /ProcSet [/PDF /ImageC /ImageI /ImageB]
+    /XObject <<
+      /Tr8 5 0 R
+      /Im7 6 0 R
+    >>
+    /ExtGState <<
+      /EGS9 7 0 R
+    >>
+  >>
+  /StructParents 0
+>>
+endobj
+4 0 obj <<
+  /Length 162
+>>
+stream
+0.1 w
+/Artifact
+BMC
+q
+0 0 612 792 re
+W* n
+EMC
+/Figure<</MCID 0>>
+BDC
+Q
+q
+281 685.3 50 50 re
+W* n
+q
+49.9 0 0 50 281.1 685.4 cm
+/Im7 Do
+Q
+EMC
+Q
+q
+EGS9 gs /Tr8 Do
+Q
+endstream
+endobj
+5 0 obj <<
+  /Type /XObject
+  /Subtype /Form
+  /BBox [-140 395 753 395.1]
+  /Group <<
+    /CS /DeviceRGB
+    /K true
+    /S /Transparency
+  >>
+  /Length 0
+>>
+stream
+endstream
+endobj
+6 0 obj <<
+  /Type /XObject
+  /Subtype /Image
+  /Width 50
+  /Height 50
+  /BitsPerComponent 8
+  /ColorSpace /DeviceRGB
+  /Filter [/ASCIIHexDecode /FlateDecode]
+  /Length 61
+>>
+stream
+789cedc13101000000c2a0f54fed6f06a00000000000000078031d4c0001
+endstream
+endobj
+7 0 obj <<
+  /ca 0.5
+  /CA 0.5
+>>
+endobj
+8 0 obj <<
+  /Type /StructTreeRoot
+  /ParentTree 9 0 R
+  /K [11 0 R]
+  /RoleMap <<
+    /Document /Document
+    /Standard /P
+    /Figure /Figure
+  >>
+>>
+endobj
+9 0 obj <<
+  /Nums [0 [10 0 R]]
+>>
+endobj
+10 0 obj <<
+  /Type /StructElem
+  /S /Figure
+  /A 13 0 R
+  /K [0]
+  /P 12 0 R
+  /ActualText <feff00410063007400750061006c00200054006500780074>
+  /Pg 3 0 R
+>>
+endobj
+11 0 obj <<
+  /Type /StructElem
+  /S /Document
+  /K [12 0 R]
+  /P 8 0 R
+  /T (TitleText)
+  /Pg 3 0 R
+>>
+endobj
+12 0 obj <<
+  /Type /StructElem
+  /S /Standard
+  /A 14 0 R
+  /K [10 0 R]
+  /P 11 0 R
+  /T <feff00730079006d0062006f006c003a0020003100300030006b>
+  /Pg 3 0 R
+>>
+endobj
+13 0 obj <<
+  /O /Layout
+  /Placement /Block
+  /BBox [281.1 685.3 331.1 735.3]
+  /Width 99.9
+  /Height 99.9
+>>
+endobj
+14 0 obj <<
+  /O /Layout
+  /Placement /Block
+>>
+endobj
+xref
+0 15
+0000000000 65535 f 
+0000000015 00000 n 
+0000000145 00000 n 
+0000000208 00000 n 
+0000000556 00000 n 
+0000000770 00000 n 
+0000000952 00000 n 
+0000001212 00000 n 
+0000001253 00000 n 
+0000001412 00000 n 
+0000001454 00000 n 
+0000001619 00000 n 
+0000001730 00000 n 
+0000001897 00000 n 
+0000002015 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 15
+>>
+startxref
+2070
+%%EOF