Add new APIs to get marked content id

FPDF_StructElement_GetMarkedContentID() provides access to the MCID
in a struct element. It assumes /K in a struct element dictionary
is a number. This may be a case sometimes, but /K can also be an
array with mixed types. See ISO 32000-1:2008, section 14.7.6 for
example. Adding FPDF_StructElement_GetMarkedContentIdCount() and
FPDF_StructElement_GetMarkedContentIdAtIndex() to address this
issue.

This change supersedes FPDF_StructElement_GetMarkedContentID(). It
provides a mechanism to access MCID at an index while supporting
the case where struct element dictionary is a number. After this
change, FPDF_StructElement_GetMarkedContentID() becomes redundant
and should be deprecated.

Change-Id: Icecd39fe3ea788e7f85851c2b7eb89453d083b2f
Bug: pdfium:1773
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/90611
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_structtree.cpp b/fpdfsdk/fpdf_structtree.cpp
index 76fd601..89f5f30 100644
--- a/fpdfsdk/fpdf_structtree.cpp
+++ b/fpdfsdk/fpdf_structtree.cpp
@@ -30,6 +30,15 @@
   return len;
 }
 
+int GetMcidFromDict(const CPDF_Dictionary* dict) {
+  if (dict && dict->GetNameFor("Type") == "MCR") {
+    const CPDF_Object* obj = dict->GetObjectFor("MCID");
+    if (obj && obj->IsNumber())
+      return obj->GetInteger();
+  }
+  return -1;
+}
+
 }  // namespace
 
 FPDF_EXPORT FPDF_STRUCTTREE FPDF_CALLCONV
@@ -397,3 +406,48 @@
   *out_buflen = len;
   return true;
 }
+
+FPDF_EXPORT int FPDF_CALLCONV
+FPDF_StructElement_GetMarkedContentIdCount(FPDF_STRUCTELEMENT struct_element) {
+  CPDF_StructElement* elem =
+      CPDFStructElementFromFPDFStructElement(struct_element);
+  const CPDF_Dictionary* dict = elem ? elem->GetDict() : nullptr;
+  const CPDF_Object* p = dict ? dict->GetObjectFor("K") : nullptr;
+  if (!p)
+    return -1;
+
+  if (p->IsNumber() || p->IsDictionary())
+    return 1;
+
+  return p->IsArray() ? fxcrt::CollectionSize<int>(*p->AsArray()) : -1;
+}
+
+FPDF_EXPORT int FPDF_CALLCONV
+FPDF_StructElement_GetMarkedContentIdAtIndex(FPDF_STRUCTELEMENT struct_element,
+                                             int index) {
+  CPDF_StructElement* elem =
+      CPDFStructElementFromFPDFStructElement(struct_element);
+  const CPDF_Dictionary* dict = elem ? elem->GetDict() : nullptr;
+  const CPDF_Object* p = dict ? dict->GetObjectFor("K") : nullptr;
+  if (!p)
+    return -1;
+
+  if (p->IsNumber())
+    return index == 0 ? p->GetInteger() : -1;
+
+  if (p->IsDictionary())
+    return GetMcidFromDict(p->GetDict());
+
+  if (p->IsArray()) {
+    const CPDF_Array* array = p->AsArray();
+    if (index < 0 || static_cast<size_t>(index) >= array->size())
+      return -1;
+    const CPDF_Object* array_elem = array->GetObjectAt(index);
+    if (array_elem->IsNumber())
+      return array_elem->GetInteger();
+    if (array_elem->IsDictionary()) {
+      return GetMcidFromDict(array_elem->GetDict());
+    }
+  }
+  return -1;
+}
diff --git a/fpdfsdk/fpdf_structtree_embeddertest.cpp b/fpdfsdk/fpdf_structtree_embeddertest.cpp
index fd2f25a..3417654 100644
--- a/fpdfsdk/fpdf_structtree_embeddertest.cpp
+++ b/fpdfsdk/fpdf_structtree_embeddertest.cpp
@@ -334,6 +334,60 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFStructTreeEmbedderTest, GetMarkedContentIdAtIndex) {
+  ASSERT_TRUE(OpenDocument("tagged_marked_content.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  {
+    ScopedFPDFStructTree struct_tree(FPDF_StructTree_GetForPage(page));
+    ASSERT_TRUE(struct_tree);
+    ASSERT_EQ(4, FPDF_StructTree_CountChildren(struct_tree.get()));
+
+    // K is an integer MCID
+    FPDF_STRUCTELEMENT child1 =
+        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 0);
+    ASSERT_TRUE(child1);
+    // Legacy API
+    EXPECT_EQ(0, FPDF_StructElement_GetMarkedContentID(child1));
+
+    // K is a dict containing MCR object reference
+    FPDF_STRUCTELEMENT child2 =
+        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 1);
+    ASSERT_TRUE(child2);
+
+    // K is an array containing dict MCR object reference and integer MCID
+    FPDF_STRUCTELEMENT child3 =
+        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 2);
+    ASSERT_TRUE(child3);
+
+    // K does not exist
+    FPDF_STRUCTELEMENT child4 =
+        FPDF_StructTree_GetChildAtIndex(struct_tree.get(), 3);
+    ASSERT_TRUE(child4);
+
+    // New APIs
+    EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentIdCount(nullptr));
+    EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentIdAtIndex(nullptr, 0));
+    EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentIdAtIndex(child1, -1));
+    EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentIdAtIndex(child1, 1));
+    EXPECT_EQ(1, FPDF_StructElement_GetMarkedContentIdCount(child1));
+    EXPECT_EQ(0, FPDF_StructElement_GetMarkedContentIdAtIndex(child1, 0));
+
+    EXPECT_EQ(1, FPDF_StructElement_GetMarkedContentIdCount(child2));
+    EXPECT_EQ(1, FPDF_StructElement_GetMarkedContentIdAtIndex(child2, 0));
+
+    EXPECT_EQ(2, FPDF_StructElement_GetMarkedContentIdCount(child3));
+    EXPECT_EQ(2, FPDF_StructElement_GetMarkedContentIdAtIndex(child3, 0));
+    EXPECT_EQ(3, FPDF_StructElement_GetMarkedContentIdAtIndex(child3, 1));
+
+    EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentIdCount(child4));
+    EXPECT_EQ(-1, FPDF_StructElement_GetMarkedContentIdAtIndex(child4, 0));
+  }
+
+  UnloadPage(page);
+}
+
 TEST_F(FPDFStructTreeEmbedderTest, GetType) {
   ASSERT_TRUE(OpenDocument("tagged_alt_text.pdf"));
   FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 38fc525..9f0e2da 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -368,6 +368,8 @@
     CHK(FPDF_StructElement_GetID);
     CHK(FPDF_StructElement_GetLang);
     CHK(FPDF_StructElement_GetMarkedContentID);
+    CHK(FPDF_StructElement_GetMarkedContentIdAtIndex);
+    CHK(FPDF_StructElement_GetMarkedContentIdCount);
     CHK(FPDF_StructElement_GetObjType);
     CHK(FPDF_StructElement_GetParent);
     CHK(FPDF_StructElement_GetStringAttribute);
diff --git a/public/fpdf_structtree.h b/public/fpdf_structtree.h
index 2de41af..2bc2dc6 100644
--- a/public/fpdf_structtree.h
+++ b/public/fpdf_structtree.h
@@ -424,6 +424,29 @@
                                      unsigned long buflen,
                                      unsigned long* out_buflen);
 
+// Experimental API.
+// Function: FPDF_StructElement_GetMarkedContentIdCount
+//          Get the count of marked content ids for a given element.
+// Parameters:
+//          struct_element -   Handle to the struct element.
+// Return value:
+//          The count of marked content ids or -1 if none exists.
+FPDF_EXPORT int FPDF_CALLCONV
+FPDF_StructElement_GetMarkedContentIdCount(FPDF_STRUCTELEMENT struct_element);
+
+// Experimental API.
+// Function: FPDF_StructElement_GetMarkedContentIdAtIndex
+//          Get the marked content id at a given index for a given element.
+// Parameters:
+//          struct_element -   Handle to the struct element.
+//          index          -   The index of the marked content id, 0-based.
+// Return value:
+//          The marked content ID of the element. If no ID exists, returns
+//          -1.
+FPDF_EXPORT int FPDF_CALLCONV
+FPDF_StructElement_GetMarkedContentIdAtIndex(FPDF_STRUCTELEMENT struct_element,
+                                             int index);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/testing/resources/tagged_marked_content.in b/testing/resources/tagged_marked_content.in
new file mode 100644
index 0000000..a8ea64b
--- /dev/null
+++ b/testing/resources/tagged_marked_content.in
@@ -0,0 +1,140 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+  /StructTreeRoot 7 0 R
+  /MarkInfo <<
+    /Type /MarkInfo
+    /Marked true
+  >>
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /StructParents 0
+  /Annots [4 0 R]
+  /Contents 5 0 R
+  /MediaBox [0 0 612 792]
+  /Resources <<
+    /ProcSet [/PDF /Text]
+    /Font <<
+      /F4 6 0 R
+    >>
+  >>
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Annot
+  /Subtype /Link
+  /Border [0 0 0]
+  /Dest /top
+  /F 4
+  /Rect [20 46 68 61]
+>>
+endobj
+{{object 5 0}} <<
+  {{streamlen}}
+>>
+stream
+q
+BT
+/P <</MCID 0 >>BDC
+/F4 16 Tf
+20 650 Td
+(Top Left) Tj
+EMC
+ET
+BT
+/P <</MCID 1 >>BDC
+/F4 16 Tf
+20 50 Td
+(Bottom Left) Tj
+EMC
+ET
+BT
+/P <</MCID 2 >>BDC
+/F4 16 Tf
+400 50 Td
+(Bottom Right) Tj
+EMC
+ET
+BT
+/P <</MCID 3 >>BDC
+/F4 16 Tf
+400 650 Td
+(Top Right) Tj
+EMC
+ET
+Q
+endstream
+endobj
+{{object 6 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 7 0}} <<
+  /Type /StructTreeRoot
+  /K [9 0 R 10 0 R 11 0 R 12 0 R]
+  /ParentTree 8 0 R
+  /ParentTreeNextKey 1
+>>
+endobj
+{{object 8 0}} <<
+  /Type /ParentTree
+  /Nums [0 [9 0 R 10 0 R 11 0 R 12 0 R]]
+>>
+endobj
+{{object 9 0}} <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /K 0
+  /ID /3
+>>
+endobj
+{{object 10 0}} <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /K <<
+      /Type /MCR
+      /MCID 1
+      /Pg 3 0 R
+     >>
+  /ID /4
+>>
+endobj
+{{object 11 0}} <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /K [
+      <<
+       /Type /MCR
+       /MCID 2
+       /Pg 3 0 R
+      >>
+      3]
+  /ID /5
+>>
+endobj
+{{object 12 0}} <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /ID /6
+>>
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/tagged_marked_content.pdf b/testing/resources/tagged_marked_content.pdf
new file mode 100644
index 0000000..92f731d
--- /dev/null
+++ b/testing/resources/tagged_marked_content.pdf
@@ -0,0 +1,159 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+  /StructTreeRoot 7 0 R
+  /MarkInfo <<
+    /Type /MarkInfo
+    /Marked true
+  >>
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /StructParents 0
+  /Annots [4 0 R]
+  /Contents 5 0 R
+  /MediaBox [0 0 612 792]
+  /Resources <<
+    /ProcSet [/PDF /Text]
+    /Font <<
+      /F4 6 0 R
+    >>
+  >>
+>>
+endobj
+4 0 obj <<
+  /Type /Annot
+  /Subtype /Link
+  /Border [0 0 0]
+  /Dest /top
+  /F 4
+  /Rect [20 46 68 61]
+>>
+endobj
+5 0 obj <<
+  /Length 264
+>>
+stream
+q
+BT
+/P <</MCID 0 >>BDC
+/F4 16 Tf
+20 650 Td
+(Top Left) Tj
+EMC
+ET
+BT
+/P <</MCID 1 >>BDC
+/F4 16 Tf
+20 50 Td
+(Bottom Left) Tj
+EMC
+ET
+BT
+/P <</MCID 2 >>BDC
+/F4 16 Tf
+400 50 Td
+(Bottom Right) Tj
+EMC
+ET
+BT
+/P <</MCID 3 >>BDC
+/F4 16 Tf
+400 650 Td
+(Top Right) Tj
+EMC
+ET
+Q
+endstream
+endobj
+6 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+7 0 obj <<
+  /Type /StructTreeRoot
+  /K [9 0 R 10 0 R 11 0 R 12 0 R]
+  /ParentTree 8 0 R
+  /ParentTreeNextKey 1
+>>
+endobj
+8 0 obj <<
+  /Type /ParentTree
+  /Nums [0 [9 0 R 10 0 R 11 0 R 12 0 R]]
+>>
+endobj
+9 0 obj <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /K 0
+  /ID /3
+>>
+endobj
+10 0 obj <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /K <<
+      /Type /MCR
+      /MCID 1
+      /Pg 3 0 R
+     >>
+  /ID /4
+>>
+endobj
+11 0 obj <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /K [
+      <<
+       /Type /MCR
+       /MCID 2
+       /Pg 3 0 R
+      >>
+      3]
+  /ID /5
+>>
+endobj
+12 0 obj <<
+  /Type /StructElem
+  /S /NonStruct
+  /P 7 0 R
+  /ID /6
+>>
+endobj
+xref
+0 13
+0000000000 65535 f 
+0000000015 00000 n 
+0000000149 00000 n 
+0000000212 00000 n 
+0000000427 00000 n 
+0000000540 00000 n 
+0000000856 00000 n 
+0000000934 00000 n 
+0000001056 00000 n 
+0000001138 00000 n 
+0000001222 00000 n 
+0000001363 00000 n 
+0000001525 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 13
+>>
+startxref
+1603
+%%EOF