Add experimental APIs to get XFA data.

Add a family of APIs to get the uncompressed data in the /XFA field.
FPDF_GetXFAPacketCount() returns how many valid XFA packets exist. For
each packet, FPDF_GetXFAPacketName() and FPDF_GetXFAPacketContent() get
the name and content of the packet, respectively.

Bug: pdfium:1568
Change-Id: Iec4ffdf80945161aba9584fe5b1ddf1d435008bd
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/71790
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_view.cpp b/fpdfsdk/fpdf_view.cpp
index c145839..22ef51a 100644
--- a/fpdfsdk/fpdf_view.cpp
+++ b/fpdfsdk/fpdf_view.cpp
@@ -21,6 +21,8 @@
 #include "core/fpdfapi/parser/cpdf_document.h"
 #include "core/fpdfapi/parser/cpdf_name.h"
 #include "core/fpdfapi/parser/cpdf_parser.h"
+#include "core/fpdfapi/parser/cpdf_stream.h"
+#include "core/fpdfapi/parser/cpdf_string.h"
 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
 #include "core/fpdfapi/render/cpdf_docrenderdata.h"
 #include "core/fpdfapi/render/cpdf_pagerendercache.h"
@@ -46,6 +48,7 @@
 #include "public/fpdf_formfill.h"
 #include "third_party/base/ptr_util.h"
 #include "third_party/base/span.h"
+#include "third_party/base/stl_util.h"
 
 #ifdef PDF_ENABLE_V8
 #include "fxjs/cfx_v8.h"
@@ -87,6 +90,58 @@
 
 bool g_bLibraryInitialized = false;
 
+const CPDF_Object* GetXFAEntryFromDocument(FPDF_DOCUMENT document) {
+  const CPDF_Document* doc = CPDFDocumentFromFPDFDocument(document);
+  if (!doc)
+    return nullptr;
+
+  const CPDF_Dictionary* root = doc->GetRoot();
+  if (!root)
+    return nullptr;
+
+  const CPDF_Dictionary* acro_form = root->GetDictFor("AcroForm");
+  return acro_form ? acro_form->GetObjectFor("XFA") : nullptr;
+}
+
+struct XFAPacket {
+  ByteString name;
+  const CPDF_Stream* data;
+};
+
+std::vector<XFAPacket> GetXFAPackets(const CPDF_Object* xfa_object) {
+  std::vector<XFAPacket> packets;
+
+  if (!xfa_object)
+    return packets;
+
+  const CPDF_Stream* xfa_stream = ToStream(xfa_object->GetDirect());
+  if (xfa_stream) {
+    packets.push_back({"", xfa_stream});
+    return packets;
+  }
+
+  const CPDF_Array* xfa_array = ToArray(xfa_object->GetDirect());
+  if (!xfa_array)
+    return packets;
+
+  packets.reserve(1 + (xfa_array->size() / 2));
+  for (size_t i = 0; i < xfa_array->size(); i += 2) {
+    if (i + 1 == xfa_array->size())
+      break;
+
+    const CPDF_String* name = ToString(xfa_array->GetObjectAt(i));
+    if (!name)
+      continue;
+
+    const CPDF_Stream* data = xfa_array->GetStreamAt(i + 1);
+    if (!data)
+      continue;
+
+    packets.push_back({name->GetString(), data});
+  }
+  return packets;
+}
+
 FPDF_DOCUMENT LoadDocumentImpl(
     const RetainPtr<IFX_SeekableReadStream>& pFileAccess,
     FPDF_BYTESTRING password) {
@@ -1124,3 +1179,47 @@
   }
   return FPDFDestFromCPDFArray(pDestObj->AsArray());
 }
+
+FPDF_EXPORT int FPDF_CALLCONV FPDF_GetXFAPacketCount(FPDF_DOCUMENT document) {
+  if (!document)
+    return -1;
+
+  return pdfium::CollectionSize<int>(
+      GetXFAPackets(GetXFAEntryFromDocument(document)));
+}
+
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDF_GetXFAPacketName(FPDF_DOCUMENT document,
+                      int index,
+                      void* buffer,
+                      unsigned long buflen) {
+  if (!document || index < 0)
+    return 0;
+
+  std::vector<XFAPacket> xfa_packets =
+      GetXFAPackets(GetXFAEntryFromDocument(document));
+  if (static_cast<size_t>(index) >= xfa_packets.size())
+    return 0;
+
+  return NulTerminateMaybeCopyAndReturnLength(xfa_packets[index].name, buffer,
+                                              buflen);
+}
+
+FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
+FPDF_GetXFAPacketContent(FPDF_DOCUMENT document,
+                         int index,
+                         void* buffer,
+                         unsigned long buflen,
+                         unsigned long* out_buflen) {
+  if (!document || index < 0 || !out_buflen)
+    return false;
+
+  std::vector<XFAPacket> xfa_packets =
+      GetXFAPackets(GetXFAEntryFromDocument(document));
+  if (static_cast<size_t>(index) >= xfa_packets.size())
+    return false;
+
+  *out_buflen = DecodeStreamMaybeCopyAndReturnLength(xfa_packets[index].data,
+                                                     buffer, buflen);
+  return true;
+}
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index c82e22e..67617b5 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -446,6 +446,9 @@
     CHK(FPDF_GetRecommendedV8Flags);
 #endif
     CHK(FPDF_GetSecurityHandlerRevision);
+    CHK(FPDF_GetXFAPacketContent);
+    CHK(FPDF_GetXFAPacketCount);
+    CHK(FPDF_GetXFAPacketName);
     CHK(FPDF_InitLibrary);
     CHK(FPDF_InitLibraryWithConfig);
     CHK(FPDF_LoadCustomDocument);
diff --git a/fpdfsdk/fpdf_view_embeddertest.cpp b/fpdfsdk/fpdf_view_embeddertest.cpp
index 4abe282..292837d 100644
--- a/fpdfsdk/fpdf_view_embeddertest.cpp
+++ b/fpdfsdk/fpdf_view_embeddertest.cpp
@@ -19,7 +19,9 @@
 #include "testing/fx_string_testhelpers.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "testing/utils/file_util.h"
+#include "testing/utils/hash.h"
 #include "testing/utils/path_service.h"
+#include "third_party/base/stl_util.h"
 
 using pdfium::kManyRectanglesChecksum;
 
@@ -1063,6 +1065,98 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFViewEmbedderTest, GetXFAArrayData) {
+  ASSERT_TRUE(OpenDocument("simple_xfa.pdf"));
+
+  static constexpr struct {
+    const char* name;
+    size_t content_length;
+    const char* content_checksum;
+  } kExpectedResults[]{
+      {"preamble", 124u, "71be364e53292596412242bfcdb46eab"},
+      {"config", 642u, "bcd1ca1d420ee31a561273a54a06435f"},
+      {"template", 541u, "0f48cb2fa1bb9cbf9eee802d66e81bf4"},
+      {"localeSet", 3455u, "bb1f253d3e5c719ac0da87d055bc164e"},
+      {"postamble", 11u, "6b79e25da35d86634ea27c38f64cf243"},
+  };
+
+  ASSERT_EQ(static_cast<int>(pdfium::size(kExpectedResults)),
+            FPDF_GetXFAPacketCount(document()));
+  for (size_t i = 0; i < pdfium::size(kExpectedResults); ++i) {
+    char name_buffer[20] = {};
+    ASSERT_EQ(strlen(kExpectedResults[i].name) + 1,
+              FPDF_GetXFAPacketName(document(), i, nullptr, 0));
+    EXPECT_EQ(
+        strlen(kExpectedResults[i].name) + 1,
+        FPDF_GetXFAPacketName(document(), i, name_buffer, sizeof(name_buffer)));
+    EXPECT_STREQ(kExpectedResults[i].name, name_buffer);
+
+    unsigned long buflen;
+    ASSERT_TRUE(FPDF_GetXFAPacketContent(document(), i, nullptr, 0, &buflen));
+    ASSERT_EQ(kExpectedResults[i].content_length, buflen);
+    std::vector<uint8_t> data_buffer(buflen);
+    EXPECT_TRUE(FPDF_GetXFAPacketContent(document(), i, data_buffer.data(),
+                                         data_buffer.size(), &buflen));
+    EXPECT_EQ(kExpectedResults[i].content_length, buflen);
+    EXPECT_STREQ(
+        kExpectedResults[i].content_checksum,
+        GenerateMD5Base16(data_buffer.data(), data_buffer.size()).c_str());
+  }
+
+  // Test bad parameters.
+  EXPECT_EQ(-1, FPDF_GetXFAPacketCount(nullptr));
+
+  EXPECT_EQ(0u, FPDF_GetXFAPacketName(nullptr, 0, nullptr, 0));
+  EXPECT_EQ(0u, FPDF_GetXFAPacketName(document(), -1, nullptr, 0));
+  EXPECT_EQ(0u, FPDF_GetXFAPacketName(
+                    document(), pdfium::size(kExpectedResults), nullptr, 0));
+
+  unsigned long buflen = 123;
+  EXPECT_FALSE(FPDF_GetXFAPacketContent(nullptr, 0, nullptr, 0, &buflen));
+  EXPECT_EQ(123u, buflen);
+  EXPECT_FALSE(FPDF_GetXFAPacketContent(document(), -1, nullptr, 0, &buflen));
+  EXPECT_EQ(123u, buflen);
+  EXPECT_FALSE(FPDF_GetXFAPacketContent(
+      document(), pdfium::size(kExpectedResults), nullptr, 0, &buflen));
+  EXPECT_EQ(123u, buflen);
+  EXPECT_FALSE(FPDF_GetXFAPacketContent(document(), 0, nullptr, 0, nullptr));
+}
+
+TEST_F(FPDFViewEmbedderTest, GetXFAStreamData) {
+  ASSERT_TRUE(OpenDocument("bug_1265.pdf"));
+
+  ASSERT_EQ(1, FPDF_GetXFAPacketCount(document()));
+
+  char name_buffer[20] = {};
+  ASSERT_EQ(1u, FPDF_GetXFAPacketName(document(), 0, nullptr, 0));
+  EXPECT_EQ(1u, FPDF_GetXFAPacketName(document(), 0, name_buffer,
+                                      sizeof(name_buffer)));
+  EXPECT_STREQ("", name_buffer);
+
+  unsigned long buflen;
+  ASSERT_TRUE(FPDF_GetXFAPacketContent(document(), 0, nullptr, 0, &buflen));
+  ASSERT_EQ(121u, buflen);
+  std::vector<uint8_t> data_buffer(buflen);
+  EXPECT_TRUE(FPDF_GetXFAPacketContent(document(), 0, data_buffer.data(),
+                                       data_buffer.size(), &buflen));
+  EXPECT_EQ(121u, buflen);
+  EXPECT_STREQ(
+      "8f912eaa1e66c9341cb3032ede71e147",
+      GenerateMD5Base16(data_buffer.data(), data_buffer.size()).c_str());
+}
+
+TEST_F(FPDFViewEmbedderTest, GetXFADataForNoForm) {
+  ASSERT_TRUE(OpenDocument("hello_world.pdf"));
+
+  EXPECT_EQ(0, FPDF_GetXFAPacketCount(document()));
+}
+
+TEST_F(FPDFViewEmbedderTest, GetXFADataForAcroForm) {
+  ASSERT_TRUE(OpenDocument("text_form.pdf"));
+
+  EXPECT_EQ(0, FPDF_GetXFAPacketCount(document()));
+}
+
 class RecordUnsupportedErrorDelegate final : public EmbedderTest::Delegate {
  public:
   RecordUnsupportedErrorDelegate() = default;
diff --git a/public/fpdfview.h b/public/fpdfview.h
index 3181df3..94bf776 100644
--- a/public/fpdfview.h
+++ b/public/fpdfview.h
@@ -1291,6 +1291,65 @@
                                                       void* buffer,
                                                       long* buflen);
 
+// Experimental API.
+// Function: FPDF_GetXFAPacketCount
+//          Get the number of valid packets in the XFA entry.
+// Parameters:
+//          document - Handle to the document.
+// Return value:
+//          The number of valid packets, or -1 on error.
+FPDF_EXPORT int FPDF_CALLCONV FPDF_GetXFAPacketCount(FPDF_DOCUMENT document);
+
+// Experimental API.
+// Function: FPDF_GetXFAPacketName
+//          Get the name of a packet in the XFA array.
+// Parameters:
+//          document - Handle to the document.
+//          index    - Index number of the packet. 0 for the first packet.
+//          buffer   - Buffer for holding the name of the XFA packet.
+//          buflen   - Length of |buffer| in bytes.
+// Return value:
+//          The length of the packet name in bytes, or 0 on error.
+//
+// |document| must be valid and |index| must be in the range [0, N), where N is
+// the value returned by FPDF_GetXFAPacketCount().
+// |buffer| is only modified if it is non-NULL and |buflen| is greater than or
+// equal to the length of the packet name. The packet name includes a
+// terminating NUL character. |buffer| is unmodified on error.
+FPDF_EXPORT unsigned long FPDF_CALLCONV FPDF_GetXFAPacketName(
+    FPDF_DOCUMENT document,
+    int index,
+    void* buffer,
+    unsigned long buflen);
+
+// Experimental API.
+// Function: FPDF_GetXFAPacketContent
+//          Get the content of a packet in the XFA array.
+// Parameters:
+//          document   - Handle to the document.
+//          index      - Index number of the packet. 0 for the first packet.
+//          buffer     - Buffer for holding the content of the XFA packet.
+//          buflen     - Length of |buffer| in bytes.
+//          out_buflen - Pointer to the variable that will receive the minimum
+//                       buffer size needed to contain the content of the XFA
+//                       packet.
+// Return value:
+//          Whether the operation succeeded or not.
+//
+// |document| must be valid and |index| must be in the range [0, N), where N is
+// the value returned by FPDF_GetXFAPacketCount(). |out_buflen| must not be
+// NULL. When the aforementioned arguments are valid, the operation succeeds,
+// and |out_buflen| receives the content size. |buffer| is only modified if
+// |buffer| is non-null and long enough to contain the content. Callers must
+// check both the return value and the input |buflen| is no less than the
+// returned |out_buflen| before using the data in |buffer|.
+FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDF_GetXFAPacketContent(
+    FPDF_DOCUMENT document,
+    int index,
+    void* buffer,
+    unsigned long buflen,
+    unsigned long* out_buflen);
+
 #ifdef PDF_ENABLE_V8
 // Function: FPDF_GetRecommendedV8Flags
 //          Returns a space-separated string of command line flags that are