Add targeted XFA fuzzer with a few preconditions

Add an XFA fuzzer that filters on the fuzz input. The filtering is done
to increase chances of exploring XFA-related logic and avoiding for the
fuzzer to explore unrelated, e.g. v8, code.

Bug: chromium: 1276950
Change-Id: I5d9776a16784f0970143daa396096b30d74964e6
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/87630
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/testing/fuzzers/BUILD.gn b/testing/fuzzers/BUILD.gn
index f3e3e05..f9c1c9f 100644
--- a/testing/fuzzers/BUILD.gn
+++ b/testing/fuzzers/BUILD.gn
@@ -73,7 +73,10 @@
     "pdf_nametree_fuzzer",
   ]
   if (pdf_enable_xfa) {
-    fuzzer_list += [ "pdf_xfa_fdp_fuzzer" ]
+    fuzzer_list += [
+      "pdf_xfa_fdp_fuzzer",
+      "pdf_xfa_raw_fuzzer",
+    ]
   }
 }
 
@@ -91,6 +94,10 @@
   }
 }
 
+source_set("fuzzer_pdf_templates") {
+  sources = [ "pdf_fuzzer_templates.h" ]
+}
+
 source_set("fuzzer_init") {
   testonly = true
   sources = [ "pdf_fuzzer_init.cc" ]
@@ -465,6 +472,16 @@
       sources = [ "pdf_xfa_fdp_fuzzer.cc" ]
       deps = [
         ":fuzzer_helper",
+        ":fuzzer_pdf_templates",
+        "../../third_party:pdfium_base",
+      ]
+      public_fuzzer = true
+    }
+    pdfium_fuzzer("pdf_xfa_raw_fuzzer") {
+      sources = [ "pdf_xfa_raw_fuzzer.cc" ]
+      deps = [
+        ":fuzzer_helper",
+        ":fuzzer_pdf_templates",
         "../../third_party:pdfium_base",
       ]
       public_fuzzer = true
diff --git a/testing/fuzzers/pdf_fuzzer_templates.h b/testing/fuzzers/pdf_fuzzer_templates.h
new file mode 100644
index 0000000..d5cdfd9
--- /dev/null
+++ b/testing/fuzzers/pdf_fuzzer_templates.h
@@ -0,0 +1,30 @@
+// Copyright 2021 The PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// File for holding strings representing PDF templates that are used by fuzzers.
+
+#ifndef TESTING_FUZZERS_PDF_FUZZER_TEMPLATES_H_
+#define TESTING_FUZZERS_PDF_FUZZER_TEMPLATES_H_
+
+constexpr char kSimplePdfTemplate[] = R"(%PDF-1.7
+1 0 obj
+<</Type /Catalog /Pages 2 0 R /AcroForm <</XFA 30 0 R>> /NeedsRendering true>>
+endobj
+2 0 obj
+<</Type /Pages /Kids [3 0 R] /Count 1>>
+endobj
+3 0 obj
+<</Type /Page /Parent 2 0 R /MediaBox [0 0 3 3]>>
+endobj
+30 0 obj
+<</Length $1>>
+stream
+$2
+endstream
+endobj
+trailer
+<</Root 1 0 R /Size 31>>
+%%EOF)";
+
+#endif  // TESTING_FUZZERS_PDF_FUZZER_TEMPLATES_H_
diff --git a/testing/fuzzers/pdf_xfa_fdp_fuzzer.cc b/testing/fuzzers/pdf_xfa_fdp_fuzzer.cc
index d346e54..7f2707c 100644
--- a/testing/fuzzers/pdf_xfa_fdp_fuzzer.cc
+++ b/testing/fuzzers/pdf_xfa_fdp_fuzzer.cc
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "public/fpdf_formfill.h"
+#include "testing/fuzzers/pdf_fuzzer_templates.h"
 #include "testing/fuzzers/pdfium_fuzzer_helper.h"
 #include "third_party/base/containers/adapters.h"
 #include "third_party/base/cxx17_backports.h"
@@ -592,26 +593,6 @@
   return xfa_string;
 }
 
-const char kSimplePdfTemplate[] = R"(%PDF-1.7
-1 0 obj
-<</Type /Catalog /Pages 2 0 R /AcroForm <</XFA 30 0 R>> /NeedsRendering true>>
-endobj
-2 0 obj
-<</Type /Pages /Kids [3 0 R] /Count 1>>
-endobj
-3 0 obj
-<</Type /Page /Parent 2 0 R /MediaBox [0 0 3 3]>>
-endobj
-30 0 obj
-<</Length $1>>
-stream
-$2
-endstream
-endobj
-trailer
-<</Root 1 0 R /Size 31>>
-%%EOF)";
-
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
   FuzzedDataProvider data_provider(data, size);
   std::string xfa_string = GenXfaTree(&data_provider);
diff --git a/testing/fuzzers/pdf_xfa_raw_fuzzer.cc b/testing/fuzzers/pdf_xfa_raw_fuzzer.cc
new file mode 100644
index 0000000..5bf096d
--- /dev/null
+++ b/testing/fuzzers/pdf_xfa_raw_fuzzer.cc
@@ -0,0 +1,101 @@
+// Copyright 2021 The PDFium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <cctype>
+#include <string>
+
+#include "public/fpdf_formfill.h"
+#include "testing/fuzzers/pdf_fuzzer_templates.h"
+#include "testing/fuzzers/pdfium_fuzzer_helper.h"
+
+class PDFiumXFAFuzzer : public PDFiumFuzzerHelper {
+ public:
+  PDFiumXFAFuzzer() = default;
+  ~PDFiumXFAFuzzer() override = default;
+
+  int GetFormCallbackVersion() const override { return 2; }
+
+  // Return false if XFA doesn't load as otherwise we're duplicating the work
+  // done by the non-xfa fuzzer.
+  bool OnFormFillEnvLoaded(FPDF_DOCUMENT doc) override {
+    int form_type = FPDF_GetFormType(doc);
+    if (form_type != FORMTYPE_XFA_FULL && form_type != FORMTYPE_XFA_FOREGROUND)
+      return false;
+    return FPDF_LoadXFA(doc);
+  }
+};
+
+bool IsValidForFuzzing(const uint8_t* data, size_t size) {
+  if (size > 2048) {
+    return false;
+  }
+
+  const char* ptr = reinterpret_cast<const char*>(data);
+  bool is_open = false;
+  size_t tag_size = 0;
+  for (size_t i = 0; i < size; i++) {
+    if (!std::isspace(ptr[i]) && !std::isprint(ptr[i])) {
+      return false;
+    }
+
+    // We do not want any script tags. The reason is this fuzzer
+    // should avoid exploring v8 code. Avoiding anything with "script"
+    // is an over-approximation, in that some inputs may contain "script"
+    // and still be a valid fuzz-case. However, this over-approximation is
+    // used to enforce strict constraints and avoid cases where whitespace
+    // may play a role, or other tags, e.g. "Javascript" will end up triggering
+    // large explorations of v8 code. The alternative we considered were
+    // "<script"
+    if (i + 6 < size && memcmp(ptr + i, "script", 6) == 0) {
+      return false;
+    }
+
+    if (ptr[i] == '<') {
+      if (is_open) {
+        return false;
+      }
+      is_open = true;
+      tag_size = 0;
+    } else if (ptr[i] == '>') {
+      if (!is_open || tag_size == 0) {
+        return false;
+      }
+      is_open = false;
+    } else if (is_open) {
+      tag_size++;
+    }
+  }
+  // we must close the last bracket.
+  return !is_open;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  // Filter the string to reduce the state space exploration.
+  if (!IsValidForFuzzing(data, size)) {
+    return 0;
+  }
+  std::string xfa_string = "<xdp xmlns=\"http://ns.adobe.com/xdp/\">";
+  xfa_string += std::string(reinterpret_cast<const char*>(data), size);
+  xfa_string += "</xdp>";
+
+  // Add 1 for newline before endstream.
+  std::string xfa_stream_len = std::to_string(xfa_string.size() + 1);
+
+  // Compose the fuzzer
+  std::string xfa_final_str = std::string(kSimplePdfTemplate);
+  xfa_final_str.replace(xfa_final_str.find("$1"), 2, xfa_stream_len);
+  xfa_final_str.replace(xfa_final_str.find("$2"), 2, xfa_string);
+
+#ifdef PDFIUM_FUZZER_DUMP
+  for (size_t i = 0; i < xfa_final_str.size(); i++) {
+    putc(xfa_final_str[i], stdout);
+  }
+#endif
+
+  PDFiumXFAFuzzer fuzzer;
+  fuzzer.RenderPdf(xfa_final_str.c_str(), xfa_final_str.size());
+  return 0;
+}