Add embedder test for FPDFText_GetText() behavior with invalid chars

Take the relevant part of the PDF from the bug and combine it with
hello_world.pdf into a minimal test case. Demonstrate FPDFText_GetText()
currently cannot return the valid characters, and FPDFText_FindStart()
cannot find them either.

Bug: 425244539
Change-Id: Ic90d64566e45e9c3ea39ca5132b0533f678a5af8
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/133550
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index f863e82..dd0c819 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2248,3 +2248,37 @@
                 ElementsAreArray(kHelloGoodbyeText));
   }
 }
+
+TEST_F(FPDFTextEmbedderTest, Bug425244539) {
+  // TODO(crbug.com/425244539): This should contain the characters in "hello".
+  static constexpr std::array<unsigned short, 6> kExpectedChars = {
+      0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0};
+
+  ASSERT_TRUE(OpenDocument("bug_425244539.pdf"));
+  ScopedPage page = LoadScopedPage(0);
+  ASSERT_TRUE(page);
+
+  ScopedFPDFTextPage textpage(FPDFText_LoadPage(page.get()));
+  ASSERT_TRUE(textpage);
+
+  std::array<unsigned short, 128> buffer = {};
+  int num_chars =
+      FPDFText_GetText(textpage.get(), 0, buffer.size(), buffer.data());
+  ASSERT_EQ(static_cast<int>(kExpectedChars.size()), num_chars);
+  EXPECT_THAT(pdfium::span(buffer).first<kExpectedChars.size()>(),
+              ElementsAreArray(kExpectedChars));
+
+  ScopedFPDFWideString hello = GetFPDFWideString(L"hello");
+
+  // TODO(crbug.com/425244539): This should be able to find "hello".
+  ScopedFPDFTextFind search(
+      FPDFText_FindStart(textpage.get(), hello.get(), 0, 0));
+  EXPECT_TRUE(search);
+  EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
+  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+
+  // Advancing finds nothing.
+  EXPECT_FALSE(FPDFText_FindNext(search.get()));
+  EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
+  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+}
diff --git a/testing/resources/bug_425244539.in b/testing/resources/bug_425244539.in
new file mode 100644
index 0000000..e943200
--- /dev/null
+++ b/testing/resources/bug_425244539.in
@@ -0,0 +1,47 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /MediaBox [0 0 200 200]
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+  {{streamlen}}
+>>
+stream
+q 1 0 0 1 20 100 cm
+BT
+/F1 12 Tf
+[<00000000000000000000000000000000000000000000>] TJ
+[(hello)] TJ
+ET
+Q
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/bug_425244539.pdf b/testing/resources/bug_425244539.pdf
new file mode 100644
index 0000000..bb34bf5
--- /dev/null
+++ b/testing/resources/bug_425244539.pdf
@@ -0,0 +1,59 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [0 0 200 200]
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+  /Length 102
+>>
+stream
+q 1 0 0 1 20 100 cm
+BT
+/F1 12 Tf
+[<00000000000000000000000000000000000000000000>] TJ
+[(hello)] TJ
+ET
+Q
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000157 00000 n 
+0000000283 00000 n 
+0000000361 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 6
+>>
+startxref
+516
+%%EOF