Add embedder test for FPDFText_GetText() behavior with invalid chars Take the relevant part of the PDF from the bug and combine it with hello_world.pdf into a minimal test case. Demonstrate FPDFText_GetText() currently cannot return the valid characters, and FPDFText_FindStart() cannot find them either. Bug: 425244539 Change-Id: Ic90d64566e45e9c3ea39ca5132b0533f678a5af8 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/133550 Reviewed-by: Tom Sepez <tsepez@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org>

commit: 429bf9f4c7ba92ccca3a3ec3ea3db9615b8f6a59 [log] [tgz]
author: Lei Zhang <thestig@chromium.org> Tue Jun 24 12:26:38 2025 -0700
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Tue Jun 24 12:26:38 2025 -0700
tree: a078201de97a7c6267b2aa020a7845182e2a7e05
parent: c8d8bdcb981994b311ee85fb77014b7d07b7ba89 [diff]
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index f863e82..dd0c819 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp

@@ -2248,3 +2248,37 @@
                 ElementsAreArray(kHelloGoodbyeText));
   }
 }
+
+TEST_F(FPDFTextEmbedderTest, Bug425244539) {
+  // TODO(crbug.com/425244539): This should contain the characters in "hello".
+  static constexpr std::array<unsigned short, 6> kExpectedChars = {
+      0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0};
+
+  ASSERT_TRUE(OpenDocument("bug_425244539.pdf"));
+  ScopedPage page = LoadScopedPage(0);
+  ASSERT_TRUE(page);
+
+  ScopedFPDFTextPage textpage(FPDFText_LoadPage(page.get()));
+  ASSERT_TRUE(textpage);
+
+  std::array<unsigned short, 128> buffer = {};
+  int num_chars =
+      FPDFText_GetText(textpage.get(), 0, buffer.size(), buffer.data());
+  ASSERT_EQ(static_cast<int>(kExpectedChars.size()), num_chars);
+  EXPECT_THAT(pdfium::span(buffer).first<kExpectedChars.size()>(),
+              ElementsAreArray(kExpectedChars));
+
+  ScopedFPDFWideString hello = GetFPDFWideString(L"hello");
+
+  // TODO(crbug.com/425244539): This should be able to find "hello".
+  ScopedFPDFTextFind search(
+      FPDFText_FindStart(textpage.get(), hello.get(), 0, 0));
+  EXPECT_TRUE(search);
+  EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
+  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+
+  // Advancing finds nothing.
+  EXPECT_FALSE(FPDFText_FindNext(search.get()));
+  EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
+  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+}

diff --git a/testing/resources/bug_425244539.in b/testing/resources/bug_425244539.in
new file mode 100644
index 0000000..e943200
--- /dev/null
+++ b/testing/resources/bug_425244539.in

@@ -0,0 +1,47 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /MediaBox [0 0 200 200]
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+  {{streamlen}}
+>>
+stream
+q 1 0 0 1 20 100 cm
+BT
+/F1 12 Tf
+[<00000000000000000000000000000000000000000000>] TJ
+[(hello)] TJ
+ET
+Q
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF

diff --git a/testing/resources/bug_425244539.pdf b/testing/resources/bug_425244539.pdf
new file mode 100644
index 0000000..bb34bf5
--- /dev/null
+++ b/testing/resources/bug_425244539.pdf

@@ -0,0 +1,59 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [0 0 200 200]
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+  /Length 102
+>>
+stream
+q 1 0 0 1 20 100 cm
+BT
+/F1 12 Tf
+[<00000000000000000000000000000000000000000000>] TJ
+[(hello)] TJ
+ET
+Q
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000157 00000 n 
+0000000283 00000 n 
+0000000361 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 6
+>>
+startxref
+516
+%%EOF
commit	429bf9f4c7ba92ccca3a3ec3ea3db9615b8f6a59	[log] [tgz]
author	Lei Zhang <thestig@chromium.org>	Tue Jun 24 12:26:38 2025 -0700
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Tue Jun 24 12:26:38 2025 -0700
tree	a078201de97a7c6267b2aa020a7845182e2a7e05
parent	c8d8bdcb981994b311ee85fb77014b7d07b7ba89 [diff]