Add embedder test for FPDFText_GetText() behavior with invalid chars
Take the relevant part of the PDF from the bug and combine it with
hello_world.pdf into a minimal test case. Demonstrate FPDFText_GetText()
currently cannot return the valid characters, and FPDFText_FindStart()
cannot find them either.
Bug: 425244539
Change-Id: Ic90d64566e45e9c3ea39ca5132b0533f678a5af8
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/133550
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index f863e82..dd0c819 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2248,3 +2248,37 @@
ElementsAreArray(kHelloGoodbyeText));
}
}
+
+TEST_F(FPDFTextEmbedderTest, Bug425244539) {
+ // TODO(crbug.com/425244539): This should contain the characters in "hello".
+ static constexpr std::array<unsigned short, 6> kExpectedChars = {
+ 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0};
+
+ ASSERT_TRUE(OpenDocument("bug_425244539.pdf"));
+ ScopedPage page = LoadScopedPage(0);
+ ASSERT_TRUE(page);
+
+ ScopedFPDFTextPage textpage(FPDFText_LoadPage(page.get()));
+ ASSERT_TRUE(textpage);
+
+ std::array<unsigned short, 128> buffer = {};
+ int num_chars =
+ FPDFText_GetText(textpage.get(), 0, buffer.size(), buffer.data());
+ ASSERT_EQ(static_cast<int>(kExpectedChars.size()), num_chars);
+ EXPECT_THAT(pdfium::span(buffer).first<kExpectedChars.size()>(),
+ ElementsAreArray(kExpectedChars));
+
+ ScopedFPDFWideString hello = GetFPDFWideString(L"hello");
+
+ // TODO(crbug.com/425244539): This should be able to find "hello".
+ ScopedFPDFTextFind search(
+ FPDFText_FindStart(textpage.get(), hello.get(), 0, 0));
+ EXPECT_TRUE(search);
+ EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
+ EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+
+ // Advancing finds nothing.
+ EXPECT_FALSE(FPDFText_FindNext(search.get()));
+ EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
+ EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+}
diff --git a/testing/resources/bug_425244539.in b/testing/resources/bug_425244539.in
new file mode 100644
index 0000000..e943200
--- /dev/null
+++ b/testing/resources/bug_425244539.in
@@ -0,0 +1,47 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /MediaBox [0 0 200 200]
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 4 0 R
+ >>
+ >>
+ /Contents 5 0 R
+>>
+endobj
+{{object 4 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+ {{streamlen}}
+>>
+stream
+q 1 0 0 1 20 100 cm
+BT
+/F1 12 Tf
+[<00000000000000000000000000000000000000000000>] TJ
+[(hello)] TJ
+ET
+Q
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/bug_425244539.pdf b/testing/resources/bug_425244539.pdf
new file mode 100644
index 0000000..bb34bf5
--- /dev/null
+++ b/testing/resources/bug_425244539.pdf
@@ -0,0 +1,59 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /MediaBox [0 0 200 200]
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /Resources <<
+ /Font <<
+ /F1 4 0 R
+ >>
+ >>
+ /Contents 5 0 R
+>>
+endobj
+4 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+ /Length 102
+>>
+stream
+q 1 0 0 1 20 100 cm
+BT
+/F1 12 Tf
+[<00000000000000000000000000000000000000000000>] TJ
+[(hello)] TJ
+ET
+Q
+endstream
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000015 00000 n
+0000000068 00000 n
+0000000157 00000 n
+0000000283 00000 n
+0000000361 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 6
+>>
+startxref
+516
+%%EOF