Add a test case for another find text failure
Distill the PDF attached on the bug into a minimal test case. Add an
embedder test case to extract the text. Show that the text is missing a
dash.
Bug: 431824298
Change-Id: I67ce45a17dca191e28a7aa31e9b27cbe339b37b2
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/134450
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 34dc9c8..fa20e81 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2279,3 +2279,38 @@
EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
}
+
+TEST_F(FPDFTextEmbedderTest, Bug431824298) {
+ // TODO(crbug.com/431824298): 0xfffe should be a dash.
+ static constexpr std::array<unsigned short, 19> kExpectedChars = {
+ '-', 'h', 'e', 'l', 'l', 'o', '-', '\r', '\n', '-',
+ 'w', 'o', 'r', 'l', 'd', 0xfffe, 0x501f, 0x6b3e, 0};
+
+ ASSERT_TRUE(OpenDocument("bug_431824298.pdf"));
+ ScopedPage page = LoadScopedPage(0);
+ ASSERT_TRUE(page);
+
+ ScopedFPDFTextPage textpage(FPDFText_LoadPage(page.get()));
+ ASSERT_TRUE(textpage);
+
+ std::array<unsigned short, 128> buffer = {};
+ int num_chars =
+ FPDFText_GetText(textpage.get(), 0, buffer.size(), buffer.data());
+ ASSERT_EQ(static_cast<int>(kExpectedChars.size()), num_chars);
+ EXPECT_THAT(pdfium::span(buffer).first<kExpectedChars.size()>(),
+ ElementsAreArray(kExpectedChars));
+
+ ScopedFPDFWideString world = GetFPDFWideString(L"-world-");
+
+ ScopedFPDFTextFind search(
+ FPDFText_FindStart(textpage.get(), world.get(), 0, 0));
+ EXPECT_TRUE(search);
+ EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
+ EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+
+ // TODO(crbug.com/431824298): Once 0xfffe in `kExpectedChars` is a dash, this
+ // search should succeed.
+ EXPECT_FALSE(FPDFText_FindNext(search.get()));
+ EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
+ EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+}
diff --git a/testing/resources/bug_431824298.in b/testing/resources/bug_431824298.in
new file mode 100644
index 0000000..a23b36f
--- /dev/null
+++ b/testing/resources/bug_431824298.in
@@ -0,0 +1,118 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+ /MediaBox [0 0 200 300]
+ /Resources <<
+ /Font <<
+ /F1 5 0 R
+ /F2 6 0 R
+ >>
+ >>
+>>
+endobj
+{{object 4 0}} <<
+ {{streamlen}}
+>>
+stream
+BT
+/F1 12 Tf
+20 80 Td
+[(-hello-)] TJ
+ET
+BT
+/F1 12 Tf
+20 50 Td
+[(-world-)] TJ
+ET
+BT
+/F2 12 Tf
+20 30 Td
+[<064F216E>] TJ
+ET
+endstream
+endobj
+{{object 5 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+ /Type /Font
+ /Subtype /Type0
+ /BaseFont /BCDEEE+SimSun
+ /DescendantFonts [7 0 R]
+ /Encoding /Identity-H
+ /ToUnicode 9 0 R
+>>
+endobj
+{{object 7 0}} <<
+ /Type /Font
+ /Subtype /CIDFontType2
+ /BaseFont /BCDEEE+SimSun
+ /CIDToGIDMap /Identity
+ /DW 1000
+ /FontDescriptor 8 0 R
+>>
+endobj
+{{object 8 0}} <<
+ /Type /FontDescriptor
+ /Ascent 859
+ /AvgWidth 500
+ /CapHeight 859
+ /Descent -141
+ /Flags 32
+ /FontBBox [-8 -141 1000 859]
+ /FontName /BCDEEE+SimSun
+ /FontWeight 400
+ /ItalicAngle 0
+ /Leading 141
+ /MaxWidth 1008
+ /StemV 50
+ /XHeight 250
+>>
+endobj
+{{object 9 0}} <<
+ {{streamlen}}
+>>
+stream
+/CIDInit /ProcSet findresource begin
+28 dict begin
+begincmap
+/CIDSystemInfo <<
+/Registry (Adobe)
+/Ordering (UCS)
+/Supplement 0
+>> def
+/CMapName /Adobe-Identity-UCS def
+/CMapType 2 def
+1 begincodespacerange
+<0000> <FFFF>
+endcodespacerange
+2 beginbfchar
+<064F> <501F>
+<216E> <6B3E>
+endbfchar
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/bug_431824298.pdf b/testing/resources/bug_431824298.pdf
new file mode 100644
index 0000000..1ede830
--- /dev/null
+++ b/testing/resources/bug_431824298.pdf
@@ -0,0 +1,134 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+ /MediaBox [0 0 200 300]
+ /Resources <<
+ /Font <<
+ /F1 5 0 R
+ /F2 6 0 R
+ >>
+ >>
+>>
+endobj
+4 0 obj <<
+ /Length 120
+>>
+stream
+BT
+/F1 12 Tf
+20 80 Td
+[(-hello-)] TJ
+ET
+BT
+/F1 12 Tf
+20 50 Td
+[(-world-)] TJ
+ET
+BT
+/F2 12 Tf
+20 30 Td
+[<064F216E>] TJ
+ET
+endstream
+endobj
+5 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+ /Type /Font
+ /Subtype /Type0
+ /BaseFont /BCDEEE+SimSun
+ /DescendantFonts [7 0 R]
+ /Encoding /Identity-H
+ /ToUnicode 9 0 R
+>>
+endobj
+7 0 obj <<
+ /Type /Font
+ /Subtype /CIDFontType2
+ /BaseFont /BCDEEE+SimSun
+ /CIDToGIDMap /Identity
+ /DW 1000
+ /FontDescriptor 8 0 R
+>>
+endobj
+8 0 obj <<
+ /Type /FontDescriptor
+ /Ascent 859
+ /AvgWidth 500
+ /CapHeight 859
+ /Descent -141
+ /Flags 32
+ /FontBBox [-8 -141 1000 859]
+ /FontName /BCDEEE+SimSun
+ /FontWeight 400
+ /ItalicAngle 0
+ /Leading 141
+ /MaxWidth 1008
+ /StemV 50
+ /XHeight 250
+>>
+endobj
+9 0 obj <<
+ /Length 351
+>>
+stream
+/CIDInit /ProcSet findresource begin
+28 dict begin
+begincmap
+/CIDSystemInfo <<
+/Registry (Adobe)
+/Ordering (UCS)
+/Supplement 0
+>> def
+/CMapName /Adobe-Identity-UCS def
+/CMapType 2 def
+1 begincodespacerange
+<0000> <FFFF>
+endcodespacerange
+2 beginbfchar
+<064F> <501F>
+<216E> <6B3E>
+endbfchar
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+endstream
+endobj
+xref
+0 10
+0000000000 65535 f
+0000000015 00000 n
+0000000068 00000 n
+0000000131 00000 n
+0000000299 00000 n
+0000000472 00000 n
+0000000548 00000 n
+0000000698 00000 n
+0000000845 00000 n
+0000001117 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 10
+>>
+startxref
+1521
+%%EOF