Add a test case for another find text failure

Distill the PDF attached on the bug into a minimal test case. Add an
embedder test case to extract the text. Show that the text is missing a
dash.

Bug: 431824298
Change-Id: I67ce45a17dca191e28a7aa31e9b27cbe339b37b2
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/134450
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 34dc9c8..fa20e81 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2279,3 +2279,38 @@
   EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
   EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
 }
+
+TEST_F(FPDFTextEmbedderTest, Bug431824298) {
+  // TODO(crbug.com/431824298): 0xfffe should be a dash.
+  static constexpr std::array<unsigned short, 19> kExpectedChars = {
+      '-', 'h', 'e', 'l', 'l', 'o',    '-',    '\r',   '\n', '-',
+      'w', 'o', 'r', 'l', 'd', 0xfffe, 0x501f, 0x6b3e, 0};
+
+  ASSERT_TRUE(OpenDocument("bug_431824298.pdf"));
+  ScopedPage page = LoadScopedPage(0);
+  ASSERT_TRUE(page);
+
+  ScopedFPDFTextPage textpage(FPDFText_LoadPage(page.get()));
+  ASSERT_TRUE(textpage);
+
+  std::array<unsigned short, 128> buffer = {};
+  int num_chars =
+      FPDFText_GetText(textpage.get(), 0, buffer.size(), buffer.data());
+  ASSERT_EQ(static_cast<int>(kExpectedChars.size()), num_chars);
+  EXPECT_THAT(pdfium::span(buffer).first<kExpectedChars.size()>(),
+              ElementsAreArray(kExpectedChars));
+
+  ScopedFPDFWideString world = GetFPDFWideString(L"-world-");
+
+  ScopedFPDFTextFind search(
+      FPDFText_FindStart(textpage.get(), world.get(), 0, 0));
+  EXPECT_TRUE(search);
+  EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
+  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+
+  // TODO(crbug.com/431824298): Once 0xfffe in `kExpectedChars` is a dash, this
+  // search should succeed.
+  EXPECT_FALSE(FPDFText_FindNext(search.get()));
+  EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
+  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+}
diff --git a/testing/resources/bug_431824298.in b/testing/resources/bug_431824298.in
new file mode 100644
index 0000000..a23b36f
--- /dev/null
+++ b/testing/resources/bug_431824298.in
@@ -0,0 +1,118 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 4 0 R
+  /MediaBox [0 0 200 300]
+  /Resources <<
+    /Font <<
+      /F1 5 0 R
+      /F2 6 0 R
+    >>
+  >>
+>>
+endobj
+{{object 4 0}} <<
+  {{streamlen}}
+>>
+stream
+BT
+/F1 12 Tf
+20 80 Td
+[(-hello-)] TJ
+ET
+BT
+/F1 12 Tf
+20 50 Td
+[(-world-)] TJ
+ET
+BT
+/F2 12 Tf
+20 30 Td
+[<064F216E>] TJ
+ET
+endstream
+endobj
+{{object 5 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+  /Type /Font
+  /Subtype /Type0
+  /BaseFont /BCDEEE+SimSun
+  /DescendantFonts [7 0 R]
+  /Encoding /Identity-H
+  /ToUnicode 9 0 R
+>>
+endobj
+{{object 7 0}} <<
+  /Type /Font
+  /Subtype /CIDFontType2
+  /BaseFont /BCDEEE+SimSun
+  /CIDToGIDMap /Identity
+  /DW 1000
+  /FontDescriptor 8 0 R
+>>
+endobj
+{{object 8 0}} <<
+  /Type /FontDescriptor
+  /Ascent 859
+  /AvgWidth 500
+  /CapHeight 859
+  /Descent -141
+  /Flags 32
+  /FontBBox [-8 -141 1000 859]
+  /FontName /BCDEEE+SimSun
+  /FontWeight 400
+  /ItalicAngle 0
+  /Leading 141
+  /MaxWidth 1008
+  /StemV 50
+  /XHeight 250
+>>
+endobj
+{{object 9 0}} <<
+  {{streamlen}}
+>>
+stream
+/CIDInit /ProcSet findresource begin
+28 dict begin
+begincmap
+/CIDSystemInfo <<
+/Registry (Adobe)
+/Ordering (UCS)
+/Supplement 0
+>> def
+/CMapName /Adobe-Identity-UCS def
+/CMapType 2 def
+1 begincodespacerange
+<0000> <FFFF>
+endcodespacerange
+2 beginbfchar
+<064F> <501F>
+<216E> <6B3E>
+endbfchar
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/bug_431824298.pdf b/testing/resources/bug_431824298.pdf
new file mode 100644
index 0000000..1ede830
--- /dev/null
+++ b/testing/resources/bug_431824298.pdf
@@ -0,0 +1,134 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 4 0 R
+  /MediaBox [0 0 200 300]
+  /Resources <<
+    /Font <<
+      /F1 5 0 R
+      /F2 6 0 R
+    >>
+  >>
+>>
+endobj
+4 0 obj <<
+  /Length 120
+>>
+stream
+BT
+/F1 12 Tf
+20 80 Td
+[(-hello-)] TJ
+ET
+BT
+/F1 12 Tf
+20 50 Td
+[(-world-)] TJ
+ET
+BT
+/F2 12 Tf
+20 30 Td
+[<064F216E>] TJ
+ET
+endstream
+endobj
+5 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+  /Type /Font
+  /Subtype /Type0
+  /BaseFont /BCDEEE+SimSun
+  /DescendantFonts [7 0 R]
+  /Encoding /Identity-H
+  /ToUnicode 9 0 R
+>>
+endobj
+7 0 obj <<
+  /Type /Font
+  /Subtype /CIDFontType2
+  /BaseFont /BCDEEE+SimSun
+  /CIDToGIDMap /Identity
+  /DW 1000
+  /FontDescriptor 8 0 R
+>>
+endobj
+8 0 obj <<
+  /Type /FontDescriptor
+  /Ascent 859
+  /AvgWidth 500
+  /CapHeight 859
+  /Descent -141
+  /Flags 32
+  /FontBBox [-8 -141 1000 859]
+  /FontName /BCDEEE+SimSun
+  /FontWeight 400
+  /ItalicAngle 0
+  /Leading 141
+  /MaxWidth 1008
+  /StemV 50
+  /XHeight 250
+>>
+endobj
+9 0 obj <<
+  /Length 351
+>>
+stream
+/CIDInit /ProcSet findresource begin
+28 dict begin
+begincmap
+/CIDSystemInfo <<
+/Registry (Adobe)
+/Ordering (UCS)
+/Supplement 0
+>> def
+/CMapName /Adobe-Identity-UCS def
+/CMapType 2 def
+1 begincodespacerange
+<0000> <FFFF>
+endcodespacerange
+2 beginbfchar
+<064F> <501F>
+<216E> <6B3E>
+endbfchar
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+endstream
+endobj
+xref
+0 10
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000131 00000 n 
+0000000299 00000 n 
+0000000472 00000 n 
+0000000548 00000 n 
+0000000698 00000 n 
+0000000845 00000 n 
+0000001117 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 10
+>>
+startxref
+1521
+%%EOF