Add a test case for FPDFText_GetUnicode() regression.
https://pdfium-review.googlesource.com/80610 regressed text extraction
for bigtable-osdi06.pdf, so the CL got reverted in
https://pdfium-review.googlesource.com/83070. This adds a regression
test with a minimized version of bigtable-osdi06.pdf.
Change-Id: Ieff1cabfd0afffc3ad5d538427a766c55099b7d1
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/83072
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Hui Yingst <nigi@chromium.org>
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index cc541cc..8db0969 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -1660,3 +1660,28 @@
UnloadPage(page);
}
+
+TEST_F(FPDFTextEmbedderTest, BigtableTextExtraction) {
+ constexpr char kExpectedText[] =
+ "{fay,jeff,sanjay,wilsonh,kerr,m3b,tushar,\x02k es,gruber}@google.com";
+ constexpr int kExpectedTextCount = pdfium::size(kExpectedText) - 1;
+
+ ASSERT_TRUE(OpenDocument("bigtable_mini.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ {
+ ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
+ ASSERT_TRUE(text_page);
+ int char_count = FPDFText_CountChars(text_page.get());
+ ASSERT_GE(char_count, 0);
+ ASSERT_EQ(kExpectedTextCount, char_count);
+
+ for (int i = 0; i < kExpectedTextCount; ++i) {
+ EXPECT_EQ(static_cast<uint32_t>(kExpectedText[i]),
+ FPDFText_GetUnicode(text_page.get(), i));
+ }
+ }
+
+ UnloadPage(page);
+}
diff --git a/testing/resources/bigtable_mini.in b/testing/resources/bigtable_mini.in
new file mode 100644
index 0000000..7e80992
--- /dev/null
+++ b/testing/resources/bigtable_mini.in
@@ -0,0 +1,113 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+ /MediaBox [0 0 612 792]
+ /Resources <<
+ /ProcSet [/PDF /ImageB /Text]
+ /Font <<
+ /F1 5 0 R
+ /F2 6 0 R
+ >>
+ >>
+>>
+endobj
+{{object 4 0}} <<
+ {{streamlen}}
+>>
+stream
+q BT
+1 0 0 1 250 667 Tm
+/F2 8.96638 Tf
+-243.635 -15.84 Td
+(f)Tj
+/F1 8.96638 Tf
+4.55491 0 Td
+(f)Tj
+2.87476 0 Td
+(ay)Tj
+7.79253 0 Td
+(,jef)Tj
+11.506 0 Td
+(f,sanjay)Tj
+27.4558 0 Td
+(,wilsonh,k)Tj
+37.1801 0 Td
+(err)Tj
+9.58372 0 Td
+(,m3b,tushar)Tj
+41.8537 0 Td
+(,k)Tj
+11.6349 0 Td
+(es,gruber)Tj
+/F2 8.96638 Tf
+32.9694 0 Td
+(g)Tj
+/F1 8.96638 Tf
+4.55491 0 Td
+(@google.com)Tj
+ET Q
+endstream
+endobj
+{{object 5 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+{{object 6 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /FirstChar 102
+ /BaseFont /RFSQHQ+CMSY9
+ /FontDescriptor 7 0 R
+ /LastChar 103
+ /Widths [508 508]
+>>
+endobj
+{{object 7 0}} <<
+ /Type /FontDescriptor
+ /Ascent 750
+ /CapHeight 750
+ /CharSet (/braceleft/braceright)
+ /Descent -250
+ /Flags 4
+ /FontBBox [0 -250 440 750]
+ /FontFile3 8 0 R
+ /FontName /RFSQHQ+CMSY9
+ /ItalicAngle 0
+ /StemV 65
+>>
+endobj
+{{object 8 0}} <<
+ /Subtype /Type1C
+ /Filter [/ASCII85Decode /FlateDecode]
+ {{streamlen}}
+>>
+stream
+GhQY<?t!MPA7Xa1EXNBL#h4%k%?@du#q5_5B)EgioL"p)pKs!j9@_A!X@n"A#^s/nr,0aY?!i,5R?>nA
+43BRuTX#t%4ekD2_c]pSd*g?I_(dmV-o3k<:Veup,U50*Ylr.i;$bHCc:fghe5L>1a\`<FkpTR<8hEdi
+.SEJ:>O%`N>>SLd>,:)GT9<BBa2#L+Pa9V1&Ao("^^P</!u&Ql7PhdY9T5#6IePGiOgOaNdLsRg)OHi+
+n+a.f[/4_7lnp:OXP)%6XER"WK`:C2*&F%pl[L]/LH0S#rJk:S[coEjaA0p=l`BI4BP[$LCn09862jKs
+Y)F7W^!5B(h.[kDUY*1W\e@l8mE%N?^8RN2qNUCce!bWPjLtF8=4Zg,R,1!:HR6^V/b\TIh0Z`11`hZZ
+*BO'Za1C[ph)dSm>aE#>k-A'_h4)!bqJD$jjj@/_bu*BN?.Ud@HV0Y&l1Vg$1F)e^]:6ER3IV(Tbj1;S
+BposQ0Df/MDiTcoYO4BSY\&*%<aJs1;J/:.>?>$UG7-pseF)R.T'a;@,PN5BX]sWQf<*o6H:RGZ*f^Pj
+qkd,*mVA#1lAJeH\%d!)GAm3[%_gAF5OB8mr>(pE^5_Fj[i?3Bq0X7/D)6E](`9a_gaUoK~>
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/bigtable_mini.pdf b/testing/resources/bigtable_mini.pdf
new file mode 100644
index 0000000..adb3e02
--- /dev/null
+++ b/testing/resources/bigtable_mini.pdf
@@ -0,0 +1,128 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+ /MediaBox [0 0 612 792]
+ /Resources <<
+ /ProcSet [/PDF /ImageB /Text]
+ /Font <<
+ /F1 5 0 R
+ /F2 6 0 R
+ >>
+ >>
+>>
+endobj
+4 0 obj <<
+ /Length 374
+>>
+stream
+q BT
+1 0 0 1 250 667 Tm
+/F2 8.96638 Tf
+-243.635 -15.84 Td
+(f)Tj
+/F1 8.96638 Tf
+4.55491 0 Td
+(f)Tj
+2.87476 0 Td
+(ay)Tj
+7.79253 0 Td
+(,jef)Tj
+11.506 0 Td
+(f,sanjay)Tj
+27.4558 0 Td
+(,wilsonh,k)Tj
+37.1801 0 Td
+(err)Tj
+9.58372 0 Td
+(,m3b,tushar)Tj
+41.8537 0 Td
+(,k)Tj
+11.6349 0 Td
+(es,gruber)Tj
+/F2 8.96638 Tf
+32.9694 0 Td
+(g)Tj
+/F1 8.96638 Tf
+4.55491 0 Td
+(@google.com)Tj
+ET Q
+endstream
+endobj
+5 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Times-Roman
+>>
+endobj
+6 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /FirstChar 102
+ /BaseFont /RFSQHQ+CMSY9
+ /FontDescriptor 7 0 R
+ /LastChar 103
+ /Widths [508 508]
+>>
+endobj
+7 0 obj <<
+ /Type /FontDescriptor
+ /Ascent 750
+ /CapHeight 750
+ /CharSet (/braceleft/braceright)
+ /Descent -250
+ /Flags 4
+ /FontBBox [0 -250 440 750]
+ /FontFile3 8 0 R
+ /FontName /RFSQHQ+CMSY9
+ /ItalicAngle 0
+ /StemV 65
+>>
+endobj
+8 0 obj <<
+ /Subtype /Type1C
+ /Filter [/ASCII85Decode /FlateDecode]
+ /Length 640
+>>
+stream
+GhQY<?t!MPA7Xa1EXNBL#h4%k%?@du#q5_5B)EgioL"p)pKs!j9@_A!X@n"A#^s/nr,0aY?!i,5R?>nA
+43BRuTX#t%4ekD2_c]pSd*g?I_(dmV-o3k<:Veup,U50*Ylr.i;$bHCc:fghe5L>1a\`<FkpTR<8hEdi
+.SEJ:>O%`N>>SLd>,:)GT9<BBa2#L+Pa9V1&Ao("^^P</!u&Ql7PhdY9T5#6IePGiOgOaNdLsRg)OHi+
+n+a.f[/4_7lnp:OXP)%6XER"WK`:C2*&F%pl[L]/LH0S#rJk:S[coEjaA0p=l`BI4BP[$LCn09862jKs
+Y)F7W^!5B(h.[kDUY*1W\e@l8mE%N?^8RN2qNUCce!bWPjLtF8=4Zg,R,1!:HR6^V/b\TIh0Z`11`hZZ
+*BO'Za1C[ph)dSm>aE#>k-A'_h4)!bqJD$jjj@/_bu*BN?.Ud@HV0Y&l1Vg$1F)e^]:6ER3IV(Tbj1;S
+BposQ0Df/MDiTcoYO4BSY\&*%<aJs1;J/:.>?>$UG7-pseF)R.T'a;@,PN5BX]sWQf<*o6H:RGZ*f^Pj
+qkd,*mVA#1lAJeH\%d!)GAm3[%_gAF5OB8mr>(pE^5_Fj[i?3Bq0X7/D)6E](`9a_gaUoK~>
+endstream
+endobj
+xref
+0 9
+0000000000 65535 f
+0000000015 00000 n
+0000000068 00000 n
+0000000131 00000 n
+0000000333 00000 n
+0000000759 00000 n
+0000000837 00000 n
+0000000993 00000 n
+0000001234 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 9
+>>
+startxref
+1985
+%%EOF