Add FPDFText_HasUnicodeMapError() API.
PDFium already detects characters with invalid ToUnicode mappings. Plumb
this to a public API so embedders can use this info.
The newly added testing/resources/bug_1388_2.pdf file is generated from
testing/resources/pixel/bug_1388_2.in.
Bug: pdfium:1926
Change-Id: I9f2f8975b33e80cdbdb302821156f64bdc000f12
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/100211
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index bdd9fd3..e2466a7 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -81,6 +81,16 @@
return charinfo.m_CharType == CPDF_TextPage::CharType::kGenerated ? 1 : 0;
}
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFText_HasUnicodeMapError(FPDF_TEXTPAGE text_page, int index) {
+ CPDF_TextPage* textpage = GetTextPageForValidIndex(text_page, index);
+ if (!textpage)
+ return -1;
+
+ const CPDF_TextPage::CharInfo& charinfo = textpage->GetCharInfo(index);
+ return charinfo.m_CharType == CPDF_TextPage::CharType::kNotUnicode;
+}
+
FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
int index) {
CPDF_TextPage* textpage = GetTextPageForValidIndex(text_page, index);
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 0fe0a99..2e2c3fd 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -983,6 +983,36 @@
UnloadPage(page);
}
+TEST_F(FPDFTextEmbedderTest, IsInvalidUnicode) {
+ ASSERT_TRUE(OpenDocument("bug_1388_2.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ {
+ constexpr int kExpectedCharCount = 5;
+ ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
+ ASSERT_TRUE(textpage);
+ EXPECT_EQ(kExpectedCharCount, FPDFText_CountChars(textpage.get()));
+
+ EXPECT_EQ(static_cast<unsigned int>('X'),
+ FPDFText_GetUnicode(textpage.get(), 0));
+ EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 0));
+ EXPECT_EQ(static_cast<unsigned int>(' '),
+ FPDFText_GetUnicode(textpage.get(), 1));
+ EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 1));
+
+ EXPECT_EQ(31u, FPDFText_GetUnicode(textpage.get(), 2));
+ EXPECT_EQ(1, FPDFText_HasUnicodeMapError(textpage.get(), 2));
+
+ EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(textpage.get(), -1));
+ EXPECT_EQ(-1,
+ FPDFText_HasUnicodeMapError(textpage.get(), kExpectedCharCount));
+ EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(nullptr, 0));
+ }
+
+ UnloadPage(page);
+}
+
TEST_F(FPDFTextEmbedderTest, Bug_921) {
ASSERT_TRUE(OpenDocument("bug_921.pdf"));
FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 23586c4..3947bd3 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -427,6 +427,7 @@
CHK(FPDFText_GetText);
CHK(FPDFText_GetTextRenderMode);
CHK(FPDFText_GetUnicode);
+ CHK(FPDFText_HasUnicodeMapError);
CHK(FPDFText_IsGenerated);
CHK(FPDFText_LoadPage);
diff --git a/public/fpdf_text.h b/public/fpdf_text.h
index 22f25e9..fb5f342 100644
--- a/public/fpdf_text.h
+++ b/public/fpdf_text.h
@@ -89,6 +89,21 @@
FPDF_EXPORT int FPDF_CALLCONV
FPDFText_IsGenerated(FPDF_TEXTPAGE text_page, int index);
+// Experimental API.
+// Function: FPDFText_HasUnicodeMapError
+// Get if a character in a page has an invalid unicode mapping.
+// Parameters:
+// text_page - Handle to a text page information structure.
+// Returned by FPDFText_LoadPage function.
+// index - Zero-based index of the character.
+// Return value:
+// 1 if the character has an invalid unicode mapping.
+// 0 if the character has no known unicode mapping issues.
+// -1 if there was an error.
+//
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFText_HasUnicodeMapError(FPDF_TEXTPAGE text_page, int index);
+
// Function: FPDFText_GetFontSize
// Get the font size of a particular character.
// Parameters:
diff --git a/testing/resources/bug_1388_2.pdf b/testing/resources/bug_1388_2.pdf
new file mode 100644
index 0000000..1fee088
--- /dev/null
+++ b/testing/resources/bug_1388_2.pdf
@@ -0,0 +1,80 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+ /MediaBox [0 0 200 200]
+ /Resources <<
+ /ProcSet [/PDF /Text]
+ /Font <<
+ /TT2 5 0 R
+ >>
+ >>
+>>
+endobj
+4 0 obj <<
+ /Length 40
+>>
+stream
+BT
+/TT2 12 Tf
+40 100 Td
+[(X X)] TJ
+ET
+endstream
+endobj
+5 0 obj <<
+ /Type /Font
+ /Subtype /TrueType
+ /BaseFont /TimesNewRomanPSMT
+ /Encoding /WinAnsiEncoding
+ /FirstChar 31
+ /FontDescriptor 6 0 R
+ /LastChar 252
+>>
+endobj
+6 0 obj <<
+ /Type /FontDescriptor
+ /Ascent 891
+ /CapHeight 656
+ /Descent -216
+ /Flags 34
+ /FontBBox [-568 -307 2000 1007]
+ /FontFamily (Times New Roman)
+ /FontName /TimesNewRomanPSMT
+ /FontStretch /Normal
+ /FontWeight 400
+ /ItalicAngle 0
+ /MissingWidth 778
+ /StemV 82
+ /XHeight -546
+>>
+endobj
+xref
+0 7
+0000000000 65535 f
+0000000015 00000 n
+0000000068 00000 n
+0000000131 00000 n
+0000000310 00000 n
+0000000401 00000 n
+0000000573 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 7
+>>
+startxref
+880
+%%EOF