Add FPDFText_HasUnicodeMapError() API. PDFium already detects characters with invalid ToUnicode mappings. Plumb this to a public API so embedders can use this info. The newly added testing/resources/bug_1388_2.pdf file is generated from testing/resources/pixel/bug_1388_2.in. Bug: pdfium:1926 Change-Id: I9f2f8975b33e80cdbdb302821156f64bdc000f12 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/100211 Reviewed-by: Tom Sepez <tsepez@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org>

commit: 5b59f6d896126e83be428613588971d72af5fc88 [log] [tgz]
author: Lei Zhang <thestig@chromium.org> Fri Oct 28 23:08:29 2022 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Fri Oct 28 23:08:29 2022 +0000
tree: 6912139adf55b25ba4007495fb72bb98f57467d7
parent: b899fba4eada47b80058e9dff4811403a6f19744 [diff]
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index bdd9fd3..e2466a7 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp

@@ -81,6 +81,16 @@
   return charinfo.m_CharType == CPDF_TextPage::CharType::kGenerated ? 1 : 0;
 }
 
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFText_HasUnicodeMapError(FPDF_TEXTPAGE text_page, int index) {
+  CPDF_TextPage* textpage = GetTextPageForValidIndex(text_page, index);
+  if (!textpage)
+    return -1;
+
+  const CPDF_TextPage::CharInfo& charinfo = textpage->GetCharInfo(index);
+  return charinfo.m_CharType == CPDF_TextPage::CharType::kNotUnicode;
+}
+
 FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
                                                       int index) {
   CPDF_TextPage* textpage = GetTextPageForValidIndex(text_page, index);

diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 0fe0a99..2e2c3fd 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp

@@ -983,6 +983,36 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFTextEmbedderTest, IsInvalidUnicode) {
+  ASSERT_TRUE(OpenDocument("bug_1388_2.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  {
+    constexpr int kExpectedCharCount = 5;
+    ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
+    ASSERT_TRUE(textpage);
+    EXPECT_EQ(kExpectedCharCount, FPDFText_CountChars(textpage.get()));
+
+    EXPECT_EQ(static_cast<unsigned int>('X'),
+              FPDFText_GetUnicode(textpage.get(), 0));
+    EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 0));
+    EXPECT_EQ(static_cast<unsigned int>(' '),
+              FPDFText_GetUnicode(textpage.get(), 1));
+    EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 1));
+
+    EXPECT_EQ(31u, FPDFText_GetUnicode(textpage.get(), 2));
+    EXPECT_EQ(1, FPDFText_HasUnicodeMapError(textpage.get(), 2));
+
+    EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(textpage.get(), -1));
+    EXPECT_EQ(-1,
+              FPDFText_HasUnicodeMapError(textpage.get(), kExpectedCharCount));
+    EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(nullptr, 0));
+  }
+
+  UnloadPage(page);
+}
+
 TEST_F(FPDFTextEmbedderTest, Bug_921) {
   ASSERT_TRUE(OpenDocument("bug_921.pdf"));
   FPDF_PAGE page = LoadPage(0);

diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 23586c4..3947bd3 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c

@@ -427,6 +427,7 @@
     CHK(FPDFText_GetText);
     CHK(FPDFText_GetTextRenderMode);
     CHK(FPDFText_GetUnicode);
+    CHK(FPDFText_HasUnicodeMapError);
     CHK(FPDFText_IsGenerated);
     CHK(FPDFText_LoadPage);
 

diff --git a/public/fpdf_text.h b/public/fpdf_text.h
index 22f25e9..fb5f342 100644
--- a/public/fpdf_text.h
+++ b/public/fpdf_text.h

@@ -89,6 +89,21 @@
 FPDF_EXPORT int FPDF_CALLCONV
 FPDFText_IsGenerated(FPDF_TEXTPAGE text_page, int index);
 
+// Experimental API.
+// Function: FPDFText_HasUnicodeMapError
+//          Get if a character in a page has an invalid unicode mapping.
+// Parameters:
+//          text_page   -   Handle to a text page information structure.
+//                          Returned by FPDFText_LoadPage function.
+//          index       -   Zero-based index of the character.
+// Return value:
+//          1 if the character has an invalid unicode mapping.
+//          0 if the character has no known unicode mapping issues.
+//          -1 if there was an error.
+//
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFText_HasUnicodeMapError(FPDF_TEXTPAGE text_page, int index);
+
 // Function: FPDFText_GetFontSize
 //          Get the font size of a particular character.
 // Parameters:

diff --git a/testing/resources/bug_1388_2.pdf b/testing/resources/bug_1388_2.pdf
new file mode 100644
index 0000000..1fee088
--- /dev/null
+++ b/testing/resources/bug_1388_2.pdf

@@ -0,0 +1,80 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /Count 1
+  /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Contents 4 0 R
+  /MediaBox [0 0 200 200]
+  /Resources <<
+    /ProcSet [/PDF /Text]
+    /Font <<
+      /TT2 5 0 R
+    >>
+  >>
+>>
+endobj
+4 0 obj <<
+  /Length 40
+>>
+stream
+BT
+/TT2 12 Tf
+40 100 Td
+[(X  X)] TJ
+ET
+endstream
+endobj
+5 0 obj <<
+  /Type /Font
+  /Subtype /TrueType
+  /BaseFont /TimesNewRomanPSMT
+  /Encoding /WinAnsiEncoding
+  /FirstChar 31
+  /FontDescriptor 6 0 R
+  /LastChar 252
+>>
+endobj
+6 0 obj <<
+  /Type /FontDescriptor
+  /Ascent 891
+  /CapHeight 656
+  /Descent -216
+  /Flags 34
+  /FontBBox [-568 -307 2000 1007]
+  /FontFamily (Times New Roman)
+  /FontName /TimesNewRomanPSMT
+  /FontStretch /Normal
+  /FontWeight 400
+  /ItalicAngle 0
+  /MissingWidth 778
+  /StemV 82
+  /XHeight -546
+>>
+endobj
+xref
+0 7
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000131 00000 n 
+0000000310 00000 n 
+0000000401 00000 n 
+0000000573 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 7
+>>
+startxref
+880
+%%EOF
commit	5b59f6d896126e83be428613588971d72af5fc88	[log] [tgz]
author	Lei Zhang <thestig@chromium.org>	Fri Oct 28 23:08:29 2022 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Fri Oct 28 23:08:29 2022 +0000
tree	6912139adf55b25ba4007495fb72bb98f57467d7
parent	b899fba4eada47b80058e9dff4811403a6f19744 [diff]