Add experimental FPDFText_IsGenerated() API.

Some characters inside a FPDF_TEXTPAGE are generated by PDFium and not
part of the actual content stream. These are usually carriage returns
and line feeds. Add an API to expose whether a character is generated or
not.

Bug: pdfium:1719
Change-Id: Ibb6ec198aed87a74a5d128edad7f671aeae86a8e
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/99830
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index da00c47..bdd9fd3 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -71,6 +71,16 @@
   return charinfo.m_Unicode;
 }
 
+FPDF_EXPORT int FPDF_CALLCONV FPDFText_IsGenerated(FPDF_TEXTPAGE text_page,
+                                                   int index) {
+  CPDF_TextPage* textpage = GetTextPageForValidIndex(text_page, index);
+  if (!textpage)
+    return -1;
+
+  const CPDF_TextPage::CharInfo& charinfo = textpage->GetCharInfo(index);
+  return charinfo.m_CharType == CPDF_TextPage::CharType::kGenerated ? 1 : 0;
+}
+
 FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
                                                       int index) {
   CPDF_TextPage* textpage = GetTextPageForValidIndex(text_page, index);
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 54e5d26..0fe0a99 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -952,6 +952,37 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFTextEmbedderTest, IsGenerated) {
+  ASSERT_TRUE(OpenDocument("hello_world.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  {
+    ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
+    ASSERT_TRUE(textpage);
+
+    EXPECT_EQ(static_cast<unsigned int>('H'),
+              FPDFText_GetUnicode(textpage.get(), 0));
+    EXPECT_EQ(0, FPDFText_IsGenerated(textpage.get(), 0));
+    EXPECT_EQ(static_cast<unsigned int>(' '),
+              FPDFText_GetUnicode(textpage.get(), 6));
+    EXPECT_EQ(0, FPDFText_IsGenerated(textpage.get(), 6));
+
+    EXPECT_EQ(static_cast<unsigned int>('\r'),
+              FPDFText_GetUnicode(textpage.get(), 13));
+    EXPECT_EQ(1, FPDFText_IsGenerated(textpage.get(), 13));
+    EXPECT_EQ(static_cast<unsigned int>('\n'),
+              FPDFText_GetUnicode(textpage.get(), 14));
+    EXPECT_EQ(1, FPDFText_IsGenerated(textpage.get(), 14));
+
+    EXPECT_EQ(-1, FPDFText_IsGenerated(textpage.get(), -1));
+    EXPECT_EQ(-1, FPDFText_IsGenerated(textpage.get(), kHelloGoodbyeTextSize));
+    EXPECT_EQ(-1, FPDFText_IsGenerated(nullptr, 6));
+  }
+
+  UnloadPage(page);
+}
+
 TEST_F(FPDFTextEmbedderTest, Bug_921) {
   ASSERT_TRUE(OpenDocument("bug_921.pdf"));
   FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 0f656ae..23586c4 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -427,6 +427,7 @@
     CHK(FPDFText_GetText);
     CHK(FPDFText_GetTextRenderMode);
     CHK(FPDFText_GetUnicode);
+    CHK(FPDFText_IsGenerated);
     CHK(FPDFText_LoadPage);
 
     // fpdf_thumbnail.h
diff --git a/public/fpdf_text.h b/public/fpdf_text.h
index 65604d8..22f25e9 100644
--- a/public/fpdf_text.h
+++ b/public/fpdf_text.h
@@ -74,6 +74,21 @@
 FPDF_EXPORT unsigned int FPDF_CALLCONV
 FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index);
 
+// Experimental API.
+// Function: FPDFText_IsGenerated
+//          Get if a character in a page is generated by PDFium.
+// Parameters:
+//          text_page   -   Handle to a text page information structure.
+//                          Returned by FPDFText_LoadPage function.
+//          index       -   Zero-based index of the character.
+// Return value:
+//          1 if the character is generated by PDFium.
+//          0 if the character is not generated by PDFium.
+//          -1 if there was an error.
+//
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFText_IsGenerated(FPDF_TEXTPAGE text_page, int index);
+
 // Function: FPDFText_GetFontSize
 //          Get the font size of a particular character.
 // Parameters: