Fix FPDFText_GetText() handling with invalid characters

Due to the changes in [1], what is now CPDF_TextPage::Init() has a
slightly different notion of what is "normal" compared to
AddCharInfoByLRDirection() and AddCharInfoByRLDirection(). Fix this
discrepancy by adding IsNormalCharacter() and using it from all 3 call
sites. Then update tests to reflect this change.

[1] https://pdfium-review.googlesource.com/27051

Bug: 425244539
Change-Id: Iedcba804ff3a210b5295f27daaee319c8725c159
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/133551
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index dbc3053..1051c9f 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -151,6 +151,11 @@
   return c == 0x2D || c == 0xAD;
 }
 
+bool IsNormalCharacter(const CPDF_TextPage::CharInfo& char_info) {
+  return char_info.unicode() != 0 ? !IsControlChar(char_info)
+                                  : char_info.char_code() != 0;
+}
+
 bool IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2) {
   CFX_FloatRect rect = rect1;
   rect.Intersect(rect2);
@@ -383,8 +388,7 @@
   for (int i = 0; i < nCount; ++i) {
     const CharInfo& charinfo = char_list_[i];
     if (charinfo.char_type() == CharType::kGenerated ||
-        (charinfo.unicode() != 0 && !IsControlChar(charinfo)) ||
-        (charinfo.unicode() == 0 && charinfo.char_code() != 0)) {
+        IsNormalCharacter(charinfo)) {
       char_indices_.back().count++;
       skipped = true;
     } else {
@@ -781,7 +785,7 @@
 
 void CPDF_TextPage::AddCharInfoByLRDirection(wchar_t wChar,
                                              const CharInfo& info) {
-  if (IsControlChar(info)) {
+  if (!IsNormalCharacter(info)) {
     char_list_.push_back(info);
     return;
   }
@@ -806,7 +810,7 @@
 
 void CPDF_TextPage::AddCharInfoByRLDirection(wchar_t wChar,
                                              const CharInfo& info) {
-  if (IsControlChar(info)) {
+  if (!IsNormalCharacter(info)) {
     char_list_.push_back(info);
     return;
   }
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index dd0c819..34dc9c8 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2250,9 +2250,8 @@
 }
 
 TEST_F(FPDFTextEmbedderTest, Bug425244539) {
-  // TODO(crbug.com/425244539): This should contain the characters in "hello".
   static constexpr std::array<unsigned short, 6> kExpectedChars = {
-      0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0};
+      'h', 'e', 'l', 'l', 'o', 0};
 
   ASSERT_TRUE(OpenDocument("bug_425244539.pdf"));
   ScopedPage page = LoadScopedPage(0);
@@ -2270,15 +2269,13 @@
 
   ScopedFPDFWideString hello = GetFPDFWideString(L"hello");
 
-  // TODO(crbug.com/425244539): This should be able to find "hello".
   ScopedFPDFTextFind search(
       FPDFText_FindStart(textpage.get(), hello.get(), 0, 0));
   EXPECT_TRUE(search);
   EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
   EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
 
-  // Advancing finds nothing.
-  EXPECT_FALSE(FPDFText_FindNext(search.get()));
+  EXPECT_TRUE(FPDFText_FindNext(search.get()));
   EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
-  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+  EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
 }