Fix FPDFText_GetText() handling with invalid characters Due to the changes in [1], what is now CPDF_TextPage::Init() has a slightly different notion of what is "normal" compared to AddCharInfoByLRDirection() and AddCharInfoByRLDirection(). Fix this discrepancy by adding IsNormalCharacter() and using it from all 3 call sites. Then update tests to reflect this change. [1] https://pdfium-review.googlesource.com/27051 Bug: 425244539 Change-Id: Iedcba804ff3a210b5295f27daaee319c8725c159 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/133551 Reviewed-by: Tom Sepez <tsepez@chromium.org> Commit-Queue: Lei Zhang <thestig@chromium.org>

commit: f2d1266fe25e9f544556c7d07b994295c299aad2 [log] [tgz]
author: Lei Zhang <thestig@chromium.org> Tue Jun 24 13:21:13 2025 -0700
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Tue Jun 24 13:21:13 2025 -0700
tree: 4fe1164f20504fc51db0facd7130181dfdbb70e4
parent: e50e167c9258d2c015163003134c9f21ef0b665d [diff]
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index dbc3053..1051c9f 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp

@@ -151,6 +151,11 @@
   return c == 0x2D || c == 0xAD;
 }
 
+bool IsNormalCharacter(const CPDF_TextPage::CharInfo& char_info) {
+  return char_info.unicode() != 0 ? !IsControlChar(char_info)
+                                  : char_info.char_code() != 0;
+}
+
 bool IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2) {
   CFX_FloatRect rect = rect1;
   rect.Intersect(rect2);
@@ -383,8 +388,7 @@
   for (int i = 0; i < nCount; ++i) {
     const CharInfo& charinfo = char_list_[i];
     if (charinfo.char_type() == CharType::kGenerated ||
-        (charinfo.unicode() != 0 && !IsControlChar(charinfo)) ||
-        (charinfo.unicode() == 0 && charinfo.char_code() != 0)) {
+        IsNormalCharacter(charinfo)) {
       char_indices_.back().count++;
       skipped = true;
     } else {
@@ -781,7 +785,7 @@
 
 void CPDF_TextPage::AddCharInfoByLRDirection(wchar_t wChar,
                                              const CharInfo& info) {
-  if (IsControlChar(info)) {
+  if (!IsNormalCharacter(info)) {
     char_list_.push_back(info);
     return;
   }
@@ -806,7 +810,7 @@
 
 void CPDF_TextPage::AddCharInfoByRLDirection(wchar_t wChar,
                                              const CharInfo& info) {
-  if (IsControlChar(info)) {
+  if (!IsNormalCharacter(info)) {
     char_list_.push_back(info);
     return;
   }

diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index dd0c819..34dc9c8 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp

@@ -2250,9 +2250,8 @@
 }
 
 TEST_F(FPDFTextEmbedderTest, Bug425244539) {
-  // TODO(crbug.com/425244539): This should contain the characters in "hello".
   static constexpr std::array<unsigned short, 6> kExpectedChars = {
-      0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0};
+      'h', 'e', 'l', 'l', 'o', 0};
 
   ASSERT_TRUE(OpenDocument("bug_425244539.pdf"));
   ScopedPage page = LoadScopedPage(0);
@@ -2270,15 +2269,13 @@
 
   ScopedFPDFWideString hello = GetFPDFWideString(L"hello");
 
-  // TODO(crbug.com/425244539): This should be able to find "hello".
   ScopedFPDFTextFind search(
       FPDFText_FindStart(textpage.get(), hello.get(), 0, 0));
   EXPECT_TRUE(search);
   EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
   EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
 
-  // Advancing finds nothing.
-  EXPECT_FALSE(FPDFText_FindNext(search.get()));
+  EXPECT_TRUE(FPDFText_FindNext(search.get()));
   EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
-  EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+  EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
 }
commit	f2d1266fe25e9f544556c7d07b994295c299aad2	[log] [tgz]
author	Lei Zhang <thestig@chromium.org>	Tue Jun 24 13:21:13 2025 -0700
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Tue Jun 24 13:21:13 2025 -0700
tree	4fe1164f20504fc51db0facd7130181dfdbb70e4
parent	e50e167c9258d2c015163003134c9f21ef0b665d [diff]