Fix FPDFText_GetText() handling with invalid characters
Due to the changes in [1], what is now CPDF_TextPage::Init() has a
slightly different notion of what is "normal" compared to
AddCharInfoByLRDirection() and AddCharInfoByRLDirection(). Fix this
discrepancy by adding IsNormalCharacter() and using it from all 3 call
sites. Then update tests to reflect this change.
[1] https://pdfium-review.googlesource.com/27051
Bug: 425244539
Change-Id: Iedcba804ff3a210b5295f27daaee319c8725c159
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/133551
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index dbc3053..1051c9f 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -151,6 +151,11 @@
return c == 0x2D || c == 0xAD;
}
+bool IsNormalCharacter(const CPDF_TextPage::CharInfo& char_info) {
+ return char_info.unicode() != 0 ? !IsControlChar(char_info)
+ : char_info.char_code() != 0;
+}
+
bool IsRectIntersect(const CFX_FloatRect& rect1, const CFX_FloatRect& rect2) {
CFX_FloatRect rect = rect1;
rect.Intersect(rect2);
@@ -383,8 +388,7 @@
for (int i = 0; i < nCount; ++i) {
const CharInfo& charinfo = char_list_[i];
if (charinfo.char_type() == CharType::kGenerated ||
- (charinfo.unicode() != 0 && !IsControlChar(charinfo)) ||
- (charinfo.unicode() == 0 && charinfo.char_code() != 0)) {
+ IsNormalCharacter(charinfo)) {
char_indices_.back().count++;
skipped = true;
} else {
@@ -781,7 +785,7 @@
void CPDF_TextPage::AddCharInfoByLRDirection(wchar_t wChar,
const CharInfo& info) {
- if (IsControlChar(info)) {
+ if (!IsNormalCharacter(info)) {
char_list_.push_back(info);
return;
}
@@ -806,7 +810,7 @@
void CPDF_TextPage::AddCharInfoByRLDirection(wchar_t wChar,
const CharInfo& info) {
- if (IsControlChar(info)) {
+ if (!IsNormalCharacter(info)) {
char_list_.push_back(info);
return;
}
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index dd0c819..34dc9c8 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2250,9 +2250,8 @@
}
TEST_F(FPDFTextEmbedderTest, Bug425244539) {
- // TODO(crbug.com/425244539): This should contain the characters in "hello".
static constexpr std::array<unsigned short, 6> kExpectedChars = {
- 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0xfffe, 0};
+ 'h', 'e', 'l', 'l', 'o', 0};
ASSERT_TRUE(OpenDocument("bug_425244539.pdf"));
ScopedPage page = LoadScopedPage(0);
@@ -2270,15 +2269,13 @@
ScopedFPDFWideString hello = GetFPDFWideString(L"hello");
- // TODO(crbug.com/425244539): This should be able to find "hello".
ScopedFPDFTextFind search(
FPDFText_FindStart(textpage.get(), hello.get(), 0, 0));
EXPECT_TRUE(search);
EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
- // Advancing finds nothing.
- EXPECT_FALSE(FPDFText_FindNext(search.get()));
+ EXPECT_TRUE(FPDFText_FindNext(search.get()));
EXPECT_EQ(22, FPDFText_GetSchResultIndex(search.get()));
- EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
+ EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
}