Current font need not contain part of ActualText When handling marked content there is currently a check that if the current font does not have a mapping for any of the code points in an ActualText then the ActualText is ignored. This check has been in the code since the initial code drop. This undermines part of the purpose of ActualText, such as providing the original code points even if there was no font which represented them. In addition, it is not clear why this behavior would be wanted. One possibility is an attempt to reduce phishing where the copied (or accessible) content is intentionally different from was is drawn on the page (especially with respect to Unicode confusables). However, this is generally malicious content which could easily work around such a check by simply adding mappings in its fonts. Any benefit of ignoring such ActualText has the downside of incorrectly handling real and intentional uses like when multiple code points map to a single glyph in a font. Remove this check and add a test which replaces English words with Chinese replacements. This test fails before this change as the default test font does not contain both sets of code points. Bug: 420508260 Change-Id: I4d82b603e5bda8ed13614e6ba3a4ce0c8f8f0d62 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/132710 Reviewed-by: Lei Zhang <thestig@chromium.org> Reviewed-by: Ben Wagner <bungeman@google.com> Commit-Queue: Ben Wagner <bungeman@google.com>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp index 2385df3..cbb2b40 100644 --- a/core/fpdftext/cpdf_textpage.cpp +++ b/core/fpdftext/cpdf_textpage.cpp
@@ -980,19 +980,6 @@ return MarkedContentState::kPass; } - RetainPtr<CPDF_Font> font = pTextObj->GetFont(); - bExist = false; - for (size_t i = 0; i < actual_text.GetLength(); ++i) { - if (font->CharCodeFromUnicode(actual_text[i]) != - CPDF_Font::kInvalidCharCode) { - bExist = true; - break; - } - } - if (!bExist) { - return MarkedContentState::kPass; - } - bExist = false; for (size_t i = 0; i < actual_text.GetLength(); ++i) { wchar_t wChar = actual_text[i];
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp index ac17ade..f863e82 100644 --- a/fpdfsdk/fpdf_text_embeddertest.cpp +++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2175,6 +2175,24 @@ ElementsAreArray(kExpected)); } +TEST_F(FPDFTextEmbedderTest, Bug420508260) { + ASSERT_TRUE(OpenDocument("bug_420508260.pdf")); + ScopedPage page = LoadScopedPage(0); + ASSERT_TRUE(page); + + ScopedFPDFTextPage textpage(FPDFText_LoadPage(page.get())); + ASSERT_TRUE(textpage); + + static constexpr wchar_t kExpected[] = L"What is 我的 favorite 食物?"; + // Includes trailing NUL character. + static constexpr int kExpectedSize = std::size(kExpected); + unsigned short buffer[256] = {}; + EXPECT_EQ(kExpectedSize, + FPDFText_GetText(textpage.get(), 0, std::size(buffer), buffer)); + EXPECT_THAT(pdfium::span(buffer).first<kExpectedSize>(), + ElementsAreArray(kExpected)); +} + TEST_F(FPDFTextEmbedderTest, TextObjectSetIsActive) { ASSERT_TRUE(OpenDocument("hello_world.pdf")); ScopedPage page = LoadScopedPage(0);
diff --git a/testing/resources/bug_420508260.in b/testing/resources/bug_420508260.in new file mode 100644 index 0000000..83e77e9 --- /dev/null +++ b/testing/resources/bug_420508260.in
@@ -0,0 +1,53 @@ +{{header}} +{{object 1 0}} << + /Type /Catalog + /Pages 2 0 R +>> +endobj +{{object 2 0}} << + /Type /Pages + /Count 1 + /Kids [3 0 R] +>> +endobj +{{object 3 0}} << + /Type /Page + /Contents 4 0 R + /Parent 2 0 R + /MediaBox [0 0 200 200] + /Resources << + /Font << + /F1 5 0 R + >> + >> +>> +endobj +{{object 4 0}} << + {{streamlen}} +>> +stream +BT +/F1 12 Tf +1 0 0 1 20 100 Tm +(What is) Tj +/Span<</ActualText <feff62117684> >> BDC +50 0 Td (your) Tj +EMC +30 0 Td (favorite) Tj +/Span<</ActualText <feff98df7269> >> BDC +50 0 Td (color) Tj +EMC +(?) Tj +ET +endstream +endobj +{{object 5 0}} << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +{{xref}} +{{trailer}} +{{startxref}} +%%EOF
diff --git a/testing/resources/bug_420508260.pdf b/testing/resources/bug_420508260.pdf new file mode 100644 index 0000000..3c48a00 --- /dev/null +++ b/testing/resources/bug_420508260.pdf
@@ -0,0 +1,65 @@ +%PDF-1.7 +% ò¤ô +1 0 obj << + /Type /Catalog + /Pages 2 0 R +>> +endobj +2 0 obj << + /Type /Pages + /Count 1 + /Kids [3 0 R] +>> +endobj +3 0 obj << + /Type /Page + /Contents 4 0 R + /Parent 2 0 R + /MediaBox [0 0 200 200] + /Resources << + /Font << + /F1 5 0 R + >> + >> +>> +endobj +4 0 obj << + /Length 202 +>> +stream +BT +/F1 12 Tf +1 0 0 1 20 100 Tm +(What is) Tj +/Span<</ActualText <feff62117684> >> BDC +50 0 Td (your) Tj +EMC +30 0 Td (favorite) Tj +/Span<</ActualText <feff98df7269> >> BDC +50 0 Td (color) Tj +EMC +(?) Tj +ET +endstream +endobj +5 0 obj << + /Type /Font + /Subtype /Type1 + /BaseFont /Helvetica +>> +endobj +xref +0 6 +0000000000 65535 f +0000000015 00000 n +0000000068 00000 n +0000000131 00000 n +0000000283 00000 n +0000000538 00000 n +trailer << + /Root 1 0 R + /Size 6 +>> +startxref +614 +%%EOF