Current font need not contain part of ActualText
When handling marked content there is currently a check that if the
current font does not have a mapping for any of the code points in an
ActualText then the ActualText is ignored. This check has been in the
code since the initial code drop. This undermines part of the purpose of
ActualText, such as providing the original code points even if there was
no font which represented them.
In addition, it is not clear why this behavior would be wanted. One
possibility is an attempt to reduce phishing where the copied (or
accessible) content is intentionally different from was is drawn on the
page (especially with respect to Unicode confusables). However, this is
generally malicious content which could easily work around such a check
by simply adding mappings in its fonts. Any benefit of ignoring such
ActualText has the downside of incorrectly handling real and intentional
uses like when multiple code points map to a single glyph in a font.
Remove this check and add a test which replaces English words with
Chinese replacements. This test fails before this change as the default
test font does not contain both sets of code points.
Bug: 420508260
Change-Id: I4d82b603e5bda8ed13614e6ba3a4ce0c8f8f0d62
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/132710
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Ben Wagner <bungeman@google.com>
Commit-Queue: Ben Wagner <bungeman@google.com>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 2385df3..cbb2b40 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -980,19 +980,6 @@
return MarkedContentState::kPass;
}
- RetainPtr<CPDF_Font> font = pTextObj->GetFont();
- bExist = false;
- for (size_t i = 0; i < actual_text.GetLength(); ++i) {
- if (font->CharCodeFromUnicode(actual_text[i]) !=
- CPDF_Font::kInvalidCharCode) {
- bExist = true;
- break;
- }
- }
- if (!bExist) {
- return MarkedContentState::kPass;
- }
-
bExist = false;
for (size_t i = 0; i < actual_text.GetLength(); ++i) {
wchar_t wChar = actual_text[i];
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index ac17ade..f863e82 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2175,6 +2175,24 @@
ElementsAreArray(kExpected));
}
+TEST_F(FPDFTextEmbedderTest, Bug420508260) {
+ ASSERT_TRUE(OpenDocument("bug_420508260.pdf"));
+ ScopedPage page = LoadScopedPage(0);
+ ASSERT_TRUE(page);
+
+ ScopedFPDFTextPage textpage(FPDFText_LoadPage(page.get()));
+ ASSERT_TRUE(textpage);
+
+ static constexpr wchar_t kExpected[] = L"What is 我的 favorite 食物?";
+ // Includes trailing NUL character.
+ static constexpr int kExpectedSize = std::size(kExpected);
+ unsigned short buffer[256] = {};
+ EXPECT_EQ(kExpectedSize,
+ FPDFText_GetText(textpage.get(), 0, std::size(buffer), buffer));
+ EXPECT_THAT(pdfium::span(buffer).first<kExpectedSize>(),
+ ElementsAreArray(kExpected));
+}
+
TEST_F(FPDFTextEmbedderTest, TextObjectSetIsActive) {
ASSERT_TRUE(OpenDocument("hello_world.pdf"));
ScopedPage page = LoadScopedPage(0);
diff --git a/testing/resources/bug_420508260.in b/testing/resources/bug_420508260.in
new file mode 100644
index 0000000..83e77e9
--- /dev/null
+++ b/testing/resources/bug_420508260.in
@@ -0,0 +1,53 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Contents 4 0 R
+ /Parent 2 0 R
+ /MediaBox [0 0 200 200]
+ /Resources <<
+ /Font <<
+ /F1 5 0 R
+ >>
+ >>
+>>
+endobj
+{{object 4 0}} <<
+ {{streamlen}}
+>>
+stream
+BT
+/F1 12 Tf
+1 0 0 1 20 100 Tm
+(What is) Tj
+/Span<</ActualText <feff62117684> >> BDC
+50 0 Td (your) Tj
+EMC
+30 0 Td (favorite) Tj
+/Span<</ActualText <feff98df7269> >> BDC
+50 0 Td (color) Tj
+EMC
+(?) Tj
+ET
+endstream
+endobj
+{{object 5 0}} <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/bug_420508260.pdf b/testing/resources/bug_420508260.pdf
new file mode 100644
index 0000000..3c48a00
--- /dev/null
+++ b/testing/resources/bug_420508260.pdf
@@ -0,0 +1,65 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+>>
+endobj
+3 0 obj <<
+ /Type /Page
+ /Contents 4 0 R
+ /Parent 2 0 R
+ /MediaBox [0 0 200 200]
+ /Resources <<
+ /Font <<
+ /F1 5 0 R
+ >>
+ >>
+>>
+endobj
+4 0 obj <<
+ /Length 202
+>>
+stream
+BT
+/F1 12 Tf
+1 0 0 1 20 100 Tm
+(What is) Tj
+/Span<</ActualText <feff62117684> >> BDC
+50 0 Td (your) Tj
+EMC
+30 0 Td (favorite) Tj
+/Span<</ActualText <feff98df7269> >> BDC
+50 0 Td (color) Tj
+EMC
+(?) Tj
+ET
+endstream
+endobj
+5 0 obj <<
+ /Type /Font
+ /Subtype /Type1
+ /BaseFont /Helvetica
+>>
+endobj
+xref
+0 6
+0000000000 65535 f
+0000000015 00000 n
+0000000068 00000 n
+0000000131 00000 n
+0000000283 00000 n
+0000000538 00000 n
+trailer <<
+ /Root 1 0 R
+ /Size 6
+>>
+startxref
+614
+%%EOF