Handle non-printing characters at beginning of extraction region
Currently if a text extraction region begins on a non-printing
character then "" will be returned. This is the incorrect behaviour,
instead the call should scan ahead until a printing character is
found and start extracting from there. Also proactively adds a
similar check and scan for the end of the extraction region.
BUG=pdfium:1139
Change-Id: Ia2001ac89740f3d31d2bb69e8000773f8b01091b
Reviewed-on: https://pdfium-review.googlesource.com/41532
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 83b43d9..1f29589 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -944,3 +944,26 @@
UnloadPage(page);
}
}
+
+TEST_F(FPDFTextEmbeddertest, Bug_1139) {
+ ASSERT_TRUE(OpenDocument("bug_1139.pdf"));
+ FPDF_PAGE page = LoadPage(0);
+ ASSERT_TRUE(page);
+
+ FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
+ ASSERT_TRUE(text_page);
+
+ // -1 for CountChars not including the \0, but +1 for the extra control
+ // character.
+ EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page));
+
+ // There is an extra control character at the beginning of the string, but it
+ // should not appear in the output nor prevent extracting the text.
+ unsigned short buffer[128];
+ int num_chars = FPDFText_GetText(text_page, 0, 128, buffer);
+ ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
+ EXPECT_TRUE(
+ check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
+ FPDFText_ClosePage(text_page);
+ UnloadPage(page);
+}