Handle non-printing characters at beginning of extraction region
Currently if a text extraction region begins on a non-printing
character then "" will be returned. This is the incorrect behaviour,
instead the call should scan ahead until a printing character is
found and start extracting from there. Also proactively adds a
similar check and scan for the end of the extraction region.
BUG=pdfium:1139
Change-Id: Ia2001ac89740f3d31d2bb69e8000773f8b01091b
Reviewed-on: https://pdfium-review.googlesource.com/41532
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 5465297..8c110b6 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -510,15 +510,36 @@
     return L"";
   }
 
+  const int count_chars = CountChars();
   int text_start = TextIndexFromCharIndex(start);
-  if (text_start < 0)
-    return L"";
 
-  count = std::min(count, CountChars() - start);
+  // If the character at |start| is a non-printing character, then
+  // TextIndexFromCharIndex will return -1, so scan ahead to the first printing
+  // character.
+  while (text_start < 0) {
+    if (start >= count_chars)
+      return L"";
+    start++;
+    text_start = TextIndexFromCharIndex(start);
+  }
+
+  count = std::min(count, count_chars - start);
 
   int last = start + count - 1;
   int text_last = TextIndexFromCharIndex(last);
-  if (text_last < 0 || text_last < text_start)
+
+  // If the character at |last| is a non-printing character, then
+  // TextIndexFromCharIndex will return -1, so scan back to the last printing
+  // character.
+  while (text_last < 0) {
+    if (last < text_start)
+      return L"";
+
+    last--;
+    text_last = TextIndexFromCharIndex(last);
+  }
+
+  if (text_last < text_start)
     return L"";
 
   int text_count = text_last - text_start + 1;
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 83b43d9..1f29589 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -944,3 +944,26 @@
     UnloadPage(page);
   }
 }
+
+TEST_F(FPDFTextEmbeddertest, Bug_1139) {
+  ASSERT_TRUE(OpenDocument("bug_1139.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
+  ASSERT_TRUE(text_page);
+
+  // -1 for CountChars not including the \0, but +1 for the extra control
+  // character.
+  EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page));
+
+  // There is an extra control character at the beginning of the string, but it
+  // should not appear in the output nor prevent extracting the text.
+  unsigned short buffer[128];
+  int num_chars = FPDFText_GetText(text_page, 0, 128, buffer);
+  ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
+  EXPECT_TRUE(
+      check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
+  FPDFText_ClosePage(text_page);
+  UnloadPage(page);
+}
diff --git a/testing/resources/bug_1139.in b/testing/resources/bug_1139.in
new file mode 100644
index 0000000..d5603f0
--- /dev/null
+++ b/testing/resources/bug_1139.in
@@ -0,0 +1,55 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+>>
+stream
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(\003Hello, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/bug_1139.pdf b/testing/resources/bug_1139.pdf
new file mode 100644
index 0000000..d89695e
--- /dev/null
+++ b/testing/resources/bug_1139.pdf
@@ -0,0 +1,68 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+>>
+stream
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(\003Hello, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+xref
+0 7
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000161 00000 n 
+0000000303 00000 n 
+0000000381 00000 n 
+0000000457 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 7
+>>
+startxref
+589
+%%EOF