Rewrite lower level details of extracting text from page

The current implementation of text extraction was difficult to
understand, duplicated logic that existed in other methods, and wasn't
clear about the units the inputs were in. It also didn't handle
control characters correctly.

The new implementation leans on the methods for converting indices
between the text buffer index and character list index spaces to avoid
duplication of code. It also makes it clear to the reader that inputs
are in the character list index space. Finally, it fixes issues being
seen in Chrome with respect of ranges being slightly off.

This CL also adds a test for extracting text that has control
characters.

BUG=pdfium:942,chromium:654578

Change-Id: Id9d1f360c2d7492c7b5a48d6c9ae29f530892742
Reviewed-on: https://pdfium-review.googlesource.com/20014
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: dsinclair <dsinclair@chromium.org>
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 3a38343..05cbdfb 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -114,7 +114,7 @@
   if (!m_pTextPage->IsParsed())
     return;
 
-  m_strPageText = m_pTextPage->GetPageText(0, -1);
+  m_strPageText = m_pTextPage->GetAllPageText();
   if (m_strPageText.IsEmpty())
     return;
 
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 7ea2061..8ef5522 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -436,49 +436,27 @@
   }
 }
 
-WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
-  if (!m_bIsParsed || nCount == 0)
+WideString CPDF_TextPage::GetPageText(int start, int count) const {
+  if (start < 0 || start >= CountChars() || count <= 0 || !m_bIsParsed ||
+      m_CharList.empty() || m_TextBuf.GetLength() == 0) {
+    return L"";
+  }
+
+  int text_start = TextIndexFromCharIndex(start);
+  if (text_start < 0)
     return L"";
 
-  if (start < 0)
-    start = 0;
+  count = std::min(count, CountChars() - start);
 
-  if (nCount == -1) {
-    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
-    WideStringView wsTextBuf = m_TextBuf.AsStringView();
-    return WideString(wsTextBuf.Right(wsTextBuf.GetLength() - start));
-  }
-  if (nCount <= 0 || m_CharList.empty())
+  int last = start + count - 1;
+  int text_last = TextIndexFromCharIndex(last);
+  if (text_last < 0 || text_last < text_start)
     return L"";
-  if (nCount + start > pdfium::CollectionSize<int>(m_CharList) - 1)
-    nCount = pdfium::CollectionSize<int>(m_CharList) - start;
-  if (nCount <= 0)
-    return L"";
-  CheckMarkedContentObject(start, nCount);
-  int startindex = 0;
-  PAGECHAR_INFO charinfo = m_CharList[start];
-  int startOffset = 0;
-  while (charinfo.m_Index == -1) {
-    startOffset++;
-    if (startOffset > nCount ||
-        start + startOffset >= pdfium::CollectionSize<int>(m_CharList)) {
-      return L"";
-    }
-    charinfo = m_CharList[start + startOffset];
-  }
-  startindex = charinfo.m_Index;
-  charinfo = m_CharList[start + nCount - 1];
-  int nCountOffset = 0;
-  while (charinfo.m_Index == -1) {
-    nCountOffset++;
-    if (nCountOffset >= nCount)
-      return L"";
-    charinfo = m_CharList[start + nCount - nCountOffset - 1];
-  }
-  nCount = start + nCount - nCountOffset - startindex;
-  if (nCount <= 0)
-    return L"";
-  return WideString(m_TextBuf.AsStringView().Mid(startindex, nCount));
+
+  int text_count = text_last - text_start + 1;
+
+  return WideString(m_TextBuf.AsStringView().Mid(
+      static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
 }
 
 int CPDF_TextPage::CountRects(int start, int nCount) {
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index e8ab82a..cd30ace 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -103,7 +103,13 @@
   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
   WideString GetTextByRect(const CFX_FloatRect& rect) const;
-  WideString GetPageText(int start = 0, int nCount = -1) const;
+
+  // Returns string with the text from |m_TextBuf| that are covered by the input
+  // range. |start| and |count| are in terms of the m_CharIndex, so the range
+  // will be converted into appropriate indices.
+  WideString GetPageText(int start, int count) const;
+  WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
+
   int CountRects(int start, int nCount);
   void GetRect(int rectIndex,
                float& left,
diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp
index a874521..9f243a0 100644
--- a/core/fpdftext/cpdf_textpagefind.cpp
+++ b/core/fpdftext/cpdf_textpagefind.cpp
@@ -41,7 +41,7 @@
       m_resStart(0),
       m_resEnd(-1),
       m_IsFind(false) {
-  m_strText = m_pTextPage->GetPageText();
+  m_strText = m_pTextPage->GetAllPageText();
   int nCount = pTextPage->CountChars();
   if (nCount)
     m_CharIndex.push_back(0);
@@ -85,7 +85,7 @@
   if (!m_pTextPage)
     return false;
   if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
-    m_strText = m_pTextPage->GetPageText();
+    m_strText = m_pTextPage->GetAllPageText();
   WideString findwhatStr = findwhat;
   m_findWhat = findwhatStr;
   m_flags = flags;
diff --git a/fpdfsdk/fpdftext.cpp b/fpdfsdk/fpdftext.cpp
index 5a2deb9..d9f7d57 100644
--- a/fpdfsdk/fpdftext.cpp
+++ b/fpdfsdk/fpdftext.cpp
@@ -179,25 +179,10 @@
     return 1;
   }
 
-  // char_* values are for a data structure that includes non-printing unicode
-  // characters, where the text_* values are from a data structure that doesn't
-  // include these characters, so translation is needed.
-  int text_start = textpage->TextIndexFromCharIndex(char_start);
-  if (text_start == -1)
-    return 0;
+  WideString str = textpage->GetPageText(char_start, char_count);
 
-  int char_last = char_start + char_count - 1;
-  int text_last = textpage->TextIndexFromCharIndex(char_last);
-  if (text_last == -1)
-    return 0;
-
-  int text_count = text_last - text_start + 1;
-  if (text_count < 1)
-    return 0;
-
-  WideString str = textpage->GetPageText(text_start, text_count);
-  if (str.GetLength() > static_cast<size_t>(text_count))
-    str = str.Left(static_cast<size_t>(text_count));
+  if (str.GetLength() > static_cast<size_t>(char_count))
+    str = str.Left(static_cast<size_t>(char_count));
 
   // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
   // the number of items to stay the same.
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp
index 51216b9..6065405 100644
--- a/fpdfsdk/fpdftext_embeddertest.cpp
+++ b/fpdfsdk/fpdftext_embeddertest.cpp
@@ -590,3 +590,38 @@
   FPDFText_ClosePage(textpage);
   UnloadPage(page);
 }
+
+TEST_F(FPDFTextEmbeddertest, ControlCharacters) {
+  EXPECT_TRUE(OpenDocument("control_characters.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  EXPECT_TRUE(page);
+
+  FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+  EXPECT_TRUE(textpage);
+
+  // Should not include the control characters in the output
+  static const char expected[] = "Hello, world!\r\nGoodbye, world!";
+  unsigned short fixed_buffer[128];
+  memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+  int num_chars = FPDFText_GetText(textpage, 0, 128, fixed_buffer);
+
+  ASSERT_GE(num_chars, 0);
+  EXPECT_EQ(sizeof(expected), static_cast<size_t>(num_chars));
+  EXPECT_TRUE(check_unsigned_shorts(expected, fixed_buffer, sizeof(expected)));
+
+  // Attempting to get a chunk of text after the control characters
+  static const char expected_substring[] = "Goodbye, world!";
+  // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
+  // original stream
+  static const int offset = 17;
+  memset(fixed_buffer, 0xbd, sizeof(fixed_buffer));
+  num_chars = FPDFText_GetText(textpage, offset, 128, fixed_buffer);
+
+  ASSERT_GE(num_chars, 0);
+  EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
+  EXPECT_TRUE(check_unsigned_shorts(expected_substring, fixed_buffer,
+                                    sizeof(expected_substring)));
+
+  FPDFText_ClosePage(textpage);
+  UnloadPage(page);
+}
diff --git a/testing/resources/control_characters.in b/testing/resources/control_characters.in
new file mode 100644
index 0000000..ca7827f
--- /dev/null
+++ b/testing/resources/control_characters.in
@@ -0,0 +1,54 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello\2\3, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/control_characters.pdf b/testing/resources/control_characters.pdf
new file mode 100644
index 0000000..5350097
--- /dev/null
+++ b/testing/resources/control_characters.pdf
@@ -0,0 +1,64 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello\2\3, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+xref
+0 7
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000161 00000 n 
+0000000303 00000 n 
+0000000381 00000 n 
+0000000457 00000 n 
+trailer<< /Root 1 0 R /Size 7 >>
+startxref
+582
+%%EOF