Use FPDFText_GetBoundedText() to get the visible text in a test.

Add a test PDF with multiple pages, each with a different media box and
crop box. Demonstrate how FPDFText_GetText() gets all the text on the
page, and how FPDFText_GetBoundedText() with the right bounding boxes
gets only the visible text on the page.

Also fix a small nit in CPDF_TextPage::GetTextByRect() found while
writing this CL.

BUG=pdfium:387

Change-Id: I9ce4bb181e2ba5b454ea1341bbccef9ba94c9cd8
Reviewed-on: https://pdfium-review.googlesource.com/34550
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: Ryan Harrison <rharrison@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index dae973b..60e5745 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -446,9 +446,9 @@
       IsAddLineFeed = false;
       if (charinfo.m_Unicode)
         strText += charinfo.m_Unicode;
-    } else if (charinfo.m_Unicode == 32) {
-      if (IsContainPreChar && charinfo.m_Unicode) {
-        strText += charinfo.m_Unicode;
+    } else if (charinfo.m_Unicode == L' ') {
+      if (IsContainPreChar) {
+        strText += L' ';
         IsContainPreChar = false;
         IsAddLineFeed = false;
       }
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index c7ad825..112991f 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -2,10 +2,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#include <algorithm>
 #include <memory>
+#include <utility>
 
 #include "core/fxcrt/fx_memory.h"
 #include "public/fpdf_text.h"
+#include "public/fpdf_transformpage.h"
 #include "public/fpdfview.h"
 #include "testing/embedder_test.h"
 #include "testing/gtest/include/gtest/gtest.h"
@@ -159,7 +162,8 @@
   EXPECT_EQ(0.0, bottom);
   EXPECT_EQ(0.0, top);
 
-  EXPECT_EQ(9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, 0, 0));
+  EXPECT_EQ(
+      9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, nullptr, 0));
 
   // Extract starting at character 4 as above.
   memset(buffer, 0xbd, sizeof(buffer));
@@ -751,3 +755,58 @@
   FPDFText_ClosePage(textpage);
   UnloadPage(page);
 }
+
+TEST_F(FPDFTextEmbeddertest, CroppedText) {
+  static constexpr int kPageCount = 4;
+  static constexpr FS_RECTF kBoxes[kPageCount] = {
+      {50.0f, 150.0f, 150.0f, 50.0f},
+      {50.0f, 150.0f, 150.0f, 50.0f},
+      {60.0f, 150.0f, 150.0f, 60.0f},
+      {60.0f, 150.0f, 150.0f, 60.0f},
+  };
+  static constexpr const char* kExpectedText[kPageCount] = {
+      " world!\r\ndbye, world!", " world!\r\ndbye, world!", "bye, world!",
+      "bye, world!",
+  };
+
+  ASSERT_TRUE(OpenDocument("cropped_text.pdf"));
+  ASSERT_EQ(kPageCount, FPDF_GetPageCount(document()));
+
+  for (int i = 0; i < kPageCount; ++i) {
+    FPDF_PAGE page = LoadPage(i);
+    ASSERT_TRUE(page);
+
+    FS_RECTF box;
+    EXPECT_TRUE(FPDF_GetPageBoundingBox(page, &box));
+    EXPECT_EQ(kBoxes[i].left, box.left);
+    EXPECT_EQ(kBoxes[i].top, box.top);
+    EXPECT_EQ(kBoxes[i].right, box.right);
+    EXPECT_EQ(kBoxes[i].bottom, box.bottom);
+
+    {
+      ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
+      ASSERT_TRUE(textpage);
+
+      unsigned short buffer[128];
+      memset(buffer, 0xbd, sizeof(buffer));
+      int num_chars = FPDFText_GetText(textpage.get(), 0, 128, buffer);
+      ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
+      EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText, buffer,
+                                        kHelloGoodbyeTextSize));
+
+      int expected_char_count = strlen(kExpectedText[i]);
+      ASSERT_EQ(expected_char_count,
+                FPDFText_GetBoundedText(textpage.get(), box.left, box.top,
+                                        box.right, box.bottom, nullptr, 0));
+
+      memset(buffer, 0xbd, sizeof(buffer));
+      ASSERT_EQ(expected_char_count + 1,
+                FPDFText_GetBoundedText(textpage.get(), box.left, box.top,
+                                        box.right, box.bottom, buffer, 128));
+      EXPECT_TRUE(
+          check_unsigned_shorts(kExpectedText[i], buffer, expected_char_count));
+    }
+
+    UnloadPage(page);
+  }
+}
diff --git a/testing/resources/cropped_text.in b/testing/resources/cropped_text.in
new file mode 100644
index 0000000..c8632de
--- /dev/null
+++ b/testing/resources/cropped_text.in
@@ -0,0 +1,98 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /Count 4
+  /Kids [ 6 0 R 7 0 R 8 0 R 9 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+{{object 5 0}} <<
+  {{streamlen}}
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+{{object 6 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ 0 0 200 200 ]
+  /CropBox [ 50 50 150 150 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+{{object 7 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ -50 -50 200 200 ]
+  /CropBox [ 50 50 150 150 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+{{object 8 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ 0 0 200 200 ]
+  /CropBox [ 60 60 150 150 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+{{object 9 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ 0 0 200 200 ]
+  /CropBox [ 150 150 60 60 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/cropped_text.pdf b/testing/resources/cropped_text.pdf
new file mode 100644
index 0000000..02d50c6
--- /dev/null
+++ b/testing/resources/cropped_text.pdf
@@ -0,0 +1,114 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /Count 4
+  /Kids [ 6 0 R 7 0 R 8 0 R 9 0 R ]
+>>
+endobj
+3 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+5 0 obj <<
+  /Length 83
+>>
+stream
+BT
+20 50 Td
+/F1 12 Tf
+(Hello, world!) Tj
+0 50 Td
+/F2 16 Tf
+(Goodbye, world!) Tj
+ET
+endstream
+endobj
+6 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ 0 0 200 200 ]
+  /CropBox [ 50 50 150 150 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+7 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ -50 -50 200 200 ]
+  /CropBox [ 50 50 150 150 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+8 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ 0 0 200 200 ]
+  /CropBox [ 60 60 150 150 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+9 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [ 0 0 200 200 ]
+  /CropBox [ 150 150 60 60 ]
+  /Resources <<
+    /Font <<
+      /F1 3 0 R
+      /F2 4 0 R
+    >>
+  >>
+  /Contents 5 0 R
+>>
+endobj
+xref
+0 10
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000151 00000 n 
+0000000229 00000 n 
+0000000305 00000 n 
+0000000439 00000 n 
+0000000638 00000 n 
+0000000841 00000 n 
+0000001040 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 10
+>>
+startxref
+1239
+%%EOF