Add FPDFTextObj_GetText() API

Generalize CPDF_TextPage::GetTextByRect(), so that it's possible to get
the text from a text page using a predicate, that way we can easily
get the text that belongs to single text object as well.

Change-Id: Ia457af0f41184694dc1481709be72b35685bce7f
Reviewed-on: https://pdfium-review.googlesource.com/39530
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 2894160..ed7f36f 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -426,7 +426,8 @@
   return pos < nCount ? pos : NearPos;
 }
 
-WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
+WideString CPDF_TextPage::GetTextByPredicate(
+    const std::function<bool(const PAGECHAR_INFO&)>& predicate) const {
   if (!m_bIsParsed)
     return WideString();
 
@@ -435,7 +436,7 @@
   bool IsAddLineFeed = false;
   WideString strText;
   for (const auto& charinfo : m_CharList) {
-    if (IsRectIntersect(rect, charinfo.m_CharBox)) {
+    if (predicate(charinfo)) {
       if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
           IsAddLineFeed) {
         posy = charinfo.m_Origin.y;
@@ -460,6 +461,19 @@
   return strText;
 }
 
+WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
+  return GetTextByPredicate([&rect](const PAGECHAR_INFO& charinfo) {
+    return IsRectIntersect(rect, charinfo.m_CharBox);
+  });
+}
+
+WideString CPDF_TextPage::GetTextByObject(
+    const CPDF_TextObject* pTextObj) const {
+  return GetTextByPredicate([pTextObj](const PAGECHAR_INFO& charinfo) {
+    return charinfo.m_pTextObj == pTextObj;
+  });
+}
+
 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
   if (!m_bIsParsed || !pdfium::IndexInBounds(m_CharList, index))
     return;
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index 36d0185..90b45bd 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -8,6 +8,7 @@
 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
 
 #include <deque>
+#include <functional>
 #include <vector>
 
 #include "core/fpdfapi/page/cpdf_pageobjectlist.h"
@@ -97,6 +98,7 @@
   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
   WideString GetTextByRect(const CFX_FloatRect& rect) const;
+  WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;
 
   // Returns string with the text from |m_TextBuf| that are covered by the input
   // range. |start| and |count| are in terms of the |m_CharIndex|, so the range
@@ -151,6 +153,8 @@
   TextOrientation FindTextlineFlowOrientation() const;
   void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
+  WideString GetTextByPredicate(
+      const std::function<bool(const PAGECHAR_INFO&)>& predicate) const;
 
   UnownedPtr<const CPDF_Page> const m_pPage;
   std::vector<uint16_t> m_CharIndex;
diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp
index 6aa44b3..2773763 100644
--- a/fpdfsdk/fpdf_edittext.cpp
+++ b/fpdfsdk/fpdf_edittext.cpp
@@ -22,6 +22,7 @@
 #include "core/fpdfapi/parser/cpdf_number.h"
 #include "core/fpdfapi/parser/cpdf_reference.h"
 #include "core/fpdfapi/parser/cpdf_stream.h"
+#include "core/fpdftext/cpdf_textpage.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxge/cfx_fontmgr.h"
 #include "core/fxge/fx_font.h"
@@ -564,6 +565,23 @@
   return dwStringLen;
 }
 
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+                    FPDF_TEXTPAGE text_page,
+                    void* buffer,
+                    unsigned long length) {
+  CPDF_TextObject* pTextObj = CPDFTextObjectFromFPDFPageObject(text_object);
+  if (!pTextObj)
+    return 0;
+
+  CPDF_TextPage* pTextPage = CPDFTextPageFromFPDFTextPage(text_page);
+  if (!pTextPage)
+    return 0;
+
+  WideString text = pTextPage->GetTextByObject(pTextObj);
+  return Utf16EncodeMaybeCopyAndReturnLength(text, buffer, length);
+}
+
 FPDF_EXPORT void FPDF_CALLCONV FPDFFont_Close(FPDF_FONT font) {
   CPDF_Font* pFont = CPDFFontFromFPDFFont(font);
   if (!pFont)
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index eafe1a2..bf064d6 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -5,6 +5,7 @@
 #include <algorithm>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include "core/fxcrt/fx_memory.h"
 #include "public/cpp/fpdf_scopers.h"
@@ -762,6 +763,50 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFTextEmbeddertest, GetText) {
+  ASSERT_TRUE(OpenDocument("hello_world.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
+  ASSERT_TRUE(text_page);
+
+  EXPECT_EQ(2, FPDFPage_CountObjects(page));
+  FPDF_PAGEOBJECT text_object = FPDFPage_GetObject(page, 0);
+  ASSERT_TRUE(text_object);
+
+  // Positive testing.
+  constexpr char kHelloText[] = "Hello, world!";
+  // Return value includes the terminating NUL that is provided.
+  constexpr unsigned long kHelloUTF16Size = FX_ArraySize(kHelloText) * 2;
+  constexpr wchar_t kHelloWideText[] = L"Hello, world!";
+  unsigned long size = FPDFTextObj_GetText(text_object, text_page, nullptr, 0);
+  ASSERT_EQ(kHelloUTF16Size, size);
+
+  std::vector<unsigned short> buffer(size);
+  ASSERT_EQ(size,
+            FPDFTextObj_GetText(text_object, text_page, buffer.data(), size));
+  ASSERT_EQ(kHelloWideText, GetPlatformWString(buffer.data()));
+
+  // Negative testing.
+  ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, text_page, nullptr, 0));
+  ASSERT_EQ(0U, FPDFTextObj_GetText(text_object, nullptr, nullptr, 0));
+  ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, nullptr, nullptr, 0));
+
+  // Buffer is too small, ensure it's not modified.
+  buffer.resize(2);
+  buffer[0] = 'x';
+  buffer[1] = '\0';
+  size =
+      FPDFTextObj_GetText(text_object, text_page, buffer.data(), buffer.size());
+  ASSERT_EQ(kHelloUTF16Size, size);
+  ASSERT_EQ('x', buffer[0]);
+  ASSERT_EQ('\0', buffer[1]);
+
+  FPDFText_ClosePage(text_page);
+  UnloadPage(page);
+}
+
 TEST_F(FPDFTextEmbeddertest, CroppedText) {
   static constexpr int kPageCount = 4;
   static constexpr FS_RECTF kBoxes[kPageCount] = {
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 91d24fd..56e9d7a 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -206,6 +206,7 @@
     CHK(FPDFPath_SetStrokeWidth);
     CHK(FPDFTextObj_GetFontName);
     CHK(FPDFTextObj_GetFontSize);
+    CHK(FPDFTextObj_GetText);
     CHK(FPDFText_GetMatrix);
     CHK(FPDFText_GetTextRenderMode);
     CHK(FPDFText_LoadFont);
diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h
index 4d5aa9c..83fedba 100644
--- a/public/fpdf_edit.h
+++ b/public/fpdf_edit.h
@@ -1275,6 +1275,26 @@
                         unsigned long length);
 
 // Experimental API.
+// Get the text of a text object.
+//
+// text_object      - the handle to the text object.
+// text_page        - the handle to the text page.
+// buffer           - the address of a buffer that receives the text.
+// length           - the size, in bytes, of |buffer|.
+//
+// Returns the number of bytes in the text (including the trailing NUL
+// character) on success, 0 on error.
+//
+// Regardless of the platform, the |buffer| is always in UTF16-LE encoding.
+// If |length| is less than the returned length, or |buffer| is NULL, |buffer|
+// will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+                    FPDF_TEXTPAGE text_page,
+                    void* buffer,
+                    unsigned long length);
+
+// Experimental API.
 // Get number of page objects inside |form_object|.
 //
 //   form_object - handle to a form object.