M124: Spanify FPDFText_GetText() and FPDF_GetBoundedText(). Restore some old behavior around the result of GetText() in that it formerly would not do surrogate conversion. -- restrict FPDFText_GetText() to UCS-2 per new API doc. -- properly describe what FPDF_GetBoundedText() does in API doc. Bug: 333414305 Change-Id: I9687dc39cd1fcc5f7d2b961528a7556a1643c9a9 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/118292 Commit-Queue: Tom Sepez <tsepez@chromium.org> Reviewed-by: Lei Zhang <thestig@chromium.org> Reviewed-by: Thomas Sepez <tsepez@google.com> (cherry picked from commit d6a4b27d80214c0c06d5d8925b21fa26b567851a) Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/118450 Bot-Commit: rubber-stamper@appspot.gserviceaccount.com <rubber-stamper@appspot.gserviceaccount.com> Commit-Queue: Lei Zhang <thestig@chromium.org>

commit: 7b90b15a21cfdd3d74525616b5de04df94fa0583 [log] [tgz]
author: Tom Sepez <tsepez@chromium.org> Wed Apr 17 20:05:47 2024 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed Apr 17 20:05:47 2024 +0000
tree: 94cbba9300971d280e94e6efc529a7a1e25ca69d
parent: e15b92e443568dac2dde59fe3af1ff6afb4e5662 [diff]
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index 84676d5..8ac9328 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp

@@ -616,6 +616,30 @@
   return result;
 }
 
+ByteString WideString::ToUCS2LE() const {
+  ByteString result;
+  size_t output_length = 0;
+  {
+    // Span's lifetime must end before ReleaseBuffer() below.
+    // 2 bytes required per UTF-16 code unit.
+    pdfium::span<uint8_t> buffer =
+        pdfium::as_writable_bytes(result.GetBuffer(GetLength() * 2 + 2));
+    for (wchar_t wc : AsStringView()) {
+#if defined(WCHAR_T_IS_32_BIT)
+      if (pdfium::IsSupplementary(wc)) {
+        continue;
+      }
+#endif
+      buffer[output_length++] = wc & 0xff;
+      buffer[output_length++] = wc >> 8;
+    }
+    buffer[output_length++] = 0;
+    buffer[output_length++] = 0;
+  }
+  result.ReleaseBuffer(output_length);
+  return result;
+}
+
 WideString WideString::EncodeEntities() const {
   WideString ret = *this;
   ret.Replace(L"&", L"&amp;");

diff --git a/core/fxcrt/widestring.h b/core/fxcrt/widestring.h
index 4a950ba..f60f105 100644
--- a/core/fxcrt/widestring.h
+++ b/core/fxcrt/widestring.h

@@ -137,10 +137,11 @@
   ByteString ToDefANSI() const;
   ByteString ToUTF8() const;
 
-  // This method will add \0\0 to the end of the string to represent the
-  // wide string terminator. These values are in the string, not just the data,
-  // so GetLength() will include them.
+  // These methods will add \0\0 to the end of the string to represent the
+  // two-byte terminator. These values are part of the string itself, so
+  // GetLength() will include them.
   ByteString ToUTF16LE() const;
+  ByteString ToUCS2LE() const;
 
   // Replace the characters &<>'" with HTML entities.
   WideString EncodeEntities() const;

diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index 7803b53..e93d6c5 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp

@@ -1291,6 +1291,31 @@
   }
 }
 
+TEST(WideString, ToUCS2LE) {
+  struct UCS2LEEncodeCase {
+    WideString ws;
+    ByteString bs;
+  } const ucs2le_encode_cases[] = {
+      {L"", ByteString("\0\0", 2)},
+      {L"abc", ByteString("a\0b\0c\0\0\0", 8)},
+      {L"abcdef", ByteString("a\0b\0c\0d\0e\0f\0\0\0", 14)},
+      {L"abc\0def", ByteString("a\0b\0c\0\0\0", 8)},
+      {L"\xaabb\xccdd", ByteString("\xbb\xaa\xdd\xcc\0\0", 6)},
+      {L"\x3132\x6162", ByteString("\x32\x31\x62\x61\0\0", 6)},
+#if defined(WCHAR_T_IS_32_BIT)
+      {L"🎨", ByteString("\0\0", 2)},
+#endif
+  };
+
+  // TODO(tsepez): make safe.
+  UNSAFE_BUFFERS({
+    for (size_t i = 0; i < std::size(ucs2le_encode_cases); ++i) {
+      EXPECT_EQ(ucs2le_encode_cases[i].bs, ucs2le_encode_cases[i].ws.ToUCS2LE())
+          << " for case number " << i;
+    }
+  });
+}
+
 TEST(WideString, EncodeEntities) {
   EXPECT_EQ(WideString(L"Symbols &<>'\".").EncodeEntities(),
             L"Symbols &amp;&lt;&gt;&apos;&quot;.");

diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index 11663e8..55674c9 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp

@@ -7,6 +7,7 @@
 #include "public/fpdf_text.h"
 
 #include <algorithm>
+#include <limits>
 #include <memory>
 #include <vector>
 
@@ -19,14 +20,15 @@
 #include "core/fpdftext/cpdf_textpage.h"
 #include "core/fpdftext/cpdf_textpagefind.h"
 #include "core/fxcrt/check_op.h"
+#include "core/fxcrt/compiler_specific.h"
 #include "core/fxcrt/numerics/safe_conversions.h"
+#include "core/fxcrt/span.h"
+#include "core/fxcrt/span_util.h"
 #include "core/fxcrt/stl_util.h"
 #include "fpdfsdk/cpdfsdk_helpers.h"
 
 namespace {
 
-constexpr size_t kBytesPerCharacter = sizeof(unsigned short);
-
 CPDF_TextPage* GetTextPageForValidIndex(FPDF_TEXTPAGE text_page, int index) {
   if (!text_page || index < 0)
     return nullptr;
@@ -319,34 +321,32 @@
                                                int char_count,
                                                unsigned short* result) {
   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
-  if (!textpage || start_index < 0 || char_count < 0 || !result)
+  if (!textpage || start_index < 0 || char_count < 0 || !result) {
     return 0;
-
+  }
   int char_available = textpage->CountChars() - start_index;
-  if (char_available <= 0)
+  if (char_available <= 0) {
     return 0;
-
+  }
   char_count = std::min(char_count, char_available);
   if (char_count == 0) {
     // Writing out "", which has a character count of 1 due to the NUL.
     *result = '\0';
     return 1;
   }
+  // SAFETY: Required from caller. Public API description states that
+  // `result` must be able to hold `char_count` characters plus a
+  // terminator.
+  CHECK_LT(char_count, std::numeric_limits<int>::max());
+  pdfium::span<unsigned short> result_span =
+      UNSAFE_BUFFERS(pdfium::make_span(result, char_count + 1));
 
-  WideString str = textpage->GetPageText(start_index, char_count);
-
-  if (str.GetLength() > static_cast<size_t>(char_count))
-    str = str.First(static_cast<size_t>(char_count));
-
-  ByteString byte_str = str.ToUTF16LE();
-  size_t byte_str_len = byte_str.GetLength();
-  size_t ret_count = byte_str_len / kBytesPerCharacter;
-
-  // +1 to account for the NUL terminator.
-  DCHECK_LE(ret_count, static_cast<size_t>(char_count) + 1);
-
-  memcpy(result, byte_str.c_str(), byte_str_len);
-  return pdfium::checked_cast<int>(ret_count);
+  // Includes two-byte terminator in string data itself.
+  ByteString str = textpage->GetPageText(start_index, char_count).ToUCS2LE();
+  pdfium::span<const char> str_span = str.AsStringView().span();
+  auto copy_span = fxcrt::reinterpret_span<const unsigned short>(str_span);
+  fxcrt::spancpy(result_span, copy_span);
+  return static_cast<int>(copy_span.size());
 }
 
 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page,
@@ -384,22 +384,27 @@
                                                       unsigned short* buffer,
                                                       int buflen) {
   CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
-  if (!textpage)
+  if (!textpage) {
     return 0;
-
+  }
   CFX_FloatRect rect((float)left, (float)bottom, (float)right, (float)top);
-  WideString str = textpage->GetTextByRect(rect);
+  WideString wstr = textpage->GetTextByRect(rect);
+  if (buflen <= 0 || !buffer) {
+    return pdfium::checked_cast<int>(wstr.GetLength());
+  }
 
-  if (buflen <= 0 || !buffer)
-    return pdfium::checked_cast<int>(str.GetLength());
+  // SAFETY: Required from caller. Public API states that buflen
+  // describes the number of values buffer can hold.
+  const auto buffer_span = UNSAFE_BUFFERS(pdfium::make_span(buffer, buflen));
 
-  ByteString cbUTF16Str = str.ToUTF16LE();
-  int len = pdfium::checked_cast<int>(cbUTF16Str.GetLength()) /
-            sizeof(unsigned short);
-  int size = buflen > len ? len : buflen;
-  memcpy(buffer, cbUTF16Str.c_str(), size * sizeof(unsigned short));
-  cbUTF16Str.ReleaseBuffer(size * sizeof(unsigned short));
-  return size;
+  ByteString str = wstr.ToUTF16LE();
+  pdfium::span<const char> str_span = str.span();
+  auto copy_span = fxcrt::reinterpret_span<const unsigned short>(str_span);
+  if (copy_span.size() > buffer_span.size()) {
+    copy_span = copy_span.first(buffer_span.size());
+  }
+  fxcrt::spancpy(buffer_span, copy_span);
+  return pdfium::checked_cast<int>(copy_span.size());
 }
 
 FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV

diff --git a/public/fpdf_text.h b/public/fpdf_text.h
index bd25ba4..a3d666b 100644
--- a/public/fpdf_text.h
+++ b/public/fpdf_text.h

@@ -376,17 +376,16 @@
 //          text_page   -   Handle to a text page information structure.
 //                          Returned by FPDFText_LoadPage function.
 //          start_index -   Index for the start characters.
-//          count       -   Number of characters to be extracted.
+//          count       -   Number of UCS-2 values to be extracted.
 //          result      -   A buffer (allocated by application) receiving the
-//                          extracted unicodes. The size of the buffer must be
-//                          able to hold the number of characters plus a
-//                          terminator.
+//                          extracted UCS-2 values. The buffer must be able to
+//                          hold `count` UCS-2 values plus a terminator.
 // Return Value:
 //          Number of characters written into the result buffer, including the
 //          trailing terminator.
 // Comments:
-//          This function ignores characters without unicode information.
-//          It returns all characters on the page, even those that are not
+//          This function ignores characters without UCS-2 representations.
+//          It considers all characters on the page, even those that are not
 //          visible when the page has a cropbox. To filter out the characters
 //          outside of the cropbox, use FPDF_GetPageBoundingBox() and
 //          FPDFText_GetCharBox().
@@ -456,20 +455,20 @@
 //          top         -   Top boundary.
 //          right       -   Right boundary.
 //          bottom      -   Bottom boundary.
-//          buffer      -   A unicode buffer.
-//          buflen      -   Number of characters (not bytes) for the buffer,
-//                          excluding an additional terminator.
+//          buffer      -   Caller-allocated buffer to receive UTF-16 values.
+//          buflen      -   Number of UTF-16 values (not bytes) that `buffer`
+//                          is capable of holding.
 // Return Value:
-//          If buffer is NULL or buflen is zero, return number of characters
-//          (not bytes) of text present within the rectangle, excluding a
-//          terminating NUL. Generally you should pass a buffer at least one
+//          If buffer is NULL or buflen is zero, return number of UTF-16
+//          values (not bytes) of text present within the rectangle, excluding
+//          a terminating NUL. Generally you should pass a buffer at least one
 //          larger than this if you want a terminating NUL, which will be
-//          provided if space is available. Otherwise, return number of
-//          characters copied into the buffer, including the terminating NUL
-//          when space for it is available.
+//          provided if space is available. Otherwise, return number of UTF-16
+//          values copied into the buffer, including the terminating NUL when
+//          space for it is available.
 // Comment:
 //          If the buffer is too small, as much text as will fit is copied into
-//          it.
+//          it. May return a split surrogate in that case.
 //
 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
                                                       double left,
commit	7b90b15a21cfdd3d74525616b5de04df94fa0583	[log] [tgz]
author	Tom Sepez <tsepez@chromium.org>	Wed Apr 17 20:05:47 2024 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed Apr 17 20:05:47 2024 +0000
tree	94cbba9300971d280e94e6efc529a7a1e25ca69d
parent	e15b92e443568dac2dde59fe3af1ff6afb4e5662 [diff]