M124: Spanify FPDFText_GetText() and FPDF_GetBoundedText().
Restore some old behavior around the result of GetText() in that
it formerly would not do surrogate conversion.
-- restrict FPDFText_GetText() to UCS-2 per new API doc.
-- properly describe what FPDF_GetBoundedText() does in API doc.
Bug: 333414305
Change-Id: I9687dc39cd1fcc5f7d2b961528a7556a1643c9a9
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/118292
Commit-Queue: Tom Sepez <tsepez@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Reviewed-by: Thomas Sepez <tsepez@google.com>
(cherry picked from commit d6a4b27d80214c0c06d5d8925b21fa26b567851a)
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/118450
Bot-Commit: rubber-stamper@appspot.gserviceaccount.com <rubber-stamper@appspot.gserviceaccount.com>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index 84676d5..8ac9328 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -616,6 +616,30 @@
return result;
}
+ByteString WideString::ToUCS2LE() const {
+ ByteString result;
+ size_t output_length = 0;
+ {
+ // Span's lifetime must end before ReleaseBuffer() below.
+ // 2 bytes required per UTF-16 code unit.
+ pdfium::span<uint8_t> buffer =
+ pdfium::as_writable_bytes(result.GetBuffer(GetLength() * 2 + 2));
+ for (wchar_t wc : AsStringView()) {
+#if defined(WCHAR_T_IS_32_BIT)
+ if (pdfium::IsSupplementary(wc)) {
+ continue;
+ }
+#endif
+ buffer[output_length++] = wc & 0xff;
+ buffer[output_length++] = wc >> 8;
+ }
+ buffer[output_length++] = 0;
+ buffer[output_length++] = 0;
+ }
+ result.ReleaseBuffer(output_length);
+ return result;
+}
+
WideString WideString::EncodeEntities() const {
WideString ret = *this;
ret.Replace(L"&", L"&");
diff --git a/core/fxcrt/widestring.h b/core/fxcrt/widestring.h
index 4a950ba..f60f105 100644
--- a/core/fxcrt/widestring.h
+++ b/core/fxcrt/widestring.h
@@ -137,10 +137,11 @@
ByteString ToDefANSI() const;
ByteString ToUTF8() const;
- // This method will add \0\0 to the end of the string to represent the
- // wide string terminator. These values are in the string, not just the data,
- // so GetLength() will include them.
+ // These methods will add \0\0 to the end of the string to represent the
+ // two-byte terminator. These values are part of the string itself, so
+ // GetLength() will include them.
ByteString ToUTF16LE() const;
+ ByteString ToUCS2LE() const;
// Replace the characters &<>'" with HTML entities.
WideString EncodeEntities() const;
diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index 7803b53..e93d6c5 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp
@@ -1291,6 +1291,31 @@
}
}
+TEST(WideString, ToUCS2LE) {
+ struct UCS2LEEncodeCase {
+ WideString ws;
+ ByteString bs;
+ } const ucs2le_encode_cases[] = {
+ {L"", ByteString("\0\0", 2)},
+ {L"abc", ByteString("a\0b\0c\0\0\0", 8)},
+ {L"abcdef", ByteString("a\0b\0c\0d\0e\0f\0\0\0", 14)},
+ {L"abc\0def", ByteString("a\0b\0c\0\0\0", 8)},
+ {L"\xaabb\xccdd", ByteString("\xbb\xaa\xdd\xcc\0\0", 6)},
+ {L"\x3132\x6162", ByteString("\x32\x31\x62\x61\0\0", 6)},
+#if defined(WCHAR_T_IS_32_BIT)
+ {L"🎨", ByteString("\0\0", 2)},
+#endif
+ };
+
+ // TODO(tsepez): make safe.
+ UNSAFE_BUFFERS({
+ for (size_t i = 0; i < std::size(ucs2le_encode_cases); ++i) {
+ EXPECT_EQ(ucs2le_encode_cases[i].bs, ucs2le_encode_cases[i].ws.ToUCS2LE())
+ << " for case number " << i;
+ }
+ });
+}
+
TEST(WideString, EncodeEntities) {
EXPECT_EQ(WideString(L"Symbols &<>'\".").EncodeEntities(),
L"Symbols &<>'".");
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index 11663e8..55674c9 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -7,6 +7,7 @@
#include "public/fpdf_text.h"
#include <algorithm>
+#include <limits>
#include <memory>
#include <vector>
@@ -19,14 +20,15 @@
#include "core/fpdftext/cpdf_textpage.h"
#include "core/fpdftext/cpdf_textpagefind.h"
#include "core/fxcrt/check_op.h"
+#include "core/fxcrt/compiler_specific.h"
#include "core/fxcrt/numerics/safe_conversions.h"
+#include "core/fxcrt/span.h"
+#include "core/fxcrt/span_util.h"
#include "core/fxcrt/stl_util.h"
#include "fpdfsdk/cpdfsdk_helpers.h"
namespace {
-constexpr size_t kBytesPerCharacter = sizeof(unsigned short);
-
CPDF_TextPage* GetTextPageForValidIndex(FPDF_TEXTPAGE text_page, int index) {
if (!text_page || index < 0)
return nullptr;
@@ -319,34 +321,32 @@
int char_count,
unsigned short* result) {
CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
- if (!textpage || start_index < 0 || char_count < 0 || !result)
+ if (!textpage || start_index < 0 || char_count < 0 || !result) {
return 0;
-
+ }
int char_available = textpage->CountChars() - start_index;
- if (char_available <= 0)
+ if (char_available <= 0) {
return 0;
-
+ }
char_count = std::min(char_count, char_available);
if (char_count == 0) {
// Writing out "", which has a character count of 1 due to the NUL.
*result = '\0';
return 1;
}
+ // SAFETY: Required from caller. Public API description states that
+ // `result` must be able to hold `char_count` characters plus a
+ // terminator.
+ CHECK_LT(char_count, std::numeric_limits<int>::max());
+ pdfium::span<unsigned short> result_span =
+ UNSAFE_BUFFERS(pdfium::make_span(result, char_count + 1));
- WideString str = textpage->GetPageText(start_index, char_count);
-
- if (str.GetLength() > static_cast<size_t>(char_count))
- str = str.First(static_cast<size_t>(char_count));
-
- ByteString byte_str = str.ToUTF16LE();
- size_t byte_str_len = byte_str.GetLength();
- size_t ret_count = byte_str_len / kBytesPerCharacter;
-
- // +1 to account for the NUL terminator.
- DCHECK_LE(ret_count, static_cast<size_t>(char_count) + 1);
-
- memcpy(result, byte_str.c_str(), byte_str_len);
- return pdfium::checked_cast<int>(ret_count);
+ // Includes two-byte terminator in string data itself.
+ ByteString str = textpage->GetPageText(start_index, char_count).ToUCS2LE();
+ pdfium::span<const char> str_span = str.AsStringView().span();
+ auto copy_span = fxcrt::reinterpret_span<const unsigned short>(str_span);
+ fxcrt::spancpy(result_span, copy_span);
+ return static_cast<int>(copy_span.size());
}
FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page,
@@ -384,22 +384,27 @@
unsigned short* buffer,
int buflen) {
CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(text_page);
- if (!textpage)
+ if (!textpage) {
return 0;
-
+ }
CFX_FloatRect rect((float)left, (float)bottom, (float)right, (float)top);
- WideString str = textpage->GetTextByRect(rect);
+ WideString wstr = textpage->GetTextByRect(rect);
+ if (buflen <= 0 || !buffer) {
+ return pdfium::checked_cast<int>(wstr.GetLength());
+ }
- if (buflen <= 0 || !buffer)
- return pdfium::checked_cast<int>(str.GetLength());
+ // SAFETY: Required from caller. Public API states that buflen
+ // describes the number of values buffer can hold.
+ const auto buffer_span = UNSAFE_BUFFERS(pdfium::make_span(buffer, buflen));
- ByteString cbUTF16Str = str.ToUTF16LE();
- int len = pdfium::checked_cast<int>(cbUTF16Str.GetLength()) /
- sizeof(unsigned short);
- int size = buflen > len ? len : buflen;
- memcpy(buffer, cbUTF16Str.c_str(), size * sizeof(unsigned short));
- cbUTF16Str.ReleaseBuffer(size * sizeof(unsigned short));
- return size;
+ ByteString str = wstr.ToUTF16LE();
+ pdfium::span<const char> str_span = str.span();
+ auto copy_span = fxcrt::reinterpret_span<const unsigned short>(str_span);
+ if (copy_span.size() > buffer_span.size()) {
+ copy_span = copy_span.first(buffer_span.size());
+ }
+ fxcrt::spancpy(buffer_span, copy_span);
+ return pdfium::checked_cast<int>(copy_span.size());
}
FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV
diff --git a/public/fpdf_text.h b/public/fpdf_text.h
index bd25ba4..a3d666b 100644
--- a/public/fpdf_text.h
+++ b/public/fpdf_text.h
@@ -376,17 +376,16 @@
// text_page - Handle to a text page information structure.
// Returned by FPDFText_LoadPage function.
// start_index - Index for the start characters.
-// count - Number of characters to be extracted.
+// count - Number of UCS-2 values to be extracted.
// result - A buffer (allocated by application) receiving the
-// extracted unicodes. The size of the buffer must be
-// able to hold the number of characters plus a
-// terminator.
+// extracted UCS-2 values. The buffer must be able to
+// hold `count` UCS-2 values plus a terminator.
// Return Value:
// Number of characters written into the result buffer, including the
// trailing terminator.
// Comments:
-// This function ignores characters without unicode information.
-// It returns all characters on the page, even those that are not
+// This function ignores characters without UCS-2 representations.
+// It considers all characters on the page, even those that are not
// visible when the page has a cropbox. To filter out the characters
// outside of the cropbox, use FPDF_GetPageBoundingBox() and
// FPDFText_GetCharBox().
@@ -456,20 +455,20 @@
// top - Top boundary.
// right - Right boundary.
// bottom - Bottom boundary.
-// buffer - A unicode buffer.
-// buflen - Number of characters (not bytes) for the buffer,
-// excluding an additional terminator.
+// buffer - Caller-allocated buffer to receive UTF-16 values.
+// buflen - Number of UTF-16 values (not bytes) that `buffer`
+// is capable of holding.
// Return Value:
-// If buffer is NULL or buflen is zero, return number of characters
-// (not bytes) of text present within the rectangle, excluding a
-// terminating NUL. Generally you should pass a buffer at least one
+// If buffer is NULL or buflen is zero, return number of UTF-16
+// values (not bytes) of text present within the rectangle, excluding
+// a terminating NUL. Generally you should pass a buffer at least one
// larger than this if you want a terminating NUL, which will be
-// provided if space is available. Otherwise, return number of
-// characters copied into the buffer, including the terminating NUL
-// when space for it is available.
+// provided if space is available. Otherwise, return number of UTF-16
+// values copied into the buffer, including the terminating NUL when
+// space for it is available.
// Comment:
// If the buffer is too small, as much text as will fit is copied into
-// it.
+// it. May return a split surrogate in that case.
//
FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
double left,