Make WideString::ToUTF16LE() do surrogate splitting if needed
Should make text strings with characters outside of the BMP show up
correctly on non-Windows platforms.
(Because e.g. PDFiumEngine::TraverseBookmarks() calls
FPDFBookmark_GetTitle() expecting to get a std::16string, and
FPDFBookmark_GetTitle() calls Utf16EncodeMaybeCopyAndReturnLength()
which calls WideString::ToUTF16LE() which currently chops off the
two upper bytes of wchar_t on platforms where wchar_t is 4 bytes --
i.e. on non-Windows.)
It's still a bit unfortunate that:
* ToUTF16LE() is a method on WideString, since that means we can't
use it when we only have a WideStringView, such as in PDF_EncodeText()
(which wants UTF16BE, but we could call UTF16LE and swap bytes).
We might want to move the method to be a static thing on ByteString
instead: ByteString::UTF16LEFromWide(WideStringView)
* ToUTF16LE() adds a zero-terminator, making it nonsymmetrical with
FromUTF16LE. Some places (AsFPDFWideString()) want to convert the
result to a FPDF_WIDESTRING which is nul-terminated. Maybe there
should be an arg for opting in to the zero termination.
But again, that's for separate CLs.
Bug: pdfium:2105
Change-Id: I7aa1f1aa96f7b490c0c4435577fffe82a91809d9
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/114130
Commit-Queue: Tom Sepez <tsepez@chromium.org>
Auto-Submit: Nico Weber <thakis@chromium.org>
Commit-Queue: Nico Weber <thakis@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index bfc9555..aa01fc3 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -771,17 +771,36 @@
ByteString result;
size_t len = m_pData->m_nDataLength;
+ size_t output_length = 0;
{
// Span's lifetime must end before ReleaseBuffer() below.
- pdfium::span<char> buffer = result.GetBuffer(len * 2 + 2);
+#if defined(WCHAR_T_IS_32_BIT)
+ // 2 or 4 bytes required per UTF-32 code unit.
+ pdfium::span<uint8_t> buffer =
+ pdfium::as_writable_bytes(result.GetBuffer(len * 4 + 2));
+#else
+ // 2 bytes required per UTF-16 code unit.
+ pdfium::span<uint8_t> buffer =
+ pdfium::as_writable_bytes(result.GetBuffer(len * 2 + 2));
+#endif
for (size_t i = 0; i < len; i++) {
- buffer[i * 2] = m_pData->m_String[i] & 0xff;
- buffer[i * 2 + 1] = m_pData->m_String[i] >> 8;
+#if defined(WCHAR_T_IS_32_BIT)
+ if (pdfium::IsSupplementary(m_pData->m_String[i])) {
+ pdfium::SurrogatePair pair(m_pData->m_String[i]);
+ buffer[output_length++] = pair.high() & 0xff;
+ buffer[output_length++] = pair.high() >> 8;
+ buffer[output_length++] = pair.low() & 0xff;
+ buffer[output_length++] = pair.low() >> 8;
+ continue;
+ }
+#endif // defined(WCHAR_T_IS_32_BIT)
+ buffer[output_length++] = m_pData->m_String[i] & 0xff;
+ buffer[output_length++] = m_pData->m_String[i] >> 8;
}
- buffer[len * 2] = 0;
- buffer[len * 2 + 1] = 0;
+ buffer[output_length++] = 0;
+ buffer[output_length++] = 0;
}
- result.ReleaseBuffer(len * 2 + 2);
+ result.ReleaseBuffer(output_length);
return result;
}
diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index f60e11f..39d734b 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp
@@ -1279,6 +1279,7 @@
{L"abc\0def", ByteString("a\0b\0c\0\0\0", 8)},
{L"\xaabb\xccdd", ByteString("\xbb\xaa\xdd\xcc\0\0", 6)},
{L"\x3132\x6162", ByteString("\x32\x31\x62\x61\0\0", 6)},
+ {L"🎨", ByteString("\x3C\xD8\xA8\xDF\0\0", 6)},
};
for (size_t i = 0; i < std::size(utf16le_encode_cases); ++i) {
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index f1b193d..194a602 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -338,8 +338,6 @@
if (str.GetLength() > static_cast<size_t>(char_count))
str = str.First(static_cast<size_t>(char_count));
- // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
- // the number of items to stay the same.
ByteString byte_str = str.ToUTF16LE();
size_t byte_str_len = byte_str.GetLength();
size_t ret_count = byte_str_len / kBytesPerCharacter;