Make WideString's FromUTF16BE / FromUTF16LE do surrogate fusing
...and use them in PDF_DecodeText().
No behavior change for PDF_DecodeText().
Otherwise, these functions are mostly used on Windows (where this
is a no-op since wchar_t is 2 bytes there) and in fuzzers, so this
generally shouldn't have a big effect.
This is a bit weird since WideString::ToUTF16LE() doesn't create
surrogates yet, so things don't round-trip cleanly through
FromUTF16LE().ToUTF16LE(). But that's not a new problem since
PDF_DecodeText() created fused surrogates before this as well,
and FX_UTF8Decode() also creates WideStrings with characters outside
the BMP. (ToUTF16LE() adds a zero-terminator, which is also a problem
round-trip wise.)
It's also strange that fx_string.h contains UTF8 conversion methods,
in addition to WideString::FromUTF8(). They operate on WideStringViews.
FX_UTF8Decode() should arguably be replaced by WideString::FromUTF8()
everywhere, but FX_UTF8Encode() takes a WideStringView -- we probably
want to create FX_UTF16LEEncode() and make WideString::ToUTF16LE() call
that possibly. Or maybe FX_UTF8Encode() should become a method on
ByteString -- ByteString::UTF8FromWide() or something.
But that's all for other CLs. This here seems like a small
self-contained progression.
Change-Id: I13afb5f66df6f553358cc03ccfd56a78e8e624c3
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/114030
Commit-Queue: Nico Weber <thakis@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Auto-Submit: Nico Weber <thakis@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index 6412a87..29b2eb6 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp
@@ -466,23 +466,6 @@
return true;
}
-#if defined(WCHAR_T_IS_32_BIT)
-static size_t FuseSurrogates(pdfium::span<wchar_t> s, size_t n) {
- size_t dest_pos = 0;
- for (size_t i = 0; i < n; ++i) {
- // TODO(crbug.com/pdfium/2031): Always use UTF-16.
- if (pdfium::IsHighSurrogate(s[i]) && i + 1 < n &&
- pdfium::IsLowSurrogate(s[i + 1])) {
- s[dest_pos++] = pdfium::SurrogatePair(s[i], s[i + 1]).ToCodePoint();
- ++i;
- continue;
- }
- s[dest_pos++] = s[i];
- }
- return dest_pos;
-}
-#endif // defined(WCHAR_T_IS_UTF32)
-
static size_t StripLanguageCodes(pdfium::span<wchar_t> s, size_t n) {
size_t dest_pos = 0;
for (size_t i = 0; i < n; ++i) {
@@ -504,24 +487,16 @@
WideString result;
if (span.size() >= 2 && ((span[0] == 0xfe && span[1] == 0xff) ||
(span[0] == 0xff && span[1] == 0xfe))) {
- pdfium::span<wchar_t> dest_buf = result.GetBuffer((span.size() - 2) / 2);
if (span[0] == 0xfe) {
- for (size_t i = 2; i < span.size() - 1; i += 2) {
- dest_buf[dest_pos++] = span[i] << 8 | span[i + 1];
- }
+ result = WideString::FromUTF16BE(span.subspan(2));
} else {
- for (size_t i = 2; i < span.size() - 1; i += 2) {
- dest_buf[dest_pos++] = span[i + 1] << 8 | span[i];
- }
+ result = WideString::FromUTF16LE(span.subspan(2));
}
-#if defined(WCHAR_T_IS_32_BIT)
- dest_pos = FuseSurrogates(dest_buf, dest_pos);
-#endif
-
- dest_pos = StripLanguageCodes(dest_buf, dest_pos);
+ pdfium::span<wchar_t> dest_buf = result.GetBuffer(result.GetLength());
+ dest_pos = StripLanguageCodes(dest_buf, result.GetLength());
} else if (span.size() >= 3 && span[0] == 0xef && span[1] == 0xbb &&
span[2] == 0xbf) {
- result = FX_UTF8Decode(span.subspan(3));
+ result = WideString::FromUTF8(span.subspan(3));
pdfium::span<wchar_t> dest_buf = result.GetBuffer(result.GetLength());
dest_pos = StripLanguageCodes(dest_buf, result.GetLength());
} else {
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index 2ad7b5d..dca0a48 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -18,6 +18,7 @@
#include "core/fxcrt/fx_safe_types.h"
#include "core/fxcrt/fx_system.h"
#include "core/fxcrt/string_pool_template.h"
+#include "core/fxcrt/utf16.h"
#include "third_party/base/check.h"
#include "third_party/base/check_op.h"
#include "third_party/base/numerics/safe_math.h"
@@ -33,6 +34,23 @@
namespace {
+#if defined(WCHAR_T_IS_32_BIT)
+size_t FuseSurrogates(pdfium::span<wchar_t> s, size_t n) {
+ size_t dest_pos = 0;
+ for (size_t i = 0; i < n; ++i) {
+ // TODO(crbug.com/pdfium/2031): Always use UTF-16.
+ if (pdfium::IsHighSurrogate(s[i]) && i + 1 < n &&
+ pdfium::IsLowSurrogate(s[i + 1])) {
+ s[dest_pos++] = pdfium::SurrogatePair(s[i], s[i + 1]).ToCodePoint();
+ ++i;
+ continue;
+ }
+ s[dest_pos++] = s[i];
+ }
+ return dest_pos;
+}
+#endif // defined(WCHAR_T_IS_32_BIT)
+
constexpr wchar_t kWideTrimChars[] = L"\x09\x0a\x0b\x0c\x0d\x20";
const wchar_t* FX_wcsstr(const wchar_t* haystack,
@@ -970,6 +988,10 @@
for (size_t i = 0; i < data.size() - 1; i += 2) {
buf[length++] = data[i] | data[i + 1] << 8;
}
+
+#if defined(WCHAR_T_IS_32_BIT)
+ length = FuseSurrogates(buf, length);
+#endif
}
result.ReleaseBuffer(length);
return result;
@@ -988,6 +1010,10 @@
for (size_t i = 0; i < data.size() - 1; i += 2) {
buf[length++] = data[i] << 8 | data[i + 1];
}
+
+#if defined(WCHAR_T_IS_32_BIT)
+ length = FuseSurrogates(buf, length);
+#endif
}
result.ReleaseBuffer(length);
return result;
diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index 947926c..5c555ca 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp
@@ -1136,6 +1136,44 @@
EXPECT_EQ(0, iter - multi_str.rbegin());
}
+TEST(WideString, FromUTF16BE) {
+ struct UTF16BEDecodeCase {
+ ByteString in;
+ WideString out;
+ } const utf16be_decode_cases[] = {
+ {"", L""},
+ {ByteString("\0a\0b\0c", 6), L"abc"},
+ {ByteString("\0a\0b\0c\0\0\0d\0e\0f", 14), WideString(L"abc\0def", 7)},
+ {ByteString(" &", 2), L"…"},
+ {ByteString("\xD8\x3C\xDF\xA8", 4), L"🎨"},
+ };
+
+ for (size_t i = 0; i < std::size(utf16be_decode_cases); ++i) {
+ EXPECT_EQ(WideString::FromUTF16BE(utf16be_decode_cases[i].in.raw_span()),
+ utf16be_decode_cases[i].out)
+ << " for case number " << i;
+ }
+}
+
+TEST(WideString, FromUTF16LE) {
+ struct UTF16LEDecodeCase {
+ ByteString in;
+ WideString out;
+ } const utf16le_decode_cases[] = {
+ {"", L""},
+ {ByteString("a\0b\0c\0", 6), L"abc"},
+ {ByteString("a\0b\0c\0\0\0d\0e\0f\0", 14), WideString(L"abc\0def", 7)},
+ {ByteString("& ", 2), L"…"},
+ {ByteString("\x3C\xD8\xA8\xDF", 4), L"🎨"},
+ };
+
+ for (size_t i = 0; i < std::size(utf16le_decode_cases); ++i) {
+ EXPECT_EQ(WideString::FromUTF16LE(utf16le_decode_cases[i].in.raw_span()),
+ utf16le_decode_cases[i].out)
+ << " for case number " << i;
+ }
+}
+
TEST(WideString, ToUTF16LE) {
struct UTF16LEEncodeCase {
WideString ws;