Handle non-BMP code points for UTF-16 wchar_t
Correctly encodes supplementary (non-BMP) code points in WideString as
UTF-16 if wchar_t is 16 bits (as on Windows), rather than 32 bits (as on
Linux).
Bug: pdfium:2029
Change-Id: I662ac7fb08bd48267e32da5f36fd1c9c2bb70717
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107070
Reviewed-by: Nigi <nigi@chromium.org>
Commit-Queue: K. Moon <kmoon@chromium.org>
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index 276e186..9661745 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp
@@ -10,6 +10,8 @@
#include <utility>
+#include "build/build_config.h"
+
CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
int remaining = 0;
char32_t code_point = 0;
@@ -54,5 +56,16 @@
return;
}
+#if defined(WCHAR_T_IS_UTF16)
+ if (code_point < 0x10000) {
+ buffer_ += static_cast<wchar_t>(code_point);
+ } else {
+ // Encode as UTF-16 surrogate pair.
+ code_point -= 0x10000;
+ buffer_ += 0xd800 | (code_point >> 10);
+ buffer_ += 0xdc00 | (code_point & 0x3ff);
+ }
+#else
buffer_ += static_cast<wchar_t>(code_point);
+#endif // defined(WCHAR_T_IS_UTF16)
}
diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
index 117f3f7..e0b7ee5 100644
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ b/core/fxcrt/cfx_utf8encoder.cpp
@@ -8,12 +8,36 @@
#include <stdint.h>
+#include "build/build_config.h"
+
CFX_UTF8Encoder::CFX_UTF8Encoder() = default;
CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
void CFX_UTF8Encoder::Input(wchar_t code_unit) {
- char32_t code_point = static_cast<char32_t>(code_unit);
+#if defined(WCHAR_T_IS_UTF16)
+ if (code_unit >= 0xd800 && code_unit < 0xdc00) {
+ // High surrogate.
+ high_surrogate_ = code_unit;
+ } else if (code_unit >= 0xdc00 && code_unit <= 0xdfff) {
+ // Low surrogate.
+ if (high_surrogate_) {
+ char32_t code_point = code_unit & 0x3ff;
+ code_point |= (high_surrogate_ & 0x3ff) << 10;
+ code_point += 0x10000;
+ high_surrogate_ = 0;
+ AppendCodePoint(code_point);
+ }
+ } else {
+ high_surrogate_ = 0;
+ AppendCodePoint(code_unit);
+ }
+#else
+ AppendCodePoint(code_unit);
+#endif // defined(WCHAR_T_IS_UTF16)
+}
+
+void CFX_UTF8Encoder::AppendCodePoint(char32_t code_point) {
if (code_point > 0x10ffff) {
// Invalid code point above U+10FFFF.
return;
diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
index a39f0cb..71b9ac3 100644
--- a/core/fxcrt/cfx_utf8encoder.h
+++ b/core/fxcrt/cfx_utf8encoder.h
@@ -7,6 +7,7 @@
#ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
#define CORE_FXCRT_CFX_UTF8ENCODER_H_
+#include "build/build_config.h"
#include "core/fxcrt/data_vector.h"
#include "core/fxcrt/string_view_template.h"
@@ -26,7 +27,13 @@
}
private:
+ void AppendCodePoint(char32_t code_point);
+
DataVector<char> buffer_;
+
+#if defined(WCHAR_T_IS_UTF16)
+ char16_t high_surrogate_ = 0;
+#endif // defined(WCHAR_T_IS_UTF16)
};
#endif // CORE_FXCRT_CFX_UTF8ENCODER_H_
diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index b3f2864..34ffcee 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp
@@ -4,6 +4,7 @@
#include <limits>
+#include "build/build_config.h"
#include "core/fxcrt/fx_string.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/base/span.h"
@@ -41,6 +42,25 @@
L"y"));
}
+TEST(fxstring, FXUTF8EncodeSupplementary) {
+ EXPECT_EQ(
+ "\xf0\x90\x80\x80"
+ "🎨"
+ "\xf4\x8f\xbf\xbf",
+ FX_UTF8Encode(L"\U00010000"
+ L"\U0001f3a8"
+ L"\U0010ffff"));
+}
+
+#if defined(WCHAR_T_IS_UTF16)
+TEST(fxstring, FXUTF8EncodeSurrogateErrorRecovery) {
+ EXPECT_EQ("()", FX_UTF8Encode(L"(\xd800)")) << "High";
+ EXPECT_EQ("()", FX_UTF8Encode(L"(\xdc00)")) << "Low";
+ EXPECT_EQ("(🎨)", FX_UTF8Encode(L"(\xd800\xd83c\xdfa8)")) << "High-high";
+ EXPECT_EQ("(🎨)", FX_UTF8Encode(L"(\xd83c\xdfa8\xdc00)")) << "Low-low";
+}
+#endif // defined(WCHAR_T_IS_UTF16)
+
TEST(fxstring, FXUTF8Decode) {
EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView()));
EXPECT_EQ(
@@ -62,6 +82,16 @@
"y"));
}
+TEST(fxstring, FXUTF8DecodeSupplementary) {
+ EXPECT_EQ(
+ L"\U00010000"
+ L"\U0001f3a8"
+ L"\U0010ffff",
+ FX_UTF8Decode("\xf0\x90\x80\x80"
+ "🎨"
+ "\xf4\x8f\xbf\xbf"));
+}
+
TEST(fxstring, FXUTF8DecodeErrorRecovery) {
EXPECT_EQ(L"(A)", FX_UTF8Decode("(\xc2\x41)")) << "Invalid continuation";
EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xc2)")) << "Invalid continuation";