Fix text string handling on UTF-32 platforms
Fixes PDF_DecodeText() and PDF_EncodeText() to handle supplementary
characters correctly on UTF-32 platforms. Supplementary characters
require 2 code units to represent in UTF-16, but only 1 in UTF-32.
Fixed: pdfium:2029
Change-Id: I985cea3a3d5f79d7cf544904f78d6d47881f4331
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107831
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: K. Moon <kmoon@chromium.org>
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index 6f29b7a..1fbee03 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp
@@ -8,10 +8,12 @@
#include <ctype.h>
#include <limits.h>
+#include <stddef.h>
#include <algorithm>
#include <utility>
+#include "build/build_config.h"
#include "constants/stream_dict_common.h"
#include "core/fpdfapi/parser/cpdf_array.h"
#include "core/fpdfapi/parser/cpdf_dictionary.h"
@@ -22,6 +24,7 @@
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/fx_safe_types.h"
#include "core/fxcrt/span_util.h"
+#include "core/fxcrt/utf16.h"
#include "third_party/base/check.h"
#include "third_party/base/containers/contains.h"
@@ -472,7 +475,7 @@
}
WideString PDF_DecodeText(pdfium::span<const uint8_t> span) {
- int dest_pos = 0;
+ size_t dest_pos = 0;
WideString result;
if (span.size() >= 2 && ((span[0] == 0xfe && span[1] == 0xff) ||
(span[0] == 0xff && span[1] == 0xfe))) {
@@ -485,6 +488,10 @@
span[0] == 0xfe ? GetUnicodeFromBigEndianBytes
: GetUnicodeFromLittleEndianBytes;
const uint8_t* unicode_str = &span[2];
+
+#if defined(WCHAR_T_IS_UTF32)
+ char16_t high_surrogate = 0;
+#endif // defined(WCHAR_T_IS_UTF32)
for (size_t i = 0; i < max_chars * 2; i += 2) {
uint16_t unicode = GetUnicodeFromBytes(unicode_str + i);
@@ -505,8 +512,34 @@
break;
}
+#if defined(WCHAR_T_IS_UTF32)
+ // TODO(crbug.com/pdfium/2031): Always use UTF-16.
+ if (high_surrogate) {
+ char16_t previous_high_surrogate = high_surrogate;
+ high_surrogate = 0;
+
+ if (pdfium::IsLowSurrogate(unicode)) {
+ dest_buf[dest_pos++] =
+ pdfium::SurrogatePair(previous_high_surrogate, unicode)
+ .ToCodePoint();
+ continue;
+ }
+ dest_buf[dest_pos++] = previous_high_surrogate;
+ }
+
+ if (pdfium::IsHighSurrogate(unicode)) {
+ high_surrogate = unicode;
+ continue;
+ }
+#endif // defined(WCHAR_T_IS_UTF32)
dest_buf[dest_pos++] = unicode;
}
+
+#if defined(WCHAR_T_IS_UTF32)
+ if (high_surrogate) {
+ dest_buf[dest_pos++] = high_surrogate;
+ }
+#endif // defined(WCHAR_T_IS_UTF32)
} else {
pdfium::span<wchar_t> dest_buf = result.GetBuffer(span.size());
for (size_t i = 0; i < span.size(); ++i)
@@ -545,18 +578,35 @@
}
size_t dest_index = 0;
- size_t encLen = len * 2 + 2;
{
+#if defined(WCHAR_T_IS_UTF32)
+ // 2 or 4 bytes required per UTF-32 code unit.
pdfium::span<uint8_t> dest_buf =
- pdfium::as_writable_bytes(result.GetBuffer(encLen));
+ pdfium::as_writable_bytes(result.GetBuffer(len * 4 + 2));
+#else
+ // 2 bytes required per UTF-16 code unit.
+ pdfium::span<uint8_t> dest_buf =
+ pdfium::as_writable_bytes(result.GetBuffer(len * 2 + 2));
+#endif // defined(WCHAR_T_IS_UTF32)
+
dest_buf[dest_index++] = 0xfe;
dest_buf[dest_index++] = 0xff;
for (size_t j = 0; j < len; ++j) {
+#if defined(WCHAR_T_IS_UTF32)
+ if (pdfium::IsSupplementary(str[j])) {
+ pdfium::SurrogatePair pair(str[j]);
+ dest_buf[dest_index++] = pair.high() >> 8;
+ dest_buf[dest_index++] = static_cast<uint8_t>(pair.high());
+ dest_buf[dest_index++] = pair.low() >> 8;
+ dest_buf[dest_index++] = static_cast<uint8_t>(pair.low());
+ continue;
+ }
+#endif // defined(WCHAR_T_IS_UTF32)
dest_buf[dest_index++] = str[j] >> 8;
dest_buf[dest_index++] = static_cast<uint8_t>(str[j]);
}
}
- result.ReleaseBuffer(encLen);
+ result.ReleaseBuffer(dest_index);
return result;
}
diff --git a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
index f812097..ad0416e 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
@@ -419,6 +419,9 @@
PDF_DecodeText(
ToSpan("\xFE\xFF\x7F\x51\x98\x75\x00\x20\x56\xFE\x72\x47\x00"
"\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB")));
+
+ // Supplementary Unicode text.
+ EXPECT_EQ(L"🎨", PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x3C\xDF\xA8")));
}
// https://crbug.com/pdfium/182
@@ -450,6 +453,12 @@
TEST(ParserDecodeTest, DecodeTextWithUnpairedSurrogates) {
EXPECT_EQ(L"\xD800", PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x00"))) << "High";
EXPECT_EQ(L"\xDC00", PDF_DecodeText(ToSpan("\xFE\xFF\xDC\x00"))) << "Low";
+ EXPECT_EQ(L"\xD800🎨",
+ PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x00\xD8\x3C\xDF\xA8")))
+ << "High-high";
+ EXPECT_EQ(L"🎨\xDC00",
+ PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x3C\xDF\xA8\xDC\x00")))
+ << "Low-low";
}
TEST(ParserDecodeTest, EncodeText) {
@@ -468,6 +477,9 @@
"\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB"),
PDF_EncodeText(L"\x7F51\x9875\x0020\x56FE\x7247\x0020"
L"\x8D44\x8BAF\x66F4\x591A\x0020\x00BB"));
+
+ // Supplementary Unicode text.
+ EXPECT_EQ("\xFE\xFF\xD8\x3C\xDF\xA8", PDF_EncodeText(L"🎨"));
}
TEST(ParserDecodeTest, RoundTripText) {
diff --git a/core/fpdfdoc/cpdf_annotlist_unittest.cpp b/core/fpdfdoc/cpdf_annotlist_unittest.cpp
index 5d7e1e2..653fdff 100644
--- a/core/fpdfdoc/cpdf_annotlist_unittest.cpp
+++ b/core/fpdfdoc/cpdf_annotlist_unittest.cpp
@@ -86,7 +86,7 @@
TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromUnicode) {
const ByteString kContents =
MakeByteString({0xFE, 0xFF, 0x00, 'A', 0x00, 'a', 0x00, 0xE4, 0x20, 0xAC,
- 0xD8, 0x3D, 0xDC, 0xC4});
+ 0xD8, 0x3C, 0xDF, 0xA8});
AddTextAnnotation(kContents);
CPDF_AnnotList list(page_);
@@ -94,9 +94,7 @@
ASSERT_EQ(2u, list.Count());
EXPECT_EQ(kContents, GetRawContents(list.GetAt(1)));
- // TODO(crbug.com/pdfium/2029): `WideString::FromUTF8()` mishandles '📄'.
- EXPECT_EQ(WideString::FromUTF8("Aaä€\xED\xA0\xBD\xED\xB3\x84"),
- GetDecodedContents(list.GetAt(1)));
+ EXPECT_EQ(WideString::FromUTF8("Aaä€ðŸŽ¨"), GetDecodedContents(list.GetAt(1)));
}
TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromEmptyPdfEncoded) {