Consolidate UTF-16 logic
Consolidates common UTF-16 logic in a single header, core/fxcrt/utf16.h.
This header defines useful constants related to the UTF-16 encoding, as
well as functions to encode and decode UTF-16 surrogate pairs.
Bug: pdfium:2029
Change-Id: Iaee483be113ac7f15c137b3c3366697460c16b1c
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107750
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: K. Moon <kmoon@chromium.org>
diff --git a/core/fxcrt/BUILD.gn b/core/fxcrt/BUILD.gn
index b6dc8de..f8b6de5 100644
--- a/core/fxcrt/BUILD.gn
+++ b/core/fxcrt/BUILD.gn
@@ -101,6 +101,7 @@
"string_pool_template.h",
"string_view_template.h",
"tree_node.h",
+ "utf16.h",
"weak_ptr.h",
"widestring.cpp",
"widestring.h",
@@ -242,6 +243,7 @@
"string_pool_template_unittest.cpp",
"tree_node_unittest.cpp",
"unowned_ptr_unittest.cpp",
+ "utf16_unittest.cpp",
"weak_ptr_unittest.cpp",
"widestring_unittest.cpp",
"widetext_buffer_unittest.cpp",
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index a36f072..b66605a 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp
@@ -12,6 +12,7 @@
#include "build/build_config.h"
#include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
#include "core/fxcrt/widestring.h"
CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
@@ -53,19 +54,19 @@
}
void CFX_UTF8Decoder::AppendCodePoint(char32_t code_point) {
- if (code_point > 0x10ffff) {
+ if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
// Invalid code point above U+10FFFF.
return;
}
#if defined(WCHAR_T_IS_UTF16)
- if (code_point < 0x10000) {
+ if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
buffer_ += static_cast<wchar_t>(code_point);
} else {
// Encode as UTF-16 surrogate pair.
- code_point -= 0x10000;
- buffer_ += 0xd800 | (code_point >> 10);
- buffer_ += 0xdc00 | (code_point & 0x3ff);
+ pdfium::SurrogatePair surrogate_pair(code_point);
+ buffer_ += surrogate_pair.high();
+ buffer_ += surrogate_pair.low();
}
#else
buffer_ += static_cast<wchar_t>(code_point);
diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
index 85b5512..aa69686 100644
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ b/core/fxcrt/cfx_utf8encoder.cpp
@@ -13,6 +13,7 @@
#include "core/fxcrt/bytestring.h"
#include "core/fxcrt/code_point_view.h"
#include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
CFX_UTF8Encoder::CFX_UTF8Encoder(WideStringView input) {
for (char32_t code_point : pdfium::CodePointView(input)) {
@@ -27,7 +28,7 @@
}
void CFX_UTF8Encoder::AppendCodePoint(char32_t code_point) {
- if (code_point > 0x10ffff) {
+ if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
// Invalid code point above U+10FFFF.
return;
}
diff --git a/core/fxcrt/code_point_view.h b/core/fxcrt/code_point_view.h
index bd60ab0..032d4a1 100644
--- a/core/fxcrt/code_point_view.h
+++ b/core/fxcrt/code_point_view.h
@@ -7,6 +7,7 @@
#include "build/build_config.h"
#include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
#include "third_party/base/check_op.h"
namespace pdfium {
@@ -28,7 +29,7 @@
Iterator& operator++() {
DCHECK_LT(current_, end_);
- current_ += code_point_ < kFirstSupplementary ? 1 : 2;
+ current_ += IsSupplementary(code_point_) ? 2 : 1;
code_point_ = Decode();
return *this;
}
@@ -42,26 +43,20 @@
friend class CodePointView;
static constexpr char32_t kSentinel = -1;
- static constexpr char32_t kFirstSupplementary = 0x10000;
Iterator(const wchar_t* begin, const wchar_t* end)
: current_(begin), end_(end), code_point_(Decode()) {}
char32_t Decode() {
if (current_ >= end_) {
- // No remaining code units.
return kSentinel;
}
char32_t code_point = *current_;
- if (code_point >= 0xd800 && code_point < 0xdc00) {
- // First code unit is a high surrogate.
+ if (IsHighSurrogate(code_point)) {
const wchar_t* next = current_ + 1;
- if (next < end_ && *next >= 0xdc00 && *next < 0xe000) {
- // Second code unit is a low surrogate.
- code_point = (code_point & 0x3ff) << 10;
- code_point |= *next & 0x3ff;
- code_point += kFirstSupplementary;
+ if (next < end_ && IsLowSurrogate(*next)) {
+ code_point = SurrogatePair(code_point, *next).ToCodePoint();
}
}
diff --git a/core/fxcrt/fx_extension.cpp b/core/fxcrt/fx_extension.cpp
index c682259..dc312f0 100644
--- a/core/fxcrt/fx_extension.cpp
+++ b/core/fxcrt/fx_extension.cpp
@@ -10,6 +10,7 @@
#include <limits>
#include "core/fxcrt/fx_system.h"
+#include "core/fxcrt/utf16.h"
#include "third_party/base/check.h"
namespace {
@@ -150,16 +151,17 @@
}
size_t FXSYS_ToUTF16BE(uint32_t unicode, char* buf) {
- DCHECK(unicode <= 0xD7FF || (unicode > 0xDFFF && unicode <= 0x10FFFF));
+ DCHECK(unicode <= pdfium::kMaximumSupplementaryCodePoint);
+ DCHECK(!pdfium::IsHighSurrogate(unicode));
+ DCHECK(!pdfium::IsLowSurrogate(unicode));
+
if (unicode <= 0xFFFF) {
FXSYS_IntToFourHexChars(unicode, buf);
return 4;
}
- unicode -= 0x010000;
- // High ten bits plus 0xD800
- FXSYS_IntToFourHexChars(0xD800 + unicode / 0x400, buf);
- // Low ten bits plus 0xDC00
- FXSYS_IntToFourHexChars(0xDC00 + unicode % 0x400, buf + 4);
+ pdfium::SurrogatePair surrogate_pair(unicode);
+ FXSYS_IntToFourHexChars(surrogate_pair.high(), buf);
+ FXSYS_IntToFourHexChars(surrogate_pair.low(), buf + 4);
return 8;
}
diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index f877aa7..825f65f 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp
@@ -6,6 +6,7 @@
#include "build/build_config.h"
#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/utf16.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/base/span.h"
@@ -111,11 +112,12 @@
TEST(fxstring, FXUTF8EncodeDecodeConsistency) {
WideString wstr;
wstr.Reserve(0x10000);
- for (int w = 0; w < 0x10000; ++w) {
- // Skip UTF-16 surrogates.
- if (w < 0xd800 || w >= 0xe000) {
- wstr += static_cast<wchar_t>(w);
+ for (char32_t w = 0; w < pdfium::kMinimumSupplementaryCodePoint; ++w) {
+ if (pdfium::IsHighSurrogate(w) || pdfium::IsLowSurrogate(w)) {
+ // Skip UTF-16 surrogates.
+ continue;
}
+ wstr += static_cast<wchar_t>(w);
}
ASSERT_EQ(0xf800u, wstr.GetLength());
@@ -127,7 +129,8 @@
TEST(fxstring, FXUTF8EncodeDecodeConsistencyUnpairedHighSurrogates) {
WideString wstr;
wstr.Reserve(0x400);
- for (wchar_t w = 0xd800; w < 0xdc00; ++w) {
+ for (wchar_t w = pdfium::kMinimumHighSurrogateCodeUnit;
+ w <= pdfium::kMaximumHighSurrogateCodeUnit; ++w) {
wstr += w;
}
ASSERT_EQ(0x400u, wstr.GetLength());
@@ -140,7 +143,8 @@
TEST(fxstring, FXUTF8EncodeDecodeConsistencyUnpairedLowSurrogates) {
WideString wstr;
wstr.Reserve(0x400);
- for (wchar_t w = 0xdc00; w < 0xe000; ++w) {
+ for (wchar_t w = pdfium::kMinimumLowSurrogateCodeUnit;
+ w <= pdfium::kMaximumLowSurrogateCodeUnit; ++w) {
wstr += w;
}
ASSERT_EQ(0x400u, wstr.GetLength());
diff --git a/core/fxcrt/utf16.h b/core/fxcrt/utf16.h
new file mode 100644
index 0000000..f42f190
--- /dev/null
+++ b/core/fxcrt/utf16.h
@@ -0,0 +1,107 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CORE_FXCRT_UTF16_H_
+#define CORE_FXCRT_UTF16_H_
+
+#include "third_party/base/check.h"
+
+namespace pdfium {
+
+// The number of suffix bits in a UTF-16 surrogate.
+inline constexpr int kSurrogateBits = 10;
+
+// A bitmask for the suffix of a UTF-16 surrogate.
+inline constexpr char16_t kSurrogateMask = (1 << kSurrogateBits) - 1;
+
+// The first supplementary code point, `U+10000`.
+inline constexpr char32_t kMinimumSupplementaryCodePoint = 0x10000;
+
+// The last supplementary code point, `U+10FFFF`.
+inline constexpr char32_t kMaximumSupplementaryCodePoint =
+ kMinimumSupplementaryCodePoint +
+ (kSurrogateMask << kSurrogateBits | kSurrogateMask);
+
+// The first UTF-16 high surrogate code unit, `U+D800`.
+inline constexpr char16_t kMinimumHighSurrogateCodeUnit = 0xd800;
+
+// The last UTF-16 high surrogate code unit, `U+DBFF`.
+inline constexpr char16_t kMaximumHighSurrogateCodeUnit =
+ kMinimumHighSurrogateCodeUnit | kSurrogateMask;
+
+// The first UTF-16 low surrogate code unit, `U+DC00`.
+inline constexpr char16_t kMinimumLowSurrogateCodeUnit =
+ kMaximumHighSurrogateCodeUnit + 1;
+
+// The last UTF-16 low surrogate code unit, `U+DFFF`.
+inline constexpr char16_t kMaximumLowSurrogateCodeUnit =
+ kMinimumLowSurrogateCodeUnit | kSurrogateMask;
+
+// Returns `true` if `code_point` is in a supplementary plane, and therefore
+// requires encoding as a UTF-16 surrogate pair.
+constexpr bool IsSupplementary(char32_t code_point) {
+ return code_point >= kMinimumSupplementaryCodePoint &&
+ code_point <= kMaximumSupplementaryCodePoint;
+}
+
+// Returns `true` if `code_point` is a UTF-16 high surrogate.
+constexpr bool IsHighSurrogate(char32_t code_point) {
+ return code_point >= kMinimumHighSurrogateCodeUnit &&
+ code_point <= kMaximumHighSurrogateCodeUnit;
+}
+
+// Returns `true` if `code_point` is a UTF-16 low surrogate.
+constexpr bool IsLowSurrogate(char32_t code_point) {
+ return code_point >= kMinimumLowSurrogateCodeUnit &&
+ code_point <= kMaximumLowSurrogateCodeUnit;
+}
+
+// A UTF-16 surrogate pair.
+class SurrogatePair final {
+ public:
+ // Constructs a surrogate pair from a high and a low surrogate.
+ constexpr SurrogatePair(char16_t high, char16_t low)
+ : high_(high), low_(low) {
+ DCHECK(IsHighSurrogate(high_));
+ DCHECK(IsLowSurrogate(low_));
+ }
+
+ // Constructs a surrogate pair from a code point.
+ explicit constexpr SurrogatePair(char32_t code_point)
+ : high_(GetHighSurrogate(code_point)), low_(GetLowSurrogate(code_point)) {
+ // This constructor initializes `high_` and `low_` using helper functions
+ // because C++17 requires it for `constexpr` constructors.
+ DCHECK(IsSupplementary(code_point));
+ }
+
+ constexpr char16_t high() const { return high_; }
+ constexpr char16_t low() const { return low_; }
+
+ // Decodes this surrogate pair to a code point.
+ constexpr char32_t ToCodePoint() const {
+ char32_t code_point = low_ & kSurrogateMask;
+ code_point |= (high_ & kSurrogateMask) << kSurrogateBits;
+ return kMinimumSupplementaryCodePoint + code_point;
+ }
+
+ private:
+ static constexpr char16_t GetHighSurrogate(char32_t code_point) {
+ code_point -= kMinimumSupplementaryCodePoint;
+ char16_t code_unit = (code_point >> kSurrogateBits) & kSurrogateMask;
+ return kMinimumHighSurrogateCodeUnit | code_unit;
+ }
+
+ static constexpr char16_t GetLowSurrogate(char32_t code_point) {
+ code_point -= kMinimumSupplementaryCodePoint;
+ char16_t code_unit = code_point & kSurrogateMask;
+ return kMinimumLowSurrogateCodeUnit | code_unit;
+ }
+
+ char16_t high_;
+ char16_t low_;
+};
+
+} // namespace pdfium
+
+#endif // CORE_FXCRT_UTF16_H_
diff --git a/core/fxcrt/utf16_unittest.cpp b/core/fxcrt/utf16_unittest.cpp
new file mode 100644
index 0000000..bab1bdd
--- /dev/null
+++ b/core/fxcrt/utf16_unittest.cpp
@@ -0,0 +1,61 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "core/fxcrt/utf16.h"
+
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace pdfium {
+
+static_assert(kSurrogateMask == 0x3ff);
+static_assert(kMaximumSupplementaryCodePoint == 0x10ffff);
+static_assert(kMaximumHighSurrogateCodeUnit == 0xdbff);
+static_assert(kMinimumLowSurrogateCodeUnit == 0xdc00);
+static_assert(kMaximumLowSurrogateCodeUnit == 0xdfff);
+
+static_assert(!IsSupplementary(0xffff));
+static_assert(IsSupplementary(0x10000));
+static_assert(IsSupplementary(0x10ffff));
+static_assert(!IsSupplementary(0x110000));
+
+static_assert(!IsHighSurrogate(0xd7ff));
+static_assert(IsHighSurrogate(0xd800));
+static_assert(IsHighSurrogate(0xdbff));
+static_assert(!IsHighSurrogate(0xdc00));
+
+static_assert(!IsLowSurrogate(0xdbff));
+static_assert(IsLowSurrogate(0xdc00));
+static_assert(IsLowSurrogate(0xdfff));
+static_assert(!IsLowSurrogate(0xe000));
+
+static_assert(SurrogatePair(0xd800, 0xdc00).high() == 0xd800);
+static_assert(SurrogatePair(0xd800, 0xdc00).low() == 0xdc00);
+static_assert(SurrogatePair(0xd800, 0xdc00).ToCodePoint() == 0x10000);
+
+static_assert(SurrogatePair(0xdbff, 0xdfff).high() == 0xdbff);
+static_assert(SurrogatePair(0xdbff, 0xdfff).low() == 0xdfff);
+static_assert(SurrogatePair(0xdbff, 0xdfff).ToCodePoint() == 0x10ffff);
+
+static_assert(SurrogatePair(0x10000).high() == 0xd800);
+static_assert(SurrogatePair(0x10000).low() == 0xdc00);
+static_assert(SurrogatePair(0x10000).ToCodePoint() == 0x10000);
+
+static_assert(SurrogatePair(0x10ffff).high() == 0xdbff);
+static_assert(SurrogatePair(0x10ffff).low() == 0xdfff);
+static_assert(SurrogatePair(0x10ffff).ToCodePoint() == 0x10ffff);
+
+TEST(SurrogatePairTest, RoundTrip) {
+ for (char32_t code_point = kMinimumSupplementaryCodePoint;
+ code_point <= kMaximumSupplementaryCodePoint; ++code_point) {
+ SurrogatePair from_code_point(code_point);
+ EXPECT_EQ(code_point, from_code_point.ToCodePoint());
+
+ SurrogatePair from_pair(from_code_point.high(), from_code_point.low());
+ EXPECT_EQ(from_code_point.high(), from_pair.high());
+ EXPECT_EQ(from_code_point.low(), from_pair.low());
+ EXPECT_EQ(code_point, from_pair.ToCodePoint());
+ }
+}
+
+} // namespace pdfium
diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp
index 467a8f3..d54ae2f 100644
--- a/fpdfsdk/fpdf_edittext.cpp
+++ b/fpdfsdk/fpdf_edittext.cpp
@@ -31,6 +31,7 @@
#include "core/fxcrt/fx_string_wrappers.h"
#include "core/fxcrt/span_util.h"
#include "core/fxcrt/stl_util.h"
+#include "core/fxcrt/utf16.h"
#include "core/fxge/cfx_defaultrenderdevice.h"
#include "core/fxge/cfx_fontmgr.h"
#include "core/fxge/fx_font.h"
@@ -166,8 +167,9 @@
// PDF spec 1.7 Section 5.9.2: "Unicode character sequences as expressed in
// UTF-16BE encoding." See https://en.wikipedia.org/wiki/UTF-16#Description
void AddUnicode(fxcrt::ostringstream* pBuffer, uint32_t unicode) {
- if (unicode >= 0xD800 && unicode <= 0xDFFF)
+ if (pdfium::IsHighSurrogate(unicode) || pdfium::IsLowSurrogate(unicode)) {
unicode = 0;
+ }
char ans[8];
*pBuffer << "<";
@@ -369,16 +371,18 @@
uint32_t dwGlyphIndex;
uint32_t dwCurrentChar = static_cast<uint32_t>(
FT_Get_First_Char(pFont->GetFaceRec(), &dwGlyphIndex));
- static constexpr uint32_t kMaxUnicode = 0x10FFFF;
// If it doesn't have a single char, just fail
- if (dwGlyphIndex == 0 || dwCurrentChar > kMaxUnicode)
+ if (dwGlyphIndex == 0 ||
+ dwCurrentChar > pdfium::kMaximumSupplementaryCodePoint) {
return nullptr;
+ }
std::multimap<uint32_t, uint32_t> to_unicode;
std::map<uint32_t, uint32_t> widths;
while (true) {
- if (dwCurrentChar > kMaxUnicode)
+ if (dwCurrentChar > pdfium::kMaximumSupplementaryCodePoint) {
break;
+ }
if (!pdfium::Contains(widths, dwGlyphIndex))
widths[dwGlyphIndex] = pFont->GetGlyphWidth(dwGlyphIndex);