Consolidate UTF-16 logic Consolidates common UTF-16 logic in a single header, core/fxcrt/utf16.h. This header defines useful constants related to the UTF-16 encoding, as well as functions to encode and decode UTF-16 surrogate pairs. Bug: pdfium:2029 Change-Id: Iaee483be113ac7f15c137b3c3366697460c16b1c Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107750 Reviewed-by: Tom Sepez <tsepez@chromium.org> Commit-Queue: K. Moon <kmoon@chromium.org>

commit: a248b67f7cf67d0430ea245cf5f8017777baae4b [log] [tgz]
author: K. Moon <kmoon@chromium.org> Wed May 17 16:34:35 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Wed May 17 16:34:35 2023 +0000
tree: dee60e28c5a535a432c1551c758e08a5572b3509
parent: b6c2e82d2da323cc7d1522389ba3ca36ee5a3983 [diff]
diff --git a/core/fxcrt/BUILD.gn b/core/fxcrt/BUILD.gn
index b6dc8de..f8b6de5 100644
--- a/core/fxcrt/BUILD.gn
+++ b/core/fxcrt/BUILD.gn

@@ -101,6 +101,7 @@
     "string_pool_template.h",
     "string_view_template.h",
     "tree_node.h",
+    "utf16.h",
     "weak_ptr.h",
     "widestring.cpp",
     "widestring.h",
@@ -242,6 +243,7 @@
     "string_pool_template_unittest.cpp",
     "tree_node_unittest.cpp",
     "unowned_ptr_unittest.cpp",
+    "utf16_unittest.cpp",
     "weak_ptr_unittest.cpp",
     "widestring_unittest.cpp",
     "widetext_buffer_unittest.cpp",

diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index a36f072..b66605a 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp

@@ -12,6 +12,7 @@
 
 #include "build/build_config.h"
 #include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
 #include "core/fxcrt/widestring.h"
 
 CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
@@ -53,19 +54,19 @@
 }
 
 void CFX_UTF8Decoder::AppendCodePoint(char32_t code_point) {
-  if (code_point > 0x10ffff) {
+  if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
     // Invalid code point above U+10FFFF.
     return;
   }
 
 #if defined(WCHAR_T_IS_UTF16)
-  if (code_point < 0x10000) {
+  if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
     buffer_ += static_cast<wchar_t>(code_point);
   } else {
     // Encode as UTF-16 surrogate pair.
-    code_point -= 0x10000;
-    buffer_ += 0xd800 | (code_point >> 10);
-    buffer_ += 0xdc00 | (code_point & 0x3ff);
+    pdfium::SurrogatePair surrogate_pair(code_point);
+    buffer_ += surrogate_pair.high();
+    buffer_ += surrogate_pair.low();
   }
 #else
   buffer_ += static_cast<wchar_t>(code_point);

diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
index 85b5512..aa69686 100644
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ b/core/fxcrt/cfx_utf8encoder.cpp

@@ -13,6 +13,7 @@
 #include "core/fxcrt/bytestring.h"
 #include "core/fxcrt/code_point_view.h"
 #include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
 
 CFX_UTF8Encoder::CFX_UTF8Encoder(WideStringView input) {
   for (char32_t code_point : pdfium::CodePointView(input)) {
@@ -27,7 +28,7 @@
 }
 
 void CFX_UTF8Encoder::AppendCodePoint(char32_t code_point) {
-  if (code_point > 0x10ffff) {
+  if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
     // Invalid code point above U+10FFFF.
     return;
   }

diff --git a/core/fxcrt/code_point_view.h b/core/fxcrt/code_point_view.h
index bd60ab0..032d4a1 100644
--- a/core/fxcrt/code_point_view.h
+++ b/core/fxcrt/code_point_view.h

@@ -7,6 +7,7 @@
 
 #include "build/build_config.h"
 #include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/utf16.h"
 #include "third_party/base/check_op.h"
 
 namespace pdfium {
@@ -28,7 +29,7 @@
 
     Iterator& operator++() {
       DCHECK_LT(current_, end_);
-      current_ += code_point_ < kFirstSupplementary ? 1 : 2;
+      current_ += IsSupplementary(code_point_) ? 2 : 1;
       code_point_ = Decode();
       return *this;
     }
@@ -42,26 +43,20 @@
     friend class CodePointView;
 
     static constexpr char32_t kSentinel = -1;
-    static constexpr char32_t kFirstSupplementary = 0x10000;
 
     Iterator(const wchar_t* begin, const wchar_t* end)
         : current_(begin), end_(end), code_point_(Decode()) {}
 
     char32_t Decode() {
       if (current_ >= end_) {
-        // No remaining code units.
         return kSentinel;
       }
 
       char32_t code_point = *current_;
-      if (code_point >= 0xd800 && code_point < 0xdc00) {
-        // First code unit is a high surrogate.
+      if (IsHighSurrogate(code_point)) {
         const wchar_t* next = current_ + 1;
-        if (next < end_ && *next >= 0xdc00 && *next < 0xe000) {
-          // Second code unit is a low surrogate.
-          code_point = (code_point & 0x3ff) << 10;
-          code_point |= *next & 0x3ff;
-          code_point += kFirstSupplementary;
+        if (next < end_ && IsLowSurrogate(*next)) {
+          code_point = SurrogatePair(code_point, *next).ToCodePoint();
         }
       }
 

diff --git a/core/fxcrt/fx_extension.cpp b/core/fxcrt/fx_extension.cpp
index c682259..dc312f0 100644
--- a/core/fxcrt/fx_extension.cpp
+++ b/core/fxcrt/fx_extension.cpp

@@ -10,6 +10,7 @@
 #include <limits>
 
 #include "core/fxcrt/fx_system.h"
+#include "core/fxcrt/utf16.h"
 #include "third_party/base/check.h"
 
 namespace {
@@ -150,16 +151,17 @@
 }
 
 size_t FXSYS_ToUTF16BE(uint32_t unicode, char* buf) {
-  DCHECK(unicode <= 0xD7FF || (unicode > 0xDFFF && unicode <= 0x10FFFF));
+  DCHECK(unicode <= pdfium::kMaximumSupplementaryCodePoint);
+  DCHECK(!pdfium::IsHighSurrogate(unicode));
+  DCHECK(!pdfium::IsLowSurrogate(unicode));
+
   if (unicode <= 0xFFFF) {
     FXSYS_IntToFourHexChars(unicode, buf);
     return 4;
   }
-  unicode -= 0x010000;
-  // High ten bits plus 0xD800
-  FXSYS_IntToFourHexChars(0xD800 + unicode / 0x400, buf);
-  // Low ten bits plus 0xDC00
-  FXSYS_IntToFourHexChars(0xDC00 + unicode % 0x400, buf + 4);
+  pdfium::SurrogatePair surrogate_pair(unicode);
+  FXSYS_IntToFourHexChars(surrogate_pair.high(), buf);
+  FXSYS_IntToFourHexChars(surrogate_pair.low(), buf + 4);
   return 8;
 }
 

diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index f877aa7..825f65f 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp

@@ -6,6 +6,7 @@
 
 #include "build/build_config.h"
 #include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/utf16.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "third_party/base/span.h"
 
@@ -111,11 +112,12 @@
 TEST(fxstring, FXUTF8EncodeDecodeConsistency) {
   WideString wstr;
   wstr.Reserve(0x10000);
-  for (int w = 0; w < 0x10000; ++w) {
-    // Skip UTF-16 surrogates.
-    if (w < 0xd800 || w >= 0xe000) {
-      wstr += static_cast<wchar_t>(w);
+  for (char32_t w = 0; w < pdfium::kMinimumSupplementaryCodePoint; ++w) {
+    if (pdfium::IsHighSurrogate(w) || pdfium::IsLowSurrogate(w)) {
+      // Skip UTF-16 surrogates.
+      continue;
     }
+    wstr += static_cast<wchar_t>(w);
   }
   ASSERT_EQ(0xf800u, wstr.GetLength());
 
@@ -127,7 +129,8 @@
 TEST(fxstring, FXUTF8EncodeDecodeConsistencyUnpairedHighSurrogates) {
   WideString wstr;
   wstr.Reserve(0x400);
-  for (wchar_t w = 0xd800; w < 0xdc00; ++w) {
+  for (wchar_t w = pdfium::kMinimumHighSurrogateCodeUnit;
+       w <= pdfium::kMaximumHighSurrogateCodeUnit; ++w) {
     wstr += w;
   }
   ASSERT_EQ(0x400u, wstr.GetLength());
@@ -140,7 +143,8 @@
 TEST(fxstring, FXUTF8EncodeDecodeConsistencyUnpairedLowSurrogates) {
   WideString wstr;
   wstr.Reserve(0x400);
-  for (wchar_t w = 0xdc00; w < 0xe000; ++w) {
+  for (wchar_t w = pdfium::kMinimumLowSurrogateCodeUnit;
+       w <= pdfium::kMaximumLowSurrogateCodeUnit; ++w) {
     wstr += w;
   }
   ASSERT_EQ(0x400u, wstr.GetLength());

diff --git a/core/fxcrt/utf16.h b/core/fxcrt/utf16.h
new file mode 100644
index 0000000..f42f190
--- /dev/null
+++ b/core/fxcrt/utf16.h

@@ -0,0 +1,107 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CORE_FXCRT_UTF16_H_
+#define CORE_FXCRT_UTF16_H_
+
+#include "third_party/base/check.h"
+
+namespace pdfium {
+
+// The number of suffix bits in a UTF-16 surrogate.
+inline constexpr int kSurrogateBits = 10;
+
+// A bitmask for the suffix of a UTF-16 surrogate.
+inline constexpr char16_t kSurrogateMask = (1 << kSurrogateBits) - 1;
+
+// The first supplementary code point, `U+10000`.
+inline constexpr char32_t kMinimumSupplementaryCodePoint = 0x10000;
+
+// The last supplementary code point, `U+10FFFF`.
+inline constexpr char32_t kMaximumSupplementaryCodePoint =
+    kMinimumSupplementaryCodePoint +
+    (kSurrogateMask << kSurrogateBits | kSurrogateMask);
+
+// The first UTF-16 high surrogate code unit, `U+D800`.
+inline constexpr char16_t kMinimumHighSurrogateCodeUnit = 0xd800;
+
+// The last UTF-16 high surrogate code unit, `U+DBFF`.
+inline constexpr char16_t kMaximumHighSurrogateCodeUnit =
+    kMinimumHighSurrogateCodeUnit | kSurrogateMask;
+
+// The first UTF-16 low surrogate code unit, `U+DC00`.
+inline constexpr char16_t kMinimumLowSurrogateCodeUnit =
+    kMaximumHighSurrogateCodeUnit + 1;
+
+// The last UTF-16 low surrogate code unit, `U+DFFF`.
+inline constexpr char16_t kMaximumLowSurrogateCodeUnit =
+    kMinimumLowSurrogateCodeUnit | kSurrogateMask;
+
+// Returns `true` if `code_point` is in a supplementary plane, and therefore
+// requires encoding as a UTF-16 surrogate pair.
+constexpr bool IsSupplementary(char32_t code_point) {
+  return code_point >= kMinimumSupplementaryCodePoint &&
+         code_point <= kMaximumSupplementaryCodePoint;
+}
+
+// Returns `true` if `code_point` is a UTF-16 high surrogate.
+constexpr bool IsHighSurrogate(char32_t code_point) {
+  return code_point >= kMinimumHighSurrogateCodeUnit &&
+         code_point <= kMaximumHighSurrogateCodeUnit;
+}
+
+// Returns `true` if `code_point` is a UTF-16 low surrogate.
+constexpr bool IsLowSurrogate(char32_t code_point) {
+  return code_point >= kMinimumLowSurrogateCodeUnit &&
+         code_point <= kMaximumLowSurrogateCodeUnit;
+}
+
+// A UTF-16 surrogate pair.
+class SurrogatePair final {
+ public:
+  // Constructs a surrogate pair from a high and a low surrogate.
+  constexpr SurrogatePair(char16_t high, char16_t low)
+      : high_(high), low_(low) {
+    DCHECK(IsHighSurrogate(high_));
+    DCHECK(IsLowSurrogate(low_));
+  }
+
+  // Constructs a surrogate pair from a code point.
+  explicit constexpr SurrogatePair(char32_t code_point)
+      : high_(GetHighSurrogate(code_point)), low_(GetLowSurrogate(code_point)) {
+    // This constructor initializes `high_` and `low_` using helper functions
+    // because C++17 requires it for `constexpr` constructors.
+    DCHECK(IsSupplementary(code_point));
+  }
+
+  constexpr char16_t high() const { return high_; }
+  constexpr char16_t low() const { return low_; }
+
+  // Decodes this surrogate pair to a code point.
+  constexpr char32_t ToCodePoint() const {
+    char32_t code_point = low_ & kSurrogateMask;
+    code_point |= (high_ & kSurrogateMask) << kSurrogateBits;
+    return kMinimumSupplementaryCodePoint + code_point;
+  }
+
+ private:
+  static constexpr char16_t GetHighSurrogate(char32_t code_point) {
+    code_point -= kMinimumSupplementaryCodePoint;
+    char16_t code_unit = (code_point >> kSurrogateBits) & kSurrogateMask;
+    return kMinimumHighSurrogateCodeUnit | code_unit;
+  }
+
+  static constexpr char16_t GetLowSurrogate(char32_t code_point) {
+    code_point -= kMinimumSupplementaryCodePoint;
+    char16_t code_unit = code_point & kSurrogateMask;
+    return kMinimumLowSurrogateCodeUnit | code_unit;
+  }
+
+  char16_t high_;
+  char16_t low_;
+};
+
+}  // namespace pdfium
+
+#endif  // CORE_FXCRT_UTF16_H_

diff --git a/core/fxcrt/utf16_unittest.cpp b/core/fxcrt/utf16_unittest.cpp
new file mode 100644
index 0000000..bab1bdd
--- /dev/null
+++ b/core/fxcrt/utf16_unittest.cpp

@@ -0,0 +1,61 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "core/fxcrt/utf16.h"
+
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace pdfium {
+
+static_assert(kSurrogateMask == 0x3ff);
+static_assert(kMaximumSupplementaryCodePoint == 0x10ffff);
+static_assert(kMaximumHighSurrogateCodeUnit == 0xdbff);
+static_assert(kMinimumLowSurrogateCodeUnit == 0xdc00);
+static_assert(kMaximumLowSurrogateCodeUnit == 0xdfff);
+
+static_assert(!IsSupplementary(0xffff));
+static_assert(IsSupplementary(0x10000));
+static_assert(IsSupplementary(0x10ffff));
+static_assert(!IsSupplementary(0x110000));
+
+static_assert(!IsHighSurrogate(0xd7ff));
+static_assert(IsHighSurrogate(0xd800));
+static_assert(IsHighSurrogate(0xdbff));
+static_assert(!IsHighSurrogate(0xdc00));
+
+static_assert(!IsLowSurrogate(0xdbff));
+static_assert(IsLowSurrogate(0xdc00));
+static_assert(IsLowSurrogate(0xdfff));
+static_assert(!IsLowSurrogate(0xe000));
+
+static_assert(SurrogatePair(0xd800, 0xdc00).high() == 0xd800);
+static_assert(SurrogatePair(0xd800, 0xdc00).low() == 0xdc00);
+static_assert(SurrogatePair(0xd800, 0xdc00).ToCodePoint() == 0x10000);
+
+static_assert(SurrogatePair(0xdbff, 0xdfff).high() == 0xdbff);
+static_assert(SurrogatePair(0xdbff, 0xdfff).low() == 0xdfff);
+static_assert(SurrogatePair(0xdbff, 0xdfff).ToCodePoint() == 0x10ffff);
+
+static_assert(SurrogatePair(0x10000).high() == 0xd800);
+static_assert(SurrogatePair(0x10000).low() == 0xdc00);
+static_assert(SurrogatePair(0x10000).ToCodePoint() == 0x10000);
+
+static_assert(SurrogatePair(0x10ffff).high() == 0xdbff);
+static_assert(SurrogatePair(0x10ffff).low() == 0xdfff);
+static_assert(SurrogatePair(0x10ffff).ToCodePoint() == 0x10ffff);
+
+TEST(SurrogatePairTest, RoundTrip) {
+  for (char32_t code_point = kMinimumSupplementaryCodePoint;
+       code_point <= kMaximumSupplementaryCodePoint; ++code_point) {
+    SurrogatePair from_code_point(code_point);
+    EXPECT_EQ(code_point, from_code_point.ToCodePoint());
+
+    SurrogatePair from_pair(from_code_point.high(), from_code_point.low());
+    EXPECT_EQ(from_code_point.high(), from_pair.high());
+    EXPECT_EQ(from_code_point.low(), from_pair.low());
+    EXPECT_EQ(code_point, from_pair.ToCodePoint());
+  }
+}
+
+}  // namespace pdfium

diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp
index 467a8f3..d54ae2f 100644
--- a/fpdfsdk/fpdf_edittext.cpp
+++ b/fpdfsdk/fpdf_edittext.cpp

@@ -31,6 +31,7 @@
 #include "core/fxcrt/fx_string_wrappers.h"
 #include "core/fxcrt/span_util.h"
 #include "core/fxcrt/stl_util.h"
+#include "core/fxcrt/utf16.h"
 #include "core/fxge/cfx_defaultrenderdevice.h"
 #include "core/fxge/cfx_fontmgr.h"
 #include "core/fxge/fx_font.h"
@@ -166,8 +167,9 @@
 // PDF spec 1.7 Section 5.9.2: "Unicode character sequences as expressed in
 // UTF-16BE encoding." See https://en.wikipedia.org/wiki/UTF-16#Description
 void AddUnicode(fxcrt::ostringstream* pBuffer, uint32_t unicode) {
-  if (unicode >= 0xD800 && unicode <= 0xDFFF)
+  if (pdfium::IsHighSurrogate(unicode) || pdfium::IsLowSurrogate(unicode)) {
     unicode = 0;
+  }
 
   char ans[8];
   *pBuffer << "<";
@@ -369,16 +371,18 @@
   uint32_t dwGlyphIndex;
   uint32_t dwCurrentChar = static_cast<uint32_t>(
       FT_Get_First_Char(pFont->GetFaceRec(), &dwGlyphIndex));
-  static constexpr uint32_t kMaxUnicode = 0x10FFFF;
   // If it doesn't have a single char, just fail
-  if (dwGlyphIndex == 0 || dwCurrentChar > kMaxUnicode)
+  if (dwGlyphIndex == 0 ||
+      dwCurrentChar > pdfium::kMaximumSupplementaryCodePoint) {
     return nullptr;
+  }
 
   std::multimap<uint32_t, uint32_t> to_unicode;
   std::map<uint32_t, uint32_t> widths;
   while (true) {
-    if (dwCurrentChar > kMaxUnicode)
+    if (dwCurrentChar > pdfium::kMaximumSupplementaryCodePoint) {
       break;
+    }
 
     if (!pdfium::Contains(widths, dwGlyphIndex))
       widths[dwGlyphIndex] = pFont->GetGlyphWidth(dwGlyphIndex);
commit	a248b67f7cf67d0430ea245cf5f8017777baae4b	[log] [tgz]
author	K. Moon <kmoon@chromium.org>	Wed May 17 16:34:35 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Wed May 17 16:34:35 2023 +0000
tree	dee60e28c5a535a432c1551c758e08a5572b3509
parent	b6c2e82d2da323cc7d1522389ba3ca36ee5a3983 [diff]