Clean up the UTF-8 encoder and decoder
Makes various minor improvements to the UTF-8 encoder and decoder:
1. Consistently uses the terms "code unit" and "code point" as defined
in the Unicode standard.
2. Drops support for invalid code points above U+10FFFF.
3. Simplifies UTF-8 decoder by placing more state on the stack.
4. Consistently uses "char16_t" to represent UTF-16 code units.
5. Consistently uses "char32_t" to represent code points.
6. Uses "char" to represent ByteString elements.
7. Prepares tests to handle UTF-16 in the future.
8. Converts class member names to Google style.
9. IWYU
Bug: pdfium:2029
Change-Id: I1ca1ecf28ee34737e1df741ecfdc259ca2052833
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107090
Reviewed-by: Nigi <nigi@chromium.org>
Commit-Queue: K. Moon <kmoon@chromium.org>
diff --git a/core/fxcrt/cfx_utf8decoder.cpp b/core/fxcrt/cfx_utf8decoder.cpp
index e834815..276e186 100644
--- a/core/fxcrt/cfx_utf8decoder.cpp
+++ b/core/fxcrt/cfx_utf8decoder.cpp
@@ -6,53 +6,53 @@
#include "core/fxcrt/cfx_utf8decoder.h"
+#include <stdint.h>
+
#include <utility>
CFX_UTF8Decoder::CFX_UTF8Decoder(ByteStringView input) {
- for (char c : input) {
- ProcessByte(c);
+ int remaining = 0;
+ char32_t code_point = 0;
+
+ for (char byte : input) {
+ uint8_t code_unit = static_cast<uint8_t>(byte);
+ if (code_unit < 0x80) {
+ remaining = 0;
+ AppendCodePoint(code_unit);
+ } else if (code_unit < 0xc0) {
+ if (remaining > 0) {
+ --remaining;
+ code_point = (code_point << 6) | (code_unit & 0x3f);
+ if (remaining == 0) {
+ AppendCodePoint(code_point);
+ }
+ }
+ } else if (code_unit < 0xe0) {
+ remaining = 1;
+ code_point = code_unit & 0x1f;
+ } else if (code_unit < 0xf0) {
+ remaining = 2;
+ code_point = code_unit & 0x0f;
+ } else if (code_unit < 0xf8) {
+ remaining = 3;
+ code_point = code_unit & 0x07;
+ } else {
+ remaining = 0;
+ }
}
}
CFX_UTF8Decoder::~CFX_UTF8Decoder() = default;
WideString CFX_UTF8Decoder::TakeResult() {
- return std::move(m_Buffer);
+ return std::move(buffer_);
}
-void CFX_UTF8Decoder::AppendCodePoint(uint32_t ch) {
- m_Buffer += static_cast<wchar_t>(ch);
-}
-
-void CFX_UTF8Decoder::ProcessByte(uint8_t byte) {
- if (byte < 0x80) {
- m_PendingBytes = 0;
- AppendCodePoint(byte);
- } else if (byte < 0xc0) {
- if (m_PendingBytes == 0) {
- return;
- }
- m_PendingBytes--;
- m_PendingChar |= (byte & 0x3f) << (m_PendingBytes * 6);
- if (m_PendingBytes == 0) {
- AppendCodePoint(m_PendingChar);
- }
- } else if (byte < 0xe0) {
- m_PendingBytes = 1;
- m_PendingChar = (byte & 0x1f) << 6;
- } else if (byte < 0xf0) {
- m_PendingBytes = 2;
- m_PendingChar = (byte & 0x0f) << 12;
- } else if (byte < 0xf8) {
- m_PendingBytes = 3;
- m_PendingChar = (byte & 0x07) << 18;
- } else if (byte < 0xfc) {
- m_PendingBytes = 4;
- m_PendingChar = (byte & 0x03) << 24;
- } else if (byte < 0xfe) {
- m_PendingBytes = 5;
- m_PendingChar = (byte & 0x01) << 30;
- } else {
- m_PendingBytes = 0;
+void CFX_UTF8Decoder::AppendCodePoint(char32_t code_point) {
+ if (code_point > 0x10ffff) {
+ // Invalid code point above U+10FFFF.
+ return;
}
+
+ buffer_ += static_cast<wchar_t>(code_point);
}
diff --git a/core/fxcrt/cfx_utf8decoder.h b/core/fxcrt/cfx_utf8decoder.h
index 35b5671..9d9b0c1 100644
--- a/core/fxcrt/cfx_utf8decoder.h
+++ b/core/fxcrt/cfx_utf8decoder.h
@@ -18,12 +18,9 @@
WideString TakeResult();
private:
- void ProcessByte(uint8_t byte);
- void AppendCodePoint(uint32_t ch);
+ void AppendCodePoint(char32_t code_point);
- int m_PendingBytes = 0;
- uint32_t m_PendingChar = 0;
- WideString m_Buffer;
+ WideString buffer_;
};
#endif // CORE_FXCRT_CFX_UTF8DECODER_H_
diff --git a/core/fxcrt/cfx_utf8encoder.cpp b/core/fxcrt/cfx_utf8encoder.cpp
index 4951126..117f3f7 100644
--- a/core/fxcrt/cfx_utf8encoder.cpp
+++ b/core/fxcrt/cfx_utf8encoder.cpp
@@ -6,38 +6,40 @@
#include "core/fxcrt/cfx_utf8encoder.h"
+#include <stdint.h>
+
CFX_UTF8Encoder::CFX_UTF8Encoder() = default;
CFX_UTF8Encoder::~CFX_UTF8Encoder() = default;
-void CFX_UTF8Encoder::Input(wchar_t unicodeAsWchar) {
- uint32_t unicode = static_cast<uint32_t>(unicodeAsWchar);
- if (unicode < 0x80) {
- m_Buffer.push_back(unicode);
+void CFX_UTF8Encoder::Input(wchar_t code_unit) {
+ char32_t code_point = static_cast<char32_t>(code_unit);
+ if (code_point > 0x10ffff) {
+ // Invalid code point above U+10FFFF.
+ return;
+ }
+
+ if (code_point < 0x80) {
+ // 7-bit code points are unchanged in UTF-8.
+ buffer_.push_back(code_point);
+ return;
+ }
+
+ int byte_size;
+ if (code_point < 0x800) {
+ byte_size = 2;
+ } else if (code_point < 0x10000) {
+ byte_size = 3;
} else {
- if (unicode >= 0x80000000)
- return;
+ byte_size = 4;
+ }
- int nbytes = 0;
- if (unicode < 0x800)
- nbytes = 2;
- else if (unicode < 0x10000)
- nbytes = 3;
- else if (unicode < 0x200000)
- nbytes = 4;
- else if (unicode < 0x4000000)
- nbytes = 5;
- else
- nbytes = 6;
-
- static const uint8_t prefix[] = {0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
- int order = 1 << ((nbytes - 1) * 6);
- int code = unicodeAsWchar;
- m_Buffer.push_back(prefix[nbytes - 2] | (code / order));
- for (int i = 0; i < nbytes - 1; i++) {
- code = code % order;
- order >>= 6;
- m_Buffer.push_back(0x80 | (code / order));
- }
+ static constexpr uint8_t kPrefix[] = {0xc0, 0xe0, 0xf0};
+ int order = 1 << ((byte_size - 1) * 6);
+ buffer_.push_back(kPrefix[byte_size - 2] | (code_point / order));
+ for (int i = 0; i < byte_size - 1; i++) {
+ code_point = code_point % order;
+ order >>= 6;
+ buffer_.push_back(0x80 | (code_point / order));
}
}
diff --git a/core/fxcrt/cfx_utf8encoder.h b/core/fxcrt/cfx_utf8encoder.h
index 13815cd..a39f0cb 100644
--- a/core/fxcrt/cfx_utf8encoder.h
+++ b/core/fxcrt/cfx_utf8encoder.h
@@ -7,26 +7,26 @@
#ifndef CORE_FXCRT_CFX_UTF8ENCODER_H_
#define CORE_FXCRT_CFX_UTF8ENCODER_H_
-#include <stdint.h>
-
-#include "core/fxcrt/bytestring.h"
#include "core/fxcrt/data_vector.h"
+#include "core/fxcrt/string_view_template.h"
class CFX_UTF8Encoder {
public:
CFX_UTF8Encoder();
~CFX_UTF8Encoder();
- void Input(wchar_t unicodeAsWchar);
+ // `code_unit` may be UTF-16 or UTF-32, depending on the platform.
+ // TODO(crbug.com/pdfium/2031): Accept `char16_t` instead of `wchar_t`.
+ void Input(wchar_t code_unit);
- // The data returned by GetResult() is invalidated when this is modified by
+ // The data returned by `GetResult()` is invalidated when this is modified by
// appending any data.
ByteStringView GetResult() const {
- return ByteStringView(m_Buffer.data(), m_Buffer.size());
+ return ByteStringView(buffer_.data(), buffer_.size());
}
private:
- DataVector<uint8_t> m_Buffer;
+ DataVector<char> buffer_;
};
#endif // CORE_FXCRT_CFX_UTF8ENCODER_H_
diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index 2619864..b3f2864 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp
@@ -20,52 +20,75 @@
return buf.data();
}
-TEST(fxstring, FX_UTF8Encode) {
+TEST(fxstring, FXUTF8Encode) {
EXPECT_EQ("", FX_UTF8Encode(WideStringView()));
EXPECT_EQ(
"x"
"\xc2\x80"
"\xc3\xbf"
+ "\xed\x9f\xbf"
+ "\xee\x80\x80"
"\xef\xbc\xac"
+ "\xef\xbf\xbf"
"y",
FX_UTF8Encode(L"x"
L"\u0080"
L"\u00ff"
+ L"\ud7ff"
+ L"\ue000"
L"\uff2c"
+ L"\uffff"
L"y"));
}
-TEST(fxstring, FX_UTF8Decode) {
+TEST(fxstring, FXUTF8Decode) {
EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView()));
EXPECT_EQ(
L"x"
L"\u0080"
L"\u00ff"
+ L"\ud7ff"
+ L"\ue000"
L"\uff2c"
+ L"\uffff"
L"y",
FX_UTF8Decode("x"
"\xc2\x80"
"\xc3\xbf"
+ "\xed\x9f\xbf"
+ "\xee\x80\x80"
"\xef\xbc\xac"
+ "\xef\xbf\xbf"
"y"));
- EXPECT_EQ(L"a(A) b() c() d() e().",
- FX_UTF8Decode("a(\xc2\x41) " // Invalid continuation.
- "b(\xc2\xc2) " // Invalid continuation.
- "c(\xc2\xff\x80) " // Invalid continuation.
- "d(\x80\x80) " // Invalid leading.
- "e(\xff\x80\x80)" // Invalid leading.
- "."));
}
-TEST(fxstring, FX_UTF8EncodeDecodeConsistency) {
+TEST(fxstring, FXUTF8DecodeErrorRecovery) {
+ EXPECT_EQ(L"(A)", FX_UTF8Decode("(\xc2\x41)")) << "Invalid continuation";
+ EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xc2)")) << "Invalid continuation";
+ EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xff\x80)")) << "Invalid continuation";
+ EXPECT_EQ(L"()", FX_UTF8Decode("(\x80\x80)")) << "Invalid leading";
+ EXPECT_EQ(L"()", FX_UTF8Decode("(\xff\x80\x80)")) << "Invalid leading";
+ EXPECT_EQ(L"()", FX_UTF8Decode("(\xf8\x80\x80\x80\x80)"))
+ << "Invalid leading";
+ EXPECT_EQ(L"()", FX_UTF8Decode("(\xf8\x88\x80\x80\x80)"))
+ << "Invalid leading";
+ EXPECT_EQ(L"()", FX_UTF8Decode("(\xf4\x90\x80\x80)"))
+ << "Code point greater than U+10FFFF";
+}
+
+TEST(fxstring, FXUTF8EncodeDecodeConsistency) {
WideString wstr;
wstr.Reserve(0x10000);
- for (int w = 0; w < 0x10000; ++w)
- wstr += static_cast<wchar_t>(w);
+ for (int w = 0; w < 0x10000; ++w) {
+ // Skip UTF-16 surrogates.
+ if (w < 0xD800 || w >= 0xE000) {
+ wstr += static_cast<wchar_t>(w);
+ }
+ }
+ ASSERT_EQ(0xF800u, wstr.GetLength());
ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
WideString wstr2 = FX_UTF8Decode(bstr.AsStringView());
- EXPECT_EQ(0x10000u, wstr2.GetLength());
EXPECT_EQ(wstr, wstr2);
}
diff --git a/core/fxcrt/widestring.h b/core/fxcrt/widestring.h
index 24547dd..cf9edd3 100644
--- a/core/fxcrt/widestring.h
+++ b/core/fxcrt/widestring.h
@@ -32,6 +32,7 @@
// avoids the cost of std::string's iterator stability guarantees.
class WideString {
public:
+ // TODO(crbug.com/pdfium/2031): Consider switching to `char16_t` instead.
using CharType = wchar_t;
using const_iterator = const CharType*;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
diff --git a/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp b/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp
index 1478bf9..9aa7b8b 100644
--- a/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp
+++ b/fxjs/xfa/cfxjse_formcalc_context_embeddertest.cpp
@@ -4,6 +4,7 @@
#include <math.h>
+#include "build/build_config.h"
#include "fxjs/fxv8.h"
#include "fxjs/xfa/cfxjse_engine.h"
#include "fxjs/xfa/cfxjse_isolatetracker.h"
@@ -717,13 +718,12 @@
ExecuteExpectString("Encode(\"\\u0022\\u00f5\\ufed0\", \"html\")",
""õﻐ");
-#if !BUILDFLAG(IS_WIN)
- // Windows wchar_t isn't wide enough to handle these anyways.
- // TODO(tsepez): fix surrogate encodings.
+#if defined(WCHAR_T_IS_UTF32)
+ // TODO(crbug.com/pdfium/2029): Support UTF-16.
ExecuteExpectString("Encode(\"\\uD83D\\uDCA9\", \"url\")", "%01%f4%a9");
ExecuteExpectString("Encode(\"\\uD83D\\uDCA9\", \"xml\")", "");
ExecuteExpectString("Encode(\"\\uD83D\\uDCA9\", \"html\")", "");
-#endif // !BUILDFLAG(IS_WIN)
+#endif // defined(WCHAR_T_IS_UTF32)
}
TEST_F(CFXJSE_FormCalcContextEmbedderTest, DISABLED_Format) {