Remove FX_UTF8Decode() in favor of WideString::FromUTF8()
One UTF-8 to WideString API is enough, and most places use the latter.
Pure code move (and caller updates), no behavior change.
Change-Id: Iaa78f09ff7a87f72e1ae4b0747262a906dcfcc4f
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/114090
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Auto-Submit: Nico Weber <thakis@chromium.org>
diff --git a/core/fxcrt/cfx_fileaccess_windows.cpp b/core/fxcrt/cfx_fileaccess_windows.cpp
index bef9f58..303fcd1 100644
--- a/core/fxcrt/cfx_fileaccess_windows.cpp
+++ b/core/fxcrt/cfx_fileaccess_windows.cpp
@@ -26,7 +26,7 @@
if (m_hFile)
return false;
- WideString wname = FX_UTF8Decode(fileName);
+ WideString wname = WideString::FromUTF8(fileName);
m_hFile = ::CreateFileW(wname.c_str(), GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE, nullptr,
OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
diff --git a/core/fxcrt/fx_string.cpp b/core/fxcrt/fx_string.cpp
index 37b6692..2f13775 100644
--- a/core/fxcrt/fx_string.cpp
+++ b/core/fxcrt/fx_string.cpp
@@ -64,31 +64,6 @@
}
}
-// Appends a Unicode code point to a `WideString` using either UTF-16 or UTF-32,
-// depending on the platform's definition of `wchar_t`.
-//
-// TODO(crbug.com/pdfium/2031): Always use UTF-16.
-// TODO(crbug.com/pdfium/2041): Migrate to `WideString`.
-void AppendCodePointToWideString(char32_t code_point, WideString& buffer) {
- if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
- // Invalid code point above U+10FFFF.
- return;
- }
-
-#if defined(WCHAR_T_IS_16_BIT)
- if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
- buffer += static_cast<wchar_t>(code_point);
- } else {
- // Encode as UTF-16 surrogate pair.
- pdfium::SurrogatePair surrogate_pair(code_point);
- buffer += surrogate_pair.high();
- buffer += surrogate_pair.low();
- }
-#else
- buffer += static_cast<wchar_t>(code_point);
-#endif // defined(WCHAR_T_IS_16_BIT)
-}
-
} // namespace
ByteString FX_UTF8Encode(WideStringView wsStr) {
@@ -99,41 +74,6 @@
return buffer;
}
-WideString FX_UTF8Decode(ByteStringView bsStr) {
- WideString buffer;
-
- int remaining = 0;
- char32_t code_point = 0;
- for (char byte : bsStr) {
- uint8_t code_unit = static_cast<uint8_t>(byte);
- if (code_unit < 0x80) {
- remaining = 0;
- AppendCodePointToWideString(code_unit, buffer);
- } else if (code_unit < 0xc0) {
- if (remaining > 0) {
- --remaining;
- code_point = (code_point << 6) | (code_unit & 0x3f);
- if (remaining == 0) {
- AppendCodePointToWideString(code_point, buffer);
- }
- }
- } else if (code_unit < 0xe0) {
- remaining = 1;
- code_point = code_unit & 0x1f;
- } else if (code_unit < 0xf0) {
- remaining = 2;
- code_point = code_unit & 0x0f;
- } else if (code_unit < 0xf8) {
- remaining = 3;
- code_point = code_unit & 0x07;
- } else {
- remaining = 0;
- }
- }
-
- return buffer;
-}
-
namespace {
constexpr float kFractionScalesFloat[] = {
diff --git a/core/fxcrt/fx_string.h b/core/fxcrt/fx_string.h
index 49351c1..c99e423 100644
--- a/core/fxcrt/fx_string.h
+++ b/core/fxcrt/fx_string.h
@@ -21,7 +21,6 @@
}
ByteString FX_UTF8Encode(WideStringView wsStr);
-WideString FX_UTF8Decode(ByteStringView bsStr);
float StringToFloat(ByteStringView str);
float StringToFloat(WideStringView wsStr);
diff --git a/core/fxcrt/fx_string_unittest.cpp b/core/fxcrt/fx_string_unittest.cpp
index ab7046e..2998658 100644
--- a/core/fxcrt/fx_string_unittest.cpp
+++ b/core/fxcrt/fx_string_unittest.cpp
@@ -64,96 +64,6 @@
}
#endif // defined(WCHAR_T_IS_16_BIT)
-TEST(fxstring, FXUTF8Decode) {
- EXPECT_EQ(L"", FX_UTF8Decode(ByteStringView()));
- EXPECT_EQ(
- L"x"
- L"\u0080"
- L"\u00ff"
- L"\ud7ff"
- L"\ue000"
- L"\uff2c"
- L"\uffff"
- L"y",
- FX_UTF8Decode("x"
- "\u0080"
- "\u00ff"
- "\ud7ff"
- "\ue000"
- "\uff2c"
- "\uffff"
- "y"));
-}
-
-TEST(fxstring, FXUTF8DecodeSupplementary) {
- EXPECT_EQ(
- L"\U00010000"
- L"\U0001f3a8"
- L"\U0010ffff",
- FX_UTF8Decode("\U00010000"
- "🎨"
- "\U0010ffff"));
-}
-
-TEST(fxstring, FXUTF8DecodeErrorRecovery) {
- EXPECT_EQ(L"(A)", FX_UTF8Decode("(\xc2\x41)")) << "Invalid continuation";
- EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xc2)")) << "Invalid continuation";
- EXPECT_EQ(L"()", FX_UTF8Decode("(\xc2\xff\x80)")) << "Invalid continuation";
- EXPECT_EQ(L"()", FX_UTF8Decode("(\x80\x80)")) << "Invalid leading";
- EXPECT_EQ(L"()", FX_UTF8Decode("(\xff\x80\x80)")) << "Invalid leading";
- EXPECT_EQ(L"()", FX_UTF8Decode("(\xf8\x80\x80\x80\x80)"))
- << "Invalid leading";
- EXPECT_EQ(L"()", FX_UTF8Decode("(\xf8\x88\x80\x80\x80)"))
- << "Invalid leading";
- EXPECT_EQ(L"()", FX_UTF8Decode("(\xf4\x90\x80\x80)"))
- << "Code point greater than U+10FFFF";
-}
-
-TEST(fxstring, FXUTF8EncodeDecodeConsistency) {
- WideString wstr;
- wstr.Reserve(0x10000);
- for (char32_t w = 0; w < pdfium::kMinimumSupplementaryCodePoint; ++w) {
- if (pdfium::IsHighSurrogate(w) || pdfium::IsLowSurrogate(w)) {
- // Skip UTF-16 surrogates.
- continue;
- }
- wstr += static_cast<wchar_t>(w);
- }
- ASSERT_EQ(0xf800u, wstr.GetLength());
-
- ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
- WideString wstr2 = FX_UTF8Decode(bstr.AsStringView());
- EXPECT_EQ(wstr, wstr2);
-}
-
-TEST(fxstring, FXUTF8EncodeDecodeConsistencyUnpairedHighSurrogates) {
- WideString wstr;
- wstr.Reserve(0x400);
- for (wchar_t w = pdfium::kMinimumHighSurrogateCodeUnit;
- w <= pdfium::kMaximumHighSurrogateCodeUnit; ++w) {
- wstr += w;
- }
- ASSERT_EQ(0x400u, wstr.GetLength());
-
- ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
- WideString wstr2 = FX_UTF8Decode(bstr.AsStringView());
- EXPECT_EQ(wstr, wstr2);
-}
-
-TEST(fxstring, FXUTF8EncodeDecodeConsistencyUnpairedLowSurrogates) {
- WideString wstr;
- wstr.Reserve(0x400);
- for (wchar_t w = pdfium::kMinimumLowSurrogateCodeUnit;
- w <= pdfium::kMaximumLowSurrogateCodeUnit; ++w) {
- wstr += w;
- }
- ASSERT_EQ(0x400u, wstr.GetLength());
-
- ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
- WideString wstr2 = FX_UTF8Decode(bstr.AsStringView());
- EXPECT_EQ(wstr, wstr2);
-}
-
TEST(fxstring, ByteStringToFloat) {
EXPECT_FLOAT_EQ(0.0f, StringToFloat(""));
EXPECT_FLOAT_EQ(0.0f, StringToFloat("0"));
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index dca0a48..11261f1 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -298,6 +298,66 @@
return str;
}
+// Appends a Unicode code point to a `WideString` using either UTF-16 or UTF-32,
+// depending on the platform's definition of `wchar_t`.
+//
+// TODO(crbug.com/pdfium/2031): Always use UTF-16.
+// TODO(crbug.com/pdfium/2041): Migrate to `WideString`.
+void AppendCodePointToWideString(char32_t code_point, WideString& buffer) {
+ if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
+ // Invalid code point above U+10FFFF.
+ return;
+ }
+
+#if defined(WCHAR_T_IS_16_BIT)
+ if (code_point < pdfium::kMinimumSupplementaryCodePoint) {
+ buffer += static_cast<wchar_t>(code_point);
+ } else {
+ // Encode as UTF-16 surrogate pair.
+ pdfium::SurrogatePair surrogate_pair(code_point);
+ buffer += surrogate_pair.high();
+ buffer += surrogate_pair.low();
+ }
+#else
+ buffer += static_cast<wchar_t>(code_point);
+#endif // defined(WCHAR_T_IS_16_BIT)
+}
+
+WideString UTF8Decode(ByteStringView bsStr) {
+ WideString buffer;
+
+ int remaining = 0;
+ char32_t code_point = 0;
+ for (char byte : bsStr) {
+ uint8_t code_unit = static_cast<uint8_t>(byte);
+ if (code_unit < 0x80) {
+ remaining = 0;
+ AppendCodePointToWideString(code_unit, buffer);
+ } else if (code_unit < 0xc0) {
+ if (remaining > 0) {
+ --remaining;
+ code_point = (code_point << 6) | (code_unit & 0x3f);
+ if (remaining == 0) {
+ AppendCodePointToWideString(code_point, buffer);
+ }
+ }
+ } else if (code_unit < 0xe0) {
+ remaining = 1;
+ code_point = code_unit & 0x1f;
+ } else if (code_unit < 0xf0) {
+ remaining = 2;
+ code_point = code_unit & 0x0f;
+ } else if (code_unit < 0xf8) {
+ remaining = 3;
+ code_point = code_unit & 0x07;
+ } else {
+ remaining = 0;
+ }
+ }
+
+ return buffer;
+}
+
} // namespace
namespace fxcrt {
@@ -971,7 +1031,7 @@
// static
WideString WideString::FromUTF8(ByteStringView str) {
- return FX_UTF8Decode(str);
+ return UTF8Decode(str);
}
// static
diff --git a/core/fxcrt/widestring_unittest.cpp b/core/fxcrt/widestring_unittest.cpp
index 5c555ca..f60e11f 100644
--- a/core/fxcrt/widestring_unittest.cpp
+++ b/core/fxcrt/widestring_unittest.cpp
@@ -10,6 +10,7 @@
#include "build/build_config.h"
#include "core/fxcrt/fx_string.h"
+#include "core/fxcrt/utf16.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/base/containers/contains.h"
#include "third_party/base/containers/span.h"
@@ -1136,6 +1137,99 @@
EXPECT_EQ(0, iter - multi_str.rbegin());
}
+TEST(WideString, FromUTF8) {
+ EXPECT_EQ(L"", WideString::FromUTF8(ByteStringView()));
+ EXPECT_EQ(
+ L"x"
+ L"\u0080"
+ L"\u00ff"
+ L"\ud7ff"
+ L"\ue000"
+ L"\uff2c"
+ L"\uffff"
+ L"y",
+ WideString::FromUTF8("x"
+ "\u0080"
+ "\u00ff"
+ "\ud7ff"
+ "\ue000"
+ "\uff2c"
+ "\uffff"
+ "y"));
+}
+
+TEST(WideString, FromUTF8Supplementary) {
+ EXPECT_EQ(
+ L"\U00010000"
+ L"\U0001f3a8"
+ L"\U0010ffff",
+ WideString::FromUTF8("\U00010000"
+ "🎨"
+ "\U0010ffff"));
+}
+
+TEST(WideString, FromUTF8ErrorRecovery) {
+ EXPECT_EQ(L"(A)", WideString::FromUTF8("(\xc2\x41)"))
+ << "Invalid continuation";
+ EXPECT_EQ(L"()", WideString::FromUTF8("(\xc2\xc2)"))
+ << "Invalid continuation";
+ EXPECT_EQ(L"()", WideString::FromUTF8("(\xc2\xff\x80)"))
+ << "Invalid continuation";
+ EXPECT_EQ(L"()", WideString::FromUTF8("(\x80\x80)")) << "Invalid leading";
+ EXPECT_EQ(L"()", WideString::FromUTF8("(\xff\x80\x80)")) << "Invalid leading";
+ EXPECT_EQ(L"()", WideString::FromUTF8("(\xf8\x80\x80\x80\x80)"))
+ << "Invalid leading";
+ EXPECT_EQ(L"()", WideString::FromUTF8("(\xf8\x88\x80\x80\x80)"))
+ << "Invalid leading";
+ EXPECT_EQ(L"()", WideString::FromUTF8("(\xf4\x90\x80\x80)"))
+ << "Code point greater than U+10FFFF";
+}
+
+TEST(WideString, UTF8EncodeDecodeConsistency) {
+ WideString wstr;
+ wstr.Reserve(0x10000);
+ for (char32_t w = 0; w < pdfium::kMinimumSupplementaryCodePoint; ++w) {
+ if (pdfium::IsHighSurrogate(w) || pdfium::IsLowSurrogate(w)) {
+ // Skip UTF-16 surrogates.
+ continue;
+ }
+ wstr += static_cast<wchar_t>(w);
+ }
+ ASSERT_EQ(0xf800u, wstr.GetLength());
+
+ ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
+ WideString wstr2 = WideString::FromUTF8(bstr.AsStringView());
+ EXPECT_EQ(wstr, wstr2);
+}
+
+TEST(WideString, UTF8EncodeDecodeConsistencyUnpairedHighSurrogates) {
+ WideString wstr;
+ wstr.Reserve(0x400);
+ for (wchar_t w = pdfium::kMinimumHighSurrogateCodeUnit;
+ w <= pdfium::kMaximumHighSurrogateCodeUnit; ++w) {
+ wstr += w;
+ }
+ ASSERT_EQ(0x400u, wstr.GetLength());
+
+ ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
+ WideString wstr2 = WideString::FromUTF8(bstr.AsStringView());
+ EXPECT_EQ(wstr, wstr2);
+}
+
+TEST(WideString, UTF8EncodeDecodeConsistencyUnpairedLowSurrogates) {
+ WideString wstr;
+ wstr.Reserve(0x400);
+ for (wchar_t w = pdfium::kMinimumLowSurrogateCodeUnit;
+ w <= pdfium::kMaximumLowSurrogateCodeUnit; ++w) {
+ wstr += w;
+ }
+ ASSERT_EQ(0x400u, wstr.GetLength());
+
+ ByteString bstr = FX_UTF8Encode(wstr.AsStringView());
+ WideString wstr2 = WideString::FromUTF8(bstr.AsStringView());
+ EXPECT_EQ(wstr, wstr2);
+}
+
TEST(WideString, FromUTF16BE) {
struct UTF16BEDecodeCase {
ByteString in;