Make WideString's FromUTF16LE(), FromUTF16BE() take bytes, not wchar_t

This allows callers to pass-in UTF-16 data that's possibly not
2-byte aligned, and seems to be what most of the callers want too.

With this, we'll be able to use this in PDF_DecodeText.

No intended behavior change.

Change-Id: I2095c49b98646a33b21342fe4289c0436f8447b9
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/113950
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Nico Weber <thakis@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Auto-Submit: Nico Weber <thakis@chromium.org>
diff --git a/core/fxcrt/widestring.cpp b/core/fxcrt/widestring.cpp
index 9d0b956..2ad7b5d 100644
--- a/core/fxcrt/widestring.cpp
+++ b/core/fxcrt/widestring.cpp
@@ -957,36 +957,39 @@
 }
 
 // static
-WideString WideString::FromUTF16LE(const unsigned short* wstr, size_t wlen) {
-  if (!wstr || wlen == 0)
+WideString WideString::FromUTF16LE(pdfium::span<const uint8_t> data) {
+  if (data.empty()) {
     return WideString();
+  }
 
   WideString result;
+  size_t length = 0;
   {
     // Span's lifetime must end before ReleaseBuffer() below.
-    pdfium::span<wchar_t> buf = result.GetBuffer(wlen);
-    for (size_t i = 0; i < wlen; i++)
-      buf[i] = wstr[i];
+    pdfium::span<wchar_t> buf = result.GetBuffer(data.size() / 2);
+    for (size_t i = 0; i < data.size() - 1; i += 2) {
+      buf[length++] = data[i] | data[i + 1] << 8;
+    }
   }
-  result.ReleaseBuffer(wlen);
+  result.ReleaseBuffer(length);
   return result;
 }
 
-WideString WideString::FromUTF16BE(const unsigned short* wstr, size_t wlen) {
-  if (!wstr || wlen == 0)
+WideString WideString::FromUTF16BE(pdfium::span<const uint8_t> data) {
+  if (data.empty()) {
     return WideString();
+  }
 
   WideString result;
+  size_t length = 0;
   {
     // Span's lifetime must end before ReleaseBuffer() below.
-    pdfium::span<wchar_t> buf = result.GetBuffer(wlen);
-    for (size_t i = 0; i < wlen; i++) {
-      auto wch = wstr[i];
-      wch = (wch >> 8) | (wch << 8);
-      buf[i] = wch;
+    pdfium::span<wchar_t> buf = result.GetBuffer(data.size() / 2);
+    for (size_t i = 0; i < data.size() - 1; i += 2) {
+      buf[length++] = data[i] << 8 | data[i + 1];
     }
   }
-  result.ReleaseBuffer(wlen);
+  result.ReleaseBuffer(length);
   return result;
 }
 
diff --git a/core/fxcrt/widestring.h b/core/fxcrt/widestring.h
index a01e960..0b8d7aa 100644
--- a/core/fxcrt/widestring.h
+++ b/core/fxcrt/widestring.h
@@ -71,10 +71,8 @@
   [[nodiscard]] static WideString FromLatin1(ByteStringView str);
   [[nodiscard]] static WideString FromDefANSI(ByteStringView str);
   [[nodiscard]] static WideString FromUTF8(ByteStringView str);
-  [[nodiscard]] static WideString FromUTF16LE(const unsigned short* str,
-                                              size_t len);
-  [[nodiscard]] static WideString FromUTF16BE(const unsigned short* wstr,
-                                              size_t wlen);
+  [[nodiscard]] static WideString FromUTF16LE(pdfium::span<const uint8_t> data);
+  [[nodiscard]] static WideString FromUTF16BE(pdfium::span<const uint8_t> data);
 
   [[nodiscard]] static size_t WStringLength(const unsigned short* str);
 
diff --git a/core/fxge/fx_font.cpp b/core/fxge/fx_font.cpp
index a3bf344..f0f3576 100644
--- a/core/fxge/fx_font.cpp
+++ b/core/fxge/fx_font.cpp
@@ -117,11 +117,7 @@
           return ByteString();
         }
 
-        pdfium::span<const uint8_t> raw_span = utf16_be.raw_span();
-        return WideString::FromUTF16BE(
-                   reinterpret_cast<const uint16_t*>(raw_span.data()),
-                   raw_span.size() / 2)
-            .ToUTF8();
+        return WideString::FromUTF16BE(utf16_be.raw_span()).ToUTF8();
       }
     }
   }
diff --git a/core/fxge/win32/cwin32_platform.cpp b/core/fxge/win32/cwin32_platform.cpp
index 0d08822..461a915 100644
--- a/core/fxge/win32/cwin32_platform.cpp
+++ b/core/fxge/win32/cwin32_platform.cpp
@@ -25,11 +25,11 @@
 
 struct Variant {
   const char* m_pFaceName;
-  const char* m_pVariantName;  // Note: UTF-16LE terminator required.
+  pdfium::span<const char> m_pVariantName;
 };
 
 constexpr Variant kVariantNames[] = {
-    {"DFKai-SB", "\x19\x6A\x77\x69\xD4\x9A\x00\x00"},
+    {"DFKai-SB", pdfium::make_span("\x19\x6A\x77\x69\xD4\x9A")},
 };
 
 struct Substs {
@@ -375,10 +375,8 @@
     if (new_face != variant.m_pFaceName)
       continue;
 
-    const auto* pName =
-        reinterpret_cast<const unsigned short*>(variant.m_pVariantName);
-    size_t len = WideString::WStringLength(pName);
-    WideString wsName = WideString::FromUTF16LE(pName, len);
+    WideString wsName =
+        WideString::FromUTF16LE(pdfium::as_bytes(variant.m_pVariantName));
     if (wsFace == wsName)
       return hFont;
   }
diff --git a/fpdfsdk/cpdfsdk_formfillenvironment.cpp b/fpdfsdk/cpdfsdk_formfillenvironment.cpp
index 075e43e..a063f16 100644
--- a/fpdfsdk/cpdfsdk_formfillenvironment.cpp
+++ b/fpdfsdk/cpdfsdk_formfillenvironment.cpp
@@ -154,8 +154,8 @@
   if (nActualLen <= 0 || nActualLen > nRequiredLen)
     return WideString();
 
-  return WideString::FromUTF16LE(reinterpret_cast<uint16_t*>(pBuff.data()),
-                                 nActualLen / sizeof(uint16_t));
+  return WideString::FromUTF16LE(
+      {pBuff.data(), static_cast<size_t>(nActualLen)});
 #else   // PDF_ENABLE_XFA
   return WideString();
 #endif  // PDF_ENABLE_XFA
@@ -176,8 +176,8 @@
   if (nActualLen <= 0 || nActualLen > nRequiredLen)
     return WideString();
 
-  return WideString::FromUTF16LE(reinterpret_cast<uint16_t*>(pBuff.data()),
-                                 nActualLen / sizeof(uint16_t));
+  return WideString::FromUTF16LE(
+      {pBuff.data(), static_cast<size_t>(nActualLen)});
 #else   // PDF_ENABLE_XFA
   return WideString();
 #endif  // PDF_ENABLE_XFA
@@ -570,8 +570,8 @@
       AsFPDFWideString(&bsHeader), &response);
 
   WideString wsRet =
-      WideString::FromUTF16LE(reinterpret_cast<FPDF_WIDESTRING>(response.str),
-                              response.len / sizeof(FPDF_WCHAR));
+      WideString::FromUTF16LE({reinterpret_cast<const uint8_t*>(response.str),
+                               static_cast<size_t>(response.len)});
 
   FPDF_BStr_Clear(&response);
   return wsRet;
diff --git a/fpdfsdk/cpdfsdk_helpers.cpp b/fpdfsdk/cpdfsdk_helpers.cpp
index c548420..4d5020f 100644
--- a/fpdfsdk/cpdfsdk_helpers.cpp
+++ b/fpdfsdk/cpdfsdk_helpers.cpp
@@ -210,8 +210,8 @@
 }
 
 WideString WideStringFromFPDFWideString(FPDF_WIDESTRING wide_string) {
-  return WideString::FromUTF16LE(wide_string,
-                                 WideString::WStringLength(wide_string));
+  return WideString::FromUTF16LE({reinterpret_cast<const uint8_t*>(wide_string),
+                                  WideString::WStringLength(wide_string) * 2});
 }
 
 #ifdef PDF_ENABLE_XFA
diff --git a/fpdfsdk/fpdf_formfill_embeddertest.cpp b/fpdfsdk/fpdf_formfill_embeddertest.cpp
index 71bede8..c35c785 100644
--- a/fpdfsdk/fpdf_formfill_embeddertest.cpp
+++ b/fpdfsdk/fpdf_formfill_embeddertest.cpp
@@ -133,12 +133,12 @@
     ASSERT_NE(actual_len, 0U);
     ASSERT_LT(actual_len, 1000U);
 
-    std::vector<unsigned short> buf(actual_len);
+    std::vector<uint8_t> buf(actual_len);
     ASSERT_EQ(actual_len, FORM_GetSelectedText(form_handle(), page_, buf.data(),
                                                actual_len));
 
-    int num_chars = (actual_len / sizeof(unsigned short)) - 1;
-    EXPECT_EQ(expected_string, WideString::FromUTF16LE(buf.data(), num_chars));
+    EXPECT_EQ(expected_string,
+              WideString::FromUTF16LE({buf.data(), actual_len - 2}));
   }
 
   void FocusOnPoint(const CFX_PointF& point) {
@@ -151,12 +151,12 @@
     ASSERT_NE(actual_len, 0U);
     ASSERT_LT(actual_len, 1000U);
 
-    std::vector<unsigned short> buf(actual_len);
+    std::vector<uint8_t> buf(actual_len);
     ASSERT_EQ(actual_len, FORM_GetFocusedText(form_handle(), page_, buf.data(),
                                               actual_len));
 
-    int num_chars = (actual_len / sizeof(unsigned short)) - 1;
-    EXPECT_EQ(expected_string, WideString::FromUTF16LE(buf.data(), num_chars));
+    EXPECT_EQ(expected_string,
+              WideString::FromUTF16LE({buf.data(), actual_len - 2}));
   }
 
   void CheckCanUndo(bool expected_result) {
diff --git a/fpdfsdk/fpdfxfa/cpdfxfa_context.cpp b/fpdfsdk/fpdfxfa/cpdfxfa_context.cpp
index 1271fbd..75f9e06 100644
--- a/fpdfsdk/fpdfxfa/cpdfxfa_context.cpp
+++ b/fpdfsdk/fpdfxfa/cpdfxfa_context.cpp
@@ -364,17 +364,16 @@
     return WideString();
 
   constexpr int kMaxWideChars = 1024;
-  FixedZeroedDataVector<uint16_t> buffer(kMaxWideChars);
-  pdfium::span<uint16_t> buffer_span = buffer.writable_span();
+  constexpr int kMaxBytes = kMaxWideChars * sizeof(uint16_t);
+  FixedZeroedDataVector<uint8_t> buffer(kMaxBytes);
+  pdfium::span<uint8_t> buffer_span = buffer.writable_span();
   int byte_length = m_pFormFillEnv->JS_appResponse(
-      wsQuestion, wsTitle, wsDefaultAnswer, WideString(), bMark,
-      pdfium::as_writable_bytes(buffer_span));
+      wsQuestion, wsTitle, wsDefaultAnswer, WideString(), bMark, buffer_span);
   if (byte_length <= 0)
     return WideString();
 
-  buffer_span = buffer_span.first(
-      std::min<size_t>(kMaxWideChars, byte_length / sizeof(uint16_t)));
-  return WideString::FromUTF16LE(buffer_span.data(), buffer_span.size());
+  buffer_span = buffer_span.first(std::min<size_t>(kMaxBytes, byte_length));
+  return WideString::FromUTF16LE(buffer_span);
 }
 
 RetainPtr<IFX_SeekableReadStream> CPDFXFA_Context::DownloadURL(
diff --git a/fpdfsdk/fpdfxfa/cpdfxfa_docenvironment.cpp b/fpdfsdk/fpdfxfa/cpdfxfa_docenvironment.cpp
index ff93008..8800a0c 100644
--- a/fpdfsdk/fpdfxfa/cpdfxfa_docenvironment.cpp
+++ b/fpdfsdk/fpdfxfa/cpdfxfa_docenvironment.cpp
@@ -503,9 +503,7 @@
         continue;
       }
       if (i == pArray->size() - 1) {
-        WideString wPath = WideString::FromUTF16LE(
-            reinterpret_cast<const unsigned short*>(bs.c_str()),
-            bs.GetLength() / sizeof(unsigned short));
+        WideString wPath = WideString::FromUTF16LE(bs.raw_span());
         ByteString bPath = wPath.ToUTF8();
         static const char kFormat[] =
             "\n<pdf href=\"%s\" xmlns=\"http://ns.adobe.com/xdp/pdf/\"/>";
diff --git a/fxjs/cjs_app.cpp b/fxjs/cjs_app.cpp
index d00f8d0..d3dd81d 100644
--- a/fxjs/cjs_app.cpp
+++ b/fxjs/cjs_app.cpp
@@ -545,19 +545,16 @@
 
   constexpr int kMaxWideChars = 1024;
   constexpr int kMaxBytes = kMaxWideChars * sizeof(uint16_t);
-  FixedZeroedDataVector<uint16_t> buffer(kMaxWideChars);
-  pdfium::span<uint16_t> buffer_span = buffer.writable_span();
+  FixedZeroedDataVector<uint8_t> buffer(kMaxBytes);
+  pdfium::span<uint8_t> buffer_span = buffer.writable_span();
   int byte_length = pRuntime->GetFormFillEnv()->JS_appResponse(
-      swQuestion, swTitle, swDefault, swLabel, bPassword,
-      pdfium::as_writable_bytes(buffer_span));
+      swQuestion, swTitle, swDefault, swLabel, bPassword, buffer_span);
   if (byte_length < 0 || byte_length > kMaxBytes)
     return CJS_Result::Failure(JSMessage::kParamTooLongError);
 
-  buffer_span = buffer_span.first(
-      std::min<size_t>(kMaxWideChars, byte_length / sizeof(uint16_t)));
-  return CJS_Result::Success(pRuntime->NewString(
-      WideString::FromUTF16LE(buffer_span.data(), buffer_span.size())
-          .AsStringView()));
+  buffer_span = buffer_span.first(std::min<size_t>(kMaxBytes, byte_length));
+  return CJS_Result::Success(
+      pRuntime->NewString(WideString::FromUTF16LE(buffer_span).AsStringView()));
 }
 
 CJS_Result CJS_App::get_media(CJS_Runtime* pRuntime) {
diff --git a/testing/BUILD.gn b/testing/BUILD.gn
index dbb11a9..d61f8f3 100644
--- a/testing/BUILD.gn
+++ b/testing/BUILD.gn
@@ -44,6 +44,7 @@
     "../core/fdrm",
     "../core/fxcrt",
     "../core/fxge",
+    "../fpdfsdk",
     "image_diff",
   ]
   configs += [
diff --git a/testing/fuzzers/pdf_bidi_fuzzer.cc b/testing/fuzzers/pdf_bidi_fuzzer.cc
index ffa6558..2871961 100644
--- a/testing/fuzzers/pdf_bidi_fuzzer.cc
+++ b/testing/fuzzers/pdf_bidi_fuzzer.cc
@@ -27,9 +27,7 @@
   rtf_break.SetFont(CFGAS_GEFont::LoadFont(std::move(font)));
   rtf_break.SetFontSize(12);
 
-  WideString input =
-      WideString::FromUTF16LE(reinterpret_cast<const unsigned short*>(data),
-                              size / sizeof(unsigned short));
+  WideString input = WideString::FromUTF16LE({data, size});
   for (wchar_t ch : input)
     rtf_break.AppendChar(ch);
 
diff --git a/testing/fuzzers/pdf_cfx_barcode_fuzzer.cc b/testing/fuzzers/pdf_cfx_barcode_fuzzer.cc
index 35afec9..4df0130 100644
--- a/testing/fuzzers/pdf_cfx_barcode_fuzzer.cc
+++ b/testing/fuzzers/pdf_cfx_barcode_fuzzer.cc
@@ -24,8 +24,7 @@
   barcode->SetHeight(298);
   barcode->SetWidth(418);
 
-  WideString content = WideString::FromUTF16LE(
-      reinterpret_cast<const uint16_t*>(data), size / sizeof(uint16_t));
+  WideString content = WideString::FromUTF16LE({data, size});
 
   if (!barcode->Encode(content.AsStringView()))
     return 0;
diff --git a/testing/fuzzers/pdf_cjs_util_fuzzer.cc b/testing/fuzzers/pdf_cjs_util_fuzzer.cc
index 3885c7b..51d6b68 100644
--- a/testing/fuzzers/pdf_cjs_util_fuzzer.cc
+++ b/testing/fuzzers/pdf_cjs_util_fuzzer.cc
@@ -6,18 +6,15 @@
 #include "fxjs/cjs_util.h"
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  auto* short_data = reinterpret_cast<const unsigned short*>(data);
-  size_t short_size = size / sizeof(unsigned short);
-  if (short_size > 1) {
-    WideString input = WideString::FromUTF16LE(short_data, short_size);
+  if (size > 2) {
+    WideString input = WideString::FromUTF16LE({data, size});
     CJS_Util::ParseDataType(&input);
   }
-  if (short_size > 2) {
-    size_t short_len1 = short_size / 2;
-    size_t short_len2 = short_size - short_len1;
-    WideString input1 = WideString::FromUTF16LE(short_data, short_len1);
-    WideString input2 =
-        WideString::FromUTF16LE(short_data + short_len1, short_len2);
+  if (size > 4) {
+    size_t len1 = size / 2;
+    size_t len2 = size - len1;
+    WideString input1 = WideString::FromUTF16LE({data, len1});
+    WideString input2 = WideString::FromUTF16LE({data + len1, len2});
     CJS_Util::StringPrintx(input1, input2);
   }
   return 0;
diff --git a/testing/fuzzers/pdf_fx_date_helpers_fuzzer.cc b/testing/fuzzers/pdf_fx_date_helpers_fuzzer.cc
index d98fffd..0e198c7 100644
--- a/testing/fuzzers/pdf_fx_date_helpers_fuzzer.cc
+++ b/testing/fuzzers/pdf_fx_date_helpers_fuzzer.cc
@@ -6,15 +6,12 @@
 #include "fxjs/fx_date_helpers.h"
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  auto* short_data = reinterpret_cast<const unsigned short*>(data);
-  size_t short_size = size / sizeof(unsigned short);
-  if (short_size > 2 && short_size < 8192) {
+  if (size > 4 && size < 16384) {
     double ignore;
-    size_t short_len1 = short_size / 2;
-    size_t short_len2 = short_size - short_len1;
-    WideString input1 = WideString::FromUTF16LE(short_data, short_len1);
-    WideString input2 =
-        WideString::FromUTF16LE(short_data + short_len1, short_len2);
+    size_t len1 = size / 2;
+    size_t len2 = size - len1;
+    WideString input1 = WideString::FromUTF16LE({data, len1});
+    WideString input2 = WideString::FromUTF16LE({data + len1, len2});
     FX_ParseDateUsingFormat(input1, input2, &ignore);
   }
   return 0;
diff --git a/testing/fuzzers/pdf_nametree_fuzzer.cc b/testing/fuzzers/pdf_nametree_fuzzer.cc
index 43c10c7..1058eec 100644
--- a/testing/fuzzers/pdf_nametree_fuzzer.cc
+++ b/testing/fuzzers/pdf_nametree_fuzzer.cc
@@ -28,8 +28,7 @@
     constexpr size_t kMaxNameLen = 10;
     std::string str = data_provider->ConsumeRandomLengthString(kMaxNameLen);
     names.push_back(WideString::FromUTF16LE(
-        reinterpret_cast<const unsigned short*>(str.data()),
-        str.size() / sizeof(unsigned short)));
+        {reinterpret_cast<const uint8_t*>(str.data()), str.size()}));
   }
   return names;
 }
diff --git a/testing/fx_string_testhelpers.cpp b/testing/fx_string_testhelpers.cpp
index 09f7653..399335c 100644
--- a/testing/fx_string_testhelpers.cpp
+++ b/testing/fx_string_testhelpers.cpp
@@ -10,6 +10,7 @@
 
 #include "core/fxcrt/cfx_datetime.h"
 #include "core/fxcrt/fx_string.h"
+#include "fpdfsdk/cpdfsdk_helpers.h"
 #include "third_party/base/check_op.h"
 #include "third_party/base/containers/span.h"
 
@@ -38,8 +39,7 @@
 }
 
 std::string GetPlatformString(FPDF_WIDESTRING wstr) {
-  WideString wide_string =
-      WideString::FromUTF16LE(wstr, WideString::WStringLength(wstr));
+  WideString wide_string = WideStringFromFPDFWideString(wstr);
   return std::string(wide_string.ToUTF8().c_str());
 }
 
diff --git a/xfa/fwl/cfwl_edit_embeddertest.cpp b/xfa/fwl/cfwl_edit_embeddertest.cpp
index 7496a24..25f6dd2 100644
--- a/xfa/fwl/cfwl_edit_embeddertest.cpp
+++ b/xfa/fwl/cfwl_edit_embeddertest.cpp
@@ -78,9 +78,9 @@
   // 12 == (2 * strlen(defgh)) + 2 (for \0\0)
   EXPECT_EQ(12UL, FORM_GetSelectedText(form_handle(), page(), nullptr, 0));
 
-  unsigned short buf[128];
+  uint8_t buf[128];
   unsigned long len = FORM_GetSelectedText(form_handle(), page(), &buf, 128);
-  EXPECT_STREQ(L"defgh", WideString::FromUTF16LE(buf, len).c_str());
+  EXPECT_STREQ(L"defgh", WideString::FromUTF16LE({buf, len}).c_str());
 }
 
 TEST_F(CFWLEditEmbedderTest, DragMouseSelection) {
@@ -101,9 +101,9 @@
   // 12 == (2 * strlen(defgh)) + 2 (for \0\0)
   EXPECT_EQ(12UL, FORM_GetSelectedText(form_handle(), page(), nullptr, 0));
 
-  unsigned short buf[128];
+  uint8_t buf[128];
   unsigned long len = FORM_GetSelectedText(form_handle(), page(), &buf, 128);
-  EXPECT_STREQ(L"defgh", WideString::FromUTF16LE(buf, len).c_str());
+  EXPECT_STREQ(L"defgh", WideString::FromUTF16LE({buf, len}).c_str());
 
   // TODO(hnakashima): This is incorrect. Visually 'abcdefgh' are selected.
   const char kDraggedMD5[] = "f131526c8edd04e44de17b2647ec54c8";