Improve ParseDecodeTest cases for test strings Improves the ParserDecodeTest cases for text strings in several ways: 1. Replaces loops with simple linear series of expectations. 2. Adds helpers to avoid manually specifying string literal lengths. 3. Adds extra tests for unpaired surrogates. 4. Adds a test verifying that PDFDocEncoding round-trips. Also adds TODOs omitted earlier from core/fxcrt/fx_string.cpp. Bug: pdfium:2029 Change-Id: Ia1572686f5570999da7281aeee1c6f6b0a0a25e5 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/107830 Reviewed-by: Tom Sepez <tsepez@chromium.org> Commit-Queue: K. Moon <kmoon@chromium.org>

commit: c4e6b7d3e039b9a8085b231be6efda7e6a33be31 [log] [tgz]
author: K. Moon <kmoon@chromium.org> Thu May 18 22:50:28 2023 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Thu May 18 22:50:28 2023 +0000
tree: 48ca0404f5197fa48f6907b932916d66eed7c7ed
parent: 448f3994dc496591d1ba8bf8e75bcd225ba1aab5 [diff]
diff --git a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
index 2c2074b..f812097 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp

@@ -4,6 +4,9 @@
 
 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
 
+#include <stddef.h>
+#include <stdint.h>
+
 #include <iterator>
 
 #include "core/fpdfapi/parser/cpdf_array.h"
@@ -12,9 +15,29 @@
 #include "core/fpdfapi/parser/cpdf_name.h"
 #include "core/fpdfapi/parser/cpdf_reference.h"
 #include "core/fpdfapi/parser/cpdf_string.h"
+#include "core/fxcrt/bytestring.h"
 #include "core/fxcrt/fx_memory_wrappers.h"
+#include "core/fxcrt/string_view_template.h"
+#include "core/fxcrt/widestring.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "testing/test_support.h"
+#include "third_party/base/span.h"
+
+namespace {
+
+// Converts a string literal into a `uint8_t` span.
+template <size_t N>
+pdfium::span<const uint8_t> ToSpan(const char (&array)[N]) {
+  return pdfium::span(reinterpret_cast<const uint8_t*>(array), N - 1);
+}
+
+// Converts a string literal into a `ByteString`.
+template <size_t N>
+ByteString ToByteString(const char (&array)[N]) {
+  return ByteString(array, N - 1);
+}
+
+}  // namespace
 
 TEST(ParserDecodeTest, ValidateDecoderPipeline) {
   {
@@ -379,82 +402,90 @@
 }
 
 TEST(ParserDecodeTest, DecodeText) {
-  const struct DecodeTestData {
-    const char* input;
-    size_t input_length;
-    const wchar_t* expected_output;
-    size_t expected_length;
-  } kTestData[] = {
-      // Empty src string.
-      {"", 0, L"", 0},
-      // ASCII text.
-      {"the quick\tfox", 13, L"the quick\tfox", 13},
-      // Unicode text.
-      {"\xFE\xFF\x03\x30\x03\x31", 6, L"\x0330\x0331", 2},
-      // More Unicode text.
-      {"\xFE\xFF\x7F\x51\x98\x75\x00\x20\x56\xFE\x72\x47\x00"
-       "\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB",
-       26,
-       L"\x7F51\x9875\x0020\x56FE\x7247\x0020"
-       L"\x8D44\x8BAF\x66F4\x591A\x0020\x00BB",
-       12},
-      // Unicode escape sequence. https://crbug.com/pdfium/182
-      {"\xFE\xFF\x00\x1B\x6A\x61\x00\x1B\x00\x20\x53\x70\x52\x37", 14,
-       L"\x0020\x5370\x5237", 3},
-      {"\xFE\xFF\x00\x1B\x6A\x61\x00\x1B\x00\x20\x53\x70\x52\x37\x29", 15,
-       L"\x0020\x5370\x5237", 3},
-      {"\xFE\xFF\x00\x1B\x6A\x61\x4A\x50\x00\x1B\x00\x20\x53\x70\x52\x37", 16,
-       L"\x0020\x5370\x5237", 3},
-      {"\xFE\xFF\x00\x20\x00\x1B\x6A\x61\x4A\x50\x00\x1B\x52\x37", 14,
-       L"\x0020\x5237", 2},
-      // https://crbug.com/1001159
-      {"\xFE\xFF\x00\x1B\x00\x1B", 6, L"", 0},
-      {"\xFE\xFF\x00\x1B\x00\x1B\x20", 7, L"", 0},
-      {"\xFE\xFF\x00\x1B\x00\x1B\x00\x20", 8, L"\x0020", 1},
-  };
+  // Empty src string.
+  EXPECT_EQ(L"", PDF_DecodeText(ToSpan("")));
 
-  for (const auto& test_case : kTestData) {
-    WideString output = PDF_DecodeText(
-        pdfium::make_span(reinterpret_cast<const uint8_t*>(test_case.input),
-                          test_case.input_length));
-    ASSERT_EQ(test_case.expected_length, output.GetLength())
-        << "for case " << test_case.input;
-    const wchar_t* str_ptr = output.c_str();
-    for (size_t i = 0; i < test_case.expected_length; ++i) {
-      EXPECT_EQ(test_case.expected_output[i], str_ptr[i])
-          << "for case " << test_case.input << " char " << i;
-    }
-  }
+  // ASCII text.
+  EXPECT_EQ(L"the quick\tfox", PDF_DecodeText(ToSpan("the quick\tfox")));
+
+  // Unicode text.
+  EXPECT_EQ(L"\x0330\x0331",
+            PDF_DecodeText(ToSpan("\xFE\xFF\x03\x30\x03\x31")));
+
+  // More Unicode text.
+  EXPECT_EQ(
+      L"\x7F51\x9875\x0020\x56FE\x7247\x0020"
+      L"\x8D44\x8BAF\x66F4\x591A\x0020\x00BB",
+      PDF_DecodeText(
+          ToSpan("\xFE\xFF\x7F\x51\x98\x75\x00\x20\x56\xFE\x72\x47\x00"
+                 "\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB")));
+}
+
+// https://crbug.com/pdfium/182
+TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) {
+  EXPECT_EQ(L"\x0020\x5370\x5237",
+            PDF_DecodeText(ToSpan(
+                "\xFE\xFF\x00\x1B\x6A\x61\x00\x1B\x00\x20\x53\x70\x52\x37")));
+  EXPECT_EQ(
+      L"\x0020\x5370\x5237",
+      PDF_DecodeText(ToSpan(
+          "\xFE\xFF\x00\x1B\x6A\x61\x00\x1B\x00\x20\x53\x70\x52\x37\x29")));
+  EXPECT_EQ(
+      L"\x0020\x5370\x5237",
+      PDF_DecodeText(ToSpan(
+          "\xFE\xFF\x00\x1B\x6A\x61\x4A\x50\x00\x1B\x00\x20\x53\x70\x52\x37")));
+  EXPECT_EQ(L"\x0020\x5237",
+            PDF_DecodeText(ToSpan(
+                "\xFE\xFF\x00\x20\x00\x1B\x6A\x61\x4A\x50\x00\x1B\x52\x37")));
+}
+
+// https://crbug.com/1001159
+TEST(ParserDecodeTest, DecodeTextWithInvalidUnicodeEscapes) {
+  EXPECT_EQ(L"", PDF_DecodeText(ToSpan("\xFE\xFF\x00\x1B\x00\x1B")));
+  EXPECT_EQ(L"", PDF_DecodeText(ToSpan("\xFE\xFF\x00\x1B\x00\x1B\x20")));
+  EXPECT_EQ(L"\x0020",
+            PDF_DecodeText(ToSpan("\xFE\xFF\x00\x1B\x00\x1B\x00\x20")));
+}
+
+TEST(ParserDecodeTest, DecodeTextWithUnpairedSurrogates) {
+  EXPECT_EQ(L"\xD800", PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x00"))) << "High";
+  EXPECT_EQ(L"\xDC00", PDF_DecodeText(ToSpan("\xFE\xFF\xDC\x00"))) << "Low";
 }
 
 TEST(ParserDecodeTest, EncodeText) {
-  const struct EncodeTestData {
-    const wchar_t* input;
-    const char* expected_output;
-    size_t expected_length;
-  } kTestData[] = {
-      // Empty src string.
-      {L"", "", 0},
-      // ASCII text.
-      {L"the quick\tfox", "the quick\tfox", 13},
-      // Unicode text.
-      {L"\x0330\x0331", "\xFE\xFF\x03\x30\x03\x31", 6},
-      // More Unicode text.
-      {L"\x7F51\x9875\x0020\x56FE\x7247\x0020"
-       L"\x8D44\x8BAF\x66F4\x591A\x0020\x00BB",
-       "\xFE\xFF\x7F\x51\x98\x75\x00\x20\x56\xFE\x72\x47\x00"
-       "\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB",
-       26},
-  };
+  // Empty src string.
+  EXPECT_EQ("", PDF_EncodeText(L""));
 
-  for (const auto& test_case : kTestData) {
-    ByteString output = PDF_EncodeText(test_case.input);
-    ASSERT_EQ(test_case.expected_length, output.GetLength())
-        << "for case " << test_case.input;
-    const char* str_ptr = output.c_str();
-    for (size_t j = 0; j < test_case.expected_length; ++j) {
-      EXPECT_EQ(test_case.expected_output[j], str_ptr[j])
-          << "for case " << test_case.input << " char " << j;
+  // ASCII text.
+  EXPECT_EQ("the quick\tfox", PDF_EncodeText(L"the quick\tfox"));
+
+  // Unicode text.
+  EXPECT_EQ("\xFE\xFF\x03\x30\x03\x31", PDF_EncodeText(L"\x0330\x0331"));
+
+  // More Unicode text.
+  EXPECT_EQ(
+      ToByteString("\xFE\xFF\x7F\x51\x98\x75\x00\x20\x56\xFE\x72\x47\x00"
+                   "\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB"),
+      PDF_EncodeText(L"\x7F51\x9875\x0020\x56FE\x7247\x0020"
+                     L"\x8D44\x8BAF\x66F4\x591A\x0020\x00BB"));
+}
+
+TEST(ParserDecodeTest, RoundTripText) {
+  for (int pdf_code_point = 0; pdf_code_point < 256; ++pdf_code_point) {
+    ByteString original(static_cast<char>(pdf_code_point));
+    ByteString reencoded =
+        PDF_EncodeText(PDF_DecodeText(original.raw_span()).AsStringView());
+
+    switch (pdf_code_point) {
+      case 0x7F:
+      case 0x9F:
+      case 0xAD:
+        EXPECT_EQ(ByteString('\0'), reencoded) << "PDFDocEncoding undefined";
+        break;
+
+      default:
+        EXPECT_EQ(original, reencoded) << "PDFDocEncoding: " << pdf_code_point;
+        break;
     }
   }
 }

diff --git a/core/fxcrt/fx_string.cpp b/core/fxcrt/fx_string.cpp
index b783ec7..e7d6e1b 100644
--- a/core/fxcrt/fx_string.cpp
+++ b/core/fxcrt/fx_string.cpp

@@ -24,6 +24,8 @@
 namespace {
 
 // Appends a Unicode code point to a `ByteString` using UTF-8.
+//
+// TODO(crbug.com/pdfium/2041): Migrate to `ByteString`.
 void AppendCodePointToByteString(char32_t code_point, ByteString& buffer) {
   if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
     // Invalid code point above U+10FFFF.
@@ -59,6 +61,7 @@
 // depending on the platform's definition of `wchar_t`.
 //
 // TODO(crbug.com/pdfium/2031): Always use UTF-16.
+// TODO(crbug.com/pdfium/2041): Migrate to `WideString`.
 void AppendCodePointToWideString(char32_t code_point, WideString& buffer) {
   if (code_point > pdfium::kMaximumSupplementaryCodePoint) {
     // Invalid code point above U+10FFFF.
commit	c4e6b7d3e039b9a8085b231be6efda7e6a33be31	[log] [tgz]
author	K. Moon <kmoon@chromium.org>	Thu May 18 22:50:28 2023 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Thu May 18 22:50:28 2023 +0000
tree	48ca0404f5197fa48f6907b932916d66eed7c7ed
parent	448f3994dc496591d1ba8bf8e75bcd225ba1aab5 [diff]