Don't re-encode annotation /Contents

Copies the /Contents string of an annotation unmodified.

Bug: pdfium:1098
Change-Id: I33a20e012fefb8b8061885d224ae059dab0d8bf0
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/106550
Commit-Queue: K. Moon <kmoon@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdfdoc/BUILD.gn b/core/fpdfdoc/BUILD.gn
index 2e7a150..5e11adb 100644
--- a/core/fpdfdoc/BUILD.gn
+++ b/core/fpdfdoc/BUILD.gn
@@ -97,6 +97,7 @@
   sources = [
     "cpdf_action_unittest.cpp",
     "cpdf_annot_unittest.cpp",
+    "cpdf_annotlist_unittest.cpp",
     "cpdf_bafontmap_unittest.cpp",
     "cpdf_defaultappearance_unittest.cpp",
     "cpdf_dest_unittest.cpp",
diff --git a/core/fpdfdoc/cpdf_annotlist.cpp b/core/fpdfdoc/cpdf_annotlist.cpp
index c421e95..ff3ed81 100644
--- a/core/fpdfdoc/cpdf_annotlist.cpp
+++ b/core/fpdfdoc/cpdf_annotlist.cpp
@@ -23,6 +23,7 @@
 #include "core/fpdfapi/parser/cpdf_number.h"
 #include "core/fpdfapi/parser/cpdf_reference.h"
 #include "core/fpdfapi/parser/cpdf_string.h"
+#include "core/fpdfapi/parser/fpdf_parser_decode.h"
 #include "core/fpdfapi/render/cpdf_renderoptions.h"
 #include "core/fpdfdoc/cpdf_annot.h"
 #include "core/fpdfdoc/cpdf_formfield.h"
@@ -78,12 +79,13 @@
   if (!pParentDict)
     return nullptr;
 
-  // TODO(jaepark): We shouldn't strip BOM for some strings and not for others.
-  // See pdfium:593.
-  WideString sContents =
-      pParentDict->GetUnicodeTextFor(pdfium::annotation::kContents);
-  if (sContents.IsEmpty())
+  // TODO(crbug.com/pdfium/1098): Determine if we really need to check if
+  // /Contents is empty or not. If so, optimize decoding for empty check.
+  ByteString contents =
+      pParentDict->GetByteStringFor(pdfium::annotation::kContents);
+  if (PDF_DecodeText(contents.raw_span()).IsEmpty()) {
     return nullptr;
+  }
 
   auto pAnnotDict = pDocument->New<CPDF_Dictionary>();
   pAnnotDict->SetNewFor<CPDF_Name>(pdfium::annotation::kType, "Annot");
@@ -91,8 +93,8 @@
   pAnnotDict->SetNewFor<CPDF_String>(
       pdfium::form_fields::kT,
       pParentDict->GetByteStringFor(pdfium::form_fields::kT), false);
-  pAnnotDict->SetNewFor<CPDF_String>(pdfium::annotation::kContents,
-                                     sContents.ToUTF8(), false);
+  pAnnotDict->SetNewFor<CPDF_String>(pdfium::annotation::kContents, contents,
+                                     /*bHex=*/false);
 
   CFX_FloatRect rect = pParentDict->GetRectFor(pdfium::annotation::kRect);
   rect.Normalize();
diff --git a/core/fpdfdoc/cpdf_annotlist_unittest.cpp b/core/fpdfdoc/cpdf_annotlist_unittest.cpp
new file mode 100644
index 0000000..5d7e1e2
--- /dev/null
+++ b/core/fpdfdoc/cpdf_annotlist_unittest.cpp
@@ -0,0 +1,127 @@
+// Copyright 2023 The PDFium Authors
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "core/fpdfdoc/cpdf_annotlist.h"
+
+#include <stdint.h>
+
+#include <initializer_list>
+#include <memory>
+
+#include "constants/annotation_common.h"
+#include "core/fpdfapi/page/cpdf_page.h"
+#include "core/fpdfapi/page/test_with_page_module.h"
+#include "core/fpdfapi/parser/cpdf_array.h"
+#include "core/fpdfapi/parser/cpdf_dictionary.h"
+#include "core/fpdfapi/parser/cpdf_name.h"
+#include "core/fpdfapi/parser/cpdf_string.h"
+#include "core/fpdfapi/parser/cpdf_test_document.h"
+#include "core/fpdfdoc/cpdf_annot.h"
+#include "core/fxcrt/bytestring.h"
+#include "core/fxcrt/retain_ptr.h"
+#include "core/fxcrt/widestring.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace {
+
+class CPDFAnnotListTest : public TestWithPageModule {
+ public:
+  void SetUp() override {
+    TestWithPageModule::SetUp();
+
+    document_ = std::make_unique<CPDF_TestDocument>();
+    document_->SetRoot(pdfium::MakeRetain<CPDF_Dictionary>());
+    page_ = pdfium::MakeRetain<CPDF_Page>(
+        document_.get(), pdfium::MakeRetain<CPDF_Dictionary>());
+  }
+
+  void TearDown() override {
+    page_.Reset();
+    document_.reset();
+
+    TestWithPageModule::TearDown();
+  }
+
+ protected:
+  void AddTextAnnotation(const ByteString& contents) {
+    RetainPtr<CPDF_Dictionary> annotation =
+        page_->GetOrCreateAnnotsArray()->AppendNew<CPDF_Dictionary>();
+    annotation->SetNewFor<CPDF_Name>(pdfium::annotation::kSubtype, "Text");
+    annotation->SetNewFor<CPDF_String>(pdfium::annotation::kContents, contents,
+                                       /*bHex=*/false);
+  }
+
+  std::unique_ptr<CPDF_TestDocument> document_;
+  RetainPtr<CPDF_Page> page_;
+};
+
+ByteString MakeByteString(std::initializer_list<uint8_t> bytes) {
+  return ByteString(std::data(bytes), std::size(bytes));
+}
+
+ByteString GetRawContents(const CPDF_Annot* annotation) {
+  return annotation->GetAnnotDict()->GetByteStringFor(
+      pdfium::annotation::kContents);
+}
+
+WideString GetDecodedContents(const CPDF_Annot* annotation) {
+  return annotation->GetAnnotDict()->GetUnicodeTextFor(
+      pdfium::annotation::kContents);
+}
+
+}  // namespace
+
+TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromPdfEncoded) {
+  const ByteString kContents = MakeByteString({'A', 'a', 0xE4, 0xA0});
+  AddTextAnnotation(kContents);
+
+  CPDF_AnnotList list(page_);
+
+  ASSERT_EQ(2u, list.Count());
+  EXPECT_EQ(kContents, GetRawContents(list.GetAt(1)));
+  EXPECT_EQ(WideString::FromUTF8("Aaä€"), GetDecodedContents(list.GetAt(1)));
+}
+
+TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromUnicode) {
+  const ByteString kContents =
+      MakeByteString({0xFE, 0xFF, 0x00, 'A', 0x00, 'a', 0x00, 0xE4, 0x20, 0xAC,
+                      0xD8, 0x3D, 0xDC, 0xC4});
+  AddTextAnnotation(kContents);
+
+  CPDF_AnnotList list(page_);
+
+  ASSERT_EQ(2u, list.Count());
+  EXPECT_EQ(kContents, GetRawContents(list.GetAt(1)));
+
+  // TODO(crbug.com/pdfium/2029): `WideString::FromUTF8()` mishandles '📄'.
+  EXPECT_EQ(WideString::FromUTF8("Aaä€\xED\xA0\xBD\xED\xB3\x84"),
+            GetDecodedContents(list.GetAt(1)));
+}
+
+TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromEmptyPdfEncoded) {
+  AddTextAnnotation("");
+
+  CPDF_AnnotList list(page_);
+
+  EXPECT_EQ(1u, list.Count());
+}
+
+TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromEmptyUnicode) {
+  const ByteString kContents = MakeByteString({0xFE, 0xFF});
+  AddTextAnnotation(kContents);
+
+  CPDF_AnnotList list(page_);
+
+  EXPECT_EQ(1u, list.Count());
+}
+
+TEST_F(CPDFAnnotListTest, CreatePopupAnnotFromEmptyUnicodedWithEscape) {
+  const ByteString kContents =
+      MakeByteString({0xFE, 0xFF, 0x00, 0x1B, 'j', 'a', 0x00, 0x1B});
+  AddTextAnnotation(kContents);
+
+  CPDF_AnnotList list(page_);
+
+  EXPECT_EQ(1u, list.Count());
+}