Add support for UTF-8 text strings

https://pdfa.org/understanding-utf-8-in-pdf-2-0/ has a good overview
of this, so I don't need to write it up. The post also has a
screenshot of Chromium's PDF viewer not supporting UTF-8 text strings.
This patch rectifies that :)

Fixed: pdfium:2101
Change-Id: Idabe8fe523c13c6879f83413806690490d3242c9
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/113830
Reviewed-by: Lei Zhang <thestig@chromium.org>
Auto-Submit: Nico Weber <thakis@chromium.org>
Commit-Queue: Nico Weber <thakis@chromium.org>
diff --git a/core/fpdfapi/parser/fpdf_parser_decode.cpp b/core/fpdfapi/parser/fpdf_parser_decode.cpp
index 47b5e43..459e4d9 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode.cpp
@@ -520,6 +520,11 @@
 #if defined(WCHAR_T_IS_32_BIT)
     dest_pos = FuseSurrogates(dest_buf, dest_pos);
 #endif
+  } else if (span.size() >= 3 && span[0] == 0xef && span[1] == 0xbb &&
+             span[2] == 0xbf) {
+    result = FX_UTF8Decode(span.subspan(3));
+    pdfium::span<wchar_t> dest_buf = result.GetBuffer(result.GetLength());
+    dest_pos = StripLanguageCodes(dest_buf, result.GetLength());
   } else {
     pdfium::span<wchar_t> dest_buf = result.GetBuffer(span.size());
     for (size_t i = 0; i < span.size(); ++i)
diff --git a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
index 544f317..a67d7e2 100644
--- a/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
+++ b/core/fpdfapi/parser/fpdf_parser_decode_unittest.cpp
@@ -409,11 +409,15 @@
   // ASCII text.
   EXPECT_EQ(L"the quick\tfox", PDF_DecodeText(ToSpan("the quick\tfox")));
 
-  // Unicode text.
+  // UTF-8 text.
+  EXPECT_EQ(L"\x0330\x0331",
+            PDF_DecodeText(ToSpan("\xEF\xBB\xBF\xCC\xB0\xCC\xB1")));
+
+  // UTF-16BE text.
   EXPECT_EQ(L"\x0330\x0331",
             PDF_DecodeText(ToSpan("\xFE\xFF\x03\x30\x03\x31")));
 
-  // More Unicode text.
+  // More UTF-16BE text.
   EXPECT_EQ(
       L"\x7F51\x9875\x0020\x56FE\x7247\x0020"
       L"\x8D44\x8BAF\x66F4\x591A\x0020\x00BB",
@@ -421,7 +425,10 @@
           ToSpan("\xFE\xFF\x7F\x51\x98\x75\x00\x20\x56\xFE\x72\x47\x00"
                  "\x20\x8D\x44\x8B\xAF\x66\xF4\x59\x1A\x00\x20\x00\xBB")));
 
-  // Supplementary Unicode text.
+  // Supplementary UTF-8 text.
+  EXPECT_EQ(L"🎨", PDF_DecodeText(ToSpan("\xEF\xBB\xBF\xF0\x9F\x8E\xA8")));
+
+  // Supplementary UTF-16BE text.
   EXPECT_EQ(L"🎨", PDF_DecodeText(ToSpan("\xFE\xFF\xD8\x3C\xDF\xA8")));
 }
 
@@ -429,6 +436,9 @@
 TEST(ParserDecodeTest, DecodeTextWithUnicodeEscapes) {
   EXPECT_EQ(L"\x0020\x5370\x5237",
             PDF_DecodeText(ToSpan(
+                "\xEF\xBB\xBF\x1B\x6A\x61\x1B\x20\xE5\x8D\xB0\xE5\x88\xB7")));
+  EXPECT_EQ(L"\x0020\x5370\x5237",
+            PDF_DecodeText(ToSpan(
                 "\xFE\xFF\x00\x1B\x6A\x61\x00\x1B\x00\x20\x53\x70\x52\x37")));
   EXPECT_EQ(
       L"\x0020\x5370\x5237",
@@ -445,8 +455,10 @@
 
 // https://crbug.com/1001159
 TEST(ParserDecodeTest, DecodeTextWithInvalidUnicodeEscapes) {
+  EXPECT_EQ(L"", PDF_DecodeText(ToSpan("\xEF\xBB\xBF\x1B\x1B")));
   EXPECT_EQ(L"", PDF_DecodeText(ToSpan("\xFE\xFF\x00\x1B\x00\x1B")));
   EXPECT_EQ(L"", PDF_DecodeText(ToSpan("\xFE\xFF\x00\x1B\x00\x1B\x20")));
+  EXPECT_EQ(L"\x0020", PDF_DecodeText(ToSpan("\xEF\xBB\xBF\x1B\x1B\x20")));
   EXPECT_EQ(L"\x0020",
             PDF_DecodeText(ToSpan("\xFE\xFF\x00\x1B\x00\x1B\x00\x20")));
 }
diff --git a/fpdfsdk/fpdf_doc_embeddertest.cpp b/fpdfsdk/fpdf_doc_embeddertest.cpp
index 11a37df..0dbc754 100644
--- a/fpdfsdk/fpdf_doc_embeddertest.cpp
+++ b/fpdfsdk/fpdf_doc_embeddertest.cpp
@@ -762,6 +762,20 @@
   EXPECT_EQ(L"D:20160411190039+00'00'", GetPlatformWString(buf));
 }
 
+TEST_F(FPDFDocEmbedderTest, Utf8Metadata) {
+  ASSERT_TRUE(OpenDocument("utf-8.pdf"));
+
+  unsigned short buf[128];
+
+  ASSERT_EQ(34u, FPDF_GetMetaText(document(), "Producer", buf, sizeof(buf)));
+  EXPECT_EQ(L"Manüally Created", GetPlatformWString(buf));
+
+  FPDF_BOOKMARK child = FPDFBookmark_GetFirstChild(document(), nullptr);
+  EXPECT_TRUE(child);
+  EXPECT_EQ(16u, FPDFBookmark_GetTitle(child, buf, sizeof(buf)));
+  EXPECT_EQ(L"Titlè 1", GetPlatformWString(buf));
+}
+
 TEST_F(FPDFDocEmbedderTest, Bug_182) {
   ASSERT_TRUE(OpenDocument("bug_182.pdf"));
 
diff --git a/testing/resources/utf-8.in b/testing/resources/utf-8.in
new file mode 100644
index 0000000..00ac83c
--- /dev/null
+++ b/testing/resources/utf-8.in
@@ -0,0 +1,58 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+  /Outlines 6 0 R
+>>
+endobj
+
+{{object 2 0}} <<
+  /Type /Pages
+  /Kids [3 0 R]
+  /Count 1
+>>
+endobj
+
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [0 0 525 250]
+  /Contents 4 0 R
+>>
+endobj
+
+{{object 4 0}} <<
+  {{streamlen}}
+>>
+stream
+endstream
+endobj
+
+{{object 5 0}} <<
+  /Producer (\357\273\277Man\303\274ally Created)
+>>
+endobj
+
+{{object 6 0}} <<
+  /Count 1
+  /First 7 0 R
+  /Last 7 0 R
+>>
+endobj
+
+{{object 7 0}} <<
+  /Title <EFBBBF5469746CC3A82031>
+  /Parent 6 0 R
+>>
+endobj
+
+{{xref}}
+
+trailer <<
+  {{trailersize}}
+  /Info 5 0 R
+  /Root 1 0 R
+>>
+
+{{startxref}}
+%%EOF
diff --git a/testing/resources/utf-8.pdf b/testing/resources/utf-8.pdf
new file mode 100644
index 0000000..6bfb3ef
--- /dev/null
+++ b/testing/resources/utf-8.pdf
@@ -0,0 +1,69 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+  /Outlines 6 0 R
+>>
+endobj
+
+2 0 obj <<
+  /Type /Pages
+  /Kids [3 0 R]
+  /Count 1
+>>
+endobj
+
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /MediaBox [0 0 525 250]
+  /Contents 4 0 R
+>>
+endobj
+
+4 0 obj <<
+  /Length 0
+>>
+stream
+endstream
+endobj
+
+5 0 obj <<
+  /Producer (\357\273\277Man\303\274ally Created)
+>>
+endobj
+
+6 0 obj <<
+  /Count 1
+  /First 7 0 R
+  /Last 7 0 R
+>>
+endobj
+
+7 0 obj <<
+  /Title <EFBBBF5469746CC3A82031>
+  /Parent 6 0 R
+>>
+endobj
+
+xref
+0 8
+0000000000 65535 f 
+0000000015 00000 n 
+0000000087 00000 n 
+0000000151 00000 n 
+0000000247 00000 n 
+0000000298 00000 n 
+0000000370 00000 n 
+0000000432 00000 n 
+
+trailer <<
+  /Size 8
+  /Info 5 0 R
+  /Root 1 0 R
+>>
+
+startxref
+504
+%%EOF