Fix issue that some characters from the embedded font don't render.

Due to character composition, different unicodes could represent the
same character, which means they are mapped to the same CID. However the
current implementation for CID to unicode mapping is always 1:1, which
leads to the situation that certain characters cannot be recognized by
one of its valid unicodes during rendering.

- To fix the embedding process, this CL changes |to_unicode| in
  LoadCompositeFont() to std::multimap so that when embedding a font to
  a PDF file, all valid unicodes for the same CID won't overwrite each
  other.

- To fix the rendering process, this CL replaces |m_Map| in
  CPDF_ToUnicodeMap with |m_Multimap| of type std::multimap so that it
  can store multiple entries with the same CID as keys.

- This CL also adds a matching embedder test which embeds a subset of
  of NotoSansSC-Regular font into a PDF and tests the text rendering
  result.

Bug: pdfium:1608
Change-Id: Ifadc2aa4df0de14e9d5a7c38da209f81769c0b3b
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/75830
Reviewed-by: Daniel Hosseinian <dhoss@chromium.org>
Commit-Queue: Hui Yingst <nigi@chromium.org>
diff --git a/core/fpdfapi/font/cpdf_tounicodemap.cpp b/core/fpdfapi/font/cpdf_tounicodemap.cpp
index 1872d5d..96649e4 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.cpp
+++ b/core/fpdfapi/font/cpdf_tounicodemap.cpp
@@ -44,8 +44,8 @@
 CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default;
 
 WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
-  auto it = m_Map.find(charcode);
-  if (it == m_Map.end()) {
+  auto it = m_Multimap.find(charcode);
+  if (it == m_Multimap.end()) {
     if (!m_pBaseMap)
       return WideString();
     return m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode));
@@ -64,7 +64,7 @@
 }
 
 uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
-  for (const auto& pair : m_Map) {
+  for (const auto& pair : m_Multimap) {
     if (pair.second == static_cast<uint32_t>(unicode))
       return pair.first;
   }
@@ -190,12 +190,12 @@
 
       uint32_t value = value_or_error.value();
       for (uint32_t code = lowcode; code <= highcode; code++)
-        m_Map[code] = value++;
+        m_Multimap.emplace(code, value++);
     } else {
       for (uint32_t code = lowcode; code <= highcode; code++) {
         WideString retcode =
             code == lowcode ? destcode : StringDataAdd(destcode);
-        m_Map[code] = GetUnicode();
+        m_Multimap.emplace(code, GetUnicode());
         m_MultiCharBuf.AppendChar(retcode.GetLength());
         m_MultiCharBuf << retcode;
         destcode = std::move(retcode);
@@ -216,9 +216,9 @@
     return;
 
   if (len == 1) {
-    m_Map[srccode] = destcode[0];
+    m_Multimap.emplace(srccode, destcode[0]);
   } else {
-    m_Map[srccode] = GetUnicode();
+    m_Multimap.emplace(srccode, GetUnicode());
     m_MultiCharBuf.AppendChar(len);
     m_MultiCharBuf << destcode;
   }
diff --git a/core/fpdfapi/font/cpdf_tounicodemap.h b/core/fpdfapi/font/cpdf_tounicodemap.h
index 9eaf625..2872192 100644
--- a/core/fpdfapi/font/cpdf_tounicodemap.h
+++ b/core/fpdfapi/font/cpdf_tounicodemap.h
@@ -37,7 +37,7 @@
   uint32_t GetUnicode() const;
   void SetCode(uint32_t srccode, WideString destcode);
 
-  std::map<uint32_t, uint32_t> m_Map;
+  std::multimap<uint32_t, uint32_t> m_Multimap;
   UnownedPtr<const CPDF_CID2UnicodeMap> m_pBaseMap;
   CFX_WideTextBuf m_MultiCharBuf;
 };
diff --git a/fpdfsdk/fpdf_edit_embeddertest.cpp b/fpdfsdk/fpdf_edit_embeddertest.cpp
index 3155191..98f85ce 100644
--- a/fpdfsdk/fpdf_edit_embeddertest.cpp
+++ b/fpdfsdk/fpdf_edit_embeddertest.cpp
@@ -29,7 +29,9 @@
 #include "testing/fx_string_testhelpers.h"
 #include "testing/gmock/include/gmock/gmock-matchers.h"
 #include "testing/gtest/include/gtest/gtest.h"
+#include "testing/utils/file_util.h"
 #include "testing/utils/hash.h"
+#include "testing/utils/path_service.h"
 
 using pdfium::kHelloWorldChecksum;
 
@@ -234,6 +236,55 @@
 
 }  // namespace
 
+TEST_F(FPDFEditEmbedderTest, EmbedNotoSansSCFont) {
+  EXPECT_TRUE(CreateEmptyDocument());
+  ScopedFPDFPage page(FPDFPage_New(document(), 0, 400, 400));
+  std::string font_path;
+  ASSERT_TRUE(PathService::GetTestFilePath(
+      "fonts/third_party/NotoSansSC/NotoSansSC-Regular.subset.otf",
+      &font_path));
+
+  size_t file_length = 0;
+  std::unique_ptr<char, pdfium::FreeDeleter> font_data =
+      GetFileContents(font_path.c_str(), &file_length);
+  ASSERT(font_data);
+
+  ScopedFPDFFont font(FPDFText_LoadFont(
+      document(), reinterpret_cast<const uint8_t*>(font_data.get()),
+      file_length, FPDF_FONT_TRUETYPE, /*cid=*/true));
+  FPDF_PAGEOBJECT text_object =
+      FPDFPageObj_CreateTextObj(document(), font.get(), 20.0f);
+  EXPECT_TRUE(text_object);
+
+  // Test the characters which are either mapped to one single unicode or
+  // multiple unicodes in the embedded font.
+  ScopedFPDFWideString text = GetFPDFWideString(L"这是第一句。 这是第二行。");
+  EXPECT_TRUE(FPDFText_SetText(text_object, text.get()));
+
+  FPDFPageObj_Transform(text_object, 1, 0, 0, 1, 50, 200);
+  FPDFPage_InsertObject(page.get(), text_object);
+  EXPECT_TRUE(FPDFPage_GenerateContent(page.get()));
+
+#if defined(_SKIA_SUPPORT_) || defined(_SKIA_SUPPORT_PATHS_)
+#if defined(OS_APPLE)
+  const char kChecksum[] = "9a31fb87d1c6d2346bba22d1196041cd";
+#else
+  const char kChecksum[] = "5bb65e15fc0a685934cd5006dec08a76";
+#endif  // defined(OS_APPLE)
+#else
+#if defined(OS_WIN)
+  const char kChecksum[] = "89e8eef5d6ad18c542a92a0519954d0f";
+#else
+  const char kChecksum[] = "9a31fb87d1c6d2346bba22d1196041cd";
+#endif  // defined(OS_WIN)
+#endif  // defined(_SKIA_SUPPORT_) || defined(_SKIA_SUPPORT_PATHS_)
+  ScopedFPDFBitmap page_bitmap = RenderPage(page.get());
+  CompareBitmap(page_bitmap.get(), 400, 400, kChecksum);
+
+  EXPECT_TRUE(FPDF_SaveAsCopy(document(), this, 0));
+  VerifySavedDocument(400, 400, kChecksum);
+}
+
 TEST_F(FPDFEditEmbedderTest, EmptyCreation) {
   EXPECT_TRUE(CreateEmptyDocument());
   FPDF_PAGE page = FPDFPage_New(document(), 0, 640.0, 480.0);
diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp
index b3815f8..4a4630e 100644
--- a/fpdfsdk/fpdf_edittext.cpp
+++ b/fpdfsdk/fpdf_edittext.cpp
@@ -157,7 +157,7 @@
 
 // Loads the charcode to unicode mapping into a stream
 CPDF_Stream* LoadUnicode(CPDF_Document* pDoc,
-                         const std::map<uint32_t, uint32_t>& to_unicode) {
+                         const std::multimap<uint32_t, uint32_t>& to_unicode) {
   // A map charcode->unicode
   std::map<uint32_t, uint32_t> char_to_uni;
   // A map <char_start, char_end> to vector v of unicode characters of size (end
@@ -355,7 +355,7 @@
   if (dwGlyphIndex == 0 || dwCurrentChar > kMaxUnicode)
     return nullptr;
 
-  std::map<uint32_t, uint32_t> to_unicode;
+  std::multimap<uint32_t, uint32_t> to_unicode;
   std::map<uint32_t, uint32_t> widths;
   while (true) {
     if (dwCurrentChar > kMaxUnicode)
@@ -363,7 +363,7 @@
 
     if (!pdfium::Contains(widths, dwGlyphIndex))
       widths[dwGlyphIndex] = pFont->GetGlyphWidth(dwGlyphIndex);
-    to_unicode[dwGlyphIndex] = dwCurrentChar;
+    to_unicode.emplace(dwGlyphIndex, dwCurrentChar);
     dwCurrentChar =
         FT_Get_Next_Char(pFont->GetFaceRec(), dwCurrentChar, &dwGlyphIndex);
     if (dwGlyphIndex == 0)
diff --git a/testing/resources/fonts/third_party/NotoSansSC/LICENSE.txt b/testing/resources/fonts/third_party/NotoSansSC/LICENSE.txt
new file mode 100644
index 0000000..70fbc7f
--- /dev/null
+++ b/testing/resources/fonts/third_party/NotoSansSC/LICENSE.txt
@@ -0,0 +1,78 @@
+Copyright 2018 The Noto Project Authors (github.com/googlei18n/noto-fonts)
+
+This Font Software is licensed under the SIL Open Font License,
+Version 1.1.
+
+This license is copied below, and is also available with a FAQ at:
+http://scripts.sil.org/OFL
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font
+creation efforts of academic and linguistic communities, and to
+provide a free and open framework in which fonts may be shared and
+improved in partnership with others.
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply to
+any document created using the fonts or their derivatives.
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+"Original Version" refers to the collection of Font Software
+components as distributed by the Copyright Holder(s).
+"Modified Version" refers to any derivative made by adding to,
+deleting, or substituting -- in part or in whole -- any of the
+components of the Original Version, by changing formats or by porting
+the Font Software to a new environment.
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed,
+modify, redistribute, and sell modified and unmodified copies of the
+Font Software, subject to the following conditions:
+1) Neither the Font Software nor any of its individual components, in
+Original or Modified Versions, may be sold by itself.
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the
+corresponding Copyright Holder. This restriction only applies to the
+primary font name as presented to the users.
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created using
+the Font Software.
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
diff --git a/testing/resources/fonts/third_party/NotoSansSC/NotoSansSC-Regular.subset.otf b/testing/resources/fonts/third_party/NotoSansSC/NotoSansSC-Regular.subset.otf
new file mode 100644
index 0000000..050c8b2
--- /dev/null
+++ b/testing/resources/fonts/third_party/NotoSansSC/NotoSansSC-Regular.subset.otf
Binary files differ
diff --git a/testing/resources/fonts/third_party/NotoSansSC/README.pdfium b/testing/resources/fonts/third_party/NotoSansSC/README.pdfium
new file mode 100644
index 0000000..8e07d2a
--- /dev/null
+++ b/testing/resources/fonts/third_party/NotoSansSC/README.pdfium
@@ -0,0 +1,19 @@
+Noto Sans SC Font:
+
+1. Source Origin
+Google's Internationalisation team's:
+https://github.com/googlei18n/noto-cjk
+
+2. LICENSE AND OPENSOURCE
+For license information, see the LICENSE.txt file
+
+3. Reason:
+This font file is used for the test FPDFEditEmbedderTest.EmbedNotoSansSCFont.
+The font file is included as a real-world example for testing PDFium's API to
+embed a CJK font into a PDF file, which contains multiple unicodes mapped to
+the same CID. See crbug.com/pdfium/1608 for details.
+
+Command to generate the font subset:
+$ pyftsubset NotoSansSC-Regular.otf --unicodes="U+0000884C,U+0000FA08,U+0000F906,U+000053E5,U+00008FD9,U+0000662F,U+00007B2C,U+00004E00,U+00003002,U+00004E8C,U+00002F00,U+00002F06"
+
+where pyftsubset comes from https://github.com/behdad/fonttools