Encode unicodes in UTF-16BE in ToUnicode map

Bug: pdfium:667
Change-Id: I811571c334ff28162905a65781ca14f03caf2966
Reviewed-on: https://pdfium-review.googlesource.com/4910
Commit-Queue: Nicolás Peña <npm@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/core/fxcrt/fx_extension.cpp b/core/fxcrt/fx_extension.cpp
index 209584b..2b290ed 100644
--- a/core/fxcrt/fx_extension.cpp
+++ b/core/fxcrt/fx_extension.cpp
@@ -137,6 +137,31 @@
   return dwHashCode;
 }
 
+void FXSYS_IntToTwoHexChars(uint8_t n, char* buf) {
+  static const char kHex[] = "0123456789ABCDEF";
+  buf[0] = kHex[n / 16];
+  buf[1] = kHex[n % 16];
+}
+
+void FXSYS_IntToFourHexChars(uint16_t n, char* buf) {
+  FXSYS_IntToTwoHexChars(n / 256, buf);
+  FXSYS_IntToTwoHexChars(n % 256, buf + 2);
+}
+
+size_t FXSYS_ToUTF16BE(uint32_t unicode, char* buf) {
+  ASSERT(unicode <= 0xD7FF || (unicode > 0xDFFF && unicode <= 0x10FFFF));
+  if (unicode <= 0xFFFF) {
+    FXSYS_IntToFourHexChars(unicode, buf);
+    return 4;
+  }
+  unicode -= 0x010000;
+  // High ten bits plus 0xD800
+  FXSYS_IntToFourHexChars(0xD800 + unicode / 0x400, buf);
+  // Low ten bits plus 0xDC00
+  FXSYS_IntToFourHexChars(0xDC00 + unicode % 0x400, buf + 4);
+  return 8;
+}
+
 void* FX_Random_MT_Start(uint32_t dwSeed) {
   FX_MTRANDOMCONTEXT* pContext = FX_Alloc(FX_MTRANDOMCONTEXT, 1);
   pContext->mt[0] = dwSeed;
diff --git a/core/fxcrt/fx_extension.h b/core/fxcrt/fx_extension.h
index f55153c..255ee2e 100644
--- a/core/fxcrt/fx_extension.h
+++ b/core/fxcrt/fx_extension.h
@@ -76,6 +76,12 @@
   return std::iswdigit(c) ? c - L'0' : 0;
 }
 
+void FXSYS_IntToTwoHexChars(uint8_t c, char* buf);
+
+void FXSYS_IntToFourHexChars(uint16_t c, char* buf);
+
+size_t FXSYS_ToUTF16BE(uint32_t unicode, char* buf);
+
 float FXSYS_FractionalScale(size_t scale_factor, int value);
 int FXSYS_FractionalScaleCount();
 
diff --git a/core/fxcrt/fx_extension_unittest.cpp b/core/fxcrt/fx_extension_unittest.cpp
index 1bc3ec6..38b66ba 100644
--- a/core/fxcrt/fx_extension_unittest.cpp
+++ b/core/fxcrt/fx_extension_unittest.cpp
@@ -39,3 +39,53 @@
   EXPECT_EQ(97u, FX_HashCode_GetW(L"A", true));
   EXPECT_EQ(1313 * 65u + 66u, FX_HashCode_GetW(L"AB", false));
 }
+
+TEST(fxcrt, FXSYS_IntToTwoHexChars) {
+  char buf[3] = {0};
+  FXSYS_IntToTwoHexChars(0x0, buf);
+  EXPECT_STREQ("00", buf);
+  FXSYS_IntToTwoHexChars(0x9, buf);
+  EXPECT_STREQ("09", buf);
+  FXSYS_IntToTwoHexChars(0xA, buf);
+  EXPECT_STREQ("0A", buf);
+  FXSYS_IntToTwoHexChars(0x8C, buf);
+  EXPECT_STREQ("8C", buf);
+  FXSYS_IntToTwoHexChars(0xBE, buf);
+  EXPECT_STREQ("BE", buf);
+  FXSYS_IntToTwoHexChars(0xD0, buf);
+  EXPECT_STREQ("D0", buf);
+  FXSYS_IntToTwoHexChars(0xFF, buf);
+  EXPECT_STREQ("FF", buf);
+}
+
+TEST(fxcrt, FXSYS_IntToFourHexChars) {
+  char buf[5] = {0};
+  FXSYS_IntToFourHexChars(0x0, buf);
+  EXPECT_STREQ("0000", buf);
+  FXSYS_IntToFourHexChars(0xA23, buf);
+  EXPECT_STREQ("0A23", buf);
+  FXSYS_IntToFourHexChars(0xB701, buf);
+  EXPECT_STREQ("B701", buf);
+  FXSYS_IntToFourHexChars(0xFFFF, buf);
+  EXPECT_STREQ("FFFF", buf);
+}
+
+TEST(fxcrt, FXSYS_ToUTF16BE) {
+  char buf[9] = {0};
+  // Test U+0000 to U+D7FF and U+E000 to U+FFFF
+  EXPECT_EQ(4U, FXSYS_ToUTF16BE(0x0, buf));
+  EXPECT_STREQ("0000", buf);
+  EXPECT_EQ(4U, FXSYS_ToUTF16BE(0xD7FF, buf));
+  EXPECT_STREQ("D7FF", buf);
+  EXPECT_EQ(4U, FXSYS_ToUTF16BE(0xE000, buf));
+  EXPECT_STREQ("E000", buf);
+  EXPECT_EQ(4U, FXSYS_ToUTF16BE(0xFFFF, buf));
+  EXPECT_STREQ("FFFF", buf);
+  // Test U+10000 to U+10FFFF
+  EXPECT_EQ(8U, FXSYS_ToUTF16BE(0x10000, buf));
+  EXPECT_STREQ("D800DC00", buf);
+  EXPECT_EQ(8U, FXSYS_ToUTF16BE(0x10FFFF, buf));
+  EXPECT_STREQ("DBFFDFFF", buf);
+  EXPECT_EQ(8U, FXSYS_ToUTF16BE(0x2003E, buf));
+  EXPECT_STREQ("D840DC3E", buf);
+}
diff --git a/fpdfsdk/fpdfedittext.cpp b/fpdfsdk/fpdfedittext.cpp
index 9b01775..54388ef 100644
--- a/fpdfsdk/fpdfedittext.cpp
+++ b/fpdfsdk/fpdfedittext.cpp
@@ -19,6 +19,7 @@
 #include "core/fpdfapi/parser/cpdf_number.h"
 #include "core/fpdfapi/parser/cpdf_reference.h"
 #include "core/fpdfapi/parser/cpdf_stream.h"
+#include "core/fxcrt/fx_extension.h"
 #include "core/fxge/cfx_fontmgr.h"
 #include "core/fxge/fx_font.h"
 #include "fpdfsdk/fsdk_define.h"
@@ -90,20 +91,27 @@
     "1 begincodespacerange\n"
     "<0000> <FFFFF>\n";
 
-const char hex[] = "0123456789ABCDEF";
-
-void AddNum(CFX_ByteTextBuf* pBuffer, uint32_t number) {
+void AddCharcode(CFX_ByteTextBuf* pBuffer, uint32_t number) {
+  ASSERT(number <= 0xFFFF);
   *pBuffer << "<";
   char ans[4];
-  for (size_t i = 0; i < 4; ++i) {
-    ans[3 - i] = hex[number % 16];
-    number /= 16;
-  }
+  FXSYS_IntToFourHexChars(number, ans);
   for (size_t i = 0; i < 4; ++i)
     pBuffer->AppendChar(ans[i]);
   *pBuffer << ">";
 }
 
+// PDF spec 1.7 Section 5.9.2: "Unicode character sequences as expressed in
+// UTF-16BE encoding." See https://en.wikipedia.org/wiki/UTF-16#Description
+void AddUnicode(CFX_ByteTextBuf* pBuffer, uint32_t unicode) {
+  char ans[8];
+  *pBuffer << "<";
+  size_t numChars = FXSYS_ToUTF16BE(unicode, ans);
+  for (size_t i = 0; i < numChars; ++i)
+    pBuffer->AppendChar(ans[i]);
+  *pBuffer << ">";
+}
+
 // Loads the charcode to unicode mapping into a stream
 CPDF_Stream* LoadUnicode(CPDF_Document* pDoc,
                          const std::map<uint32_t, uint32_t>& to_unicode) {
@@ -173,37 +181,37 @@
   }
   // Add maps to buffer
   buffer << static_cast<uint32_t>(char_to_uni.size()) << " beginbfchar\n";
-  for (auto iter : char_to_uni) {
-    AddNum(&buffer, iter.first);
+  for (const auto& iter : char_to_uni) {
+    AddCharcode(&buffer, iter.first);
     buffer << " ";
-    AddNum(&buffer, iter.second);
+    AddUnicode(&buffer, iter.second);
     buffer << "\n";
   }
   buffer << "endbfchar\n"
          << static_cast<uint32_t>(map_range_vector.size() + map_range.size())
          << " beginbfrange\n";
-  for (auto iter : map_range_vector) {
+  for (const auto& iter : map_range_vector) {
     const std::pair<uint32_t, uint32_t>& charcodeRange = iter.first;
-    AddNum(&buffer, charcodeRange.first);
+    AddCharcode(&buffer, charcodeRange.first);
     buffer << " ";
-    AddNum(&buffer, charcodeRange.second);
+    AddCharcode(&buffer, charcodeRange.second);
     buffer << " [";
     const std::vector<uint32_t>& unicodes = iter.second;
     for (size_t i = 0; i < unicodes.size(); ++i) {
       uint32_t uni = unicodes[i];
-      AddNum(&buffer, uni);
+      AddUnicode(&buffer, uni);
       if (i != unicodes.size() - 1)
         buffer << " ";
     }
     buffer << "]\n";
   }
-  for (auto iter : map_range) {
+  for (const auto& iter : map_range) {
     const std::pair<uint32_t, uint32_t>& charcodeRange = iter.first;
-    AddNum(&buffer, charcodeRange.first);
+    AddCharcode(&buffer, charcodeRange.first);
     buffer << " ";
-    AddNum(&buffer, charcodeRange.second);
+    AddCharcode(&buffer, charcodeRange.second);
     buffer << " ";
-    AddNum(&buffer, iter.second);
+    AddUnicode(&buffer, iter.second);
     buffer << "\n";
   }
   // TODO(npm): Encrypt / Compress?
@@ -389,10 +397,10 @@
 
 DLLEXPORT FPDF_BOOL STDCALL FPDFText_SetText(FPDF_PAGEOBJECT text_object,
                                              FPDF_WIDESTRING text) {
-  if (!text_object)
+  auto* pTextObj = static_cast<CPDF_TextObject*>(text_object);
+  if (!pTextObj)
     return false;
 
-  auto* pTextObj = reinterpret_cast<CPDF_TextObject*>(text_object);
   FX_STRSIZE len = CFX_WideString::WStringLength(text);
   CFX_WideString encodedText = CFX_WideString::FromUTF16LE(text, len);
   CFX_ByteString byteText;
@@ -428,10 +436,10 @@
 }
 
 DLLEXPORT void STDCALL FPDFFont_Close(FPDF_FONT font) {
-  if (!font)
+  CPDF_Font* pFont = static_cast<CPDF_Font*>(font);
+  if (!pFont)
     return;
 
-  CPDF_Font* pFont = reinterpret_cast<CPDF_Font*>(font);
   CPDF_Document* pDoc = pFont->GetDocument();
   if (!pDoc)
     return;
@@ -445,14 +453,11 @@
 FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document,
                           FPDF_FONT font,
                           float font_size) {
-  if (!font)
-    return nullptr;
-
   CPDF_Document* pDoc = CPDFDocumentFromFPDFDocument(document);
-  if (!pDoc)
+  CPDF_Font* pFont = static_cast<CPDF_Font*>(font);
+  if (!pDoc || !pFont)
     return nullptr;
 
-  CPDF_Font* pFont = reinterpret_cast<CPDF_Font*>(font);
   auto pTextObj = pdfium::MakeUnique<CPDF_TextObject>();
   pTextObj->m_TextState.SetFont(pDoc->LoadFont(pFont->GetFontDict()));
   pTextObj->m_TextState.SetFontSize(font_size);