core/fpdfapi/font/cpdf_cmap.cpp - pdfium - Git at Google

 // Copyright 2017 PDFium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

 #include "core/fpdfapi/font/cpdf_cmap.h"

 #include <memory>
 #include <utility>
 #include <vector>

 #include "core/fpdfapi/cmaps/cmap_int.h"
 #include "core/fpdfapi/font/cpdf_cmapparser.h"
 #include "core/fpdfapi/font/cpdf_fontglobals.h"
 #include "core/fpdfapi/parser/cpdf_simple_parser.h"

 namespace {

 struct ByteRange {
   uint8_t m_First;
   uint8_t m_Last;  // Inclusive.
 };

 struct PredefinedCMap {
   const char* m_pName;  // Raw, POD struct.
   CIDSet m_Charset;
   CIDCoding m_Coding;
   CPDF_CMap::CodingScheme m_CodingScheme;
   uint8_t m_LeadingSegCount;
   ByteRange m_LeadingSegs[2];
 };

 constexpr PredefinedCMap kPredefinedCMaps[] = {
     {"GB-EUC",
      CIDSET_GB1,
      CIDCODING_GB,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0xa1, 0xfe}}},
     {"GBpc-EUC",
      CIDSET_GB1,
      CIDCODING_GB,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0xa1, 0xfc}}},
     {"GBK-EUC",
      CIDSET_GB1,
      CIDCODING_GB,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0x81, 0xfe}}},
     {"GBKp-EUC",
      CIDSET_GB1,
      CIDCODING_GB,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0x81, 0xfe}}},
     {"GBK2K-EUC",
      CIDSET_GB1,
      CIDCODING_GB,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0x81, 0xfe}}},
     {"GBK2K",
      CIDSET_GB1,
      CIDCODING_GB,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0x81, 0xfe}}},
     {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
     {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
     {"B5pc",
      CIDSET_CNS1,
      CIDCODING_BIG5,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0xa1, 0xfc}}},
     {"HKscs-B5",
      CIDSET_CNS1,
      CIDCODING_BIG5,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0x88, 0xfe}}},
     {"ETen-B5",
      CIDSET_CNS1,
      CIDCODING_BIG5,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0xa1, 0xfe}}},
     {"ETenms-B5",
      CIDSET_CNS1,
      CIDCODING_BIG5,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0xa1, 0xfe}}},
     {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
     {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
     {"83pv-RKSJ",
      CIDSET_JAPAN1,
      CIDCODING_JIS,
      CPDF_CMap::MixedTwoBytes,
      2,
      {{0x81, 0x9f}, {0xe0, 0xfc}}},
     {"90ms-RKSJ",
      CIDSET_JAPAN1,
      CIDCODING_JIS,
      CPDF_CMap::MixedTwoBytes,
      2,
      {{0x81, 0x9f}, {0xe0, 0xfc}}},
     {"90msp-RKSJ",
      CIDSET_JAPAN1,
      CIDCODING_JIS,
      CPDF_CMap::MixedTwoBytes,
      2,
      {{0x81, 0x9f}, {0xe0, 0xfc}}},
     {"90pv-RKSJ",
      CIDSET_JAPAN1,
      CIDCODING_JIS,
      CPDF_CMap::MixedTwoBytes,
      2,
      {{0x81, 0x9f}, {0xe0, 0xfc}}},
     {"Add-RKSJ",
      CIDSET_JAPAN1,
      CIDCODING_JIS,
      CPDF_CMap::MixedTwoBytes,
      2,
      {{0x81, 0x9f}, {0xe0, 0xfc}}},
     {"EUC",
      CIDSET_JAPAN1,
      CIDCODING_JIS,
      CPDF_CMap::MixedTwoBytes,
      2,
      {{0x8e, 0x8e}, {0xa1, 0xfe}}},
     {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
     {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
     {"Ext-RKSJ",
      CIDSET_JAPAN1,
      CIDCODING_JIS,
      CPDF_CMap::MixedTwoBytes,
      2,
      {{0x81, 0x9f}, {0xe0, 0xfc}}},
     {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
     {"UniJIS-UCS2-HW",
      CIDSET_JAPAN1,
      CIDCODING_UCS2,
      CPDF_CMap::TwoBytes,
      0,
      {}},
     {"UniJIS-UTF16",
      CIDSET_JAPAN1,
      CIDCODING_UTF16,
      CPDF_CMap::TwoBytes,
      0,
      {}},
     {"KSC-EUC",
      CIDSET_KOREA1,
      CIDCODING_KOREA,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0xa1, 0xfe}}},
     {"KSCms-UHC",
      CIDSET_KOREA1,
      CIDCODING_KOREA,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0x81, 0xfe}}},
     {"KSCms-UHC-HW",
      CIDSET_KOREA1,
      CIDCODING_KOREA,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0x81, 0xfe}}},
     {"KSCpc-EUC",
      CIDSET_KOREA1,
      CIDCODING_KOREA,
      CPDF_CMap::MixedTwoBytes,
      1,
      {{0xa1, 0xfd}}},
     {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
     {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
 };

 const PredefinedCMap* GetPredefinedCMap(const ByteString& bsPredefinedName) {
   ByteString cmapid = bsPredefinedName;
   if (cmapid.GetLength() > 2)
     cmapid = cmapid.Left(cmapid.GetLength() - 2);
   for (const auto& map : kPredefinedCMaps) {
     if (cmapid == ByteStringView(map.m_pName))
       return &map;
   }
   return nullptr;
 }

 std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
   std::vector<bool> segments(256);
   for (uint32_t i = 0; i < map.m_LeadingSegCount; ++i) {
     const ByteRange& seg = map.m_LeadingSegs[i];
     for (int b = seg.m_First; b <= seg.m_Last; ++b)
       segments[b] = true;
   }
   return segments;
 }

 int CheckFourByteCodeRange(uint8_t* codes,
                            size_t size,
                            const std::vector<CPDF_CMap::CodeRange>& ranges) {
   for (size_t i = ranges.size(); i > 0; i--) {
     size_t seg = i - 1;
     if (ranges[seg].m_CharSize < size)
       continue;
     size_t iChar = 0;
     while (iChar < size) {
       if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
           codes[iChar] > ranges[seg].m_Upper[iChar]) {
         break;
       }
       ++iChar;
     }
     if (iChar == ranges[seg].m_CharSize)
       return 2;
     if (iChar)
       return (size == ranges[seg].m_CharSize) ? 2 : 1;
   }
   return 0;
 }

 size_t GetFourByteCharSizeImpl(
     uint32_t charcode,
     const std::vector<CPDF_CMap::CodeRange>& ranges) {
   if (ranges.empty())
     return 1;

   uint8_t codes[4];
   codes[0] = codes[1] = 0x00;
   codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
   codes[3] = static_cast<uint8_t>(charcode);
   for (size_t offset = 0; offset < 4; offset++) {
     size_t size = 4 - offset;
     for (size_t j = 0; j < ranges.size(); j++) {
       size_t iSeg = (ranges.size() - 1) - j;
       if (ranges[iSeg].m_CharSize < size)
         continue;
       size_t iChar = 0;
       while (iChar < size) {
         if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
             codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
           break;
         }
         ++iChar;
       }
       if (iChar == ranges[iSeg].m_CharSize)
         return size;
     }
   }
   return 1;
 }

 }  // namespace

 CPDF_CMap::CPDF_CMap(const ByteString& bsPredefinedName)
     : m_bVertical(bsPredefinedName.Last() == 'V') {
   if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
     m_Coding = CIDCODING_CID;
     m_bLoaded = true;
     return;
   }

   const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
   if (!map)
     return;

   m_Charset = map->m_Charset;
   m_Coding = map->m_Coding;
   m_CodingScheme = map->m_CodingScheme;
   if (m_CodingScheme == MixedTwoBytes)
     m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
   m_pEmbedMap = FindEmbeddedCMap(
       CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
       bsPredefinedName);
   if (!m_pEmbedMap)
     return;

   m_bLoaded = true;
 }

 CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
     : m_DirectCharcodeToCIDTable(65536) {
   CPDF_CMapParser parser(this);
   CPDF_SimpleParser syntax(spEmbeddedData);
   while (1) {
     ByteStringView word = syntax.GetWord();
     if (word.IsEmpty())
       break;

     parser.ParseWord(word);
   }

   if (m_CodingScheme == MixedFourBytes && parser.HasAdditionalMappings()) {
     m_AdditionalCharcodeToCIDMappings = parser.TakeAdditionalMappings();
     std::sort(
         m_AdditionalCharcodeToCIDMappings.begin(),
         m_AdditionalCharcodeToCIDMappings.end(),
         [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
           return arg1.m_EndCode < arg2.m_EndCode;
         });
   }
 }

 CPDF_CMap::~CPDF_CMap() = default;

 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
   if (m_Coding == CIDCODING_CID)
     return static_cast<uint16_t>(charcode);

   if (m_pEmbedMap)
     return ::CIDFromCharCode(m_pEmbedMap.Get(), charcode);

   if (m_DirectCharcodeToCIDTable.empty())
     return static_cast<uint16_t>(charcode);

   if (charcode < 0x10000)
     return m_DirectCharcodeToCIDTable[charcode];

   auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
                              m_AdditionalCharcodeToCIDMappings.end(), charcode,
                              [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
                                return arg.m_EndCode < val;
                              });
   if (it == m_AdditionalCharcodeToCIDMappings.end() ||
       it->m_StartCode > charcode) {
     return 0;
   }
   return it->m_StartCID + charcode - it->m_StartCode;
 }

 uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
   size_t& offset = *pOffset;
   auto pBytes = pString.raw_span();
   switch (m_CodingScheme) {
     case OneByte: {
       return offset < pBytes.size() ? pBytes[offset++] : 0;
     }
     case TwoBytes: {
       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
       return 256 * byte1 + byte2;
     }
     case MixedTwoBytes: {
       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
       if (!m_MixedTwoByteLeadingBytes[byte1])
         return byte1;
       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
       return 256 * byte1 + byte2;
     }
     case MixedFourBytes: {
       uint8_t codes[4];
       int char_size = 1;
       codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
       while (1) {
         int ret = CheckFourByteCodeRange(codes, char_size,
                                          m_MixedFourByteLeadingRanges);
         if (ret == 0)
           return 0;
         if (ret == 2) {
           uint32_t charcode = 0;
           for (int i = 0; i < char_size; i++)
             charcode = (charcode << 8) + codes[i];
           return charcode;
         }
         if (char_size == 4 || offset == pBytes.size())
           return 0;
         codes[char_size++] = pBytes[offset++];
       }
       break;
     }
   }
   return 0;
 }

 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
   switch (m_CodingScheme) {
     case OneByte:
       return 1;
     case TwoBytes:
       return 2;
     case MixedTwoBytes:
       if (charcode < 0x100)
         return 1;
       return 2;
     case MixedFourBytes:
       if (charcode < 0x100)
         return 1;
       if (charcode < 0x10000)
         return 2;
       if (charcode < 0x1000000)
         return 3;
       return 4;
   }
   return 1;
 }

 size_t CPDF_CMap::CountChar(ByteStringView pString) const {
   switch (m_CodingScheme) {
     case OneByte:
       return pString.GetLength();
     case TwoBytes:
       return (pString.GetLength() + 1) / 2;
     case MixedTwoBytes: {
       size_t count = 0;
       for (size_t i = 0; i < pString.GetLength(); i++) {
         count++;
         if (m_MixedTwoByteLeadingBytes[pString[i]])
           i++;
       }
       return count;
     }
     case MixedFourBytes: {
       size_t count = 0;
       size_t offset = 0;
       while (offset < pString.GetLength()) {
         GetNextChar(pString, &offset);
         count++;
       }
       return count;
     }
   }
   return pString.GetLength();
 }

 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
   switch (m_CodingScheme) {
     case OneByte:
       str[0] = static_cast<char>(charcode);
       return 1;
     case TwoBytes:
       str[0] = static_cast<char>(charcode / 256);
       str[1] = static_cast<char>(charcode % 256);
       return 2;
     case MixedTwoBytes:
       if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
         str[0] = static_cast<char>(charcode);
         return 1;
       }
       str[0] = static_cast<char>(charcode >> 8);
       str[1] = static_cast<char>(charcode);
       return 2;
     case MixedFourBytes:
       if (charcode < 0x100) {
         int iSize = static_cast<int>(
             GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
         if (iSize == 0)
           iSize = 1;
         str[iSize - 1] = static_cast<char>(charcode);
         if (iSize > 1)
           memset(str, 0, iSize - 1);
         return iSize;
       }
       if (charcode < 0x10000) {
         str[0] = static_cast<char>(charcode >> 8);
         str[1] = static_cast<char>(charcode);
         return 2;
       }
       if (charcode < 0x1000000) {
         str[0] = static_cast<char>(charcode >> 16);
         str[1] = static_cast<char>(charcode >> 8);
         str[2] = static_cast<char>(charcode);
         return 3;
       }
       str[0] = static_cast<char>(charcode >> 24);
       str[1] = static_cast<char>(charcode >> 16);
       str[2] = static_cast<char>(charcode >> 8);
       str[3] = static_cast<char>(charcode);
       return 4;
   }
   return 0;
 }
	// Copyright 2017 PDFium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

	#include "core/fpdfapi/font/cpdf_cmap.h"

	#include <memory>
	#include <utility>
	#include <vector>

	#include "core/fpdfapi/cmaps/cmap_int.h"
	#include "core/fpdfapi/font/cpdf_cmapparser.h"
	#include "core/fpdfapi/font/cpdf_fontglobals.h"
	#include "core/fpdfapi/parser/cpdf_simple_parser.h"

	namespace {

	struct ByteRange {
	uint8_t m_First;
	uint8_t m_Last; // Inclusive.
	};

	struct PredefinedCMap {
	const char* m_pName; // Raw, POD struct.
	CIDSet m_Charset;
	CIDCoding m_Coding;
	CPDF_CMap::CodingScheme m_CodingScheme;
	uint8_t m_LeadingSegCount;
	ByteRange m_LeadingSegs[2];
	};

	constexpr PredefinedCMap kPredefinedCMaps[] = {
	{"GB-EUC",
	CIDSET_GB1,
	CIDCODING_GB,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0xa1, 0xfe}}},
	{"GBpc-EUC",
	CIDSET_GB1,
	CIDCODING_GB,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0xa1, 0xfc}}},
	{"GBK-EUC",
	CIDSET_GB1,
	CIDCODING_GB,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0x81, 0xfe}}},
	{"GBKp-EUC",
	CIDSET_GB1,
	CIDCODING_GB,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0x81, 0xfe}}},
	{"GBK2K-EUC",
	CIDSET_GB1,
	CIDCODING_GB,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0x81, 0xfe}}},
	{"GBK2K",
	CIDSET_GB1,
	CIDCODING_GB,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0x81, 0xfe}}},
	{"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
	{"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
	{"B5pc",
	CIDSET_CNS1,
	CIDCODING_BIG5,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0xa1, 0xfc}}},
	{"HKscs-B5",
	CIDSET_CNS1,
	CIDCODING_BIG5,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0x88, 0xfe}}},
	{"ETen-B5",
	CIDSET_CNS1,
	CIDCODING_BIG5,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0xa1, 0xfe}}},
	{"ETenms-B5",
	CIDSET_CNS1,
	CIDCODING_BIG5,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0xa1, 0xfe}}},
	{"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
	{"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
	{"83pv-RKSJ",
	CIDSET_JAPAN1,
	CIDCODING_JIS,
	CPDF_CMap::MixedTwoBytes,
	2,
	{{0x81, 0x9f}, {0xe0, 0xfc}}},
	{"90ms-RKSJ",
	CIDSET_JAPAN1,
	CIDCODING_JIS,
	CPDF_CMap::MixedTwoBytes,
	2,
	{{0x81, 0x9f}, {0xe0, 0xfc}}},
	{"90msp-RKSJ",
	CIDSET_JAPAN1,
	CIDCODING_JIS,
	CPDF_CMap::MixedTwoBytes,
	2,
	{{0x81, 0x9f}, {0xe0, 0xfc}}},
	{"90pv-RKSJ",
	CIDSET_JAPAN1,
	CIDCODING_JIS,
	CPDF_CMap::MixedTwoBytes,
	2,
	{{0x81, 0x9f}, {0xe0, 0xfc}}},
	{"Add-RKSJ",
	CIDSET_JAPAN1,
	CIDCODING_JIS,
	CPDF_CMap::MixedTwoBytes,
	2,
	{{0x81, 0x9f}, {0xe0, 0xfc}}},
	{"EUC",
	CIDSET_JAPAN1,
	CIDCODING_JIS,
	CPDF_CMap::MixedTwoBytes,
	2,
	{{0x8e, 0x8e}, {0xa1, 0xfe}}},
	{"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
	{"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
	{"Ext-RKSJ",
	CIDSET_JAPAN1,
	CIDCODING_JIS,
	CPDF_CMap::MixedTwoBytes,
	2,
	{{0x81, 0x9f}, {0xe0, 0xfc}}},
	{"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
	{"UniJIS-UCS2-HW",
	CIDSET_JAPAN1,
	CIDCODING_UCS2,
	CPDF_CMap::TwoBytes,
	0,
	{}},
	{"UniJIS-UTF16",
	CIDSET_JAPAN1,
	CIDCODING_UTF16,
	CPDF_CMap::TwoBytes,
	0,
	{}},
	{"KSC-EUC",
	CIDSET_KOREA1,
	CIDCODING_KOREA,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0xa1, 0xfe}}},
	{"KSCms-UHC",
	CIDSET_KOREA1,
	CIDCODING_KOREA,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0x81, 0xfe}}},
	{"KSCms-UHC-HW",
	CIDSET_KOREA1,
	CIDCODING_KOREA,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0x81, 0xfe}}},
	{"KSCpc-EUC",
	CIDSET_KOREA1,
	CIDCODING_KOREA,
	CPDF_CMap::MixedTwoBytes,
	1,
	{{0xa1, 0xfd}}},
	{"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
	{"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
	};

	const PredefinedCMap* GetPredefinedCMap(const ByteString& bsPredefinedName) {
	ByteString cmapid = bsPredefinedName;
	if (cmapid.GetLength() > 2)
	cmapid = cmapid.Left(cmapid.GetLength() - 2);
	for (const auto& map : kPredefinedCMaps) {
	if (cmapid == ByteStringView(map.m_pName))
	return &map;
	}
	return nullptr;
	}

	std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
	std::vector<bool> segments(256);
	for (uint32_t i = 0; i < map.m_LeadingSegCount; ++i) {
	const ByteRange& seg = map.m_LeadingSegs[i];
	for (int b = seg.m_First; b <= seg.m_Last; ++b)
	segments[b] = true;
	}
	return segments;
	}

	int CheckFourByteCodeRange(uint8_t* codes,
	size_t size,
	const std::vector<CPDF_CMap::CodeRange>& ranges) {
	for (size_t i = ranges.size(); i > 0; i--) {
	size_t seg = i - 1;
	if (ranges[seg].m_CharSize < size)
	continue;
	size_t iChar = 0;
	while (iChar < size) {
	if (codes[iChar] < ranges[seg].m_Lower[iChar] \|\|
	codes[iChar] > ranges[seg].m_Upper[iChar]) {
	break;
	}
	++iChar;
	}
	if (iChar == ranges[seg].m_CharSize)
	return 2;
	if (iChar)
	return (size == ranges[seg].m_CharSize) ? 2 : 1;
	}
	return 0;
	}

	size_t GetFourByteCharSizeImpl(
	uint32_t charcode,
	const std::vector<CPDF_CMap::CodeRange>& ranges) {
	if (ranges.empty())
	return 1;

	uint8_t codes[4];
	codes[0] = codes[1] = 0x00;
	codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
	codes[3] = static_cast<uint8_t>(charcode);
	for (size_t offset = 0; offset < 4; offset++) {
	size_t size = 4 - offset;
	for (size_t j = 0; j < ranges.size(); j++) {
	size_t iSeg = (ranges.size() - 1) - j;
	if (ranges[iSeg].m_CharSize < size)
	continue;
	size_t iChar = 0;
	while (iChar < size) {
	if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] \|\|
	codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
	break;
	}
	++iChar;
	}
	if (iChar == ranges[iSeg].m_CharSize)
	return size;
	}
	}
	return 1;
	}

	} // namespace

	CPDF_CMap::CPDF_CMap(const ByteString& bsPredefinedName)
	: m_bVertical(bsPredefinedName.Last() == 'V') {
	if (bsPredefinedName == "Identity-H" \|\| bsPredefinedName == "Identity-V") {
	m_Coding = CIDCODING_CID;
	m_bLoaded = true;
	return;
	}

	const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
	if (!map)
	return;

	m_Charset = map->m_Charset;
	m_Coding = map->m_Coding;
	m_CodingScheme = map->m_CodingScheme;
	if (m_CodingScheme == MixedTwoBytes)
	m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
	m_pEmbedMap = FindEmbeddedCMap(
	CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
	bsPredefinedName);
	if (!m_pEmbedMap)
	return;

	m_bLoaded = true;
	}

	CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
	: m_DirectCharcodeToCIDTable(65536) {
	CPDF_CMapParser parser(this);
	CPDF_SimpleParser syntax(spEmbeddedData);
	while (1) {
	ByteStringView word = syntax.GetWord();
	if (word.IsEmpty())
	break;

	parser.ParseWord(word);
	}

	if (m_CodingScheme == MixedFourBytes && parser.HasAdditionalMappings()) {
	m_AdditionalCharcodeToCIDMappings = parser.TakeAdditionalMappings();
	std::sort(
	m_AdditionalCharcodeToCIDMappings.begin(),
	m_AdditionalCharcodeToCIDMappings.end(),
	[](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
	return arg1.m_EndCode < arg2.m_EndCode;
	});
	}
	}

	CPDF_CMap::~CPDF_CMap() = default;

	uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
	if (m_Coding == CIDCODING_CID)
	return static_cast<uint16_t>(charcode);

	if (m_pEmbedMap)
	return ::CIDFromCharCode(m_pEmbedMap.Get(), charcode);

	if (m_DirectCharcodeToCIDTable.empty())
	return static_cast<uint16_t>(charcode);

	if (charcode < 0x10000)
	return m_DirectCharcodeToCIDTable[charcode];

	auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
	m_AdditionalCharcodeToCIDMappings.end(), charcode,
	[](const CPDF_CMap::CIDRange& arg, uint32_t val) {
	return arg.m_EndCode < val;
	});
	if (it == m_AdditionalCharcodeToCIDMappings.end() \|\|
	it->m_StartCode > charcode) {
	return 0;
	}
	return it->m_StartCID + charcode - it->m_StartCode;
	}

	uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
	size_t& offset = *pOffset;
	auto pBytes = pString.raw_span();
	switch (m_CodingScheme) {
	case OneByte: {
	return offset < pBytes.size() ? pBytes[offset++] : 0;
	}
	case TwoBytes: {
	uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
	uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
	return 256 * byte1 + byte2;
	}
	case MixedTwoBytes: {
	uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
	if (!m_MixedTwoByteLeadingBytes[byte1])
	return byte1;
	uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
	return 256 * byte1 + byte2;
	}
	case MixedFourBytes: {
	uint8_t codes[4];
	int char_size = 1;
	codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
	while (1) {
	int ret = CheckFourByteCodeRange(codes, char_size,
	m_MixedFourByteLeadingRanges);
	if (ret == 0)
	return 0;
	if (ret == 2) {
	uint32_t charcode = 0;
	for (int i = 0; i < char_size; i++)
	charcode = (charcode << 8) + codes[i];
	return charcode;
	}
	if (char_size == 4 \|\| offset == pBytes.size())
	return 0;
	codes[char_size++] = pBytes[offset++];
	}
	break;
	}
	}
	return 0;
	}

	int CPDF_CMap::GetCharSize(uint32_t charcode) const {
	switch (m_CodingScheme) {
	case OneByte:
	return 1;
	case TwoBytes:
	return 2;
	case MixedTwoBytes:
	if (charcode < 0x100)
	return 1;
	return 2;
	case MixedFourBytes:
	if (charcode < 0x100)
	return 1;
	if (charcode < 0x10000)
	return 2;
	if (charcode < 0x1000000)
	return 3;
	return 4;
	}
	return 1;
	}

	size_t CPDF_CMap::CountChar(ByteStringView pString) const {
	switch (m_CodingScheme) {
	case OneByte:
	return pString.GetLength();
	case TwoBytes:
	return (pString.GetLength() + 1) / 2;
	case MixedTwoBytes: {
	size_t count = 0;
	for (size_t i = 0; i < pString.GetLength(); i++) {
	count++;
	if (m_MixedTwoByteLeadingBytes[pString[i]])
	i++;
	}
	return count;
	}
	case MixedFourBytes: {
	size_t count = 0;
	size_t offset = 0;
	while (offset < pString.GetLength()) {
	GetNextChar(pString, &offset);
	count++;
	}
	return count;
	}
	}
	return pString.GetLength();
	}

	int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
	switch (m_CodingScheme) {
	case OneByte:
	str[0] = static_cast<char>(charcode);
	return 1;
	case TwoBytes:
	str[0] = static_cast<char>(charcode / 256);
	str[1] = static_cast<char>(charcode % 256);
	return 2;
	case MixedTwoBytes:
	if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
	str[0] = static_cast<char>(charcode);
	return 1;
	}
	str[0] = static_cast<char>(charcode >> 8);
	str[1] = static_cast<char>(charcode);
	return 2;
	case MixedFourBytes:
	if (charcode < 0x100) {
	int iSize = static_cast<int>(
	GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
	if (iSize == 0)
	iSize = 1;
	str[iSize - 1] = static_cast<char>(charcode);
	if (iSize > 1)
	memset(str, 0, iSize - 1);
	return iSize;
	}
	if (charcode < 0x10000) {
	str[0] = static_cast<char>(charcode >> 8);
	str[1] = static_cast<char>(charcode);
	return 2;
	}
	if (charcode < 0x1000000) {
	str[0] = static_cast<char>(charcode >> 16);
	str[1] = static_cast<char>(charcode >> 8);
	str[2] = static_cast<char>(charcode);
	return 3;
	}
	str[0] = static_cast<char>(charcode >> 24);
	str[1] = static_cast<char>(charcode >> 16);
	str[2] = static_cast<char>(charcode >> 8);
	str[3] = static_cast<char>(charcode);
	return 4;
	}
	return 0;
	}