| // Copyright 2017 PDFium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| |
| #include "core/fpdfapi/font/cpdf_cmap.h" |
| |
| #include <memory> |
| #include <utility> |
| #include <vector> |
| |
| #include "core/fpdfapi/cmaps/cmap_int.h" |
| #include "core/fpdfapi/font/cpdf_cmapmanager.h" |
| #include "core/fpdfapi/font/cpdf_cmapparser.h" |
| #include "core/fpdfapi/parser/cpdf_simple_parser.h" |
| |
| namespace { |
| |
| struct ByteRange { |
| uint8_t m_First; |
| uint8_t m_Last; // Inclusive. |
| }; |
| |
| struct PredefinedCMap { |
| const char* m_pName; |
| CIDSet m_Charset; |
| CIDCoding m_Coding; |
| CPDF_CMap::CodingScheme m_CodingScheme; |
| uint8_t m_LeadingSegCount; |
| ByteRange m_LeadingSegs[2]; |
| }; |
| |
| const PredefinedCMap g_PredefinedCMaps[] = { |
| {"GB-EUC", |
| CIDSET_GB1, |
| CIDCODING_GB, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0xa1, 0xfe}}}, |
| {"GBpc-EUC", |
| CIDSET_GB1, |
| CIDCODING_GB, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0xa1, 0xfc}}}, |
| {"GBK-EUC", |
| CIDSET_GB1, |
| CIDCODING_GB, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0x81, 0xfe}}}, |
| {"GBKp-EUC", |
| CIDSET_GB1, |
| CIDCODING_GB, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0x81, 0xfe}}}, |
| {"GBK2K-EUC", |
| CIDSET_GB1, |
| CIDCODING_GB, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0x81, 0xfe}}}, |
| {"GBK2K", |
| CIDSET_GB1, |
| CIDCODING_GB, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0x81, 0xfe}}}, |
| {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}}, |
| {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}}, |
| {"B5pc", |
| CIDSET_CNS1, |
| CIDCODING_BIG5, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0xa1, 0xfc}}}, |
| {"HKscs-B5", |
| CIDSET_CNS1, |
| CIDCODING_BIG5, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0x88, 0xfe}}}, |
| {"ETen-B5", |
| CIDSET_CNS1, |
| CIDCODING_BIG5, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0xa1, 0xfe}}}, |
| {"ETenms-B5", |
| CIDSET_CNS1, |
| CIDCODING_BIG5, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0xa1, 0xfe}}}, |
| {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}}, |
| {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}}, |
| {"83pv-RKSJ", |
| CIDSET_JAPAN1, |
| CIDCODING_JIS, |
| CPDF_CMap::MixedTwoBytes, |
| 2, |
| {{0x81, 0x9f}, {0xe0, 0xfc}}}, |
| {"90ms-RKSJ", |
| CIDSET_JAPAN1, |
| CIDCODING_JIS, |
| CPDF_CMap::MixedTwoBytes, |
| 2, |
| {{0x81, 0x9f}, {0xe0, 0xfc}}}, |
| {"90msp-RKSJ", |
| CIDSET_JAPAN1, |
| CIDCODING_JIS, |
| CPDF_CMap::MixedTwoBytes, |
| 2, |
| {{0x81, 0x9f}, {0xe0, 0xfc}}}, |
| {"90pv-RKSJ", |
| CIDSET_JAPAN1, |
| CIDCODING_JIS, |
| CPDF_CMap::MixedTwoBytes, |
| 2, |
| {{0x81, 0x9f}, {0xe0, 0xfc}}}, |
| {"Add-RKSJ", |
| CIDSET_JAPAN1, |
| CIDCODING_JIS, |
| CPDF_CMap::MixedTwoBytes, |
| 2, |
| {{0x81, 0x9f}, {0xe0, 0xfc}}}, |
| {"EUC", |
| CIDSET_JAPAN1, |
| CIDCODING_JIS, |
| CPDF_CMap::MixedTwoBytes, |
| 2, |
| {{0x8e, 0x8e}, {0xa1, 0xfe}}}, |
| {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}}, |
| {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}}, |
| {"Ext-RKSJ", |
| CIDSET_JAPAN1, |
| CIDCODING_JIS, |
| CPDF_CMap::MixedTwoBytes, |
| 2, |
| {{0x81, 0x9f}, {0xe0, 0xfc}}}, |
| {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}}, |
| {"UniJIS-UCS2-HW", |
| CIDSET_JAPAN1, |
| CIDCODING_UCS2, |
| CPDF_CMap::TwoBytes, |
| 0, |
| {}}, |
| {"UniJIS-UTF16", |
| CIDSET_JAPAN1, |
| CIDCODING_UTF16, |
| CPDF_CMap::TwoBytes, |
| 0, |
| {}}, |
| {"KSC-EUC", |
| CIDSET_KOREA1, |
| CIDCODING_KOREA, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0xa1, 0xfe}}}, |
| {"KSCms-UHC", |
| CIDSET_KOREA1, |
| CIDCODING_KOREA, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0x81, 0xfe}}}, |
| {"KSCms-UHC-HW", |
| CIDSET_KOREA1, |
| CIDCODING_KOREA, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0x81, 0xfe}}}, |
| {"KSCpc-EUC", |
| CIDSET_KOREA1, |
| CIDCODING_KOREA, |
| CPDF_CMap::MixedTwoBytes, |
| 1, |
| {{0xa1, 0xfd}}}, |
| {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}}, |
| {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}}, |
| }; |
| |
| int CheckFourByteCodeRange(uint8_t* codes, |
| int size, |
| const std::vector<CPDF_CMap::CodeRange>& ranges) { |
| int iSeg = pdfium::CollectionSize<int>(ranges) - 1; |
| while (iSeg >= 0) { |
| if (ranges[iSeg].m_CharSize < size) { |
| --iSeg; |
| continue; |
| } |
| int iChar = 0; |
| while (iChar < size) { |
| if (codes[iChar] < ranges[iSeg].m_Lower[iChar] || |
| codes[iChar] > ranges[iSeg].m_Upper[iChar]) { |
| break; |
| } |
| ++iChar; |
| } |
| if (iChar == ranges[iSeg].m_CharSize) |
| return 2; |
| if (iChar) |
| return (size == ranges[iSeg].m_CharSize) ? 2 : 1; |
| iSeg--; |
| } |
| return 0; |
| } |
| |
| int GetFourByteCharSizeImpl(uint32_t charcode, |
| const std::vector<CPDF_CMap::CodeRange>& ranges) { |
| if (ranges.empty()) |
| return 1; |
| |
| uint8_t codes[4]; |
| codes[0] = codes[1] = 0x00; |
| codes[2] = (uint8_t)(charcode >> 8 & 0xFF); |
| codes[3] = (uint8_t)charcode; |
| int offset = 0; |
| int size = 4; |
| for (int i = 0; i < 4; ++i) { |
| int iSeg = pdfium::CollectionSize<int>(ranges) - 1; |
| while (iSeg >= 0) { |
| if (ranges[iSeg].m_CharSize < size) { |
| --iSeg; |
| continue; |
| } |
| int iChar = 0; |
| while (iChar < size) { |
| if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] || |
| codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) { |
| break; |
| } |
| ++iChar; |
| } |
| if (iChar == ranges[iSeg].m_CharSize) |
| return size; |
| --iSeg; |
| } |
| --size; |
| ++offset; |
| } |
| return 1; |
| } |
| |
| } // namespace |
| |
| CPDF_CMap::CPDF_CMap() |
| : m_bLoaded(false), |
| m_bVertical(false), |
| m_Charset(CIDSET_UNKNOWN), |
| m_CodingScheme(TwoBytes), |
| m_Coding(CIDCODING_UNKNOWN), |
| m_pEmbedMap(nullptr) {} |
| |
| CPDF_CMap::~CPDF_CMap() {} |
| |
| void CPDF_CMap::LoadPredefined(CPDF_CMapManager* pMgr, |
| const CFX_ByteString& bsName, |
| bool bPromptCJK) { |
| m_PredefinedCMap = bsName; |
| if (m_PredefinedCMap == "Identity-H" || m_PredefinedCMap == "Identity-V") { |
| m_Coding = CIDCODING_CID; |
| m_bVertical = bsName[9] == 'V'; |
| m_bLoaded = true; |
| return; |
| } |
| CFX_ByteString cmapid = m_PredefinedCMap; |
| m_bVertical = cmapid.Right(1) == "V"; |
| if (cmapid.GetLength() > 2) { |
| cmapid = cmapid.Left(cmapid.GetLength() - 2); |
| } |
| const PredefinedCMap* map = nullptr; |
| for (size_t i = 0; i < FX_ArraySize(g_PredefinedCMaps); ++i) { |
| if (cmapid == CFX_ByteStringC(g_PredefinedCMaps[i].m_pName)) { |
| map = &g_PredefinedCMaps[i]; |
| break; |
| } |
| } |
| if (!map) |
| return; |
| |
| m_Charset = map->m_Charset; |
| m_Coding = map->m_Coding; |
| m_CodingScheme = map->m_CodingScheme; |
| if (m_CodingScheme == MixedTwoBytes) { |
| m_MixedTwoByteLeadingBytes = std::vector<bool>(256); |
| for (uint32_t i = 0; i < map->m_LeadingSegCount; ++i) { |
| const ByteRange& seg = map->m_LeadingSegs[i]; |
| for (int b = seg.m_First; b <= seg.m_Last; ++b) |
| m_MixedTwoByteLeadingBytes[b] = true; |
| } |
| } |
| FPDFAPI_FindEmbeddedCMap(bsName, m_Charset, m_Coding, m_pEmbedMap); |
| if (!m_pEmbedMap) |
| return; |
| |
| m_bLoaded = true; |
| } |
| |
| void CPDF_CMap::LoadEmbedded(const uint8_t* pData, uint32_t size) { |
| m_DirectCharcodeToCIDTable = std::vector<uint16_t>(65536); |
| CPDF_CMapParser parser(this); |
| CPDF_SimpleParser syntax(pData, size); |
| while (1) { |
| CFX_ByteStringC word = syntax.GetWord(); |
| if (word.IsEmpty()) { |
| break; |
| } |
| parser.ParseWord(word); |
| } |
| if (m_CodingScheme == MixedFourBytes && parser.HasAdditionalMappings()) { |
| m_AdditionalCharcodeToCIDMappings = parser.TakeAdditionalMappings(); |
| std::sort( |
| m_AdditionalCharcodeToCIDMappings.begin(), |
| m_AdditionalCharcodeToCIDMappings.end(), |
| [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) { |
| return arg1.m_EndCode < arg2.m_EndCode; |
| }); |
| } |
| } |
| |
| uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const { |
| if (m_Coding == CIDCODING_CID) |
| return static_cast<uint16_t>(charcode); |
| |
| if (m_pEmbedMap) |
| return FPDFAPI_CIDFromCharCode(m_pEmbedMap, charcode); |
| |
| if (m_DirectCharcodeToCIDTable.empty()) |
| return static_cast<uint16_t>(charcode); |
| |
| if (charcode < 0x10000) |
| return m_DirectCharcodeToCIDTable[charcode]; |
| |
| auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(), |
| m_AdditionalCharcodeToCIDMappings.end(), charcode, |
| [](const CPDF_CMap::CIDRange& arg, uint32_t val) { |
| return arg.m_EndCode < val; |
| }); |
| if (it == m_AdditionalCharcodeToCIDMappings.end() || |
| it->m_StartCode > charcode) { |
| return 0; |
| } |
| return it->m_StartCID + charcode - it->m_StartCode; |
| } |
| |
| uint32_t CPDF_CMap::GetNextChar(const char* pString, |
| int nStrLen, |
| int& offset) const { |
| auto* pBytes = reinterpret_cast<const uint8_t*>(pString); |
| switch (m_CodingScheme) { |
| case OneByte: { |
| return pBytes[offset++]; |
| } |
| case TwoBytes: { |
| uint8_t byte1 = pBytes[offset++]; |
| return 256 * byte1 + pBytes[offset++]; |
| } |
| case MixedTwoBytes: { |
| uint8_t byte1 = pBytes[offset++]; |
| if (!m_MixedTwoByteLeadingBytes[byte1]) |
| return byte1; |
| return 256 * byte1 + pBytes[offset++]; |
| } |
| case MixedFourBytes: { |
| uint8_t codes[4]; |
| int char_size = 1; |
| codes[0] = pBytes[offset++]; |
| while (1) { |
| int ret = CheckFourByteCodeRange(codes, char_size, |
| m_MixedFourByteLeadingRanges); |
| if (ret == 0) |
| return 0; |
| if (ret == 2) { |
| uint32_t charcode = 0; |
| for (int i = 0; i < char_size; i++) |
| charcode = (charcode << 8) + codes[i]; |
| return charcode; |
| } |
| if (char_size == 4 || offset == nStrLen) |
| return 0; |
| codes[char_size++] = pBytes[offset++]; |
| } |
| break; |
| } |
| } |
| return 0; |
| } |
| |
| int CPDF_CMap::GetCharSize(uint32_t charcode) const { |
| switch (m_CodingScheme) { |
| case OneByte: |
| return 1; |
| case TwoBytes: |
| return 2; |
| case MixedTwoBytes: |
| if (charcode < 0x100) |
| return 1; |
| return 2; |
| case MixedFourBytes: |
| if (charcode < 0x100) |
| return 1; |
| if (charcode < 0x10000) |
| return 2; |
| if (charcode < 0x1000000) |
| return 3; |
| return 4; |
| } |
| return 1; |
| } |
| |
| int CPDF_CMap::CountChar(const char* pString, int size) const { |
| switch (m_CodingScheme) { |
| case OneByte: |
| return size; |
| case TwoBytes: |
| return (size + 1) / 2; |
| case MixedTwoBytes: { |
| int count = 0; |
| for (int i = 0; i < size; i++) { |
| count++; |
| if (m_MixedTwoByteLeadingBytes[reinterpret_cast<const uint8_t*>( |
| pString)[i]]) { |
| i++; |
| } |
| } |
| return count; |
| } |
| case MixedFourBytes: { |
| int count = 0, offset = 0; |
| while (offset < size) { |
| GetNextChar(pString, size, offset); |
| count++; |
| } |
| return count; |
| } |
| } |
| return size; |
| } |
| |
| int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const { |
| switch (m_CodingScheme) { |
| case OneByte: |
| str[0] = (uint8_t)charcode; |
| return 1; |
| case TwoBytes: |
| str[0] = (uint8_t)(charcode / 256); |
| str[1] = (uint8_t)(charcode % 256); |
| return 2; |
| case MixedTwoBytes: |
| if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[(uint8_t)charcode]) { |
| str[0] = (uint8_t)charcode; |
| return 1; |
| } |
| str[0] = (uint8_t)(charcode >> 8); |
| str[1] = (uint8_t)charcode; |
| return 2; |
| case MixedFourBytes: |
| if (charcode < 0x100) { |
| int iSize = |
| GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges); |
| if (iSize == 0) |
| iSize = 1; |
| str[iSize - 1] = (uint8_t)charcode; |
| if (iSize > 1) |
| memset(str, 0, iSize - 1); |
| return iSize; |
| } |
| if (charcode < 0x10000) { |
| str[0] = (uint8_t)(charcode >> 8); |
| str[1] = (uint8_t)charcode; |
| return 2; |
| } |
| if (charcode < 0x1000000) { |
| str[0] = (uint8_t)(charcode >> 16); |
| str[1] = (uint8_t)(charcode >> 8); |
| str[2] = (uint8_t)charcode; |
| return 3; |
| } |
| str[0] = (uint8_t)(charcode >> 24); |
| str[1] = (uint8_t)(charcode >> 16); |
| str[2] = (uint8_t)(charcode >> 8); |
| str[3] = (uint8_t)charcode; |
| return 4; |
| } |
| return 0; |
| } |