| // Copyright 2017 The PDFium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| |
| #include "core/fpdfapi/font/cpdf_tounicodemap.h" |
| |
| #include <set> |
| #include <utility> |
| |
| #include "core/fpdfapi/font/cpdf_cid2unicodemap.h" |
| #include "core/fpdfapi/font/cpdf_fontglobals.h" |
| #include "core/fpdfapi/parser/cpdf_simple_parser.h" |
| #include "core/fpdfapi/parser/cpdf_stream.h" |
| #include "core/fpdfapi/parser/fpdf_parser_utility.h" |
| #include "core/fxcrt/containers/contains.h" |
| #include "core/fxcrt/fx_extension.h" |
| #include "core/fxcrt/fx_safe_types.h" |
| #include "third_party/abseil-cpp/absl/types/variant.h" |
| |
| namespace { |
| |
| constexpr uint32_t kCidLimit = 0xffff; |
| |
| WideString StringDataAdd(WideString str) { |
| WideString ret; |
| wchar_t value = 1; |
| for (size_t i = str.GetLength(); i > 0; --i) { |
| wchar_t ch = str[i - 1] + value; |
| if (ch < str[i - 1]) { |
| ret.InsertAtFront(0); |
| } else { |
| ret.InsertAtFront(ch); |
| value = 0; |
| } |
| } |
| if (value) |
| ret.InsertAtFront(value); |
| return ret; |
| } |
| |
| } // namespace |
| |
| CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream) { |
| Load(std::move(pStream)); |
| } |
| |
| CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default; |
| |
| WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const { |
| auto it = m_Multimap.find(charcode); |
| if (it == m_Multimap.end()) { |
| if (!m_pBaseMap) |
| return WideString(); |
| return WideString( |
| m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode))); |
| } |
| |
| uint32_t value = *it->second.begin(); |
| wchar_t unicode = static_cast<wchar_t>(value & 0xffff); |
| if (unicode != 0xffff) |
| return WideString(unicode); |
| |
| size_t index = value >> 16; |
| return index < m_MultiCharVec.size() ? m_MultiCharVec[index] : WideString(); |
| } |
| |
| uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const { |
| for (const auto& pair : m_Multimap) { |
| if (pdfium::Contains(pair.second, static_cast<uint32_t>(unicode))) |
| return pair.first; |
| } |
| return 0; |
| } |
| |
| size_t CPDF_ToUnicodeMap::GetUnicodeCountByCharcodeForTesting( |
| uint32_t charcode) const { |
| auto it = m_Multimap.find(charcode); |
| return it != m_Multimap.end() ? it->second.size() : 0u; |
| } |
| |
| // static |
| std::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView input) { |
| // Ignore whitespaces within `input`. See https://crbug.com/pdfium/2065. |
| std::set<char> seen_whitespace_chars; |
| for (char c : input) { |
| if (PDFCharIsWhitespace(c)) { |
| seen_whitespace_chars.insert(c); |
| } |
| } |
| ByteString str_without_whitespace_chars; // Must outlive `str`. |
| ByteStringView str; |
| if (seen_whitespace_chars.empty()) { |
| str = input; |
| } else { |
| str_without_whitespace_chars.Reserve(input.GetLength()); |
| for (char c : input) { |
| if (!pdfium::Contains(seen_whitespace_chars, c)) { |
| str_without_whitespace_chars += c; |
| } |
| } |
| str = str_without_whitespace_chars.AsStringView(); |
| } |
| |
| size_t len = str.GetLength(); |
| if (len <= 2 || str[0] != '<' || str[len - 1] != '>') |
| return std::nullopt; |
| |
| FX_SAFE_UINT32 code = 0; |
| for (char c : str.Substr(1, len - 2)) { |
| if (!FXSYS_IsHexDigit(c)) |
| return std::nullopt; |
| |
| code = code * 16 + FXSYS_HexCharToInt(c); |
| if (!code.IsValid()) |
| return std::nullopt; |
| } |
| return std::optional<uint32_t>(code.ValueOrDie()); |
| } |
| |
| // static |
| WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) { |
| size_t len = str.GetLength(); |
| if (len <= 2 || str[0] != '<' || str[len - 1] != '>') |
| return WideString(); |
| |
| WideString result; |
| int byte_pos = 0; |
| wchar_t ch = 0; |
| for (char c : str.Substr(1, len - 2)) { |
| if (!FXSYS_IsHexDigit(c)) |
| break; |
| |
| ch = ch * 16 + FXSYS_HexCharToInt(c); |
| byte_pos++; |
| if (byte_pos == 4) { |
| result += ch; |
| byte_pos = 0; |
| ch = 0; |
| } |
| } |
| return result; |
| } |
| |
| void CPDF_ToUnicodeMap::Load(RetainPtr<const CPDF_Stream> pStream) { |
| CIDSet cid_set = CIDSET_UNKNOWN; |
| auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream)); |
| pAcc->LoadAllDataFiltered(); |
| CPDF_SimpleParser parser(pAcc->GetSpan()); |
| ByteStringView previous_word; |
| while (true) { |
| ByteStringView word = parser.GetWord(); |
| if (word.IsEmpty()) { |
| break; |
| } |
| |
| if (word == "beginbfchar") { |
| word = HandleBeginBFChar(parser, previous_word); |
| } else if (word == "beginbfrange") { |
| word = HandleBeginBFRange(parser, previous_word); |
| } else if (word == "/Adobe-Korea1-UCS2") { |
| cid_set = CIDSET_KOREA1; |
| } else if (word == "/Adobe-Japan1-UCS2") { |
| cid_set = CIDSET_JAPAN1; |
| } else if (word == "/Adobe-CNS1-UCS2") { |
| cid_set = CIDSET_CNS1; |
| } else if (word == "/Adobe-GB1-UCS2") { |
| cid_set = CIDSET_GB1; |
| } |
| |
| previous_word = word; |
| } |
| if (cid_set != CIDSET_UNKNOWN) { |
| m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set); |
| } |
| } |
| |
| ByteStringView CPDF_ToUnicodeMap::HandleBeginBFChar( |
| CPDF_SimpleParser& parser, |
| ByteStringView previous_word) { |
| struct CodeWord { |
| uint32_t code; |
| ByteStringView word; |
| }; |
| std::vector<CodeWord> code_words; |
| |
| const int raw_count = StringToInt(previous_word); |
| bool is_valid = raw_count >= 0 && raw_count <= 100; |
| const size_t expected_count = is_valid ? static_cast<size_t>(raw_count) : 0; |
| code_words.reserve(expected_count); |
| |
| ByteStringView word; |
| while (true) { |
| word = parser.GetWord(); |
| if (word.IsEmpty() || word == "endbfchar") { |
| break; |
| } |
| if (!is_valid) { |
| continue; // Keep consuming words. Do nothing else. |
| } |
| |
| std::optional<uint32_t> code = StringToCode(word); |
| if (!code.has_value() || code.value() > kCidLimit) { |
| is_valid = false; |
| continue; |
| } |
| |
| word = parser.GetWord(); |
| code_words.emplace_back(CodeWord{code.value(), word}); |
| |
| if (code_words.size() > expected_count) { |
| is_valid = false; |
| } |
| } |
| |
| if (is_valid && code_words.size() == expected_count) { |
| for (const auto& entry : code_words) { |
| SetCode(entry.code, StringToWideString(entry.word)); |
| } |
| } |
| return word; |
| } |
| |
| ByteStringView CPDF_ToUnicodeMap::HandleBeginBFRange( |
| CPDF_SimpleParser& parser, |
| ByteStringView previous_word) { |
| struct CodeWordRange { |
| uint32_t low_code; |
| std::vector<ByteStringView> code_words; |
| }; |
| struct MultimapSingleDestRange { |
| uint32_t low_code; |
| uint32_t high_code; |
| uint32_t start_value; |
| }; |
| struct MultimapMultiDestRange { |
| uint32_t low_code; |
| std::vector<WideString> retcodes; |
| }; |
| using Range = absl::variant<CodeWordRange, MultimapSingleDestRange, |
| MultimapMultiDestRange>; |
| std::vector<Range> ranges; |
| |
| const int raw_count = StringToInt(previous_word); |
| bool is_valid = raw_count >= 0 && raw_count <= 100; |
| const size_t expected_count = is_valid ? static_cast<size_t>(raw_count) : 0; |
| ranges.reserve(expected_count); |
| |
| ByteStringView word; |
| while (true) { |
| word = parser.GetWord(); |
| if (word.IsEmpty() || word == "endbfrange") { |
| break; |
| } |
| if (!is_valid) { |
| continue; // Keep consuming words. Do nothing else. |
| } |
| |
| std::optional<uint32_t> lowcode_opt = StringToCode(word); |
| if (!lowcode_opt.has_value()) { |
| is_valid = false; |
| continue; |
| } |
| |
| word = parser.GetWord(); |
| std::optional<uint32_t> highcode_opt = StringToCode(word); |
| if (!highcode_opt.has_value()) { |
| is_valid = false; |
| continue; |
| } |
| |
| uint32_t lowcode = lowcode_opt.value(); |
| uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff); |
| if (lowcode > kCidLimit || highcode > kCidLimit || lowcode > highcode) { |
| is_valid = false; |
| continue; |
| } |
| |
| word = parser.GetWord(); |
| ByteStringView start = word; |
| if (start == "[") { |
| CodeWordRange range; |
| range.low_code = lowcode; |
| range.code_words.reserve(1 + highcode - lowcode); |
| for (uint32_t code = lowcode; code <= highcode; ++code) { |
| word = parser.GetWord(); |
| range.code_words.push_back(word); |
| } |
| ranges.push_back(std::move(range)); |
| |
| if (ranges.size() > expected_count) { |
| is_valid = false; |
| continue; |
| } |
| |
| word = parser.GetWord(); |
| if (word != "]") { |
| is_valid = false; |
| } |
| continue; |
| } |
| |
| WideString destcode = StringToWideString(start); |
| if (destcode.GetLength() == 1) { |
| std::optional<uint32_t> value_or_error = StringToCode(start); |
| if (!value_or_error.has_value()) { |
| is_valid = false; |
| continue; |
| } |
| |
| ranges.push_back( |
| MultimapSingleDestRange{.low_code = lowcode, |
| .high_code = highcode, |
| .start_value = value_or_error.value()}); |
| } else { |
| MultimapMultiDestRange range; |
| range.low_code = lowcode; |
| range.retcodes.reserve(1 + highcode - lowcode); |
| range.retcodes.push_back(destcode); |
| for (uint32_t code = lowcode + 1; code <= highcode; ++code) { |
| WideString retcode = StringDataAdd(range.retcodes.back()); |
| range.retcodes.push_back(std::move(retcode)); |
| } |
| ranges.push_back(std::move(range)); |
| } |
| |
| if (ranges.size() > expected_count) { |
| is_valid = false; |
| } |
| } |
| |
| if (is_valid && ranges.size() == expected_count) { |
| for (const auto& entry : ranges) { |
| if (absl::holds_alternative<CodeWordRange>(entry)) { |
| const auto& range = absl::get<CodeWordRange>(entry); |
| uint32_t code = range.low_code; |
| for (const auto& code_word : range.code_words) { |
| SetCode(code, StringToWideString(code_word)); |
| ++code; |
| } |
| } else if (absl::holds_alternative<MultimapSingleDestRange>(entry)) { |
| const auto& range = absl::get<MultimapSingleDestRange>(entry); |
| uint32_t value = range.start_value; |
| for (uint32_t code = range.low_code; code <= range.high_code; ++code) { |
| InsertIntoMultimap(code, value++); |
| } |
| } else { |
| CHECK(absl::holds_alternative<MultimapMultiDestRange>(entry)); |
| const auto& range = absl::get<MultimapMultiDestRange>(entry); |
| uint32_t code = range.low_code; |
| for (const auto& retcode : range.retcodes) { |
| InsertIntoMultimap(code, GetMultiCharIndexIndicator()); |
| m_MultiCharVec.push_back(retcode); |
| ++code; |
| } |
| } |
| } |
| } |
| return word; |
| } |
| |
| uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const { |
| FX_SAFE_UINT32 uni = m_MultiCharVec.size(); |
| uni = uni * 0x10000 + 0xffff; |
| return uni.ValueOrDefault(0); |
| } |
| |
| void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) { |
| size_t len = destcode.GetLength(); |
| if (len == 0) |
| return; |
| |
| if (len == 1) { |
| InsertIntoMultimap(srccode, destcode[0]); |
| } else { |
| InsertIntoMultimap(srccode, GetMultiCharIndexIndicator()); |
| m_MultiCharVec.push_back(destcode); |
| } |
| } |
| |
| void CPDF_ToUnicodeMap::InsertIntoMultimap(uint32_t code, uint32_t destcode) { |
| auto it = m_Multimap.find(code); |
| if (it == m_Multimap.end()) { |
| m_Multimap.emplace(code, std::set<uint32_t>{destcode}); |
| return; |
| } |
| |
| it->second.emplace(destcode); |
| } |