blob: 0e686e94ebd10f6ba85571f9eefd925b64f18fda [file] [log] [blame]
// Copyright 2017 The PDFium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#include "core/fpdfapi/font/cpdf_tounicodemap.h"
#include <set>
#include <utility>
#include "core/fpdfapi/font/cpdf_cid2unicodemap.h"
#include "core/fpdfapi/font/cpdf_fontglobals.h"
#include "core/fpdfapi/parser/cpdf_simple_parser.h"
#include "core/fpdfapi/parser/cpdf_stream.h"
#include "core/fpdfapi/parser/fpdf_parser_utility.h"
#include "core/fxcrt/containers/contains.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/fx_safe_types.h"
#include "third_party/abseil-cpp/absl/types/variant.h"
namespace {
constexpr uint32_t kCidLimit = 0xffff;
WideString StringDataAdd(WideString str) {
WideString ret;
wchar_t value = 1;
for (size_t i = str.GetLength(); i > 0; --i) {
wchar_t ch = str[i - 1] + value;
if (ch < str[i - 1]) {
ret.InsertAtFront(0);
} else {
ret.InsertAtFront(ch);
value = 0;
}
}
if (value)
ret.InsertAtFront(value);
return ret;
}
} // namespace
CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream) {
Load(std::move(pStream));
}
CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default;
WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
auto it = m_Multimap.find(charcode);
if (it == m_Multimap.end()) {
if (!m_pBaseMap)
return WideString();
return WideString(
m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode)));
}
uint32_t value = *it->second.begin();
wchar_t unicode = static_cast<wchar_t>(value & 0xffff);
if (unicode != 0xffff)
return WideString(unicode);
size_t index = value >> 16;
return index < m_MultiCharVec.size() ? m_MultiCharVec[index] : WideString();
}
uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
for (const auto& pair : m_Multimap) {
if (pdfium::Contains(pair.second, static_cast<uint32_t>(unicode)))
return pair.first;
}
return 0;
}
size_t CPDF_ToUnicodeMap::GetUnicodeCountByCharcodeForTesting(
uint32_t charcode) const {
auto it = m_Multimap.find(charcode);
return it != m_Multimap.end() ? it->second.size() : 0u;
}
// static
std::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView input) {
// Ignore whitespaces within `input`. See https://crbug.com/pdfium/2065.
std::set<char> seen_whitespace_chars;
for (char c : input) {
if (PDFCharIsWhitespace(c)) {
seen_whitespace_chars.insert(c);
}
}
ByteString str_without_whitespace_chars; // Must outlive `str`.
ByteStringView str;
if (seen_whitespace_chars.empty()) {
str = input;
} else {
str_without_whitespace_chars.Reserve(input.GetLength());
for (char c : input) {
if (!pdfium::Contains(seen_whitespace_chars, c)) {
str_without_whitespace_chars += c;
}
}
str = str_without_whitespace_chars.AsStringView();
}
size_t len = str.GetLength();
if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
return std::nullopt;
FX_SAFE_UINT32 code = 0;
for (char c : str.Substr(1, len - 2)) {
if (!FXSYS_IsHexDigit(c))
return std::nullopt;
code = code * 16 + FXSYS_HexCharToInt(c);
if (!code.IsValid())
return std::nullopt;
}
return std::optional<uint32_t>(code.ValueOrDie());
}
// static
WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) {
size_t len = str.GetLength();
if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
return WideString();
WideString result;
int byte_pos = 0;
wchar_t ch = 0;
for (char c : str.Substr(1, len - 2)) {
if (!FXSYS_IsHexDigit(c))
break;
ch = ch * 16 + FXSYS_HexCharToInt(c);
byte_pos++;
if (byte_pos == 4) {
result += ch;
byte_pos = 0;
ch = 0;
}
}
return result;
}
void CPDF_ToUnicodeMap::Load(RetainPtr<const CPDF_Stream> pStream) {
CIDSet cid_set = CIDSET_UNKNOWN;
auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream));
pAcc->LoadAllDataFiltered();
CPDF_SimpleParser parser(pAcc->GetSpan());
ByteStringView previous_word;
while (true) {
ByteStringView word = parser.GetWord();
if (word.IsEmpty()) {
break;
}
if (word == "beginbfchar") {
word = HandleBeginBFChar(parser, previous_word);
} else if (word == "beginbfrange") {
word = HandleBeginBFRange(parser, previous_word);
} else if (word == "/Adobe-Korea1-UCS2") {
cid_set = CIDSET_KOREA1;
} else if (word == "/Adobe-Japan1-UCS2") {
cid_set = CIDSET_JAPAN1;
} else if (word == "/Adobe-CNS1-UCS2") {
cid_set = CIDSET_CNS1;
} else if (word == "/Adobe-GB1-UCS2") {
cid_set = CIDSET_GB1;
}
previous_word = word;
}
if (cid_set != CIDSET_UNKNOWN) {
m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set);
}
}
ByteStringView CPDF_ToUnicodeMap::HandleBeginBFChar(
CPDF_SimpleParser& parser,
ByteStringView previous_word) {
struct CodeWord {
uint32_t code;
ByteStringView word;
};
std::vector<CodeWord> code_words;
const int raw_count = StringToInt(previous_word);
bool is_valid = raw_count >= 0 && raw_count <= 100;
const size_t expected_count = is_valid ? static_cast<size_t>(raw_count) : 0;
code_words.reserve(expected_count);
ByteStringView word;
while (true) {
word = parser.GetWord();
if (word.IsEmpty() || word == "endbfchar") {
break;
}
if (!is_valid) {
continue; // Keep consuming words. Do nothing else.
}
std::optional<uint32_t> code = StringToCode(word);
if (!code.has_value() || code.value() > kCidLimit) {
is_valid = false;
continue;
}
word = parser.GetWord();
code_words.emplace_back(CodeWord{code.value(), word});
if (code_words.size() > expected_count) {
is_valid = false;
}
}
if (is_valid && code_words.size() == expected_count) {
for (const auto& entry : code_words) {
SetCode(entry.code, StringToWideString(entry.word));
}
}
return word;
}
ByteStringView CPDF_ToUnicodeMap::HandleBeginBFRange(
CPDF_SimpleParser& parser,
ByteStringView previous_word) {
struct CodeWordRange {
uint32_t low_code;
std::vector<ByteStringView> code_words;
};
struct MultimapSingleDestRange {
uint32_t low_code;
uint32_t high_code;
uint32_t start_value;
};
struct MultimapMultiDestRange {
uint32_t low_code;
std::vector<WideString> retcodes;
};
using Range = absl::variant<CodeWordRange, MultimapSingleDestRange,
MultimapMultiDestRange>;
std::vector<Range> ranges;
const int raw_count = StringToInt(previous_word);
bool is_valid = raw_count >= 0 && raw_count <= 100;
const size_t expected_count = is_valid ? static_cast<size_t>(raw_count) : 0;
ranges.reserve(expected_count);
ByteStringView word;
while (true) {
word = parser.GetWord();
if (word.IsEmpty() || word == "endbfrange") {
break;
}
if (!is_valid) {
continue; // Keep consuming words. Do nothing else.
}
std::optional<uint32_t> lowcode_opt = StringToCode(word);
if (!lowcode_opt.has_value()) {
is_valid = false;
continue;
}
word = parser.GetWord();
std::optional<uint32_t> highcode_opt = StringToCode(word);
if (!highcode_opt.has_value()) {
is_valid = false;
continue;
}
uint32_t lowcode = lowcode_opt.value();
uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
if (lowcode > kCidLimit || highcode > kCidLimit || lowcode > highcode) {
is_valid = false;
continue;
}
word = parser.GetWord();
ByteStringView start = word;
if (start == "[") {
CodeWordRange range;
range.low_code = lowcode;
range.code_words.reserve(1 + highcode - lowcode);
for (uint32_t code = lowcode; code <= highcode; ++code) {
word = parser.GetWord();
range.code_words.push_back(word);
}
ranges.push_back(std::move(range));
if (ranges.size() > expected_count) {
is_valid = false;
continue;
}
word = parser.GetWord();
if (word != "]") {
is_valid = false;
}
continue;
}
WideString destcode = StringToWideString(start);
if (destcode.GetLength() == 1) {
std::optional<uint32_t> value_or_error = StringToCode(start);
if (!value_or_error.has_value()) {
is_valid = false;
continue;
}
ranges.push_back(
MultimapSingleDestRange{.low_code = lowcode,
.high_code = highcode,
.start_value = value_or_error.value()});
} else {
MultimapMultiDestRange range;
range.low_code = lowcode;
range.retcodes.reserve(1 + highcode - lowcode);
range.retcodes.push_back(destcode);
for (uint32_t code = lowcode + 1; code <= highcode; ++code) {
WideString retcode = StringDataAdd(range.retcodes.back());
range.retcodes.push_back(std::move(retcode));
}
ranges.push_back(std::move(range));
}
if (ranges.size() > expected_count) {
is_valid = false;
}
}
if (is_valid && ranges.size() == expected_count) {
for (const auto& entry : ranges) {
if (absl::holds_alternative<CodeWordRange>(entry)) {
const auto& range = absl::get<CodeWordRange>(entry);
uint32_t code = range.low_code;
for (const auto& code_word : range.code_words) {
SetCode(code, StringToWideString(code_word));
++code;
}
} else if (absl::holds_alternative<MultimapSingleDestRange>(entry)) {
const auto& range = absl::get<MultimapSingleDestRange>(entry);
uint32_t value = range.start_value;
for (uint32_t code = range.low_code; code <= range.high_code; ++code) {
InsertIntoMultimap(code, value++);
}
} else {
CHECK(absl::holds_alternative<MultimapMultiDestRange>(entry));
const auto& range = absl::get<MultimapMultiDestRange>(entry);
uint32_t code = range.low_code;
for (const auto& retcode : range.retcodes) {
InsertIntoMultimap(code, GetMultiCharIndexIndicator());
m_MultiCharVec.push_back(retcode);
++code;
}
}
}
}
return word;
}
uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const {
FX_SAFE_UINT32 uni = m_MultiCharVec.size();
uni = uni * 0x10000 + 0xffff;
return uni.ValueOrDefault(0);
}
void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) {
size_t len = destcode.GetLength();
if (len == 0)
return;
if (len == 1) {
InsertIntoMultimap(srccode, destcode[0]);
} else {
InsertIntoMultimap(srccode, GetMultiCharIndexIndicator());
m_MultiCharVec.push_back(destcode);
}
}
void CPDF_ToUnicodeMap::InsertIntoMultimap(uint32_t code, uint32_t destcode) {
auto it = m_Multimap.find(code);
if (it == m_Multimap.end()) {
m_Multimap.emplace(code, std::set<uint32_t>{destcode});
return;
}
it->second.emplace(destcode);
}