blob: 0619ee302e916e3f424ff64faa8d683ede1f1acd [file] [log] [blame]
// Copyright 2026 The PDFium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "core/fpdfapi/edit/cpdf_font_util.h"
#include <stdint.h>
#include <algorithm>
#include <map>
#include <sstream>
#include <utility>
#include <vector>
#include "core/fpdfapi/parser/cpdf_array.h"
#include "core/fpdfapi/parser/cpdf_document.h"
#include "core/fpdfapi/parser/cpdf_number.h"
#include "core/fpdfapi/parser/cpdf_stream.h"
#include "core/fxcrt/check_op.h"
#include "core/fxcrt/fx_extension.h"
#include "core/fxcrt/fx_string_wrappers.h"
#include "core/fxcrt/numerics/safe_conversions.h"
#include "core/fxcrt/retain_ptr.h"
#include "core/fxcrt/span.h"
#include "core/fxcrt/utf16.h"
namespace {
constexpr uint32_t kMaxBfCharBfRangeEntries = 100;
const char kToUnicodeStart[] =
"/CIDInit /ProcSet findresource begin\n"
"12 dict begin\n"
"begincmap\n"
"/CIDSystemInfo\n"
"<</Registry (Adobe)\n"
"/Ordering (Identity)\n"
"/Supplement 0\n"
">> def\n"
"/CMapName /Adobe-Identity-H def\n"
"/CMapType 2 def\n"
"1 begincodespacerange\n"
"<0000> <FFFF>\n"
"endcodespacerange\n";
const char kToUnicodeEnd[] =
"endcmap\n"
"CMapName currentdict /CMap defineresource pop\n"
"end\n"
"end\n";
void AddCharcode(fxcrt::ostringstream& buffer, uint32_t number) {
CHECK_LE(number, 0xFFFF);
buffer << "<";
char ans[4];
FXSYS_IntToFourHexChars(number, ans);
for (char c : ans) {
buffer << c;
}
buffer << ">";
}
// PDF spec 1.7 Section 5.9.2: "Unicode character sequences as expressed in
// UTF-16BE encoding." See https://en.wikipedia.org/wiki/UTF-16#Description
void AddUnicode(fxcrt::ostringstream& buffer, uint32_t unicode) {
if (pdfium::IsHighSurrogate(unicode) || pdfium::IsLowSurrogate(unicode)) {
unicode = 0;
}
char unicode_buf[8];
pdfium::span<const char> unicode_span = FXSYS_ToUTF16BE(unicode, unicode_buf);
CHECK(!unicode_span.empty());
buffer << "<";
for (char c : unicode_span) {
buffer << c;
}
buffer << ">";
}
} // namespace
RetainPtr<CPDF_Array> CreateWidthsArray(
const std::map<uint32_t, uint32_t>& widths) {
auto widths_array = pdfium::MakeRetain<CPDF_Array>();
for (auto it = widths.begin(); it != widths.end(); ++it) {
auto next_it = std::next(it);
if (next_it != widths.end() && next_it->first == it->first + 1 &&
next_it->second == it->second) {
// The array can have a group c_first c_last w: all CIDs in the range from
// c_first to c_last will have width w
widths_array->AppendNew<CPDF_Number>(static_cast<int>(it->first));
while (next_it != widths.end() && next_it->first == it->first + 1 &&
next_it->second == it->second) {
it = next_it;
next_it = std::next(it);
}
widths_array->AppendNew<CPDF_Number>(static_cast<int>(it->first));
widths_array->AppendNew<CPDF_Number>(static_cast<int>(it->second));
continue;
}
// Otherwise we can have a group of the form c [w1 w2 ...]: c has width
// w1, c+1 has width w2, etc.
// A group may contain only a single item, e.g. c[w]
widths_array->AppendNew<CPDF_Number>(static_cast<int>(it->first));
auto current_width_array = pdfium::MakeRetain<CPDF_Array>();
current_width_array->AppendNew<CPDF_Number>(static_cast<int>(it->second));
while (next_it != widths.end() && next_it->first == it->first + 1) {
it = next_it;
next_it = std::next(it);
current_width_array->AppendNew<CPDF_Number>(static_cast<int>(it->second));
}
widths_array->Append(std::move(current_width_array));
}
return widths_array;
}
RetainPtr<CPDF_Stream> LoadUnicode(
const std::multimap<uint32_t, uint32_t>& to_unicode) {
// A map charcode->unicode
std::map<uint32_t, uint32_t> char_to_unicode_map;
// A map <char_start, char_end> to vector v of unicode characters of size (end
// - start + 1). This abbreviates: start->v[0], start+1->v[1], etc. PDF spec
// 1.7 Section 5.9.2 says that only the last byte of the unicode may change.
std::map<std::pair<uint32_t, uint32_t>, std::vector<uint32_t>>
char_range_to_unicodes_map;
// A map <start, end> -> unicode
// This abbreviates: start->unicode, start+1->unicode+1, etc.
// PDF spec 1.7 Section 5.9.2 says that only the last byte of the unicode may
// change.
std::map<std::pair<uint32_t, uint32_t>, uint32_t>
char_range_to_consecutive_unicodes_map;
// Calculate the maps
for (auto it = to_unicode.begin(); it != to_unicode.end(); ++it) {
uint32_t first_charcode = it->first;
uint32_t first_unicode = it->second;
{
auto next_it = std::next(it);
if (next_it == to_unicode.end() || first_charcode + 1 != next_it->first) {
char_to_unicode_map[first_charcode] = first_unicode;
continue;
}
}
++it;
uint32_t current_charcode = it->first;
uint32_t current_unicode = it->second;
if (current_charcode % 256 == 0) {
char_to_unicode_map[first_charcode] = first_unicode;
char_to_unicode_map[current_charcode] = current_unicode;
continue;
}
const size_t max_extra = 255 - (current_charcode % 256);
auto next_it = std::next(it);
if (first_unicode + 1 != current_unicode) {
// Consecutive charcodes mapping to non-consecutive unicodes
std::vector<uint32_t> unicodes = {first_unicode, current_unicode};
for (size_t i = 0; i < max_extra; ++i) {
if (next_it == to_unicode.end() ||
current_charcode + 1 != next_it->first) {
break;
}
++it;
++current_charcode;
unicodes.push_back(it->second);
next_it = std::next(it);
}
CHECK_EQ(it->first - first_charcode + 1, unicodes.size());
char_range_to_unicodes_map[std::make_pair(first_charcode, it->first)] =
std::move(unicodes);
continue;
}
// Consecutive charcodes mapping to consecutive unicodes
for (size_t i = 0; i < max_extra; ++i) {
if (next_it == to_unicode.end() ||
current_charcode + 1 != next_it->first ||
current_unicode + 1 != next_it->second) {
break;
}
++it;
++current_charcode;
++current_unicode;
next_it = std::next(it);
}
char_range_to_consecutive_unicodes_map[std::make_pair(
first_charcode, current_charcode)] = first_unicode;
}
fxcrt::ostringstream buffer;
buffer << kToUnicodeStart;
{
// Add `char_to_unicode_map` to `buffer`.
uint32_t to_process =
pdfium::checked_cast<uint32_t>(char_to_unicode_map.size());
auto it = char_to_unicode_map.begin();
while (to_process) {
const uint32_t to_process_this_iteration =
std::min(to_process, kMaxBfCharBfRangeEntries);
buffer << to_process_this_iteration << " beginbfchar\n";
for (uint32_t i = 0; i < to_process_this_iteration; ++i) {
CHECK(it != char_to_unicode_map.end());
AddCharcode(buffer, it->first);
buffer << " ";
AddUnicode(buffer, it->second);
buffer << "\n";
++it;
}
buffer << "endbfchar\n";
to_process -= to_process_this_iteration;
}
}
{
// Add `char_range_to_unicodes_map` to `buffer`.
uint32_t to_process =
pdfium::checked_cast<uint32_t>(char_range_to_unicodes_map.size());
auto it = char_range_to_unicodes_map.begin();
while (to_process) {
const uint32_t to_process_this_iteration =
std::min(to_process, kMaxBfCharBfRangeEntries);
buffer << to_process_this_iteration << " beginbfrange\n";
for (uint32_t i = 0; i < to_process_this_iteration; ++i) {
CHECK(it != char_range_to_unicodes_map.end());
const std::pair<uint32_t, uint32_t>& charcode_range = it->first;
AddCharcode(buffer, charcode_range.first);
buffer << " ";
AddCharcode(buffer, charcode_range.second);
buffer << " [";
auto unicodes = pdfium::span(it->second);
AddUnicode(buffer, unicodes[0]);
for (uint32_t code : unicodes.subspan(1u)) {
buffer << " ";
AddUnicode(buffer, code);
}
buffer << "]\n";
++it;
}
buffer << "endbfrange\n";
to_process -= to_process_this_iteration;
}
}
{
// Add `char_range_to_consecutive_unicodes_map` to `buffer`.
uint32_t to_process = pdfium::checked_cast<uint32_t>(
char_range_to_consecutive_unicodes_map.size());
auto it = char_range_to_consecutive_unicodes_map.begin();
while (to_process) {
const uint32_t to_process_this_iteration =
std::min(to_process, kMaxBfCharBfRangeEntries);
buffer << to_process_this_iteration << " beginbfrange\n";
for (uint32_t i = 0; i < to_process_this_iteration; ++i) {
CHECK(it != char_range_to_consecutive_unicodes_map.end());
const std::pair<uint32_t, uint32_t>& charcode_range = it->first;
AddCharcode(buffer, charcode_range.first);
buffer << " ";
AddCharcode(buffer, charcode_range.second);
buffer << " ";
AddUnicode(buffer, it->second);
buffer << "\n";
++it;
}
buffer << "endbfrange\n";
to_process -= to_process_this_iteration;
}
}
buffer << kToUnicodeEnd;
return pdfium::MakeRetain<CPDF_Stream>(&buffer);
}