blob: 5bbfbd91983e220542800f77ca091cc5b6daa11f [file] [log] [blame]
// Copyright 2014 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc.
#include "core/include/fpdfapi/fpdf_page.h"
#include "core/include/fpdfapi/fpdf_pageobj.h"
#include "core/src/fpdftext/text_int.h"
class CPDF_TextStream {
CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF)
: m_Buffer(buffer), m_bUseLF(bUseLF), m_pLastObj(nullptr) {}
~CPDF_TextStream() {}
FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine);
CFX_WideTextBuf& m_Buffer;
const CPDF_TextObject* m_pLastObj;
FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1,
const CPDF_TextObject* pTextObj2) {
if (!pTextObj1 || !pTextObj2) {
return FALSE;
CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
pTextObj2->m_Right, pTextObj2->m_Top);
CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
pTextObj1->m_Right, pTextObj1->m_Top);
if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) {
return TRUE;
if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
if (rcPreObj.IsEmpty()) {
return FALSE;
if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
rcCurObj.Width() / 2) {
return FALSE;
if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
return FALSE;
int nPreCount = pTextObj2->CountItems();
int nCurCount = pTextObj1->CountItems();
if (nPreCount != nCurCount) {
return FALSE;
for (int i = 0; i < nPreCount; i++) {
CPDF_TextObjectItem itemPer, itemCur;
pTextObj2->GetItemInfo(i, &itemPer);
pTextObj1->GetItemInfo(i, &itemCur);
if (itemCur.m_CharCode != itemPer.m_CharCode) {
return FALSE;
return TRUE;
int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) {
if (charCode == -1) {
return 0;
int w = pFont->GetCharWidthF(charCode);
if (w == 0) {
CFX_ByteString str;
pFont->AppendChar(str, charCode);
w = pFont->GetStringWidth(str, 1);
if (w == 0) {
pFont->GetCharBBox(charCode, BBox);
w = BBox.right - BBox.left;
return w;
int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj,
const CPDF_TextObject* pObj) {
if (FPDFText_IsSameTextObject(pPrevObj, pObj)) {
return -1;
CPDF_TextObjectItem item;
int nItem = pPrevObj->CountItems();
pPrevObj->GetItemInfo(nItem - 1, &item);
FX_WCHAR preChar = 0, curChar = 0;
CFX_WideString wstr =
if (wstr.GetLength()) {
preChar = wstr.GetAt(0);
FX_FLOAT last_pos = item.m_OriginX;
int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont());
FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000;
last_width = FXSYS_fabs(last_width);
pObj->GetItemInfo(0, &item);
wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
if (wstr.GetLength()) {
curChar = wstr.GetAt(0);
int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
this_width = FXSYS_fabs(this_width);
FX_FLOAT threshold =
last_width > this_width ? last_width / 4 : this_width / 4;
CFX_Matrix prev_matrix, prev_reverse;
FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY();
prev_reverse.Transform(x, y);
if (FXSYS_fabs(y) > threshold * 2) {
return 2;
threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
threshold = threshold > 400
? (threshold < 700 ? threshold / 4 : threshold / 5)
: (threshold / 2);
threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize())
: FXSYS_fabs(pObj->GetFontSize());
threshold /= 1000;
if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
preChar != L' ')
if (curChar != L' ' && preChar != L' ') {
if ((x - last_pos - last_width) > threshold ||
(last_pos - x - last_width) > threshold) {
return 1;
if (x < 0 && (last_pos - x - last_width) > threshold) {
return 1;
if ((x - last_pos - last_width) > this_width ||
(x - last_pos - this_width) > last_width) {
return 1;
if (last_pos + last_width > x + this_width && curChar == L' ') {
return 3;
return 0;
FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj,
FX_BOOL bFirstLine) {
CPDF_Font* pFont = pObj->GetFont();
CFX_Matrix matrix;
int item_index = 0;
if (m_pLastObj) {
int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
if (result == 2) {
int len = m_Buffer.GetLength();
if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
m_Buffer.Delete(len - 1, 1);
} else {
if (bFirstLine) {
return TRUE;
if (m_bUseLF) {
} else {
m_Buffer.AppendChar(' ');
} else if (result == 1) {
m_Buffer.AppendChar(L' ');
} else if (result == -1) {
m_pLastObj = pObj;
return FALSE;
} else if (result == 3) {
item_index = 1;
m_pLastObj = pObj;
int nItems = pObj->CountItems();
FX_FLOAT Ignorekerning = 0;
for (int i = 1; i < nItems - 1; i += 2) {
CPDF_TextObjectItem item;
pObj->GetItemInfo(i, &item);
if (item.m_CharCode == (FX_DWORD)-1) {
if (i == 1) {
Ignorekerning = item.m_OriginX;
} else if (Ignorekerning > item.m_OriginX) {
Ignorekerning = item.m_OriginX;
} else {
Ignorekerning = 0;
FX_FLOAT spacing = 0;
for (; item_index < nItems; item_index++) {
CPDF_TextObjectItem item;
pObj->GetItemInfo(item_index, &item);
if (item.m_CharCode == (FX_DWORD)-1) {
CFX_WideString wstr = m_Buffer.GetWideString();
if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
if (nItems > 3 && !spacing) {
charSpace = 0;
if ((spacing || charSpace) && item_index > 0) {
int last_width = 0;
FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
FX_FLOAT threshold = 0;
if (space_charcode != -1) {
threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
if (threshold > fontsize_h / 3) {
threshold = 0;
} else {
threshold /= 2;
if (threshold == 0) {
threshold = fontsize_h;
int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
threshold = this_width > last_width ? (FX_FLOAT)this_width
: (FX_FLOAT)last_width;
int nDivide = 6;
if (threshold < 300) {
nDivide = 2;
} else if (threshold < 500) {
nDivide = 4;
} else if (threshold < 700) {
nDivide = 5;
threshold = threshold / nDivide;
threshold = fontsize_h * threshold / 1000;
if (charSpace > 0.001) {
spacing += matrix.TransformDistance(charSpace);
} else if (charSpace < -0.001) {
spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
if (threshold && (spacing && spacing >= threshold)) {
m_Buffer.AppendChar(L' ');
if (item.m_CharCode == (FX_DWORD)-1) {
spacing = 0;
CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
if (unicode_str.IsEmpty()) {
} else {
m_Buffer << unicode_str;
return FALSE;
void GetTextStream_Unicode(CFX_WideTextBuf& buffer,
CPDF_PageObjectList* pPage,
CPDF_TextStream textstream(buffer, bUseLF);
FX_POSITION pos = pPage->GetFirstObjectPosition();
while (pos) {
CPDF_PageObject* pObject = pPage->GetNextObject(pos);
if (pObject && pObject->m_Type == CPDF_PageObject::TEXT)
textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE);
CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc,
CPDF_Dictionary* pPage) {
CFX_WideTextBuf buffer;
buffer.EstimateSize(0, 1024);
CPDF_Page page;
page.Load(pDoc, pPage);
CPDF_ParseOptions options;
options.m_bTextOnly = TRUE;
options.m_bSeparateForm = FALSE;
CPDF_TextStream textstream(buffer, FALSE);
FX_POSITION pos = page.GetFirstObjectPosition();
while (pos) {
CPDF_PageObject* pObject = page.GetNextObject(pos);
if (pObject->m_Type != CPDF_PageObject::TEXT) {
if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
return buffer.GetWideString();