// Copyright 2014 PDFium Authors. All rights reserved. | |
// Use of this source code is governed by a BSD-style license that can be | |
// found in the LICENSE file. | |
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | |
#include "../../include/fpdfapi/fpdf_pageobj.h" | |
#include "../../include/fpdftext/fpdf_text.h" | |
#include "../../include/fpdfapi/fpdf_page.h" | |
class CPDF_TextStream : public CFX_Object | |
{ | |
public: | |
CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray); | |
~CPDF_TextStream() {} | |
FX_BOOL ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine); | |
CFX_WideTextBuf& m_Buffer; | |
FX_BOOL m_bUseLF; | |
CFX_PtrArray* m_pObjArray; | |
const CPDF_TextObject* m_pLastObj; | |
}; | |
CPDF_TextStream::CPDF_TextStream(CFX_WideTextBuf& buffer, FX_BOOL bUseLF, CFX_PtrArray* pObjArray) : m_Buffer(buffer) | |
{ | |
m_pLastObj = NULL; | |
m_bUseLF = bUseLF; | |
m_pObjArray = pObjArray; | |
} | |
FX_BOOL FPDFText_IsSameTextObject(const CPDF_TextObject* pTextObj1, const CPDF_TextObject* pTextObj2) | |
{ | |
if (!pTextObj1 || !pTextObj2) { | |
return FALSE; | |
} | |
CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_Right, pTextObj2->m_Top); | |
CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_Right, pTextObj1->m_Top); | |
if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty()) { | |
return TRUE; | |
} | |
if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) { | |
rcPreObj.Intersect(rcCurObj); | |
if (rcPreObj.IsEmpty()) { | |
return FALSE; | |
} | |
if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) { | |
return FALSE; | |
} | |
if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) { | |
return FALSE; | |
} | |
} | |
int nPreCount = pTextObj2->CountItems(); | |
int nCurCount = pTextObj1->CountItems(); | |
if (nPreCount != nCurCount) { | |
return FALSE; | |
} | |
for (int i = 0; i < nPreCount; i++) { | |
CPDF_TextObjectItem itemPer, itemCur; | |
pTextObj2->GetItemInfo(i, &itemPer); | |
pTextObj1->GetItemInfo(i, &itemCur); | |
if (itemCur.m_CharCode != itemPer.m_CharCode) { | |
return FALSE; | |
} | |
} | |
return TRUE; | |
} | |
int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) | |
{ | |
if(charCode == -1) { | |
return 0; | |
} | |
int w = pFont->GetCharWidthF(charCode); | |
if(w == 0) { | |
CFX_ByteString str; | |
pFont->AppendChar(str, charCode); | |
w = pFont->GetStringWidth(str, 1); | |
if(w == 0) { | |
FX_RECT BBox; | |
pFont->GetCharBBox(charCode, BBox); | |
w = BBox.right - BBox.left; | |
} | |
} | |
return w; | |
} | |
int FPDFText_ProcessInterObj(const CPDF_TextObject* pPrevObj, const CPDF_TextObject* pObj) | |
{ | |
if(FPDFText_IsSameTextObject(pPrevObj, pObj)) { | |
return -1; | |
} | |
CPDF_TextObjectItem item; | |
int nItem = pPrevObj->CountItems(); | |
pPrevObj->GetItemInfo(nItem - 1, &item); | |
FX_WCHAR preChar = 0, curChar = 0; | |
CFX_WideString wstr = pPrevObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); | |
if(wstr.GetLength()) { | |
preChar = wstr.GetAt(0); | |
} | |
FX_FLOAT last_pos = item.m_OriginX; | |
int nLastWidth = GetCharWidth(item.m_CharCode, pPrevObj->GetFont()); | |
FX_FLOAT last_width = nLastWidth * pPrevObj->GetFontSize() / 1000; | |
last_width = FXSYS_fabs(last_width); | |
pObj->GetItemInfo(0, &item); | |
wstr = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); | |
if(wstr.GetLength()) { | |
curChar = wstr.GetAt(0); | |
} | |
int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont()); | |
FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000; | |
this_width = FXSYS_fabs(this_width); | |
FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4; | |
CFX_AffineMatrix prev_matrix, prev_reverse; | |
pPrevObj->GetTextMatrix(&prev_matrix); | |
prev_reverse.SetReverse(prev_matrix); | |
FX_FLOAT x = pObj->GetPosX(), y = pObj->GetPosY(); | |
prev_reverse.Transform(x, y); | |
if (FXSYS_fabs(y) > threshold * 2) { | |
return 2; | |
} | |
threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth); | |
threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 : threshold / 5) : (threshold / 2); | |
threshold *= nLastWidth > nThisWidth ? FXSYS_fabs(pPrevObj->GetFontSize()) : FXSYS_fabs(pObj->GetFontSize()); | |
threshold /= 1000; | |
if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ') | |
if(curChar != L' ' && preChar != L' ') { | |
if((x - last_pos - last_width) > threshold || (last_pos - x - last_width) > threshold) { | |
return 1; | |
} | |
if(x < 0 && (last_pos - x - last_width) > threshold) { | |
return 1; | |
} | |
if((x - last_pos - last_width) > this_width || (x - last_pos - this_width) > last_width ) { | |
return 1; | |
} | |
} | |
if(last_pos + last_width > x + this_width && curChar == L' ') { | |
return 3; | |
} | |
return 0; | |
} | |
FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine) | |
{ | |
if(pObj->m_Bottom > 380 && pObj->m_Left < 45 && pObj->m_Top < 402) { | |
int i = 0; | |
} | |
CPDF_Font* pFont = pObj->GetFont(); | |
CFX_AffineMatrix matrix; | |
pObj->GetTextMatrix(&matrix); | |
FX_FLOAT fs = pObj->GetFontSize(); | |
int item_index = 0; | |
if (m_pLastObj) { | |
int result = FPDFText_ProcessInterObj(m_pLastObj, pObj); | |
if (result == 2) { | |
int len = m_Buffer.GetLength(); | |
if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') { | |
m_Buffer.Delete(len - 1, 1); | |
if (m_pObjArray) { | |
m_pObjArray->RemoveAt((len - 1) * 2, 2); | |
} | |
} else { | |
if (bFirstLine) { | |
return TRUE; | |
} | |
if (m_bUseLF) { | |
m_Buffer.AppendChar(L'\r'); | |
m_Buffer.AppendChar(L'\n'); | |
if (m_pObjArray) { | |
for (int i = 0; i < 4; i ++) { | |
m_pObjArray->Add(NULL); | |
} | |
} | |
} else { | |
m_Buffer.AppendChar(' '); | |
if (m_pObjArray) { | |
m_pObjArray->Add(NULL); | |
m_pObjArray->Add(NULL); | |
} | |
} | |
} | |
} else if (result == 1) { | |
m_Buffer.AppendChar(L' '); | |
if (m_pObjArray) { | |
m_pObjArray->Add(NULL); | |
m_pObjArray->Add(NULL); | |
} | |
} else if (result == -1) { | |
m_pLastObj = pObj; | |
return FALSE; | |
} else if (result == 3) { | |
item_index = 1; | |
} | |
} | |
m_pLastObj = pObj; | |
int nItems = pObj->CountItems(); | |
FX_FLOAT Ignorekerning = 0; | |
for(int i = 1; i < nItems - 1; i += 2) { | |
CPDF_TextObjectItem item; | |
pObj->GetItemInfo(i, &item); | |
if (item.m_CharCode == (FX_DWORD) - 1) { | |
if(i == 1) { | |
Ignorekerning = item.m_OriginX; | |
} else if(Ignorekerning > item.m_OriginX) { | |
Ignorekerning = item.m_OriginX; | |
} | |
} else { | |
Ignorekerning = 0; | |
break; | |
} | |
} | |
FX_FLOAT spacing = 0; | |
for (; item_index < nItems; item_index ++) { | |
CPDF_TextObjectItem item; | |
pObj->GetItemInfo(item_index, &item); | |
if (item.m_CharCode == (FX_DWORD) - 1) { | |
CFX_WideString wstr = m_Buffer.GetWideString(); | |
if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') { | |
continue; | |
} | |
FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); | |
spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000; | |
continue; | |
} | |
FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace; | |
if(nItems > 3 && !spacing) { | |
charSpace = 0; | |
} | |
if((spacing || charSpace) && item_index > 0) { | |
int last_width = 0; | |
FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); | |
FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); | |
FX_FLOAT threshold = 0; | |
if (space_charcode != -1) { | |
threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ; | |
} | |
if(threshold > fontsize_h / 3) { | |
threshold = 0; | |
} else { | |
threshold /= 2; | |
} | |
if (threshold == 0) { | |
threshold = fontsize_h; | |
int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); | |
threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; | |
int nDivide = 6; | |
if (threshold < 300) { | |
nDivide = 2; | |
} else if (threshold < 500) { | |
nDivide = 4; | |
} else if (threshold < 700) { | |
nDivide = 5; | |
} | |
threshold = threshold / nDivide; | |
threshold = fontsize_h * threshold / 1000; | |
} | |
if(charSpace > 0.001) { | |
spacing += matrix.TransformDistance(charSpace); | |
} else if(charSpace < -0.001) { | |
spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); | |
} | |
if (threshold && (spacing && spacing >= threshold) ) { | |
m_Buffer.AppendChar(L' '); | |
if (m_pObjArray) { | |
m_pObjArray->Add(NULL); | |
m_pObjArray->Add(NULL); | |
} | |
} | |
if (item.m_CharCode == (FX_DWORD) - 1) { | |
continue; | |
} | |
spacing = 0; | |
} | |
CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode); | |
if (unicode_str.IsEmpty()) { | |
m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode); | |
if (m_pObjArray) { | |
m_pObjArray->Add((void*)pObj); | |
m_pObjArray->Add((void*)(FX_INTPTR)item_index); | |
} | |
} else { | |
m_Buffer << unicode_str; | |
if (m_pObjArray) { | |
for (int i = 0; i < unicode_str.GetLength(); i ++) { | |
m_pObjArray->Add((void*)pObj); | |
m_pObjArray->Add((void*)(FX_INTPTR)item_index); | |
} | |
} | |
} | |
} | |
return FALSE; | |
} | |
void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects* pPage, FX_BOOL bUseLF, | |
CFX_PtrArray* pObjArray) | |
{ | |
CPDF_TextStream textstream(buffer, bUseLF, pObjArray); | |
FX_POSITION pos = pPage->GetFirstObjectPosition(); | |
while (pos) { | |
CPDF_PageObject* pObject = pPage->GetNextObject(pos); | |
if (pObject == NULL) { | |
continue; | |
} | |
if (pObject->m_Type != PDFPAGE_TEXT) { | |
continue; | |
} | |
textstream.ProcessObject((CPDF_TextObject*)pObject, FALSE); | |
} | |
} | |
CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage) | |
{ | |
CFX_WideTextBuf buffer; | |
buffer.EstimateSize(0, 1024); | |
CPDF_Page page; | |
page.Load(pDoc, pPage); | |
CPDF_ParseOptions options; | |
options.m_bTextOnly = TRUE; | |
options.m_bSeparateForm = FALSE; | |
page.ParseContent(&options); | |
CPDF_TextStream textstream(buffer, FALSE, NULL); | |
FX_POSITION pos = page.GetFirstObjectPosition(); | |
while (pos) { | |
CPDF_PageObject* pObject = page.GetNextObject(pos); | |
if (pObject->m_Type != PDFPAGE_TEXT) { | |
continue; | |
} | |
if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) { | |
break; | |
} | |
} | |
return buffer.GetWideString(); | |
} |