| // Copyright 2014 PDFium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| |
| #include "core/include/fpdfapi/fpdf_page.h" |
| #include "core/include/fpdfapi/fpdf_pageobj.h" |
| #include "core/include/fpdfapi/fpdf_resource.h" |
| #include "core/include/fpdftext/fpdf_text.h" |
| #include "core/include/fxcrt/fx_bidi.h" |
| #include "core/include/fxcrt/fx_ucd.h" |
| #include "text_int.h" |
| #include "third_party/base/nonstd_unique_ptr.h" |
| #include "txtproc.h" |
| |
| #include <cctype> |
| #include <cwctype> |
| |
| CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, |
| int destcp, |
| const FX_CHAR* defchar) { |
| if (destcp == 0) { |
| if (unicode < 0x80) { |
| return CFX_ByteString((char)unicode); |
| } |
| const FX_CHAR* altstr = FCS_GetAltStr(unicode); |
| return CFX_ByteString(altstr ? altstr : defchar); |
| } |
| char buf[10]; |
| int iDef = 0; |
| int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10, |
| NULL, &iDef); |
| if (ret && !iDef) { |
| return CFX_ByteString(buf, ret); |
| } |
| const FX_CHAR* altstr = FCS_GetAltStr(unicode); |
| return CFX_ByteString(altstr ? altstr : defchar); |
| } |
| CTextPage::CTextPage() {} |
| CTextPage::~CTextPage() { |
| int i; |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| delete m_BaseLines.GetAt(i); |
| } |
| for (i = 0; i < m_TextColumns.GetSize(); i++) { |
| delete m_TextColumns.GetAt(i); |
| } |
| } |
| void CTextPage::ProcessObject(CPDF_PageObject* pObject) { |
| if (pObject->m_Type != PDFPAGE_TEXT) { |
| return; |
| } |
| CPDF_TextObject* pText = (CPDF_TextObject*)pObject; |
| CPDF_Font* pFont = pText->m_TextState.GetFont(); |
| int count = pText->CountItems(); |
| FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2); |
| pText->CalcCharPos(pPosArray); |
| |
| FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); |
| FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); |
| FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); |
| FX_FLOAT spacew = 0; |
| if (space_charcode != -1) { |
| spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; |
| } |
| if (spacew == 0) { |
| spacew = fontsize_h / 4; |
| } |
| if (pText->m_TextState.GetBaselineAngle() != 0) { |
| int cc = 0; |
| CFX_Matrix matrix; |
| pText->GetTextMatrix(&matrix); |
| for (int i = 0; i < pText->m_nChars; i++) { |
| FX_DWORD charcode = pText->m_nChars == 1 |
| ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes |
| : pText->m_pCharCodes[i]; |
| if (charcode == (FX_DWORD)-1) { |
| continue; |
| } |
| FX_RECT char_box; |
| pFont->GetCharBBox(charcode, char_box); |
| FX_FLOAT char_left = |
| pPosArray ? pPosArray[cc * 2] |
| : char_box.left * pText->m_TextState.GetFontSize() / 1000; |
| FX_FLOAT char_right = |
| pPosArray ? pPosArray[cc * 2 + 1] |
| : char_box.right * pText->m_TextState.GetFontSize() / 1000; |
| FX_FLOAT char_top = |
| char_box.top * pText->m_TextState.GetFontSize() / 1000; |
| FX_FLOAT char_bottom = |
| char_box.bottom * pText->m_TextState.GetFontSize() / 1000; |
| cc++; |
| FX_FLOAT char_origx, char_origy; |
| matrix.Transform(char_left, 0, char_origx, char_origy); |
| matrix.TransformRect(char_left, char_right, char_top, char_bottom); |
| CFX_ByteString str; |
| pFont->AppendChar(str, charcode); |
| InsertTextBox(NULL, char_origy, char_left, char_right, char_top, |
| char_bottom, spacew, fontsize_v, str, pFont); |
| } |
| FX_Free(pPosArray); |
| return; |
| } |
| FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); |
| for (int ii = 0; ii < count * 2; ii++) { |
| pPosArray[ii] *= ratio_h; |
| } |
| FX_FLOAT baseline = pText->m_PosY; |
| CTextBaseLine* pBaseLine = NULL; |
| FX_FLOAT topy = pText->m_Top; |
| FX_FLOAT bottomy = pText->m_Bottom; |
| FX_FLOAT leftx = pText->m_Left; |
| int cc = 0; |
| CFX_ByteString segment; |
| int space_count = 0; |
| FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; |
| for (int i = 0; i < pText->m_nChars; i++) { |
| FX_DWORD charcode = pText->m_nChars == 1 |
| ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes |
| : pText->m_pCharCodes[i]; |
| if (charcode == (FX_DWORD)-1) { |
| continue; |
| } |
| FX_FLOAT char_left = pPosArray[cc * 2]; |
| FX_FLOAT char_right = pPosArray[cc * 2 + 1]; |
| cc++; |
| if (char_left < last_left || (char_left - last_right) > spacew / 2) { |
| pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, |
| leftx + segment_right, topy, bottomy, spacew, |
| fontsize_v, segment, pFont); |
| segment_left = char_left; |
| segment = ""; |
| } |
| if (space_count > 1) { |
| pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, |
| leftx + segment_right, topy, bottomy, spacew, |
| fontsize_v, segment, pFont); |
| segment = ""; |
| } else if (space_count == 1) { |
| pFont->AppendChar(segment, ' '); |
| } |
| if (segment.GetLength() == 0) { |
| segment_left = char_left; |
| } |
| segment_right = char_right; |
| pFont->AppendChar(segment, charcode); |
| space_count = 0; |
| last_left = char_left; |
| last_right = char_right; |
| } |
| if (segment.GetLength()) |
| pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, |
| leftx + segment_right, topy, bottomy, spacew, |
| fontsize_v, segment, pFont); |
| FX_Free(pPosArray); |
| } |
| CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, |
| FX_FLOAT basey, |
| FX_FLOAT leftx, |
| FX_FLOAT rightx, |
| FX_FLOAT topy, |
| FX_FLOAT bottomy, |
| FX_FLOAT spacew, |
| FX_FLOAT fontsize_v, |
| CFX_ByteString& str, |
| CPDF_Font* pFont) { |
| if (str.GetLength() == 0) { |
| return NULL; |
| } |
| if (!pBaseLine) { |
| int i; |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| CTextBaseLine* pExistLine = m_BaseLines.GetAt(i); |
| if (pExistLine->m_BaseLine == basey) { |
| pBaseLine = pExistLine; |
| break; |
| } |
| if (pExistLine->m_BaseLine < basey) { |
| break; |
| } |
| } |
| if (!pBaseLine) { |
| pBaseLine = new CTextBaseLine; |
| pBaseLine->m_BaseLine = basey; |
| m_BaseLines.InsertAt(i, pBaseLine); |
| } |
| } |
| CFX_WideString text; |
| const FX_CHAR* pStr = str; |
| int len = str.GetLength(), offset = 0; |
| while (offset < len) { |
| FX_DWORD ch = pFont->GetNextChar(pStr, len, offset); |
| CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); |
| if (unicode_str.IsEmpty()) { |
| text += (FX_WCHAR)ch; |
| } else { |
| text += unicode_str; |
| } |
| } |
| pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, |
| text); |
| return pBaseLine; |
| } |
| void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) { |
| FX_FLOAT lastheight = -1; |
| FX_FLOAT lastbaseline = -1; |
| FX_FLOAT MinLeftX = 1000000; |
| FX_FLOAT MaxRightX = 0; |
| int i; |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); |
| FX_FLOAT leftx, rightx; |
| if (pBaseLine->GetWidth(leftx, rightx)) { |
| if (leftx < MinLeftX) { |
| MinLeftX = leftx; |
| } |
| if (rightx > MaxRightX) { |
| MaxRightX = rightx; |
| } |
| } |
| } |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| m_BaseLines.GetAt(i)->MergeBoxes(); |
| } |
| for (i = 1; i < m_BaseLines.GetSize(); i++) { |
| CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); |
| CTextBaseLine* pPrevLine = m_BaseLines.GetAt(i - 1); |
| if (pBaseLine->CanMerge(pPrevLine)) { |
| pPrevLine->Merge(pBaseLine); |
| delete pBaseLine; |
| m_BaseLines.RemoveAt(i); |
| i--; |
| } |
| } |
| if (m_bAutoWidth) { |
| int* widths = FX_Alloc(int, m_BaseLines.GetSize()); |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| widths[i] = 0; |
| CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); |
| int TotalChars = 0; |
| FX_FLOAT TotalWidth = 0; |
| int minchars; |
| pBaseLine->CountChars(TotalChars, TotalWidth, minchars); |
| if (TotalChars) { |
| FX_FLOAT charwidth = TotalWidth / TotalChars; |
| widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); |
| } |
| if (widths[i] > 1000) { |
| widths[i] = 1000; |
| } |
| if (widths[i] < minchars) { |
| widths[i] = minchars; |
| } |
| } |
| int AvgWidth = 0, widthcount = 0; |
| for (i = 0; i < m_BaseLines.GetSize(); i++) |
| if (widths[i]) { |
| AvgWidth += widths[i]; |
| widthcount++; |
| } |
| AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); |
| int MaxWidth = 0; |
| for (i = 0; i < m_BaseLines.GetSize(); i++) |
| if (MaxWidth < widths[i]) { |
| MaxWidth = widths[i]; |
| } |
| if (MaxWidth > AvgWidth * 6 / 5) { |
| MaxWidth = AvgWidth * 6 / 5; |
| } |
| FX_Free(widths); |
| if (iMinWidth < MaxWidth) { |
| iMinWidth = MaxWidth; |
| } |
| } |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| m_BaseLines.GetAt(i)->MergeBoxes(); |
| } |
| if (m_bKeepColumn) { |
| FindColumns(); |
| } |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); |
| if (lastheight >= 0) { |
| FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; |
| if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) { |
| lines.Add(L""); |
| } |
| } |
| lastheight = pBaseLine->m_MaxFontSizeV; |
| lastbaseline = pBaseLine->m_BaseLine; |
| CFX_WideString str; |
| pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); |
| lines.Add(str); |
| } |
| } |
| void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) { |
| wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); |
| FX_WCHAR* pDst = NULL; |
| FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); |
| if (nCount < 1) { |
| sDest += wChar; |
| return; |
| } |
| pDst = new FX_WCHAR[nCount]; |
| FX_Unicode_GetNormalization(wChar, pDst); |
| for (int nIndex = 0; nIndex < nCount; nIndex++) { |
| sDest += pDst[nIndex]; |
| } |
| delete[] pDst; |
| } |
| void NormalizeString(CFX_WideString& str) { |
| if (str.GetLength() <= 0) { |
| return; |
| } |
| CFX_WideString sBuffer; |
| nonstd::unique_ptr<CFX_BidiChar> pBidiChar(new CFX_BidiChar); |
| CFX_WordArray order; |
| FX_BOOL bR2L = FALSE; |
| int32_t start = 0, count = 0, i = 0; |
| int nR2L = 0, nL2R = 0; |
| for (i = 0; i < str.GetLength(); i++) { |
| if (pBidiChar->AppendChar(str.GetAt(i))) { |
| CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); |
| order.Add(start); |
| order.Add(count); |
| order.Add(ret); |
| if (!bR2L) { |
| if (ret == CFX_BidiChar::RIGHT) { |
| nR2L++; |
| } else if (ret == CFX_BidiChar::LEFT) { |
| nL2R++; |
| } |
| } |
| } |
| } |
| if (pBidiChar->EndChar()) { |
| CFX_BidiChar::Direction ret = pBidiChar->GetBidiInfo(&start, &count); |
| order.Add(start); |
| order.Add(count); |
| order.Add(ret); |
| if (!bR2L) { |
| if (ret == CFX_BidiChar::RIGHT) { |
| nR2L++; |
| } else if (ret == CFX_BidiChar::LEFT) { |
| nL2R++; |
| } |
| } |
| } |
| if (nR2L > 0 && nR2L >= nL2R) { |
| bR2L = TRUE; |
| } |
| if (bR2L) { |
| int count = order.GetSize(); |
| for (int j = count - 1; j > 0; j -= 3) { |
| int ret = order.GetAt(j); |
| int start = order.GetAt(j - 2); |
| int count1 = order.GetAt(j - 1); |
| if (ret == 2 || ret == 0) { |
| for (int i = start + count1 - 1; i >= start; i--) { |
| NormalizeCompositeChar(str[i], sBuffer); |
| } |
| } else { |
| i = j; |
| FX_BOOL bSymbol = FALSE; |
| while (i > 0 && order.GetAt(i) != 2) { |
| bSymbol = !order.GetAt(i); |
| i -= 3; |
| } |
| int end = start + count1; |
| int n = 0; |
| if (bSymbol) { |
| n = i + 6; |
| } else { |
| n = i + 3; |
| } |
| if (n >= j) { |
| for (int m = start; m < end; m++) { |
| sBuffer += str[m]; |
| } |
| } else { |
| i = j; |
| j = n; |
| for (; n <= i; n += 3) { |
| int start = order.GetAt(n - 2); |
| int count1 = order.GetAt(n - 1); |
| int end = start + count1; |
| for (int m = start; m < end; m++) { |
| sBuffer += str[m]; |
| } |
| } |
| } |
| } |
| } |
| } else { |
| int count = order.GetSize(); |
| FX_BOOL bL2R = FALSE; |
| for (int j = 0; j < count; j += 3) { |
| int ret = order.GetAt(j + 2); |
| int start = order.GetAt(j); |
| int count1 = order.GetAt(j + 1); |
| if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) { |
| int i = j + 3; |
| while (bR2L && i < count) { |
| if (order.GetAt(i + 2) == 1) { |
| break; |
| } else { |
| i += 3; |
| } |
| } |
| if (i == 3) { |
| j = -3; |
| bL2R = TRUE; |
| continue; |
| } |
| int end = str.GetLength() - 1; |
| if (i < count) { |
| end = order.GetAt(i) - 1; |
| } |
| j = i - 3; |
| for (int n = end; n >= start; n--) { |
| NormalizeCompositeChar(str[i], sBuffer); |
| } |
| } else { |
| int end = start + count1; |
| for (int i = start; i < end; i++) { |
| sBuffer += str[i]; |
| } |
| } |
| } |
| } |
| str.Empty(); |
| str += sBuffer; |
| } |
| static FX_BOOL IsNumber(CFX_WideString& str) { |
| for (int i = 0; i < str.GetLength(); i++) { |
| FX_WCHAR ch = str[i]; |
| // TODO(dsinclair): --.+ +.-- should probably not be a number. |
| if (!std::iswdigit(ch) && ch != '-' && ch != '+' && ch != '.' && ch != ' ') |
| return FALSE; |
| } |
| return TRUE; |
| } |
| void CTextPage::FindColumns() { |
| int i; |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); |
| for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { |
| CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); |
| CTextColumn* pColumn = FindColumn(pTextBox->m_Right); |
| if (pColumn) { |
| pColumn->m_AvgPos = |
| (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) / |
| (pColumn->m_Count + 1); |
| pColumn->m_Count++; |
| } else { |
| pColumn = new CTextColumn; |
| pColumn->m_Count = 1; |
| pColumn->m_AvgPos = pTextBox->m_Right; |
| pColumn->m_TextPos = -1; |
| m_TextColumns.Add(pColumn); |
| } |
| } |
| } |
| int mincount = m_BaseLines.GetSize() / 4; |
| for (i = 0; i < m_TextColumns.GetSize(); i++) { |
| CTextColumn* pTextColumn = m_TextColumns.GetAt(i); |
| if (pTextColumn->m_Count >= mincount) { |
| continue; |
| } |
| delete pTextColumn; |
| m_TextColumns.RemoveAt(i); |
| i--; |
| } |
| for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| CTextBaseLine* pBaseLine = m_BaseLines.GetAt(i); |
| for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { |
| CTextBox* pTextBox = pBaseLine->m_TextList.GetAt(j); |
| if (IsNumber(pTextBox->m_Text)) { |
| pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); |
| } |
| } |
| } |
| } |
| CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) { |
| for (int i = 0; i < m_TextColumns.GetSize(); i++) { |
| CTextColumn* pColumn = m_TextColumns.GetAt(i); |
| if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { |
| return pColumn; |
| } |
| } |
| return NULL; |
| } |
| void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {} |
| CTextBaseLine::CTextBaseLine() { |
| m_Top = -100000; |
| m_Bottom = 100000; |
| m_MaxFontSizeV = 0; |
| } |
| CTextBaseLine::~CTextBaseLine() { |
| for (int i = 0; i < m_TextList.GetSize(); i++) { |
| delete m_TextList.GetAt(i); |
| } |
| } |
| void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, |
| FX_FLOAT rightx, |
| FX_FLOAT topy, |
| FX_FLOAT bottomy, |
| FX_FLOAT spacew, |
| FX_FLOAT fontsize_v, |
| const CFX_WideString& text) { |
| if (m_Top < topy) { |
| m_Top = topy; |
| } |
| if (m_Bottom > bottomy) { |
| m_Bottom = bottomy; |
| } |
| if (m_MaxFontSizeV < fontsize_v) { |
| m_MaxFontSizeV = fontsize_v; |
| } |
| int i; |
| for (i = 0; i < m_TextList.GetSize(); i++) { |
| CTextBox* pText = m_TextList.GetAt(i); |
| if (pText->m_Left > leftx) { |
| break; |
| } |
| } |
| CTextBox* pText = new CTextBox; |
| pText->m_Text = text; |
| pText->m_Left = leftx; |
| pText->m_Right = rightx; |
| pText->m_Top = topy; |
| pText->m_Bottom = bottomy; |
| pText->m_SpaceWidth = spacew; |
| pText->m_FontSizeV = fontsize_v; |
| pText->m_pColumn = NULL; |
| m_TextList.InsertAt(i, pText); |
| } |
| FX_BOOL GetIntersection(FX_FLOAT low1, |
| FX_FLOAT high1, |
| FX_FLOAT low2, |
| FX_FLOAT high2, |
| FX_FLOAT& interlow, |
| FX_FLOAT& interhigh); |
| FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) { |
| FX_FLOAT inter_top, inter_bottom; |
| if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top, |
| inter_bottom, inter_top)) { |
| return FALSE; |
| } |
| FX_FLOAT inter_h = inter_top - inter_bottom; |
| if (inter_h < (m_Top - m_Bottom) / 2 && |
| inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) { |
| return FALSE; |
| } |
| FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); |
| for (int i = 0; i < m_TextList.GetSize(); i++) { |
| CTextBox* pText = m_TextList.GetAt(i); |
| for (int j = 0; j < pOther->m_TextList.GetSize(); j++) { |
| CTextBox* pOtherText = pOther->m_TextList.GetAt(j); |
| FX_FLOAT inter_left, inter_right; |
| if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left, |
| pOtherText->m_Right, inter_left, inter_right)) { |
| continue; |
| } |
| FX_FLOAT inter_w = inter_right - inter_left; |
| if (inter_w < pText->m_SpaceWidth / 2 && |
| inter_w < pOtherText->m_SpaceWidth / 2) { |
| continue; |
| } |
| if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || |
| dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { |
| return FALSE; |
| } |
| } |
| } |
| return TRUE; |
| } |
| void CTextBaseLine::Merge(CTextBaseLine* pOther) { |
| for (int i = 0; i < pOther->m_TextList.GetSize(); i++) { |
| CTextBox* pText = pOther->m_TextList.GetAt(i); |
| InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom, |
| pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text); |
| } |
| } |
| FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) { |
| int i; |
| for (i = 0; i < m_TextList.GetSize(); i++) { |
| CTextBox* pText = m_TextList.GetAt(i); |
| if (pText->m_Text != L" ") { |
| break; |
| } |
| } |
| if (i == m_TextList.GetSize()) { |
| return FALSE; |
| } |
| CTextBox* pText = m_TextList.GetAt(i); |
| leftx = pText->m_Left; |
| for (i = m_TextList.GetSize() - 1; i >= 0; i--) { |
| CTextBox* pText = m_TextList.GetAt(i); |
| if (pText->m_Text != L" ") { |
| break; |
| } |
| } |
| pText = m_TextList.GetAt(i); |
| rightx = pText->m_Right; |
| return TRUE; |
| } |
| void CTextBaseLine::MergeBoxes() { |
| int i = 0; |
| while (1) { |
| if (i >= m_TextList.GetSize() - 1) { |
| break; |
| } |
| CTextBox* pThisText = m_TextList.GetAt(i); |
| CTextBox* pNextText = m_TextList.GetAt(i + 1); |
| FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; |
| FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) |
| ? pNextText->m_SpaceWidth |
| : pThisText->m_SpaceWidth; |
| if (spacew > 0.0 && dx < spacew * 2) { |
| pThisText->m_Right = pNextText->m_Right; |
| if (dx > spacew * 1.5) { |
| pThisText->m_Text += L" "; |
| } else if (dx > spacew / 3) { |
| pThisText->m_Text += L' '; |
| } |
| pThisText->m_Text += pNextText->m_Text; |
| pThisText->m_SpaceWidth = |
| pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth; |
| m_TextList.RemoveAt(i + 1); |
| delete pNextText; |
| } else { |
| i++; |
| } |
| } |
| } |
| void CTextBaseLine::WriteOutput(CFX_WideString& str, |
| FX_FLOAT leftx, |
| FX_FLOAT pagewidth, |
| int iTextWidth) { |
| int lastpos = -1; |
| for (int i = 0; i < m_TextList.GetSize(); i++) { |
| CTextBox* pText = m_TextList.GetAt(i); |
| int xpos; |
| if (pText->m_pColumn) { |
| xpos = |
| (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + |
| 0.5); |
| xpos -= pText->m_Text.GetLength(); |
| } else { |
| xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5); |
| } |
| if (xpos <= lastpos) { |
| xpos = lastpos + 1; |
| } |
| for (int j = lastpos + 1; j < xpos; j++) { |
| str += ' '; |
| } |
| CFX_WideString sSrc(pText->m_Text); |
| NormalizeString(sSrc); |
| str += sSrc; |
| str += ' '; |
| lastpos = xpos + pText->m_Text.GetLength(); |
| } |
| } |
| void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) { |
| minchars = 0; |
| for (int i = 0; i < m_TextList.GetSize(); i++) { |
| CTextBox* pText = m_TextList.GetAt(i); |
| if (pText->m_Right - pText->m_Left < 0.002) { |
| continue; |
| } |
| count += pText->m_Text.GetLength(); |
| width += pText->m_Right - pText->m_Left; |
| minchars += pText->m_Text.GetLength() + 1; |
| } |
| } |
| #define PI 3.1415926535897932384626433832795 |
| static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) { |
| int total_count = 0, rotated_count[3] = {0, 0, 0}; |
| FX_POSITION pos = page.GetFirstObjectPosition(); |
| while (pos) { |
| CPDF_PageObject* pObj = page.GetNextObject(pos); |
| if (pObj->m_Type != PDFPAGE_TEXT) { |
| continue; |
| } |
| total_count++; |
| CPDF_TextObject* pText = (CPDF_TextObject*)pObj; |
| FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); |
| if (angle == 0.0) { |
| continue; |
| } |
| int degree = (int)(angle * 180 / PI + 0.5); |
| if (degree % 90) { |
| continue; |
| } |
| if (degree < 0) { |
| degree += 360; |
| } |
| int index = degree / 90 % 3 - 1; |
| if (index < 0) { |
| continue; |
| } |
| rotated_count[index]++; |
| } |
| if (total_count == 0) { |
| return; |
| } |
| CFX_Matrix matrix; |
| if (rotated_count[0] > total_count * 2 / 3) { |
| matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); |
| } else if (rotated_count[1] > total_count * 2 / 3) { |
| matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); |
| } else if (rotated_count[2] > total_count * 2 / 3) { |
| matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); |
| } else { |
| return; |
| } |
| page.Transform(matrix); |
| page_bbox.Transform(&matrix); |
| } |
| void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, |
| CPDF_Document* pDoc, |
| CPDF_Dictionary* pPage, |
| int iMinWidth, |
| FX_DWORD flags) { |
| lines.RemoveAll(); |
| if (!pPage) { |
| return; |
| } |
| CPDF_Page page; |
| page.Load(pDoc, pPage); |
| CPDF_ParseOptions options; |
| options.m_bTextOnly = TRUE; |
| options.m_bSeparateForm = FALSE; |
| page.ParseContent(&options); |
| CFX_FloatRect page_bbox = page.GetPageBBox(); |
| if (flags & PDF2TXT_AUTO_ROTATE) { |
| CheckRotate(page, page_bbox); |
| } |
| CTextPage texts; |
| texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; |
| texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; |
| texts.m_bBreakSpace = TRUE; |
| FX_POSITION pos = page.GetFirstObjectPosition(); |
| while (pos) { |
| CPDF_PageObject* pObject = page.GetNextObject(pos); |
| if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { |
| CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, |
| pObject->m_Top); |
| if (!page_bbox.Contains(rect)) { |
| continue; |
| } |
| } |
| texts.ProcessObject(pObject); |
| } |
| texts.WriteOutput(lines, iMinWidth); |
| } |
| void PDF_GetPageText(CFX_ByteStringArray& lines, |
| CPDF_Document* pDoc, |
| CPDF_Dictionary* pPage, |
| int iMinWidth, |
| FX_DWORD flags) { |
| lines.RemoveAll(); |
| CFX_WideStringArray wlines; |
| PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); |
| for (int i = 0; i < wlines.GetSize(); i++) { |
| CFX_WideString wstr = wlines[i]; |
| CFX_ByteString str; |
| for (int c = 0; c < wstr.GetLength(); c++) { |
| str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); |
| } |
| lines.Add(str); |
| } |
| } |
| void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, |
| CPDF_Document* pDoc, |
| CPDF_Dictionary* pPage, |
| FX_DWORD flags) { |
| buffer.EstimateSize(0, 10240); |
| CPDF_Page page; |
| page.Load(pDoc, pPage); |
| CPDF_ParseOptions options; |
| options.m_bTextOnly = TRUE; |
| options.m_bSeparateForm = FALSE; |
| page.ParseContent(&options); |
| GetTextStream_Unicode(buffer, &page, TRUE, NULL); |
| } |