| // Copyright 2014 PDFium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| |
| #ifndef CORE_SRC_FPDFTEXT_TEXT_INT_H_ |
| #define CORE_SRC_FPDFTEXT_TEXT_INT_H_ |
| |
| #include <deque> |
| |
| #include "core/include/fpdftext/fpdf_text.h" |
| #include "core/include/fxcrt/fx_basic.h" |
| |
| class CFX_BidiChar; |
| class CPDF_DocProgressiveSearch; |
| class CPDF_FormObject; |
| class CPDF_LinkExtract; |
| class CPDF_TextPageFind; |
| |
| #define FPDFTEXT_CHAR_ERROR -1 |
| #define FPDFTEXT_CHAR_NORMAL 0 |
| #define FPDFTEXT_CHAR_GENERATED 1 |
| #define FPDFTEXT_CHAR_UNUNICODE 2 |
| #define FPDFTEXT_CHAR_HYPHEN 3 |
| #define FPDFTEXT_CHAR_PIECE 4 |
| #define FPDFTEXT_MC_PASS 0 |
| #define FPDFTEXT_MC_DONE 1 |
| #define FPDFTEXT_MC_DELAY 2 |
| |
| struct PAGECHAR_INFO { |
| int m_CharCode; |
| FX_WCHAR m_Unicode; |
| FX_FLOAT m_OriginX; |
| FX_FLOAT m_OriginY; |
| int32_t m_Flag; |
| CFX_FloatRect m_CharBox; |
| CPDF_TextObject* m_pTextObj; |
| CFX_Matrix m_Matrix; |
| int m_Index; |
| }; |
| |
| struct FPDF_SEGMENT { |
| int m_Start; |
| int m_nCount; |
| }; |
| |
| struct PDFTEXT_Obj { |
| CPDF_TextObject* m_pTextObj; |
| CFX_Matrix m_formMatrix; |
| }; |
| |
| class CPDF_TextPage : public IPDF_TextPage { |
| public: |
| CPDF_TextPage(const CPDF_Page* pPage, int flags); |
| ~CPDF_TextPage() override {} |
| |
| // IPDF_TextPage: |
| FX_BOOL ParseTextPage() override; |
| bool IsParsed() const override { return m_bIsParsed; } |
| int CharIndexFromTextIndex(int TextIndex) const override; |
| int TextIndexFromCharIndex(int CharIndex) const override; |
| int CountChars() const override; |
| void GetCharInfo(int index, FPDF_CHAR_INFO* info) const override; |
| void GetRectArray(int start, |
| int nCount, |
| CFX_RectArray& rectArray) const override; |
| int GetIndexAtPos(CPDF_Point point, |
| FX_FLOAT xTolerance, |
| FX_FLOAT yTolerance) const override; |
| int GetIndexAtPos(FX_FLOAT x, |
| FX_FLOAT y, |
| FX_FLOAT xTolerance, |
| FX_FLOAT yTolerance) const override; |
| CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const override; |
| void GetRectsArrayByRect(const CFX_FloatRect& rect, |
| CFX_RectArray& resRectArray) const override; |
| CFX_WideString GetPageText(int start = 0, int nCount = -1) const override; |
| int CountRects(int start, int nCount) override; |
| void GetRect(int rectIndex, |
| FX_FLOAT& left, |
| FX_FLOAT& top, |
| FX_FLOAT& right, |
| FX_FLOAT& bottom) const override; |
| FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) override; |
| FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) override; |
| int CountBoundedSegments(FX_FLOAT left, |
| FX_FLOAT top, |
| FX_FLOAT right, |
| FX_FLOAT bottom, |
| FX_BOOL bContains = FALSE) override; |
| void GetBoundedSegment(int index, int& start, int& count) const override; |
| int GetWordBreak(int index, int direction) const override; |
| |
| static FX_BOOL IsRectIntersect(const CFX_FloatRect& rect1, |
| const CFX_FloatRect& rect2); |
| static FX_BOOL IsLetter(FX_WCHAR unicode); |
| |
| private: |
| FX_BOOL IsHyphen(FX_WCHAR curChar); |
| bool IsControlChar(const PAGECHAR_INFO& charInfo); |
| FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); |
| void ProcessObject(); |
| void ProcessFormObject(CPDF_FormObject* pFormObj, |
| const CFX_Matrix& formMatrix); |
| void ProcessTextObject(PDFTEXT_Obj pObj); |
| void ProcessTextObject(CPDF_TextObject* pTextObj, |
| const CFX_Matrix& formMatrix, |
| FX_POSITION ObjPos); |
| int ProcessInsertObject(const CPDF_TextObject* pObj, |
| const CFX_Matrix& formMatrix); |
| FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); |
| FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); |
| FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, |
| CPDF_TextObject* pTextObj2); |
| int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; |
| void CloseTempLine(); |
| void OnPiece(CFX_BidiChar* pBidi, CFX_WideString& str); |
| int32_t PreMarkedContent(PDFTEXT_Obj pObj); |
| void ProcessMarkedContent(PDFTEXT_Obj pObj); |
| void CheckMarkedContentObject(int32_t& start, int32_t& nCount) const; |
| void FindPreviousTextObject(void); |
| void AddCharInfoByLRDirection(CFX_WideString& str, int i); |
| void AddCharInfoByRLDirection(CFX_WideString& str, int i); |
| int32_t GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); |
| int32_t FindTextlineFlowDirection(); |
| |
| void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend); |
| FX_BOOL IsRightToLeft(const CPDF_TextObject* pTextObj, |
| const CPDF_Font* pFont, |
| int nItems) const; |
| |
| CFX_WordArray m_CharIndex; |
| const CPDF_PageObjectList* const m_pPage; |
| std::deque<PAGECHAR_INFO> m_CharList; |
| std::deque<PAGECHAR_INFO> m_TempCharList; |
| CFX_WideTextBuf m_TextBuf; |
| CFX_WideTextBuf m_TempTextBuf; |
| const int m_parserflag; |
| CPDF_TextObject* m_pPreTextObj; |
| CFX_Matrix m_perMatrix; |
| bool m_bIsParsed; |
| CFX_Matrix m_DisplayMatrix; |
| CFX_ArrayTemplate<FPDF_SEGMENT> m_Segments; |
| CFX_RectArray m_SelRects; |
| CFX_ArrayTemplate<PDFTEXT_Obj> m_LineObj; |
| int32_t m_TextlineDir; |
| CFX_FloatRect m_CurlineRect; |
| }; |
| |
| class CPDF_TextPageFind : public IPDF_TextPageFind { |
| public: |
| explicit CPDF_TextPageFind(const IPDF_TextPage* pTextPage); |
| ~CPDF_TextPageFind() override {} |
| |
| // IPDF_TextPageFind |
| FX_BOOL FindFirst(const CFX_WideString& findwhat, |
| int flags, |
| int startPos = 0) override; |
| FX_BOOL FindNext() override; |
| FX_BOOL FindPrev() override; |
| void GetRectArray(CFX_RectArray& rects) const override; |
| int GetCurOrder() const override; |
| int GetMatchedCount() const override; |
| |
| protected: |
| void ExtractFindWhat(const CFX_WideString& findwhat); |
| FX_BOOL IsMatchWholeWord(const CFX_WideString& csPageText, |
| int startPos, |
| int endPos); |
| FX_BOOL ExtractSubString(CFX_WideString& rString, |
| const FX_WCHAR* lpszFullString, |
| int iSubString, |
| FX_WCHAR chSep); |
| CFX_WideString MakeReverse(const CFX_WideString& str); |
| int ReverseFind(const CFX_WideString& csPageText, |
| const CFX_WideString& csWord, |
| int nStartPos, |
| int& WordLength); |
| int GetCharIndex(int index) const; |
| |
| private: |
| CFX_WordArray m_CharIndex; |
| const IPDF_TextPage* m_pTextPage; |
| CFX_WideString m_strText; |
| CFX_WideString m_findWhat; |
| int m_flags; |
| CFX_WideStringArray m_csFindWhatArray; |
| int m_findNextStart; |
| int m_findPreStart; |
| FX_BOOL m_bMatchCase; |
| FX_BOOL m_bMatchWholeWord; |
| int m_resStart; |
| int m_resEnd; |
| CFX_RectArray m_resArray; |
| FX_BOOL m_IsFind; |
| }; |
| |
| class CPDF_LinkExt { |
| public: |
| CPDF_LinkExt() {} |
| int m_Start; |
| int m_Count; |
| CFX_WideString m_strUrl; |
| virtual ~CPDF_LinkExt() {} |
| }; |
| |
| typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; |
| |
| class CPDF_LinkExtract : public IPDF_LinkExtract { |
| public: |
| CPDF_LinkExtract(); |
| ~CPDF_LinkExtract() override; |
| |
| // IPDF_LinkExtract |
| FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) override; |
| int CountLinks() const override; |
| CFX_WideString GetURL(int index) const override; |
| void GetBoundedSegment(int index, int& start, int& count) const override; |
| void GetRects(int index, CFX_RectArray& rects) const override; |
| |
| FX_BOOL IsExtract() const { return m_bIsParsed; } |
| |
| protected: |
| void ParseLink(); |
| void DeleteLinkList(); |
| FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); |
| bool CheckMailLink(CFX_WideString& str); |
| void AppendToLinkList(int start, int count, const CFX_WideString& strUrl); |
| |
| private: |
| LINK_InfoArray m_LinkList; |
| const CPDF_TextPage* m_pTextPage; |
| CFX_WideString m_strPageText; |
| bool m_bIsParsed; |
| }; |
| |
| FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst); |
| void NormalizeString(CFX_WideString& str); |
| void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); |
| void GetTextStream_Unicode(CFX_WideTextBuf& buffer, |
| CPDF_PageObjectList* pPage, |
| FX_BOOL bUseLF, |
| CFX_PtrArray* pObjArray); |
| |
| #endif // CORE_SRC_FPDFTEXT_TEXT_INT_H_ |