| // Copyright 2016 The PDFium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| |
| #ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ |
| #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ |
| |
| #include <stdint.h> |
| |
| #include <deque> |
| #include <functional> |
| #include <vector> |
| |
| #include "core/fpdfapi/page/cpdf_pageobjectholder.h" |
| #include "core/fxcrt/data_vector.h" |
| #include "core/fxcrt/fx_coordinates.h" |
| #include "core/fxcrt/fx_memory_wrappers.h" |
| #include "core/fxcrt/unowned_ptr.h" |
| #include "core/fxcrt/widestring.h" |
| #include "core/fxcrt/widetext_buffer.h" |
| #include "third_party/abseil-cpp/absl/types/optional.h" |
| |
| class CPDF_FormObject; |
| class CPDF_Page; |
| class CPDF_TextObject; |
| |
| struct TextPageCharSegment { |
| int index; |
| int count; |
| }; |
| |
| FX_DATA_PARTITION_EXCEPTION(TextPageCharSegment); |
| |
| class CPDF_TextPage { |
| public: |
| enum class CharType : uint8_t { |
| kNormal, |
| kGenerated, |
| kNotUnicode, |
| kHyphen, |
| kPiece, |
| }; |
| |
| class CharInfo { |
| public: |
| CharInfo(); |
| CharInfo(const CharInfo&); |
| ~CharInfo(); |
| |
| int m_Index = 0; |
| uint32_t m_CharCode = 0; |
| wchar_t m_Unicode = 0; |
| CharType m_CharType = CharType::kNormal; |
| CFX_PointF m_Origin; |
| CFX_FloatRect m_CharBox; |
| UnownedPtr<const CPDF_TextObject> m_pTextObj; |
| CFX_Matrix m_Matrix; |
| }; |
| |
| CPDF_TextPage(const CPDF_Page* pPage, bool rtl); |
| ~CPDF_TextPage(); |
| |
| int CharIndexFromTextIndex(int text_index) const; |
| int TextIndexFromCharIndex(int char_index) const; |
| size_t size() const { return m_CharList.size(); } |
| int CountChars() const; |
| |
| // These methods CHECK() to make sure |index| is within bounds. |
| const CharInfo& GetCharInfo(size_t index) const; |
| float GetCharFontSize(size_t index) const; |
| CFX_FloatRect GetCharLooseBounds(size_t index) const; |
| |
| std::vector<CFX_FloatRect> GetRectArray(int start, int count) const; |
| int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const; |
| WideString GetTextByRect(const CFX_FloatRect& rect) const; |
| WideString GetTextByObject(const CPDF_TextObject* pTextObj) const; |
| |
| // Returns string with the text from |m_TextBuf| that are covered by the input |
| // range. |start| and |count| are in terms of the |m_CharIndices|, so the |
| // range will be converted into appropriate indices. |
| WideString GetPageText(int start, int count) const; |
| WideString GetAllPageText() const { return GetPageText(0, CountChars()); } |
| |
| int CountRects(int start, int nCount); |
| bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; |
| |
| private: |
| enum class TextOrientation { |
| kUnknown, |
| kHorizontal, |
| kVertical, |
| }; |
| |
| enum class GenerateCharacter { |
| kNone, |
| kSpace, |
| kLineBreak, |
| kHyphen, |
| }; |
| |
| enum class MarkedContentState { kPass = 0, kDone, kDelay }; |
| |
| struct TransformedTextObject { |
| TransformedTextObject(); |
| TransformedTextObject(const TransformedTextObject& that); |
| ~TransformedTextObject(); |
| |
| UnownedPtr<const CPDF_TextObject> m_pTextObj; |
| CFX_Matrix m_formMatrix; |
| }; |
| |
| void Init(); |
| bool IsHyphen(wchar_t curChar) const; |
| void ProcessObject(); |
| void ProcessFormObject(CPDF_FormObject* pFormObj, |
| const CFX_Matrix& formMatrix); |
| void ProcessTextObject(const TransformedTextObject& obj); |
| void ProcessTextObject(CPDF_TextObject* pTextObj, |
| const CFX_Matrix& formMatrix, |
| const CPDF_PageObjectHolder* pObjList, |
| CPDF_PageObjectHolder::const_iterator ObjPos); |
| GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj, |
| const CFX_Matrix& formMatrix); |
| const CharInfo* GetPrevCharInfo() const; |
| absl::optional<CharInfo> GenerateCharInfo(wchar_t unicode); |
| bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj, |
| const CPDF_PageObjectHolder* pObjList, |
| CPDF_PageObjectHolder::const_iterator iter) const; |
| bool IsSameTextObject(CPDF_TextObject* pTextObj1, |
| CPDF_TextObject* pTextObj2) const; |
| void CloseTempLine(); |
| MarkedContentState PreMarkedContent(const CPDF_TextObject* pTextObj); |
| void ProcessMarkedContent(const TransformedTextObject& obj); |
| void FindPreviousTextObject(); |
| void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info); |
| void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info); |
| TextOrientation GetTextObjectWritingMode( |
| const CPDF_TextObject* pTextObj) const; |
| TextOrientation FindTextlineFlowOrientation() const; |
| void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix); |
| void SwapTempTextBuf(size_t iCharListStartAppend, size_t iBufStartAppend); |
| WideString GetTextByPredicate( |
| const std::function<bool(const CharInfo&)>& predicate) const; |
| |
| UnownedPtr<const CPDF_Page> const m_pPage; |
| DataVector<TextPageCharSegment> m_CharIndices; |
| std::deque<CharInfo> m_CharList; |
| std::deque<CharInfo> m_TempCharList; |
| WideTextBuffer m_TextBuf; |
| WideTextBuffer m_TempTextBuf; |
| UnownedPtr<const CPDF_TextObject> m_pPrevTextObj; |
| CFX_Matrix m_PrevMatrix; |
| const bool m_rtl; |
| const CFX_Matrix m_DisplayMatrix; |
| std::vector<CFX_FloatRect> m_SelRects; |
| std::vector<TransformedTextObject> mTextObjects; |
| TextOrientation m_TextlineDir = TextOrientation::kUnknown; |
| CFX_FloatRect m_CurlineRect; |
| }; |
| |
| #endif // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_ |