blob: a47d059b413ca6943e85fae47c23d24a5018ead5 [file] [log] [blame] [edit]
// Copyright 2016 The PDFium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#ifndef CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
#define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
#include <stdint.h>
#include <deque>
#include <functional>
#include <optional>
#include <vector>
#include "core/fpdfapi/page/cpdf_pageobjectholder.h"
#include "core/fxcrt/data_vector.h"
#include "core/fxcrt/fx_coordinates.h"
#include "core/fxcrt/fx_memory_wrappers.h"
#include "core/fxcrt/unowned_ptr.h"
#include "core/fxcrt/widestring.h"
#include "core/fxcrt/widetext_buffer.h"
class CPDF_FormObject;
class CPDF_Page;
class CPDF_TextObject;
struct TextPageCharSegment {
int index;
int count;
};
FX_DATA_PARTITION_EXCEPTION(TextPageCharSegment);
class CPDF_TextPage {
public:
enum class CharType : uint8_t {
kNormal,
kGenerated,
kNotUnicode,
kHyphen,
kPiece,
};
class CharInfo {
public:
CharInfo();
CharInfo(CharType char_type,
uint32_t char_code,
wchar_t unicode,
CFX_PointF origin,
CFX_FloatRect char_box,
CFX_Matrix matrix,
CPDF_TextObject* text_object);
CharInfo(const CharInfo&);
~CharInfo();
CharType char_type() const { return char_type_; }
void set_char_type(CharType char_type) { char_type_ = char_type; }
uint32_t char_code() const { return char_code_; }
wchar_t unicode() const { return unicode_; }
void set_unicode(wchar_t unicode) { unicode_ = unicode; }
const CFX_PointF& origin() const { return origin_; }
const CFX_FloatRect& char_box() const { return char_box_; }
const CFX_Matrix& matrix() const { return matrix_; }
const CPDF_TextObject* text_object() const { return text_object_; }
CPDF_TextObject* text_object() { return text_object_; }
private:
CharType char_type_ = CharType::kNormal;
wchar_t unicode_ = 0; // Above `char_code_` to potentially pack tighter.
uint32_t char_code_ = 0;
CFX_PointF origin_;
CFX_FloatRect char_box_;
CFX_Matrix matrix_;
UnownedPtr<CPDF_TextObject> text_object_;
};
CPDF_TextPage(const CPDF_Page* pPage, bool rtl);
~CPDF_TextPage();
int CharIndexFromTextIndex(int text_index) const;
int TextIndexFromCharIndex(int char_index) const;
size_t size() const { return m_CharList.size(); }
int CountChars() const;
// These methods CHECK() to make sure |index| is within bounds.
const CharInfo& GetCharInfo(size_t index) const;
CharInfo& GetCharInfo(size_t index);
float GetCharFontSize(size_t index) const;
CFX_FloatRect GetCharLooseBounds(size_t index) const;
std::vector<CFX_FloatRect> GetRectArray(int start, int count) const;
int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
WideString GetTextByRect(const CFX_FloatRect& rect) const;
WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;
// Returns string with the text from |m_TextBuf| that are covered by the input
// range. |start| and |count| are in terms of the |m_CharIndices|, so the
// range will be converted into appropriate indices.
WideString GetPageText(int start, int count) const;
WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
int CountRects(int start, int nCount);
bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
private:
enum class TextOrientation {
kUnknown,
kHorizontal,
kVertical,
};
enum class GenerateCharacter {
kNone,
kSpace,
kLineBreak,
kHyphen,
};
enum class MarkedContentState { kPass = 0, kDone, kDelay };
struct TransformedTextObject {
TransformedTextObject();
TransformedTextObject(const TransformedTextObject& that);
~TransformedTextObject();
UnownedPtr<CPDF_TextObject> m_pTextObj;
CFX_Matrix m_formMatrix;
};
void Init();
bool IsHyphen(wchar_t curChar) const;
void ProcessObject();
void ProcessFormObject(CPDF_FormObject* pFormObj,
const CFX_Matrix& form_matrix);
void ProcessTextObject(const TransformedTextObject& obj);
void ProcessTextObject(CPDF_TextObject* pTextObj,
const CFX_Matrix& form_matrix,
const CPDF_PageObjectHolder* pObjList,
CPDF_PageObjectHolder::const_iterator ObjPos);
GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
const CFX_Matrix& form_matrix);
// Returns whether to continue or not.
bool ProcessGenerateCharacter(GenerateCharacter type,
const CPDF_TextObject* text_object,
const CFX_Matrix& form_matrix);
void ProcessTextObjectItems(CPDF_TextObject* text_object,
const CFX_Matrix& form_matrix,
const CFX_Matrix& matrix);
const CharInfo* GetPrevCharInfo() const;
std::optional<CharInfo> GenerateCharInfo(wchar_t unicode,
const CFX_Matrix& form_matrix);
bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
const CPDF_PageObjectHolder* pObjList,
CPDF_PageObjectHolder::const_iterator iter) const;
bool IsSameTextObject(CPDF_TextObject* pTextObj1,
CPDF_TextObject* pTextObj2) const;
void CloseTempLine();
MarkedContentState PreMarkedContent(const CPDF_TextObject* pTextObj);
void ProcessMarkedContent(const TransformedTextObject& obj);
void FindPreviousTextObject();
void AddCharInfoByLRDirection(wchar_t wChar, const CharInfo& info);
void AddCharInfoByRLDirection(wchar_t wChar, const CharInfo& info);
TextOrientation GetTextObjectWritingMode(
const CPDF_TextObject* pTextObj) const;
TextOrientation FindTextlineFlowOrientation() const;
void AppendGeneratedCharacter(wchar_t unicode,
const CFX_Matrix& form_matrix,
bool use_temp_buffer);
void SwapTempTextBuf(size_t iCharListStartAppend, size_t iBufStartAppend);
WideString GetTextByPredicate(
const std::function<bool(const CharInfo&)>& predicate) const;
UnownedPtr<const CPDF_Page> const m_pPage;
DataVector<TextPageCharSegment> m_CharIndices;
std::deque<CharInfo> m_CharList;
std::deque<CharInfo> m_TempCharList;
WideTextBuffer m_TextBuf;
WideTextBuffer m_TempTextBuf;
UnownedPtr<const CPDF_TextObject> m_pPrevTextObj;
CFX_Matrix m_PrevMatrix;
const bool m_rtl;
const CFX_Matrix m_DisplayMatrix;
std::vector<CFX_FloatRect> m_SelRects;
std::vector<TransformedTextObject> mTextObjects;
TextOrientation m_TextlineDir = TextOrientation::kUnknown;
CFX_FloatRect m_CurlineRect;
};
#endif // CORE_FPDFTEXT_CPDF_TEXTPAGE_H_