Clean up CPDF_TextPage.

- Rename ParseTextPage() to Init(), make it private, and call it from
  the ctor. This makes it possible to put another item in the
  initializer list and remove some unnecessary reset calls.
- Mark various methods const, or move them into the anonymous namespace.
- Replace a check that will never fail with a DCHECK.

Change-Id: I46375413204e52bb63a887d43985bb89a10f28a0
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/65433
Reviewed-by: Tom Sepez <tsepez@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index c310cac..4bda087 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -112,6 +112,22 @@
   return count / (end - start);
 }
 
+bool IsControlChar(const PAGECHAR_INFO& charInfo) {
+  switch (charInfo.m_Unicode) {
+    case 0x2:
+    case 0x3:
+    case 0x93:
+    case 0x94:
+    case 0x96:
+    case 0x97:
+    case 0x98:
+    case 0xfffe:
+      return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
+    default:
+      return false;
+  }
+}
+
 bool IsHyphenCode(wchar_t c) {
   return c == 0x2D || c == 0xAD;
 }
@@ -204,6 +220,12 @@
   return right <= left;
 }
 
+CFX_Matrix GetPageMatrix(const CPDF_Page* pPage) {
+  const FX_RECT rect(0, 0, static_cast<int>(pPage->GetPageWidth()),
+                     static_cast<int>(pPage->GetPageHeight()));
+  return pPage->GetDisplayMatrix(rect, 0);
+}
+
 }  // namespace
 
 PDFTEXT_Obj::PDFTEXT_Obj() {}
@@ -223,40 +245,17 @@
 PAGECHAR_INFO::~PAGECHAR_INFO() {}
 
 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, bool rtl)
-    : m_pPage(pPage), m_rtl(rtl) {
+    : m_pPage(pPage), m_rtl(rtl), m_DisplayMatrix(GetPageMatrix(pPage)) {
+  Init();
+}
+
+CPDF_TextPage::~CPDF_TextPage() = default;
+
+void CPDF_TextPage::Init() {
   m_TextBuf.SetAllocStep(10240);
-  const FX_RECT rect(0, 0, static_cast<int>(pPage->GetPageWidth()),
-                     static_cast<int>(pPage->GetPageHeight()));
-  m_DisplayMatrix = pPage->GetDisplayMatrix(rect, 0);
-}
-
-CPDF_TextPage::~CPDF_TextPage() {}
-
-bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
-  switch (charInfo.m_Unicode) {
-    case 0x2:
-    case 0x3:
-    case 0x93:
-    case 0x94:
-    case 0x96:
-    case 0x97:
-    case 0x98:
-    case 0xfffe:
-      return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
-    default:
-      return false;
-  }
-}
-
-void CPDF_TextPage::ParseTextPage() {
-  m_bIsParsed = false;
-  m_TextBuf.Clear();
-  m_CharList.clear();
-  m_pPreTextObj = nullptr;
   ProcessObject();
 
   m_bIsParsed = true;
-  m_CharIndex.clear();
   const int nCount = CountChars();
   if (nCount)
     m_CharIndex.push_back(0);
@@ -529,8 +528,7 @@
 
 CPDF_TextPage::TextOrientation CPDF_TextPage::FindTextlineFlowOrientation()
     const {
-  if (m_pPage->GetPageObjectCount() == 0)
-    return TextOrientation::kUnknown;
+  DCHECK_NE(m_pPage->GetPageObjectCount(), 0);
 
   const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth());
   const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight());
@@ -1338,7 +1336,7 @@
 }
 
 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
-                                     CPDF_TextObject* pTextObj2) {
+                                     CPDF_TextObject* pTextObj2) const {
   if (!pTextObj1 || !pTextObj2)
     return false;
 
@@ -1394,7 +1392,7 @@
 bool CPDF_TextPage::IsSameAsPreTextObject(
     CPDF_TextObject* pTextObj,
     const CPDF_PageObjectHolder* pObjList,
-    CPDF_PageObjectHolder::const_iterator iter) {
+    CPDF_PageObjectHolder::const_iterator iter) const {
   int i = 0;
   while (i < 5 && iter != pObjList->begin()) {
     --iter;
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index 5b82776..b4e11d1 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -81,7 +81,6 @@
   CPDF_TextPage(const CPDF_Page* pPage, bool rtl);
   ~CPDF_TextPage();
 
-  void ParseTextPage();
   bool IsParsed() const { return m_bIsParsed; }
   int CharIndexFromTextIndex(int TextIndex) const;
   int TextIndexFromCharIndex(int CharIndex) const;
@@ -118,8 +117,8 @@
 
   enum class MarkedContentState { kPass = 0, kDone, kDelay };
 
+  void Init();
   bool IsHyphen(wchar_t curChar) const;
-  bool IsControlChar(const PAGECHAR_INFO& charInfo);
   void ProcessObject();
   void ProcessFormObject(CPDF_FormObject* pFormObj,
                          const CFX_Matrix& formMatrix);
@@ -134,8 +133,9 @@
   Optional<PAGECHAR_INFO> GenerateCharInfo(wchar_t unicode);
   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
                              const CPDF_PageObjectHolder* pObjList,
-                             CPDF_PageObjectHolder::const_iterator iter);
-  bool IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2);
+                             CPDF_PageObjectHolder::const_iterator iter) const;
+  bool IsSameTextObject(CPDF_TextObject* pTextObj1,
+                        CPDF_TextObject* pTextObj2) const;
   void CloseTempLine();
   MarkedContentState PreMarkedContent(PDFTEXT_Obj pObj);
   void ProcessMarkedContent(PDFTEXT_Obj pObj);
@@ -160,7 +160,7 @@
   CFX_Matrix m_perMatrix;
   const bool m_rtl;
   bool m_bIsParsed = false;
-  CFX_Matrix m_DisplayMatrix;
+  const CFX_Matrix m_DisplayMatrix;
   std::vector<CFX_FloatRect> m_SelRects;
   std::vector<PDFTEXT_Obj> m_LineObj;
   TextOrientation m_TextlineDir = TextOrientation::kUnknown;
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index 9c16958..03ef0d3 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -50,7 +50,6 @@
   CPDF_ViewerPreferences viewRef(pPDFPage->GetDocument());
   auto textpage =
       pdfium::MakeUnique<CPDF_TextPage>(pPDFPage, viewRef.IsDirectionR2L());
-  textpage->ParseTextPage();
 
   // Caller takes ownership.
   return FPDFTextPageFromCPDFTextPage(textpage.release());