Split CPDF_TextPage::ProcessTextObject() into smaller pieces

Add ProcessGenerateCharacter() and ProcessTextObjectItems() helper
methods for ProcessTextObject() to call. This gets ProcessTextObject()
down to a more manageable size.

Along the way, move some variables closer to where they are used.

Change-Id: I31f6b5092b0879ad8a5a9c38398476bff0303c23
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/124750
Reviewed-by: Tom Sepez <tsepez@google.com>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index e631737..a2744a2 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -30,6 +30,7 @@
 #include "core/fxcrt/fx_bidi.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_unicode.h"
+#include "core/fxcrt/notreached.h"
 #include "core/fxcrt/span.h"
 #include "core/fxcrt/stl_util.h"
 
@@ -954,69 +955,29 @@
 }
 
 void CPDF_TextPage::ProcessTextObject(const TransformedTextObject& obj) {
-  CPDF_TextObject* pTextObj = obj.m_pTextObj;
-  if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon)
+  CPDF_TextObject* const pTextObj = obj.m_pTextObj;
+  if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon) {
     return;
+  }
 
-  CFX_Matrix form_matrix = obj.m_formMatrix;
-  RetainPtr<CPDF_Font> pFont = pTextObj->GetFont();
-  CFX_Matrix matrix = pTextObj->GetTextMatrix() * form_matrix;
-  MarkedContentState ePreMKC = PreMarkedContent(obj.m_pTextObj);
+  const CFX_Matrix form_matrix = obj.m_formMatrix;
+  const MarkedContentState ePreMKC = PreMarkedContent(pTextObj);
   if (ePreMKC == MarkedContentState::kDone) {
     m_pPrevTextObj = pTextObj;
     m_PrevMatrix = form_matrix;
     return;
   }
-  GenerateCharacter result = GenerateCharacter::kNone;
-  if (m_pPrevTextObj) {
-    result = ProcessInsertObject(pTextObj, form_matrix);
-    if (result == GenerateCharacter::kLineBreak)
-      m_CurlineRect = pTextObj->GetRect();
-    else
-      m_CurlineRect.Union(obj.m_pTextObj->GetRect());
 
-    switch (result) {
-      case GenerateCharacter::kNone:
-        break;
-      case GenerateCharacter::kSpace: {
-        std::optional<CharInfo> pGenerateChar = GenerateCharInfo(L' ');
-        if (pGenerateChar.has_value()) {
-          if (!form_matrix.IsIdentity())
-            pGenerateChar->set_matrix(form_matrix);
-          m_TempTextBuf.AppendChar(L' ');
-          m_TempCharList.push_back(pGenerateChar.value());
-        }
-        break;
-      }
-      case GenerateCharacter::kLineBreak:
-        CloseTempLine();
-        if (m_TextBuf.GetSize()) {
-          AppendGeneratedCharacter(L'\r', form_matrix);
-          AppendGeneratedCharacter(L'\n', form_matrix);
-        }
-        break;
-      case GenerateCharacter::kHyphen:
-        if (pTextObj->CountChars() == 1) {
-          CPDF_TextObject::Item item = pTextObj->GetCharInfo(0);
-          WideString wstrItem =
-              pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
-          if (wstrItem.IsEmpty())
-            wstrItem += (wchar_t)item.m_CharCode;
-          wchar_t curChar = wstrItem[0];
-          if (IsHyphenCode(curChar))
-            return;
-        }
-        while (m_TempTextBuf.GetSize() > 0 &&
-               m_TempTextBuf.AsStringView().Back() == 0x20) {
-          m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
-          m_TempCharList.pop_back();
-        }
-        CharInfo* charinfo = &m_TempCharList.back();
-        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
-        charinfo->set_unicode(0x2);
-        charinfo->set_char_type(CPDF_TextPage::CharType::kHyphen);
-        m_TempTextBuf.AppendChar(0xfffe);
-        break;
+  if (m_pPrevTextObj) {
+    GenerateCharacter type = ProcessInsertObject(pTextObj, form_matrix);
+    if (type == GenerateCharacter::kLineBreak) {
+      m_CurlineRect = pTextObj->GetRect();
+    } else {
+      m_CurlineRect.Union(pTextObj->GetRect());
+    }
+
+    if (!ProcessGenerateCharacter(type, pTextObj, form_matrix)) {
+      return;
     }
   } else {
     m_CurlineRect = pTextObj->GetRect();
@@ -1028,143 +989,21 @@
     m_PrevMatrix = form_matrix;
     return;
   }
+
   m_pPrevTextObj = pTextObj;
   m_PrevMatrix = form_matrix;
-  float baseSpace = CalculateBaseSpace(pTextObj, matrix);
 
-  const bool bR2L = IsRightToLeft(*pTextObj, *pFont);
+  const bool bR2L = IsRightToLeft(*pTextObj, *pTextObj->GetFont());
+  const CFX_Matrix matrix = pTextObj->GetTextMatrix() * form_matrix;
   const bool bIsBidiAndMirrorInverse =
       bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
   const size_t iBufStartAppend = m_TempTextBuf.GetLength();
   const size_t iCharListStartAppend = m_TempCharList.size();
 
-  float spacing = 0;
-  const size_t nItems = pTextObj->CountItems();
-  for (size_t i = 0; i < nItems; ++i) {
-    CharInfo charinfo;
-    CPDF_TextObject::Item item = pTextObj->GetItemInfo(i);
-    if (item.m_CharCode == 0xffffffff) {
-      WideString str = m_TempTextBuf.MakeString();
-      if (str.IsEmpty())
-        str = m_TextBuf.AsStringView();
-      if (str.IsEmpty() || str.Back() == L' ')
-        continue;
-
-      float fontsize_h = pTextObj->text_state().GetFontSizeH();
-      spacing = -fontsize_h * item.m_Origin.x / 1000;
-      continue;
-    }
-    float charSpace = pTextObj->text_state().GetCharSpace();
-    if (charSpace > 0.001)
-      spacing += matrix.TransformDistance(charSpace);
-    else if (charSpace < -0.001)
-      spacing -= matrix.TransformDistance(fabs(charSpace));
-    spacing -= baseSpace;
-    if (spacing && i > 0) {
-      float fontsize_h = pTextObj->text_state().GetFontSizeH();
-      uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
-      float threshold = 0;
-      if (space_charcode != CPDF_Font::kInvalidCharCode)
-        threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
-      if (threshold > fontsize_h / 3)
-        threshold = 0;
-      else
-        threshold /= 2;
-      if (threshold == 0) {
-        threshold = GetCharWidth(item.m_CharCode, pFont.Get());
-        threshold = NormalizeThreshold(threshold, 300, 500, 700);
-        threshold = fontsize_h * threshold / 1000;
-      }
-      if (threshold && (spacing && spacing >= threshold)) {
-        charinfo.set_unicode(L' ');
-        charinfo.set_char_type(CPDF_TextPage::CharType::kGenerated);
-        charinfo.set_text_object(pTextObj);
-        charinfo.set_char_code(CPDF_Font::kInvalidCharCode);
-        charinfo.set_origin(matrix.Transform(item.m_Origin));
-        charinfo.set_char_box(
-            CFX_FloatRect(charinfo.origin().x, charinfo.origin().y,
-                          charinfo.origin().x, charinfo.origin().y));
-        charinfo.set_matrix(form_matrix);
-        m_TempTextBuf.AppendChar(L' ');
-        m_TempCharList.push_back(charinfo);
-      }
-      if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
-        continue;
-    }
-    spacing = 0;
-    WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
-    bool bNoUnicode = false;
-    if (wstrItem.IsEmpty() && item.m_CharCode) {
-      wstrItem += static_cast<wchar_t>(item.m_CharCode);
-      bNoUnicode = true;
-    }
-    charinfo.set_char_type(bNoUnicode ? CPDF_TextPage::CharType::kNotUnicode
-                                      : CPDF_TextPage::CharType::kNormal);
-    charinfo.set_char_code(item.m_CharCode);
-    charinfo.set_text_object(pTextObj);
-    charinfo.set_origin(matrix.Transform(item.m_Origin));
-
-    const FX_RECT rect =
-        charinfo.text_object()->GetFont()->GetCharBBox(charinfo.char_code());
-    const float fFontSize = pTextObj->GetFontSize() / 1000;
-    CFX_FloatRect char_box(rect.left * fFontSize + item.m_Origin.x,
-                           rect.bottom * fFontSize + item.m_Origin.y,
-                           rect.right * fFontSize + item.m_Origin.x,
-                           rect.top * fFontSize + item.m_Origin.y);
-    if (fabsf(char_box.top - char_box.bottom) < kSizeEpsilon) {
-      char_box.top = char_box.bottom + fFontSize;
-    }
-    if (fabsf(char_box.right - char_box.left) < kSizeEpsilon) {
-      char_box.right =
-          char_box.left + pTextObj->GetCharWidth(charinfo.char_code());
-    }
-    char_box = matrix.TransformRect(char_box);
-    charinfo.set_char_box(char_box);
-    charinfo.set_matrix(matrix);
-    if (wstrItem.IsEmpty()) {
-      charinfo.set_unicode(0);
-      m_TempCharList.push_back(charinfo);
-      m_TempTextBuf.AppendChar(0xfffe);
-      continue;
-    }
-    size_t nTotal = wstrItem.GetLength();
-    bool bDel = false;
-    const int count = std::min(fxcrt::CollectionSize<int>(m_TempCharList), 7);
-    constexpr float kTextCharRatioGapDelta = 0.07f;
-    float threshold = charinfo.matrix().TransformXDistance(
-        kTextCharRatioGapDelta * pTextObj->GetFontSize());
-    for (int n = fxcrt::CollectionSize<int>(m_TempCharList);
-         n > fxcrt::CollectionSize<int>(m_TempCharList) - count; --n) {
-      const CharInfo& charinfo1 = m_TempCharList[n - 1];
-      CFX_PointF diff = charinfo1.origin() - charinfo.origin();
-      if (charinfo1.char_code() == charinfo.char_code() &&
-          charinfo1.text_object()->GetFont() ==
-              charinfo.text_object()->GetFont() &&
-          fabs(diff.x) < threshold && fabs(diff.y) < threshold) {
-        bDel = true;
-        break;
-      }
-    }
-    if (!bDel) {
-      for (size_t nIndex = 0; nIndex < nTotal; ++nIndex) {
-        charinfo.set_unicode(wstrItem[nIndex]);
-        if (charinfo.unicode()) {
-          m_TempTextBuf.AppendChar(charinfo.unicode());
-        } else {
-          m_TempTextBuf.AppendChar(0xfffe);
-        }
-        m_TempCharList.push_back(charinfo);
-      }
-    } else if (i == 0) {
-      WideString str = m_TempTextBuf.MakeString();
-      if (!str.IsEmpty() && str.Back() == L' ') {
-        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
-        m_TempCharList.pop_back();
-      }
-    }
-  }
-  if (bIsBidiAndMirrorInverse)
+  ProcessTextObjectItems(pTextObj, form_matrix, matrix);
+  if (bIsBidiAndMirrorInverse) {
     SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
+  }
 }
 
 CPDF_TextPage::TextOrientation CPDF_TextPage::GetTextObjectWritingMode(
@@ -1347,6 +1186,197 @@
              : GenerateCharacter::kNone;
 }
 
+bool CPDF_TextPage::ProcessGenerateCharacter(GenerateCharacter type,
+                                             const CPDF_TextObject* text_object,
+                                             const CFX_Matrix& form_matrix) {
+  switch (type) {
+    case GenerateCharacter::kNone:
+      return true;
+    case GenerateCharacter::kSpace: {
+      std::optional<CharInfo> pGenerateChar = GenerateCharInfo(L' ');
+      if (pGenerateChar.has_value()) {
+        if (!form_matrix.IsIdentity()) {
+          pGenerateChar->set_matrix(form_matrix);
+        }
+        m_TempTextBuf.AppendChar(L' ');
+        m_TempCharList.push_back(pGenerateChar.value());
+      }
+      return true;
+    }
+    case GenerateCharacter::kLineBreak:
+      CloseTempLine();
+      if (m_TextBuf.GetSize()) {
+        AppendGeneratedCharacter(L'\r', form_matrix);
+        AppendGeneratedCharacter(L'\n', form_matrix);
+      }
+      return true;
+    case GenerateCharacter::kHyphen:
+      if (text_object->CountChars() == 1) {
+        CPDF_TextObject::Item item = text_object->GetCharInfo(0);
+        WideString wstrItem =
+            text_object->GetFont()->UnicodeFromCharCode(item.m_CharCode);
+        if (wstrItem.IsEmpty()) {
+          wstrItem += static_cast<wchar_t>(item.m_CharCode);
+        }
+        wchar_t curChar = wstrItem[0];
+        if (IsHyphenCode(curChar)) {
+          return false;
+        }
+      }
+      while (m_TempTextBuf.GetSize() > 0 &&
+             m_TempTextBuf.AsStringView().Back() == 0x20) {
+        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
+        m_TempCharList.pop_back();
+      }
+      CharInfo* charinfo = &m_TempCharList.back();
+      m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
+      charinfo->set_unicode(0x2);
+      charinfo->set_char_type(CPDF_TextPage::CharType::kHyphen);
+      m_TempTextBuf.AppendChar(0xfffe);
+      return true;
+  }
+  NOTREACHED_NORETURN();
+}
+
+void CPDF_TextPage::ProcessTextObjectItems(CPDF_TextObject* text_object,
+                                           const CFX_Matrix& form_matrix,
+                                           const CFX_Matrix& matrix) {
+  const float base_space = CalculateBaseSpace(text_object, matrix);
+  RetainPtr<CPDF_Font> const font = text_object->GetFont();
+
+  float spacing = 0;
+  const size_t nItems = text_object->CountItems();
+  for (size_t i = 0; i < nItems; ++i) {
+    CharInfo charinfo;
+    CPDF_TextObject::Item item = text_object->GetItemInfo(i);
+    if (item.m_CharCode == 0xffffffff) {
+      WideString str = m_TempTextBuf.MakeString();
+      if (str.IsEmpty()) {
+        str = m_TextBuf.AsStringView();
+      }
+      if (str.IsEmpty() || str.Back() == L' ') {
+        continue;
+      }
+
+      float fontsize_h = text_object->text_state().GetFontSizeH();
+      spacing = -fontsize_h * item.m_Origin.x / 1000;
+      continue;
+    }
+    float charSpace = text_object->text_state().GetCharSpace();
+    if (charSpace > 0.001) {
+      spacing += matrix.TransformDistance(charSpace);
+    } else if (charSpace < -0.001) {
+      spacing -= matrix.TransformDistance(fabs(charSpace));
+    }
+    spacing -= base_space;
+    if (spacing && i > 0) {
+      float fontsize_h = text_object->text_state().GetFontSizeH();
+      uint32_t space_charcode = font->CharCodeFromUnicode(' ');
+      float threshold = 0;
+      if (space_charcode != CPDF_Font::kInvalidCharCode) {
+        threshold = fontsize_h * font->GetCharWidthF(space_charcode) / 1000;
+      }
+      if (threshold > fontsize_h / 3) {
+        threshold = 0;
+      } else {
+        threshold /= 2;
+      }
+      if (threshold == 0) {
+        threshold = GetCharWidth(item.m_CharCode, font.Get());
+        threshold = NormalizeThreshold(threshold, 300, 500, 700);
+        threshold = fontsize_h * threshold / 1000;
+      }
+      if (threshold && (spacing && spacing >= threshold)) {
+        charinfo.set_unicode(L' ');
+        charinfo.set_char_type(CPDF_TextPage::CharType::kGenerated);
+        charinfo.set_text_object(text_object);
+        charinfo.set_char_code(CPDF_Font::kInvalidCharCode);
+        charinfo.set_origin(matrix.Transform(item.m_Origin));
+        charinfo.set_char_box(
+            CFX_FloatRect(charinfo.origin().x, charinfo.origin().y,
+                          charinfo.origin().x, charinfo.origin().y));
+        charinfo.set_matrix(form_matrix);
+        m_TempTextBuf.AppendChar(L' ');
+        m_TempCharList.push_back(charinfo);
+      }
+      if (item.m_CharCode == CPDF_Font::kInvalidCharCode) {
+        continue;
+      }
+    }
+    spacing = 0;
+    WideString wstrItem = font->UnicodeFromCharCode(item.m_CharCode);
+    bool bNoUnicode = false;
+    if (wstrItem.IsEmpty() && item.m_CharCode) {
+      wstrItem += static_cast<wchar_t>(item.m_CharCode);
+      bNoUnicode = true;
+    }
+    charinfo.set_char_type(bNoUnicode ? CPDF_TextPage::CharType::kNotUnicode
+                                      : CPDF_TextPage::CharType::kNormal);
+    charinfo.set_char_code(item.m_CharCode);
+    charinfo.set_text_object(text_object);
+    charinfo.set_origin(matrix.Transform(item.m_Origin));
+
+    const FX_RECT rect =
+        charinfo.text_object()->GetFont()->GetCharBBox(charinfo.char_code());
+    const float fFontSize = text_object->GetFontSize() / 1000;
+    CFX_FloatRect char_box(rect.left * fFontSize + item.m_Origin.x,
+                           rect.bottom * fFontSize + item.m_Origin.y,
+                           rect.right * fFontSize + item.m_Origin.x,
+                           rect.top * fFontSize + item.m_Origin.y);
+    if (fabsf(char_box.top - char_box.bottom) < kSizeEpsilon) {
+      char_box.top = char_box.bottom + fFontSize;
+    }
+    if (fabsf(char_box.right - char_box.left) < kSizeEpsilon) {
+      char_box.right =
+          char_box.left + text_object->GetCharWidth(charinfo.char_code());
+    }
+    char_box = matrix.TransformRect(char_box);
+    charinfo.set_char_box(char_box);
+    charinfo.set_matrix(matrix);
+    if (wstrItem.IsEmpty()) {
+      charinfo.set_unicode(0);
+      m_TempCharList.push_back(charinfo);
+      m_TempTextBuf.AppendChar(0xfffe);
+      continue;
+    }
+    size_t nTotal = wstrItem.GetLength();
+    bool bDel = false;
+    const int count = std::min(fxcrt::CollectionSize<int>(m_TempCharList), 7);
+    constexpr float kTextCharRatioGapDelta = 0.07f;
+    float threshold = charinfo.matrix().TransformXDistance(
+        kTextCharRatioGapDelta * text_object->GetFontSize());
+    for (int n = fxcrt::CollectionSize<int>(m_TempCharList);
+         n > fxcrt::CollectionSize<int>(m_TempCharList) - count; --n) {
+      const CharInfo& charinfo1 = m_TempCharList[n - 1];
+      CFX_PointF diff = charinfo1.origin() - charinfo.origin();
+      if (charinfo1.char_code() == charinfo.char_code() &&
+          charinfo1.text_object()->GetFont() ==
+              charinfo.text_object()->GetFont() &&
+          fabs(diff.x) < threshold && fabs(diff.y) < threshold) {
+        bDel = true;
+        break;
+      }
+    }
+    if (!bDel) {
+      for (size_t nIndex = 0; nIndex < nTotal; ++nIndex) {
+        charinfo.set_unicode(wstrItem[nIndex]);
+        if (charinfo.unicode()) {
+          m_TempTextBuf.AppendChar(charinfo.unicode());
+        } else {
+          m_TempTextBuf.AppendChar(0xfffe);
+        }
+        m_TempCharList.push_back(charinfo);
+      }
+    } else if (i == 0) {
+      WideString str = m_TempTextBuf.MakeString();
+      if (!str.IsEmpty() && str.Back() == L' ') {
+        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
+        m_TempCharList.pop_back();
+      }
+    }
+  }
+}
+
 bool CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
                                      CPDF_TextObject* pTextObj2) const {
   if (!pTextObj1 || !pTextObj2)
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index 37cf789..35c486b 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -148,6 +148,13 @@
                          CPDF_PageObjectHolder::const_iterator ObjPos);
   GenerateCharacter ProcessInsertObject(const CPDF_TextObject* pObj,
                                         const CFX_Matrix& form_matrix);
+  // Returns whether to continue or not.
+  bool ProcessGenerateCharacter(GenerateCharacter type,
+                                const CPDF_TextObject* text_object,
+                                const CFX_Matrix& form_matrix);
+  void ProcessTextObjectItems(CPDF_TextObject* text_object,
+                              const CFX_Matrix& form_matrix,
+                              const CFX_Matrix& matrix);
   const CharInfo* GetPrevCharInfo() const;
   std::optional<CharInfo> GenerateCharInfo(wchar_t unicode);
   bool IsSameAsPreTextObject(CPDF_TextObject* pTextObj,