| // Copyright 2016 PDFium Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style license that can be | 
 | // found in the LICENSE file. | 
 |  | 
 | // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 
 |  | 
 | #include "core/fpdftext/cpdf_textpagefind.h" | 
 |  | 
 | #include <wchar.h> | 
 |  | 
 | #include <vector> | 
 |  | 
 | #include "core/fpdftext/cpdf_textpage.h" | 
 | #include "core/fxcrt/fx_extension.h" | 
 | #include "core/fxcrt/fx_string.h" | 
 | #include "core/fxcrt/fx_system.h" | 
 | #include "core/fxcrt/fx_unicode.h" | 
 | #include "core/fxcrt/stl_util.h" | 
 | #include "third_party/base/check.h" | 
 | #include "third_party/base/ptr_util.h" | 
 |  | 
 | namespace { | 
 |  | 
 | constexpr wchar_t kNonBreakingSpace = 160; | 
 |  | 
 | bool IsIgnoreSpaceCharacter(wchar_t curChar) { | 
 |   if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) || | 
 |       (curChar >= 0xFE70 && curChar <= 0xFEFF) || | 
 |       (curChar >= 0xFB50 && curChar <= 0xFDFF) || | 
 |       (curChar >= 0x0400 && curChar <= 0x04FF) || | 
 |       (curChar >= 0x0500 && curChar <= 0x052F) || | 
 |       (curChar >= 0xA640 && curChar <= 0xA69F) || | 
 |       (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || | 
 |       (curChar >= 0x2000 && curChar <= 0x206F)) { | 
 |     return false; | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | bool IsMatchWholeWord(const WideString& csPageText, | 
 |                       size_t startPos, | 
 |                       size_t endPos) { | 
 |   if (startPos > endPos) | 
 |     return false; | 
 |   wchar_t char_left = 0; | 
 |   wchar_t char_right = 0; | 
 |   size_t char_count = endPos - startPos + 1; | 
 |   if (char_count == 0) | 
 |     return false; | 
 |   if (char_count == 1 && csPageText[startPos] > 255) | 
 |     return true; | 
 |   if (startPos >= 1) | 
 |     char_left = csPageText[startPos - 1]; | 
 |   if (startPos + char_count < csPageText.GetLength()) | 
 |     char_right = csPageText[startPos + char_count]; | 
 |   if ((char_left > 'A' && char_left < 'a') || | 
 |       (char_left > 'a' && char_left < 'z') || | 
 |       (char_left > 0xfb00 && char_left < 0xfb06) || | 
 |       FXSYS_IsDecimalDigit(char_left) || | 
 |       (char_right > 'A' && char_right < 'a') || | 
 |       (char_right > 'a' && char_right < 'z') || | 
 |       (char_right > 0xfb00 && char_right < 0xfb06) || | 
 |       FXSYS_IsDecimalDigit(char_right)) { | 
 |     return false; | 
 |   } | 
 |   if (!(('A' > char_left || char_left > 'Z') && | 
 |         ('a' > char_left || char_left > 'z') && | 
 |         ('A' > char_right || char_right > 'Z') && | 
 |         ('a' > char_right || char_right > 'z'))) { | 
 |     return false; | 
 |   } | 
 |   if (char_count > 0) { | 
 |     if (FXSYS_IsDecimalDigit(char_left) && | 
 |         FXSYS_IsDecimalDigit(csPageText[startPos])) { | 
 |       return false; | 
 |     } | 
 |     if (FXSYS_IsDecimalDigit(char_right) && | 
 |         FXSYS_IsDecimalDigit(csPageText[endPos])) { | 
 |       return false; | 
 |     } | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) { | 
 |   if (bMatchCase) | 
 |     return wsOriginal; | 
 |  | 
 |   WideString wsLower = wsOriginal; | 
 |   wsLower.MakeLower(); | 
 |   return wsLower; | 
 | } | 
 |  | 
 | absl::optional<WideString> ExtractSubString(const wchar_t* lpszFullString, | 
 |                                             int iSubString) { | 
 |   DCHECK(lpszFullString); | 
 |  | 
 |   while (iSubString--) { | 
 |     lpszFullString = wcschr(lpszFullString, L' '); | 
 |     if (!lpszFullString) | 
 |       return absl::nullopt; | 
 |  | 
 |     lpszFullString++; | 
 |     while (*lpszFullString == L' ') | 
 |       lpszFullString++; | 
 |   } | 
 |  | 
 |   const wchar_t* lpchEnd = wcschr(lpszFullString, L' '); | 
 |   int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString) | 
 |                      : static_cast<int>(wcslen(lpszFullString)); | 
 |   if (nLen < 0) | 
 |     return absl::nullopt; | 
 |  | 
 |   return WideString(lpszFullString, static_cast<size_t>(nLen)); | 
 | } | 
 |  | 
 | std::vector<WideString> ExtractFindWhat(const WideString& findwhat) { | 
 |   std::vector<WideString> findwhat_array; | 
 |  | 
 |   size_t len = findwhat.GetLength(); | 
 |   size_t i = 0; | 
 |   for (i = 0; i < len; ++i) | 
 |     if (findwhat[i] != ' ') | 
 |       break; | 
 |   if (i == len) { | 
 |     findwhat_array.push_back(findwhat); | 
 |     return findwhat_array; | 
 |   } | 
 |  | 
 |   int index = 0; | 
 |   while (1) { | 
 |     absl::optional<WideString> word = ExtractSubString(findwhat.c_str(), index); | 
 |     if (!word.has_value()) | 
 |       break; | 
 |  | 
 |     if (word->IsEmpty()) { | 
 |       findwhat_array.push_back(L""); | 
 |       index++; | 
 |       continue; | 
 |     } | 
 |  | 
 |     size_t pos = 0; | 
 |     while (pos < word->GetLength()) { | 
 |       WideString curStr = word->Substr(pos, 1); | 
 |       wchar_t curChar = word.value()[pos]; | 
 |       if (IsIgnoreSpaceCharacter(curChar)) { | 
 |         if (pos > 0 && curChar == pdfium::unicode::kRightSingleQuotationMark) { | 
 |           pos++; | 
 |           continue; | 
 |         } | 
 |         if (pos > 0) | 
 |           findwhat_array.push_back(word->First(pos)); | 
 |         findwhat_array.push_back(curStr); | 
 |         if (pos == word->GetLength() - 1) { | 
 |           word->clear(); | 
 |           break; | 
 |         } | 
 |         word.emplace(word->Last(word->GetLength() - pos - 1)); | 
 |         pos = 0; | 
 |         continue; | 
 |       } | 
 |       pos++; | 
 |     } | 
 |  | 
 |     if (!word->IsEmpty()) | 
 |       findwhat_array.push_back(word.value()); | 
 |     index++; | 
 |   } | 
 |   return findwhat_array; | 
 | } | 
 |  | 
 | }  // namespace | 
 |  | 
 | // static | 
 | std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create( | 
 |     const CPDF_TextPage* pTextPage, | 
 |     const WideString& findwhat, | 
 |     const Options& options, | 
 |     absl::optional<size_t> startPos) { | 
 |   std::vector<WideString> findwhat_array = | 
 |       ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase)); | 
 |   auto find = pdfium::WrapUnique( | 
 |       new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos)); | 
 |   find->FindFirst(); | 
 |   return find; | 
 | } | 
 |  | 
 | CPDF_TextPageFind::CPDF_TextPageFind( | 
 |     const CPDF_TextPage* pTextPage, | 
 |     const std::vector<WideString>& findwhat_array, | 
 |     const Options& options, | 
 |     absl::optional<size_t> startPos) | 
 |     : m_pTextPage(pTextPage), | 
 |       m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)), | 
 |       m_csFindWhatArray(findwhat_array), | 
 |       m_options(options) { | 
 |   if (!m_strText.IsEmpty()) { | 
 |     m_findNextStart = startPos; | 
 |     m_findPreStart = startPos.value_or(m_strText.GetLength() - 1); | 
 |   } | 
 | } | 
 |  | 
 | CPDF_TextPageFind::~CPDF_TextPageFind() = default; | 
 |  | 
 | int CPDF_TextPageFind::GetCharIndex(int index) const { | 
 |   return m_pTextPage->CharIndexFromTextIndex(index); | 
 | } | 
 |  | 
 | bool CPDF_TextPageFind::FindFirst() { | 
 |   return m_strText.IsEmpty() || !m_csFindWhatArray.empty(); | 
 | } | 
 |  | 
 | bool CPDF_TextPageFind::FindNext() { | 
 |   if (m_strText.IsEmpty() || !m_findNextStart.has_value()) | 
 |     return false; | 
 |  | 
 |   size_t strLen = m_strText.GetLength(); | 
 |   if (m_findNextStart.value() > strLen - 1) | 
 |     return false; | 
 |  | 
 |   int nCount = fxcrt::CollectionSize<int>(m_csFindWhatArray); | 
 |   absl::optional<size_t> nResultPos = 0; | 
 |   size_t nStartPos = m_findNextStart.value(); | 
 |   bool bSpaceStart = false; | 
 |   for (int iWord = 0; iWord < nCount; iWord++) { | 
 |     WideString csWord = m_csFindWhatArray[iWord]; | 
 |     if (csWord.IsEmpty()) { | 
 |       if (iWord == nCount - 1) { | 
 |         wchar_t strInsert = m_strText[nStartPos]; | 
 |         if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' || | 
 |             strInsert == kNonBreakingSpace) { | 
 |           nResultPos = nStartPos + 1; | 
 |           break; | 
 |         } | 
 |         iWord = -1; | 
 |       } else if (iWord == 0) { | 
 |         bSpaceStart = true; | 
 |       } | 
 |       continue; | 
 |     } | 
 |     nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos); | 
 |     if (!nResultPos.has_value()) | 
 |       return false; | 
 |  | 
 |     size_t endIndex = nResultPos.value() + csWord.GetLength() - 1; | 
 |     if (iWord == 0) | 
 |       m_resStart = nResultPos.value(); | 
 |     bool bMatch = true; | 
 |     if (iWord != 0 && !bSpaceStart) { | 
 |       size_t PreResEndPos = nStartPos; | 
 |       int curChar = csWord[0]; | 
 |       WideString lastWord = m_csFindWhatArray[iWord - 1]; | 
 |       int lastChar = lastWord.Back(); | 
 |       if (nStartPos == nResultPos.value() && | 
 |           !(IsIgnoreSpaceCharacter(lastChar) || | 
 |             IsIgnoreSpaceCharacter(curChar))) { | 
 |         bMatch = false; | 
 |       } | 
 |       for (size_t d = PreResEndPos; d < nResultPos.value(); d++) { | 
 |         wchar_t strInsert = m_strText[d]; | 
 |         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' && | 
 |             strInsert != kNonBreakingSpace) { | 
 |           bMatch = false; | 
 |           break; | 
 |         } | 
 |       } | 
 |     } else if (bSpaceStart) { | 
 |       if (nResultPos.value() > 0) { | 
 |         wchar_t strInsert = m_strText[nResultPos.value() - 1]; | 
 |         if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' && | 
 |             strInsert != kNonBreakingSpace) { | 
 |           bMatch = false; | 
 |           m_resStart = nResultPos.value(); | 
 |         } else { | 
 |           m_resStart = nResultPos.value() - 1; | 
 |         } | 
 |       } | 
 |     } | 
 |     if (m_options.bMatchWholeWord && bMatch) | 
 |       bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex); | 
 |  | 
 |     nStartPos = endIndex + 1; | 
 |     if (!bMatch) { | 
 |       iWord = -1; | 
 |       size_t index = bSpaceStart ? 1 : 0; | 
 |       nStartPos = m_resStart + m_csFindWhatArray[index].GetLength(); | 
 |     } | 
 |   } | 
 |   m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1; | 
 |   if (m_options.bConsecutive) { | 
 |     m_findNextStart = m_resStart + 1; | 
 |     m_findPreStart = m_resEnd - 1; | 
 |   } else { | 
 |     m_findNextStart = m_resEnd + 1; | 
 |     m_findPreStart = m_resStart - 1; | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | bool CPDF_TextPageFind::FindPrev() { | 
 |   if (m_strText.IsEmpty() || !m_findPreStart.has_value()) | 
 |     return false; | 
 |  | 
 |   CPDF_TextPageFind find_engine(m_pTextPage.Get(), m_csFindWhatArray, m_options, | 
 |                                 0); | 
 |   if (!find_engine.FindFirst()) | 
 |     return false; | 
 |  | 
 |   int order = -1; | 
 |   int matches = 0; | 
 |   while (find_engine.FindNext()) { | 
 |     int cur_order = find_engine.GetCurOrder(); | 
 |     int cur_match = find_engine.GetMatchedCount(); | 
 |     int temp = cur_order + cur_match; | 
 |     if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1) | 
 |       break; | 
 |  | 
 |     order = cur_order; | 
 |     matches = cur_match; | 
 |   } | 
 |   if (order == -1) | 
 |     return false; | 
 |  | 
 |   m_resStart = m_pTextPage->TextIndexFromCharIndex(order); | 
 |   m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1); | 
 |   if (m_options.bConsecutive) { | 
 |     m_findNextStart = m_resStart + 1; | 
 |     m_findPreStart = m_resEnd - 1; | 
 |   } else { | 
 |     m_findNextStart = m_resEnd + 1; | 
 |     m_findPreStart = m_resStart - 1; | 
 |   } | 
 |   return true; | 
 | } | 
 |  | 
 | int CPDF_TextPageFind::GetCurOrder() const { | 
 |   return GetCharIndex(m_resStart); | 
 | } | 
 |  | 
 | int CPDF_TextPageFind::GetMatchedCount() const { | 
 |   int resStart = GetCharIndex(m_resStart); | 
 |   int resEnd = GetCharIndex(m_resEnd); | 
 |   return resEnd - resStart + 1; | 
 | } |