| // Copyright 2016 PDFium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| |
| #include "core/fpdftext/cpdf_textpagefind.h" |
| |
| #include <cwchar> |
| #include <cwctype> |
| #include <vector> |
| |
| #include "core/fpdftext/cpdf_textpage.h" |
| #include "core/fxcrt/fx_extension.h" |
| #include "core/fxcrt/fx_string.h" |
| #include "core/fxcrt/fx_system.h" |
| #include "third_party/base/check.h" |
| #include "third_party/base/ptr_util.h" |
| #include "third_party/base/stl_util.h" |
| |
| namespace { |
| |
| constexpr wchar_t kNonBreakingSpace = 160; |
| |
| bool IsIgnoreSpaceCharacter(wchar_t curChar) { |
| if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) || |
| (curChar >= 0xFE70 && curChar <= 0xFEFF) || |
| (curChar >= 0xFB50 && curChar <= 0xFDFF) || |
| (curChar >= 0x0400 && curChar <= 0x04FF) || |
| (curChar >= 0x0500 && curChar <= 0x052F) || |
| (curChar >= 0xA640 && curChar <= 0xA69F) || |
| (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || |
| (curChar >= 0x2000 && curChar <= 0x206F)) { |
| return false; |
| } |
| return true; |
| } |
| |
| bool IsMatchWholeWord(const WideString& csPageText, |
| size_t startPos, |
| size_t endPos) { |
| if (startPos > endPos) |
| return false; |
| wchar_t char_left = 0; |
| wchar_t char_right = 0; |
| size_t char_count = endPos - startPos + 1; |
| if (char_count == 0) |
| return false; |
| if (char_count == 1 && csPageText[startPos] > 255) |
| return true; |
| if (startPos >= 1) |
| char_left = csPageText[startPos - 1]; |
| if (startPos + char_count < csPageText.GetLength()) |
| char_right = csPageText[startPos + char_count]; |
| if ((char_left > 'A' && char_left < 'a') || |
| (char_left > 'a' && char_left < 'z') || |
| (char_left > 0xfb00 && char_left < 0xfb06) || |
| FXSYS_IsDecimalDigit(char_left) || |
| (char_right > 'A' && char_right < 'a') || |
| (char_right > 'a' && char_right < 'z') || |
| (char_right > 0xfb00 && char_right < 0xfb06) || |
| FXSYS_IsDecimalDigit(char_right)) { |
| return false; |
| } |
| if (!(('A' > char_left || char_left > 'Z') && |
| ('a' > char_left || char_left > 'z') && |
| ('A' > char_right || char_right > 'Z') && |
| ('a' > char_right || char_right > 'z'))) { |
| return false; |
| } |
| if (char_count > 0) { |
| if (FXSYS_IsDecimalDigit(char_left) && |
| FXSYS_IsDecimalDigit(csPageText[startPos])) { |
| return false; |
| } |
| if (FXSYS_IsDecimalDigit(char_right) && |
| FXSYS_IsDecimalDigit(csPageText[endPos])) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| WideString GetStringCase(const WideString& wsOriginal, bool bMatchCase) { |
| if (bMatchCase) |
| return wsOriginal; |
| |
| WideString wsLower = wsOriginal; |
| wsLower.MakeLower(); |
| return wsLower; |
| } |
| |
| Optional<WideString> ExtractSubString(const wchar_t* lpszFullString, |
| int iSubString) { |
| DCHECK(lpszFullString); |
| |
| while (iSubString--) { |
| lpszFullString = std::wcschr(lpszFullString, L' '); |
| if (!lpszFullString) |
| return {}; |
| |
| lpszFullString++; |
| while (*lpszFullString == L' ') |
| lpszFullString++; |
| } |
| |
| const wchar_t* lpchEnd = std::wcschr(lpszFullString, L' '); |
| int nLen = lpchEnd ? static_cast<int>(lpchEnd - lpszFullString) |
| : static_cast<int>(wcslen(lpszFullString)); |
| if (nLen < 0) |
| return {}; |
| |
| return WideString(lpszFullString, static_cast<size_t>(nLen)); |
| } |
| |
| std::vector<WideString> ExtractFindWhat(const WideString& findwhat) { |
| std::vector<WideString> findwhat_array; |
| |
| size_t len = findwhat.GetLength(); |
| size_t i = 0; |
| for (i = 0; i < len; ++i) |
| if (findwhat[i] != ' ') |
| break; |
| if (i == len) { |
| findwhat_array.push_back(findwhat); |
| return findwhat_array; |
| } |
| |
| int index = 0; |
| while (1) { |
| Optional<WideString> word = ExtractSubString(findwhat.c_str(), index); |
| if (!word) |
| break; |
| |
| if (word->IsEmpty()) { |
| findwhat_array.push_back(L""); |
| index++; |
| continue; |
| } |
| |
| size_t pos = 0; |
| while (pos < word->GetLength()) { |
| WideString curStr = word->Substr(pos, 1); |
| wchar_t curChar = (*word)[pos]; |
| if (IsIgnoreSpaceCharacter(curChar)) { |
| if (pos > 0 && curChar == 0x2019) { |
| pos++; |
| continue; |
| } |
| if (pos > 0) |
| findwhat_array.push_back(word->First(pos)); |
| findwhat_array.push_back(curStr); |
| if (pos == word->GetLength() - 1) { |
| word->clear(); |
| break; |
| } |
| word.emplace(word->Last(word->GetLength() - pos - 1)); |
| pos = 0; |
| continue; |
| } |
| pos++; |
| } |
| |
| if (!word->IsEmpty()) |
| findwhat_array.push_back(word.value()); |
| index++; |
| } |
| return findwhat_array; |
| } |
| |
| } // namespace |
| |
| // static |
| std::unique_ptr<CPDF_TextPageFind> CPDF_TextPageFind::Create( |
| const CPDF_TextPage* pTextPage, |
| const WideString& findwhat, |
| const Options& options, |
| Optional<size_t> startPos) { |
| std::vector<WideString> findwhat_array = |
| ExtractFindWhat(GetStringCase(findwhat, options.bMatchCase)); |
| auto find = pdfium::WrapUnique( |
| new CPDF_TextPageFind(pTextPage, findwhat_array, options, startPos)); |
| find->FindFirst(); |
| return find; |
| } |
| |
| CPDF_TextPageFind::CPDF_TextPageFind( |
| const CPDF_TextPage* pTextPage, |
| const std::vector<WideString>& findwhat_array, |
| const Options& options, |
| Optional<size_t> startPos) |
| : m_pTextPage(pTextPage), |
| m_strText(GetStringCase(pTextPage->GetAllPageText(), options.bMatchCase)), |
| m_csFindWhatArray(findwhat_array), |
| m_options(options) { |
| if (!m_strText.IsEmpty()) { |
| m_findNextStart = startPos; |
| m_findPreStart = startPos.value_or(m_strText.GetLength() - 1); |
| } |
| } |
| |
| CPDF_TextPageFind::~CPDF_TextPageFind() = default; |
| |
| int CPDF_TextPageFind::GetCharIndex(int index) const { |
| return m_pTextPage->CharIndexFromTextIndex(index); |
| } |
| |
| bool CPDF_TextPageFind::FindFirst() { |
| return m_strText.IsEmpty() || !m_csFindWhatArray.empty(); |
| } |
| |
| bool CPDF_TextPageFind::FindNext() { |
| if (m_strText.IsEmpty() || !m_findNextStart.has_value()) |
| return false; |
| |
| size_t strLen = m_strText.GetLength(); |
| if (m_findNextStart.value() > strLen - 1) |
| return false; |
| |
| int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); |
| Optional<size_t> nResultPos = 0; |
| size_t nStartPos = m_findNextStart.value(); |
| bool bSpaceStart = false; |
| for (int iWord = 0; iWord < nCount; iWord++) { |
| WideString csWord = m_csFindWhatArray[iWord]; |
| if (csWord.IsEmpty()) { |
| if (iWord == nCount - 1) { |
| wchar_t strInsert = m_strText[nStartPos]; |
| if (strInsert == L'\n' || strInsert == L' ' || strInsert == L'\r' || |
| strInsert == kNonBreakingSpace) { |
| nResultPos = nStartPos + 1; |
| break; |
| } |
| iWord = -1; |
| } else if (iWord == 0) { |
| bSpaceStart = true; |
| } |
| continue; |
| } |
| nResultPos = m_strText.Find(csWord.AsStringView(), nStartPos); |
| if (!nResultPos.has_value()) |
| return false; |
| |
| size_t endIndex = nResultPos.value() + csWord.GetLength() - 1; |
| if (iWord == 0) |
| m_resStart = nResultPos.value(); |
| bool bMatch = true; |
| if (iWord != 0 && !bSpaceStart) { |
| size_t PreResEndPos = nStartPos; |
| int curChar = csWord[0]; |
| WideString lastWord = m_csFindWhatArray[iWord - 1]; |
| int lastChar = lastWord.Back(); |
| if (nStartPos == nResultPos.value() && |
| !(IsIgnoreSpaceCharacter(lastChar) || |
| IsIgnoreSpaceCharacter(curChar))) { |
| bMatch = false; |
| } |
| for (size_t d = PreResEndPos; d < nResultPos.value(); d++) { |
| wchar_t strInsert = m_strText[d]; |
| if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' && |
| strInsert != kNonBreakingSpace) { |
| bMatch = false; |
| break; |
| } |
| } |
| } else if (bSpaceStart) { |
| if (nResultPos.value() > 0) { |
| wchar_t strInsert = m_strText[nResultPos.value() - 1]; |
| if (strInsert != L'\n' && strInsert != L' ' && strInsert != L'\r' && |
| strInsert != kNonBreakingSpace) { |
| bMatch = false; |
| m_resStart = nResultPos.value(); |
| } else { |
| m_resStart = nResultPos.value() - 1; |
| } |
| } |
| } |
| if (m_options.bMatchWholeWord && bMatch) |
| bMatch = IsMatchWholeWord(m_strText, nResultPos.value(), endIndex); |
| |
| nStartPos = endIndex + 1; |
| if (!bMatch) { |
| iWord = -1; |
| size_t index = bSpaceStart ? 1 : 0; |
| nStartPos = m_resStart + m_csFindWhatArray[index].GetLength(); |
| } |
| } |
| m_resEnd = nResultPos.value() + m_csFindWhatArray.back().GetLength() - 1; |
| if (m_options.bConsecutive) { |
| m_findNextStart = m_resStart + 1; |
| m_findPreStart = m_resEnd - 1; |
| } else { |
| m_findNextStart = m_resEnd + 1; |
| m_findPreStart = m_resStart - 1; |
| } |
| return true; |
| } |
| |
| bool CPDF_TextPageFind::FindPrev() { |
| if (m_strText.IsEmpty() || !m_findPreStart.has_value()) |
| return false; |
| |
| CPDF_TextPageFind find_engine(m_pTextPage.Get(), m_csFindWhatArray, m_options, |
| 0); |
| if (!find_engine.FindFirst()) |
| return false; |
| |
| int order = -1; |
| int matches = 0; |
| while (find_engine.FindNext()) { |
| int cur_order = find_engine.GetCurOrder(); |
| int cur_match = find_engine.GetMatchedCount(); |
| int temp = cur_order + cur_match; |
| if (temp < 0 || static_cast<size_t>(temp) > m_findPreStart.value() + 1) |
| break; |
| |
| order = cur_order; |
| matches = cur_match; |
| } |
| if (order == -1) |
| return false; |
| |
| m_resStart = m_pTextPage->TextIndexFromCharIndex(order); |
| m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + matches - 1); |
| if (m_options.bConsecutive) { |
| m_findNextStart = m_resStart + 1; |
| m_findPreStart = m_resEnd - 1; |
| } else { |
| m_findNextStart = m_resEnd + 1; |
| m_findPreStart = m_resStart - 1; |
| } |
| return true; |
| } |
| |
| int CPDF_TextPageFind::GetCurOrder() const { |
| return GetCharIndex(m_resStart); |
| } |
| |
| int CPDF_TextPageFind::GetMatchedCount() const { |
| int resStart = GetCharIndex(m_resStart); |
| int resEnd = GetCharIndex(m_resEnd); |
| return resEnd - resStart + 1; |
| } |