|  | // Copyright 2016 PDFium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 
|  |  | 
|  | #include "core/fpdftext/cpdf_linkextract.h" | 
|  |  | 
|  | #include <vector> | 
|  |  | 
|  | #include "core/fpdftext/cpdf_textpage.h" | 
|  | #include "core/fxcrt/fx_extension.h" | 
|  | #include "core/fxcrt/fx_string.h" | 
|  | #include "core/fxcrt/fx_system.h" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | // Find the end of a web link starting from offset |start| and ending at offset | 
|  | // |end|. The purpose of this function is to separate url from the surrounding | 
|  | // context characters, we do not intend to fully validate the url. |str| | 
|  | // contains lower case characters only. | 
|  | size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) { | 
|  | if (str.Contains(L'/', start)) { | 
|  | // When there is a path and query after '/', most ASCII chars are allowed. | 
|  | // We don't sanitize in this case. | 
|  | return end; | 
|  | } | 
|  |  | 
|  | // When there is no path, it only has IP address or host name. | 
|  | // Port is optional at the end. | 
|  | if (str[start] == L'[') { | 
|  | // IPv6 reference. | 
|  | // Find the end of the reference. | 
|  | auto result = str.Find(L']', start + 1); | 
|  | if (result.has_value()) { | 
|  | end = result.value(); | 
|  | if (end > start + 1) {  // Has content inside brackets. | 
|  | size_t len = str.GetLength(); | 
|  | size_t off = end + 1; | 
|  | if (off < len && str[off] == L':') { | 
|  | off++; | 
|  | while (off < len && FXSYS_IsDecimalDigit(str[off])) | 
|  | off++; | 
|  | if (off > end + 2 && | 
|  | off <= len)   // At least one digit in port number. | 
|  | end = off - 1;  // |off| is offset of the first invalid char. | 
|  | } | 
|  | } | 
|  | } | 
|  | return end; | 
|  | } | 
|  |  | 
|  | // According to RFC1123, host name only has alphanumeric chars, hyphens, | 
|  | // and periods. Hyphen should not at the end though. | 
|  | // Non-ASCII chars are ignored during checking. | 
|  | while (end > start && str[end] < 0x80) { | 
|  | if (FXSYS_IsDecimalDigit(str[end]) || | 
|  | (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') { | 
|  | break; | 
|  | } | 
|  | end--; | 
|  | } | 
|  | return end; | 
|  | } | 
|  |  | 
|  | // Remove characters from the end of |str|, delimited by |start| and |end|, up | 
|  | // to and including |charToFind|. No-op if |charToFind| is not present. Updates | 
|  | // |end| if characters were removed. | 
|  | void TrimBackwardsToChar(const WideString& str, | 
|  | wchar_t charToFind, | 
|  | size_t start, | 
|  | size_t* end) { | 
|  | for (size_t pos = *end; pos >= start; pos--) { | 
|  | if (str[pos] == charToFind) { | 
|  | *end = pos - 1; | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Finds opening brackets ()[]{}<> and quotes "'  before the URL delimited by | 
|  | // |start| and |end| in |str|. Matches a closing bracket or quote for each | 
|  | // opening character and, if present, removes everything afterwards. Returns the | 
|  | // new end position for the string. | 
|  | size_t TrimExternalBracketsFromWebLink(const WideString& str, | 
|  | size_t start, | 
|  | size_t end) { | 
|  | for (size_t pos = 0; pos < start; pos++) { | 
|  | if (str[pos] == '(') { | 
|  | TrimBackwardsToChar(str, ')', start, &end); | 
|  | } else if (str[pos] == '[') { | 
|  | TrimBackwardsToChar(str, ']', start, &end); | 
|  | } else if (str[pos] == '{') { | 
|  | TrimBackwardsToChar(str, '}', start, &end); | 
|  | } else if (str[pos] == '<') { | 
|  | TrimBackwardsToChar(str, '>', start, &end); | 
|  | } else if (str[pos] == '"') { | 
|  | TrimBackwardsToChar(str, '"', start, &end); | 
|  | } else if (str[pos] == '\'') { | 
|  | TrimBackwardsToChar(str, '\'', start, &end); | 
|  | } | 
|  | } | 
|  | return end; | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) | 
|  | : m_pTextPage(pTextPage) {} | 
|  |  | 
|  | CPDF_LinkExtract::~CPDF_LinkExtract() = default; | 
|  |  | 
|  | void CPDF_LinkExtract::ExtractLinks() { | 
|  | m_LinkArray.clear(); | 
|  | int start = 0; | 
|  | int pos = 0; | 
|  | bool bAfterHyphen = false; | 
|  | bool bLineBreak = false; | 
|  | const int nTotalChar = m_pTextPage->CountChars(); | 
|  | const WideString page_text = m_pTextPage->GetAllPageText(); | 
|  | while (pos < nTotalChar) { | 
|  | const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos); | 
|  | if (char_info.m_CharType != CPDF_TextPage::CharType::kGenerated && | 
|  | char_info.m_Unicode != L' ' && pos != nTotalChar - 1) { | 
|  | bAfterHyphen = | 
|  | (char_info.m_CharType == CPDF_TextPage::CharType::kHyphen || | 
|  | (char_info.m_CharType == CPDF_TextPage::CharType::kNormal && | 
|  | char_info.m_Unicode == L'-')); | 
|  | ++pos; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | int nCount = pos - start; | 
|  | if (pos == nTotalChar - 1) { | 
|  | ++nCount; | 
|  | } else if (bAfterHyphen && | 
|  | (char_info.m_Unicode == L'\n' || char_info.m_Unicode == L'\r')) { | 
|  | // Handle text breaks with a hyphen to the next line. | 
|  | bLineBreak = true; | 
|  | ++pos; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | WideString strBeCheck = page_text.Substr(start, nCount); | 
|  | if (bLineBreak) { | 
|  | strBeCheck.Remove(L'\n'); | 
|  | strBeCheck.Remove(L'\r'); | 
|  | bLineBreak = false; | 
|  | } | 
|  | // Replace the generated code with the hyphen char. | 
|  | strBeCheck.Replace(L"\xfffe", L"-"); | 
|  |  | 
|  | if (strBeCheck.GetLength() > 5) { | 
|  | while (strBeCheck.GetLength() > 0) { | 
|  | wchar_t ch = strBeCheck.Back(); | 
|  | if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.') | 
|  | break; | 
|  |  | 
|  | strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1); | 
|  | nCount--; | 
|  | } | 
|  |  | 
|  | // Check for potential web URLs and email addresses. | 
|  | // Ftp address, file system links, data, blob etc. are not checked. | 
|  | if (nCount > 5) { | 
|  | int32_t nStartOffset; | 
|  | int32_t nCountOverload; | 
|  | if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) { | 
|  | m_LinkArray.push_back( | 
|  | {start + nStartOffset, nCountOverload, strBeCheck}); | 
|  | } else if (CheckMailLink(&strBeCheck)) { | 
|  | m_LinkArray.push_back({start, nCount, strBeCheck}); | 
|  | } | 
|  | } | 
|  | } | 
|  | start = ++pos; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck, | 
|  | int32_t* nStart, | 
|  | int32_t* nCount) { | 
|  | static const wchar_t kHttpScheme[] = L"http"; | 
|  | static const wchar_t kWWWAddrStart[] = L"www."; | 
|  |  | 
|  | const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme); | 
|  | const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart); | 
|  |  | 
|  | WideString str = *strBeCheck; | 
|  | str.MakeLower(); | 
|  |  | 
|  | size_t len = str.GetLength(); | 
|  | // First, try to find the scheme. | 
|  | auto start = str.Find(kHttpScheme); | 
|  | if (start.has_value()) { | 
|  | size_t off = start.value() + kHttpSchemeLen;  // move after "http". | 
|  | if (len > off + 4) {     // At least "://<char>" follows. | 
|  | if (str[off] == L's')  // "https" scheme is accepted. | 
|  | off++; | 
|  | if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') { | 
|  | off += 3; | 
|  | size_t end = TrimExternalBracketsFromWebLink(str, start.value(), | 
|  | str.GetLength() - 1); | 
|  | end = FindWebLinkEnding(str, off, end); | 
|  | if (end > off) {  // Non-empty host name. | 
|  | *nStart = start.value(); | 
|  | *nCount = end - start.value() + 1; | 
|  | *strBeCheck = strBeCheck->Substr(*nStart, *nCount); | 
|  | return true; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // When there is no scheme, try to find url starting with "www.". | 
|  | start = str.Find(kWWWAddrStart); | 
|  | if (start.has_value() && len > start.value() + kWWWAddrStartLen) { | 
|  | size_t end = TrimExternalBracketsFromWebLink(str, start.value(), | 
|  | str.GetLength() - 1); | 
|  | end = FindWebLinkEnding(str, start.value(), end); | 
|  | if (end > start.value() + kWWWAddrStartLen) { | 
|  | *nStart = start.value(); | 
|  | *nCount = end - start.value() + 1; | 
|  | *strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount); | 
|  | return true; | 
|  | } | 
|  | } | 
|  | return false; | 
|  | } | 
|  |  | 
|  | bool CPDF_LinkExtract::CheckMailLink(WideString* str) { | 
|  | auto aPos = str->Find(L'@'); | 
|  | // Invalid when no '@' or when starts/ends with '@'. | 
|  | if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1) | 
|  | return false; | 
|  |  | 
|  | // Check the local part. | 
|  | size_t pPos = aPos.value();  // Used to track the position of '@' or '.'. | 
|  | for (size_t i = aPos.value(); i > 0; i--) { | 
|  | wchar_t ch = (*str)[i - 1]; | 
|  | if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) | 
|  | continue; | 
|  |  | 
|  | if (ch != L'.' || i == pPos || i == 1) { | 
|  | if (i == aPos.value()) { | 
|  | // There is '.' or invalid char before '@'. | 
|  | return false; | 
|  | } | 
|  | // End extracting for other invalid chars, '.' at the beginning, or | 
|  | // consecutive '.'. | 
|  | size_t removed_len = i == pPos ? i + 1 : i; | 
|  | *str = str->Last(str->GetLength() - removed_len); | 
|  | break; | 
|  | } | 
|  | // Found a valid '.'. | 
|  | pPos = i - 1; | 
|  | } | 
|  |  | 
|  | // Check the domain name part. | 
|  | aPos = str->Find(L'@'); | 
|  | if (!aPos.has_value() || aPos.value() == 0) | 
|  | return false; | 
|  |  | 
|  | str->TrimRight(L'.'); | 
|  | // At least one '.' in domain name, but not at the beginning. | 
|  | // TODO(weili): RFC5322 allows domain names to be a local name without '.'. | 
|  | // Check whether we should remove this check. | 
|  | auto ePos = str->Find(L'.', aPos.value() + 1); | 
|  | if (!ePos.has_value() || ePos.value() == aPos.value() + 1) | 
|  | return false; | 
|  |  | 
|  | // Validate all other chars in domain name. | 
|  | size_t nLen = str->GetLength(); | 
|  | pPos = 0;  // Used to track the position of '.'. | 
|  | for (size_t i = aPos.value() + 1; i < nLen; i++) { | 
|  | wchar_t wch = (*str)[i]; | 
|  | if (wch == L'-' || FXSYS_iswalnum(wch)) | 
|  | continue; | 
|  |  | 
|  | if (wch != L'.' || i == pPos + 1) { | 
|  | // Domain name should end before invalid char. | 
|  | size_t host_end = i == pPos + 1 ? i - 2 : i - 1; | 
|  | if (pPos > 0 && host_end - aPos.value() >= 3) { | 
|  | // Trim the ending invalid chars if there is at least one '.' and name. | 
|  | *str = str->First(host_end + 1); | 
|  | break; | 
|  | } | 
|  | return false; | 
|  | } | 
|  | pPos = i; | 
|  | } | 
|  |  | 
|  | if (!str->Contains(L"mailto:")) | 
|  | *str = L"mailto:" + *str; | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | WideString CPDF_LinkExtract::GetURL(size_t index) const { | 
|  | return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl | 
|  | : WideString(); | 
|  | } | 
|  |  | 
|  | std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { | 
|  | if (index >= m_LinkArray.size()) | 
|  | return std::vector<CFX_FloatRect>(); | 
|  |  | 
|  | return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, | 
|  | m_LinkArray[index].m_Count); | 
|  | } | 
|  |  | 
|  | bool CPDF_LinkExtract::GetTextRange(size_t index, | 
|  | int* start_char_index, | 
|  | int* char_count) const { | 
|  | if (index >= m_LinkArray.size()) | 
|  | return false; | 
|  | *start_char_index = m_LinkArray[index].m_Start; | 
|  | *char_count = m_LinkArray[index].m_Count; | 
|  | return true; | 
|  | } |