| // Copyright 2016 PDFium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| |
| #include "core/fpdftext/cpdf_linkextract.h" |
| |
| #include <vector> |
| |
| #include "core/fpdftext/cpdf_textpage.h" |
| #include "core/fxcrt/fx_extension.h" |
| #include "core/fxcrt/fx_string.h" |
| #include "core/fxcrt/fx_system.h" |
| |
| namespace { |
| |
| // Find the end of a web link starting from offset |start| and ending at offset |
| // |end|. The purpose of this function is to separate url from the surrounding |
| // context characters, we do not intend to fully validate the url. |str| |
| // contains lower case characters only. |
| size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) { |
| if (str.Contains(L'/', start)) { |
| // When there is a path and query after '/', most ASCII chars are allowed. |
| // We don't sanitize in this case. |
| return end; |
| } |
| |
| // When there is no path, it only has IP address or host name. |
| // Port is optional at the end. |
| if (str[start] == L'[') { |
| // IPv6 reference. |
| // Find the end of the reference. |
| auto result = str.Find(L']', start + 1); |
| if (result.has_value()) { |
| end = result.value(); |
| if (end > start + 1) { // Has content inside brackets. |
| size_t len = str.GetLength(); |
| size_t off = end + 1; |
| if (off < len && str[off] == L':') { |
| off++; |
| while (off < len && FXSYS_IsDecimalDigit(str[off])) |
| off++; |
| if (off > end + 2 && |
| off <= len) // At least one digit in port number. |
| end = off - 1; // |off| is offset of the first invalid char. |
| } |
| } |
| } |
| return end; |
| } |
| |
| // According to RFC1123, host name only has alphanumeric chars, hyphens, |
| // and periods. Hyphen should not at the end though. |
| // Non-ASCII chars are ignored during checking. |
| while (end > start && str[end] < 0x80) { |
| if (FXSYS_IsDecimalDigit(str[end]) || |
| (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') { |
| break; |
| } |
| end--; |
| } |
| return end; |
| } |
| |
| // Remove characters from the end of |str|, delimited by |start| and |end|, up |
| // to and including |charToFind|. No-op if |charToFind| is not present. Updates |
| // |end| if characters were removed. |
| void TrimBackwardsToChar(const WideString& str, |
| wchar_t charToFind, |
| size_t start, |
| size_t* end) { |
| for (size_t pos = *end; pos >= start; pos--) { |
| if (str[pos] == charToFind) { |
| *end = pos - 1; |
| break; |
| } |
| } |
| } |
| |
| // Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by |
| // |start| and |end| in |str|. Matches a closing bracket or quote for each |
| // opening character and, if present, removes everything afterwards. Returns the |
| // new end position for the string. |
| size_t TrimExternalBracketsFromWebLink(const WideString& str, |
| size_t start, |
| size_t end) { |
| for (size_t pos = 0; pos < start; pos++) { |
| if (str[pos] == '(') { |
| TrimBackwardsToChar(str, ')', start, &end); |
| } else if (str[pos] == '[') { |
| TrimBackwardsToChar(str, ']', start, &end); |
| } else if (str[pos] == '{') { |
| TrimBackwardsToChar(str, '}', start, &end); |
| } else if (str[pos] == '<') { |
| TrimBackwardsToChar(str, '>', start, &end); |
| } else if (str[pos] == '"') { |
| TrimBackwardsToChar(str, '"', start, &end); |
| } else if (str[pos] == '\'') { |
| TrimBackwardsToChar(str, '\'', start, &end); |
| } |
| } |
| return end; |
| } |
| |
| } // namespace |
| |
| CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) |
| : m_pTextPage(pTextPage) {} |
| |
| CPDF_LinkExtract::~CPDF_LinkExtract() = default; |
| |
| void CPDF_LinkExtract::ExtractLinks() { |
| m_LinkArray.clear(); |
| int start = 0; |
| int pos = 0; |
| bool bAfterHyphen = false; |
| bool bLineBreak = false; |
| const int nTotalChar = m_pTextPage->CountChars(); |
| const WideString page_text = m_pTextPage->GetAllPageText(); |
| while (pos < nTotalChar) { |
| const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos); |
| if (char_info.m_CharType != CPDF_TextPage::CharType::kGenerated && |
| char_info.m_Unicode != L' ' && pos != nTotalChar - 1) { |
| bAfterHyphen = |
| (char_info.m_CharType == CPDF_TextPage::CharType::kHyphen || |
| (char_info.m_CharType == CPDF_TextPage::CharType::kNormal && |
| char_info.m_Unicode == L'-')); |
| ++pos; |
| continue; |
| } |
| |
| int nCount = pos - start; |
| if (pos == nTotalChar - 1) { |
| ++nCount; |
| } else if (bAfterHyphen && |
| (char_info.m_Unicode == L'\n' || char_info.m_Unicode == L'\r')) { |
| // Handle text breaks with a hyphen to the next line. |
| bLineBreak = true; |
| ++pos; |
| continue; |
| } |
| |
| WideString strBeCheck = page_text.Substr(start, nCount); |
| if (bLineBreak) { |
| strBeCheck.Remove(L'\n'); |
| strBeCheck.Remove(L'\r'); |
| bLineBreak = false; |
| } |
| // Replace the generated code with the hyphen char. |
| strBeCheck.Replace(L"\xfffe", L"-"); |
| |
| if (strBeCheck.GetLength() > 5) { |
| while (strBeCheck.GetLength() > 0) { |
| wchar_t ch = strBeCheck.Back(); |
| if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.') |
| break; |
| |
| strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1); |
| nCount--; |
| } |
| |
| // Check for potential web URLs and email addresses. |
| // Ftp address, file system links, data, blob etc. are not checked. |
| if (nCount > 5) { |
| int32_t nStartOffset; |
| int32_t nCountOverload; |
| if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) { |
| m_LinkArray.push_back( |
| {start + nStartOffset, nCountOverload, strBeCheck}); |
| } else if (CheckMailLink(&strBeCheck)) { |
| m_LinkArray.push_back({start, nCount, strBeCheck}); |
| } |
| } |
| } |
| start = ++pos; |
| } |
| } |
| |
| bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck, |
| int32_t* nStart, |
| int32_t* nCount) { |
| static const wchar_t kHttpScheme[] = L"http"; |
| static const wchar_t kWWWAddrStart[] = L"www."; |
| |
| const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme); |
| const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart); |
| |
| WideString str = *strBeCheck; |
| str.MakeLower(); |
| |
| size_t len = str.GetLength(); |
| // First, try to find the scheme. |
| auto start = str.Find(kHttpScheme); |
| if (start.has_value()) { |
| size_t off = start.value() + kHttpSchemeLen; // move after "http". |
| if (len > off + 4) { // At least "://<char>" follows. |
| if (str[off] == L's') // "https" scheme is accepted. |
| off++; |
| if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') { |
| off += 3; |
| size_t end = TrimExternalBracketsFromWebLink(str, start.value(), |
| str.GetLength() - 1); |
| end = FindWebLinkEnding(str, off, end); |
| if (end > off) { // Non-empty host name. |
| *nStart = start.value(); |
| *nCount = end - start.value() + 1; |
| *strBeCheck = strBeCheck->Substr(*nStart, *nCount); |
| return true; |
| } |
| } |
| } |
| } |
| |
| // When there is no scheme, try to find url starting with "www.". |
| start = str.Find(kWWWAddrStart); |
| if (start.has_value() && len > start.value() + kWWWAddrStartLen) { |
| size_t end = TrimExternalBracketsFromWebLink(str, start.value(), |
| str.GetLength() - 1); |
| end = FindWebLinkEnding(str, start.value(), end); |
| if (end > start.value() + kWWWAddrStartLen) { |
| *nStart = start.value(); |
| *nCount = end - start.value() + 1; |
| *strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| bool CPDF_LinkExtract::CheckMailLink(WideString* str) { |
| auto aPos = str->Find(L'@'); |
| // Invalid when no '@' or when starts/ends with '@'. |
| if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1) |
| return false; |
| |
| // Check the local part. |
| size_t pPos = aPos.value(); // Used to track the position of '@' or '.'. |
| for (size_t i = aPos.value(); i > 0; i--) { |
| wchar_t ch = (*str)[i - 1]; |
| if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) |
| continue; |
| |
| if (ch != L'.' || i == pPos || i == 1) { |
| if (i == aPos.value()) { |
| // There is '.' or invalid char before '@'. |
| return false; |
| } |
| // End extracting for other invalid chars, '.' at the beginning, or |
| // consecutive '.'. |
| size_t removed_len = i == pPos ? i + 1 : i; |
| *str = str->Last(str->GetLength() - removed_len); |
| break; |
| } |
| // Found a valid '.'. |
| pPos = i - 1; |
| } |
| |
| // Check the domain name part. |
| aPos = str->Find(L'@'); |
| if (!aPos.has_value() || aPos.value() == 0) |
| return false; |
| |
| str->TrimRight(L'.'); |
| // At least one '.' in domain name, but not at the beginning. |
| // TODO(weili): RFC5322 allows domain names to be a local name without '.'. |
| // Check whether we should remove this check. |
| auto ePos = str->Find(L'.', aPos.value() + 1); |
| if (!ePos.has_value() || ePos.value() == aPos.value() + 1) |
| return false; |
| |
| // Validate all other chars in domain name. |
| size_t nLen = str->GetLength(); |
| pPos = 0; // Used to track the position of '.'. |
| for (size_t i = aPos.value() + 1; i < nLen; i++) { |
| wchar_t wch = (*str)[i]; |
| if (wch == L'-' || FXSYS_iswalnum(wch)) |
| continue; |
| |
| if (wch != L'.' || i == pPos + 1) { |
| // Domain name should end before invalid char. |
| size_t host_end = i == pPos + 1 ? i - 2 : i - 1; |
| if (pPos > 0 && host_end - aPos.value() >= 3) { |
| // Trim the ending invalid chars if there is at least one '.' and name. |
| *str = str->First(host_end + 1); |
| break; |
| } |
| return false; |
| } |
| pPos = i; |
| } |
| |
| if (!str->Contains(L"mailto:")) |
| *str = L"mailto:" + *str; |
| |
| return true; |
| } |
| |
| WideString CPDF_LinkExtract::GetURL(size_t index) const { |
| return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl |
| : WideString(); |
| } |
| |
| std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { |
| if (index >= m_LinkArray.size()) |
| return std::vector<CFX_FloatRect>(); |
| |
| return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, |
| m_LinkArray[index].m_Count); |
| } |
| |
| bool CPDF_LinkExtract::GetTextRange(size_t index, |
| int* start_char_index, |
| int* char_count) const { |
| if (index >= m_LinkArray.size()) |
| return false; |
| *start_char_index = m_LinkArray[index].m_Start; |
| *char_count = m_LinkArray[index].m_Count; |
| return true; |
| } |