blob: 47d0754bd25e1ad35770678d42608e3cc001690f [file] [log] [blame]
// Copyright 2016 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#include "core/fpdftext/cpdf_linkextract.h"
#include <vector>
#include "core/fpdftext/cpdf_textpage.h"
#include "core/fxcrt/fx_ext.h"
#include "core/fxcrt/fx_string.h"
#include "core/fxcrt/fx_system.h"
CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
: m_pTextPage(pTextPage) {}
CPDF_LinkExtract::~CPDF_LinkExtract() {}
void CPDF_LinkExtract::ExtractLinks() {
m_LinkArray.clear();
if (!m_pTextPage->IsParsed())
return;
m_strPageText = m_pTextPage->GetPageText(0, -1);
if (m_strPageText.IsEmpty())
return;
ParseLink();
}
void CPDF_LinkExtract::ParseLink() {
int start = 0;
int pos = 0;
int nTotalChar = m_pTextPage->CountChars();
bool bAfterHyphen = false;
bool bLineBreak = false;
while (pos < nTotalChar) {
FPDF_CHAR_INFO pageChar;
m_pTextPage->GetCharInfo(pos, &pageChar);
if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) {
int nCount = pos - start;
if (pos == nTotalChar - 1) {
nCount++;
} else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR ||
pageChar.m_Unicode == TEXT_RETURN_CHAR)) {
// Handle text breaks with a hyphen to the next line.
bLineBreak = true;
pos++;
continue;
}
CFX_WideString strBeCheck;
strBeCheck = m_pTextPage->GetPageText(start, nCount);
if (bLineBreak) {
strBeCheck.Remove(TEXT_LINEFEED_CHAR);
strBeCheck.Remove(TEXT_RETURN_CHAR);
bLineBreak = false;
}
// Replace the generated code with the hyphen char.
strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN);
if (strBeCheck.GetLength() > 5) {
while (strBeCheck.GetLength() > 0) {
wchar_t ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
nCount--;
} else {
break;
}
}
if (nCount > 5 &&
(CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
m_LinkArray.push_back({start, nCount, strBeCheck});
}
}
start = ++pos;
} else {
bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN ||
(pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL &&
pageChar.m_Unicode == TEXT_HYPHEN_CHAR));
pos++;
}
}
}
bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
CFX_WideString str = strBeCheck;
str.MakeLower();
if (str.Find(L"http://www.") != -1) {
strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
return true;
}
if (str.Find(L"http://") != -1) {
strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
return true;
}
if (str.Find(L"https://www.") != -1) {
strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
return true;
}
if (str.Find(L"https://") != -1) {
strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
return true;
}
if (str.Find(L"www.") != -1) {
strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
strBeCheck = L"http://" + strBeCheck;
return true;
}
return false;
}
bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
int aPos = str.Find(L'@');
// Invalid when no '@'.
if (aPos < 1)
return false;
// Check the local part.
int pPos = aPos; // Used to track the position of '@' or '.'.
for (int i = aPos - 1; i >= 0; i--) {
wchar_t ch = str.GetAt(i);
if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
continue;
if (ch != L'.' || i == pPos - 1 || i == 0) {
if (i == aPos - 1) {
// There is '.' or invalid char before '@'.
return false;
}
// End extracting for other invalid chars, '.' at the beginning, or
// consecutive '.'.
int removed_len = i == pPos - 1 ? i + 2 : i + 1;
str = str.Right(str.GetLength() - removed_len);
break;
}
// Found a valid '.'.
pPos = i;
}
// Check the domain name part.
aPos = str.Find(L'@');
if (aPos < 1)
return false;
str.TrimRight(L'.');
// At least one '.' in domain name, but not at the beginning.
// TODO(weili): RFC5322 allows domain names to be a local name without '.'.
// Check whether we should remove this check.
int ePos = str.Find(L'.', aPos + 1);
if (ePos == -1 || ePos == aPos + 1)
return false;
// Validate all other chars in domain name.
int nLen = str.GetLength();
pPos = 0; // Used to track the position of '.'.
for (int i = aPos + 1; i < nLen; i++) {
wchar_t wch = str.GetAt(i);
if (wch == L'-' || FXSYS_iswalnum(wch))
continue;
if (wch != L'.' || i == pPos + 1) {
// Domain name should end before invalid char.
int host_end = i == pPos + 1 ? i - 2 : i - 1;
if (pPos > 0 && host_end - aPos >= 3) {
// Trim the ending invalid chars if there is at least one '.' and name.
str = str.Left(host_end + 1);
break;
}
return false;
}
pPos = i;
}
if (str.Find(L"mailto:") == -1)
str = L"mailto:" + str;
return true;
}
CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
}
std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
if (index >= m_LinkArray.size())
return std::vector<CFX_FloatRect>();
return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
m_LinkArray[index].m_Count);
}