Rework CPDF_LinkExtract::CheckWebLink()
Convert to size_t and avoid out parameters.
Change-Id: I89eaef3fef2cb4225c2063315f55ac26c4057b92
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/91470
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index f822f3b..68bca2d 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -112,11 +112,11 @@
void CPDF_LinkExtract::ExtractLinks() {
m_LinkArray.clear();
- int start = 0;
- int pos = 0;
+ size_t start = 0;
+ size_t pos = 0;
bool bAfterHyphen = false;
bool bLineBreak = false;
- const int nTotalChar = m_pTextPage->CountChars();
+ const size_t nTotalChar = m_pTextPage->CountChars();
const WideString page_text = m_pTextPage->GetAllPageText();
while (pos < nTotalChar) {
const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
@@ -130,7 +130,7 @@
continue;
}
- int nCount = pos - start;
+ size_t nCount = pos - start;
if (pos == nTotalChar - 1) {
++nCount;
} else if (bAfterHyphen &&
@@ -163,13 +163,12 @@
// Check for potential web URLs and email addresses.
// Ftp address, file system links, data, blob etc. are not checked.
if (nCount > 5) {
- int32_t nStartOffset;
- int32_t nCountOverload;
- if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
- m_LinkArray.push_back(
- {start + nStartOffset, nCountOverload, strBeCheck});
+ auto maybe_link = CheckWebLink(strBeCheck);
+ if (maybe_link.has_value()) {
+ maybe_link.value().m_Start += start;
+ m_LinkArray.push_back(maybe_link.value());
} else if (CheckMailLink(&strBeCheck)) {
- m_LinkArray.push_back({start, nCount, strBeCheck});
+ m_LinkArray.push_back(Link{start, nCount, strBeCheck});
}
}
}
@@ -177,36 +176,34 @@
}
}
-bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
- int32_t* nStart,
- int32_t* nCount) {
+absl::optional<CPDF_LinkExtract::Link> CPDF_LinkExtract::CheckWebLink(
+ const WideString& strBeCheck) {
static const wchar_t kHttpScheme[] = L"http";
static const wchar_t kWWWAddrStart[] = L"www.";
const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
- WideString str = *strBeCheck;
+ WideString str = strBeCheck;
str.MakeLower();
- size_t len = str.GetLength();
// First, try to find the scheme.
auto start = str.Find(kHttpScheme);
if (start.has_value()) {
size_t off = start.value() + kHttpSchemeLen; // move after "http".
- if (len > off + 4) { // At least "://<char>" follows.
+ if (str.GetLength() > off + 4) { // At least "://<char>" follows.
if (str[off] == L's') // "https" scheme is accepted.
off++;
if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
off += 3;
- size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
- str.GetLength() - 1);
- end = FindWebLinkEnding(str, off, end);
+ const size_t end =
+ FindWebLinkEnding(str, off,
+ TrimExternalBracketsFromWebLink(
+ str, start.value(), str.GetLength() - 1));
if (end > off) { // Non-empty host name.
- *nStart = start.value();
- *nCount = end - start.value() + 1;
- *strBeCheck = strBeCheck->Substr(*nStart, *nCount);
- return true;
+ const size_t nStart = start.value();
+ const size_t nCount = end - nStart + 1;
+ return Link{nStart, nCount, strBeCheck.Substr(nStart, nCount)};
}
}
}
@@ -214,18 +211,23 @@
// When there is no scheme, try to find url starting with "www.".
start = str.Find(kWWWAddrStart);
- if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
- size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
- str.GetLength() - 1);
- end = FindWebLinkEnding(str, start.value(), end);
- if (end > start.value() + kWWWAddrStartLen) {
- *nStart = start.value();
- *nCount = end - start.value() + 1;
- *strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount);
- return true;
+ if (start.has_value()) {
+ size_t off = start.value() + kWWWAddrStartLen;
+ if (str.GetLength() > off) {
+ const size_t end =
+ FindWebLinkEnding(str, start.value(),
+ TrimExternalBracketsFromWebLink(
+ str, start.value(), str.GetLength() - 1));
+ if (end > off) {
+ const size_t nStart = start.value();
+ const size_t nCount = end - nStart + 1;
+ return Link{nStart, nCount,
+ L"http://" + strBeCheck.Substr(nStart, nCount)};
+ }
}
}
- return false;
+
+ return absl::nullopt;
}
bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
diff --git a/core/fpdftext/cpdf_linkextract.h b/core/fpdftext/cpdf_linkextract.h
index e86b7a7..d44db70 100644
--- a/core/fpdftext/cpdf_linkextract.h
+++ b/core/fpdftext/cpdf_linkextract.h
@@ -15,6 +15,7 @@
#include "core/fxcrt/fx_coordinates.h"
#include "core/fxcrt/unowned_ptr.h"
#include "core/fxcrt/widestring.h"
+#include "third_party/abseil-cpp/absl/types/optional.h"
class CPDF_TextPage;
@@ -30,16 +31,15 @@
bool GetTextRange(size_t index, int* start_char_index, int* char_count) const;
protected:
- bool CheckWebLink(WideString* str, int32_t* nStart, int32_t* nCount);
- bool CheckMailLink(WideString* str);
-
- private:
struct Link {
- int m_Start;
- int m_Count;
+ size_t m_Start;
+ size_t m_Count;
WideString m_strUrl;
};
+ absl::optional<Link> CheckWebLink(const WideString& str);
+ bool CheckMailLink(WideString* str);
+
UnownedPtr<const CPDF_TextPage> const m_pTextPage;
std::vector<Link> m_LinkArray;
};
diff --git a/core/fpdftext/cpdf_linkextract_unittest.cpp b/core/fpdftext/cpdf_linkextract_unittest.cpp
index 66b30f4..f75853e 100644
--- a/core/fpdftext/cpdf_linkextract_unittest.cpp
+++ b/core/fpdftext/cpdf_linkextract_unittest.cpp
@@ -65,7 +65,10 @@
// Check cases that fail to extract valid web link.
// The last few are legit web addresses that we don't handle now.
const wchar_t* const kInvalidCases[] = {
- L"", L"http", L"www.", L"https-and-www",
+ L"", // Blank.
+ L"http", // No colon.
+ L"www.", // Missing domain.
+ L"https-and-www", // Dash not colon.
L"http:/abc.com", // Missing slash.
L"http://((()),", // Only invalid chars in host name.
L"ftp://example.com", // Ftp scheme is not supported.
@@ -73,18 +76,11 @@
L"http//[example.com", // Invalid IPv6 address.
L"http//[00:00:00:00:00:00", // Invalid IPv6 address.
L"http//[]", // Empty IPv6 address.
- // Web addresses that in correct format that we don't handle.
- L"abc.example.com", // URL without scheme.
+ L"abc.example.com", // URL without scheme.
};
- constexpr int32_t kDefaultValue = -42;
for (const wchar_t* input : kInvalidCases) {
- WideString text_str(input);
- int32_t start_offset = kDefaultValue;
- int32_t count = kDefaultValue;
- EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count))
- << input;
- EXPECT_EQ(kDefaultValue, start_offset) << input;
- EXPECT_EQ(kDefaultValue, count) << input;
+ auto maybe_link = extractor.CheckWebLink(input);
+ EXPECT_FALSE(maybe_link.has_value()) << input;
}
// Check cases that can extract valid web link.
@@ -92,8 +88,8 @@
struct ValidCase {
const wchar_t* const input_string;
const wchar_t* const url_extracted;
- const int32_t start_offset;
- const int32_t count;
+ const size_t start_offset;
+ const size_t count;
};
const ValidCase kValidCases[] = {
{L"http://www.example.com", L"http://www.example.com", 0,
@@ -172,14 +168,10 @@
{L"www.测试.net;", L"http://www.测试.net;", 0, 11},
};
for (const auto& it : kValidCases) {
- const wchar_t* const input = it.input_string;
- WideString text_str(input);
- int32_t start_offset = kDefaultValue;
- int32_t count = kDefaultValue;
- EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count))
- << input;
- EXPECT_STREQ(it.url_extracted, text_str.c_str());
- EXPECT_EQ(it.start_offset, start_offset) << input;
- EXPECT_EQ(it.count, count) << input;
+ auto maybe_link = extractor.CheckWebLink(it.input_string);
+ ASSERT_TRUE(maybe_link.has_value()) << it.input_string;
+ EXPECT_STREQ(it.url_extracted, maybe_link.value().m_strUrl.c_str());
+ EXPECT_EQ(it.start_offset, maybe_link.value().m_Start) << it.input_string;
+ EXPECT_EQ(it.count, maybe_link.value().m_Count) << it.input_string;
}
}