Rework CPDF_LinkExtract::CheckWebLink() Convert to size_t and avoid out parameters. Change-Id: I89eaef3fef2cb4225c2063315f55ac26c4057b92 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/91470 Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp index f822f3b..68bca2d 100644 --- a/core/fpdftext/cpdf_linkextract.cpp +++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -112,11 +112,11 @@ void CPDF_LinkExtract::ExtractLinks() { m_LinkArray.clear(); - int start = 0; - int pos = 0; + size_t start = 0; + size_t pos = 0; bool bAfterHyphen = false; bool bLineBreak = false; - const int nTotalChar = m_pTextPage->CountChars(); + const size_t nTotalChar = m_pTextPage->CountChars(); const WideString page_text = m_pTextPage->GetAllPageText(); while (pos < nTotalChar) { const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos); @@ -130,7 +130,7 @@ continue; } - int nCount = pos - start; + size_t nCount = pos - start; if (pos == nTotalChar - 1) { ++nCount; } else if (bAfterHyphen && @@ -163,13 +163,12 @@ // Check for potential web URLs and email addresses. // Ftp address, file system links, data, blob etc. are not checked. if (nCount > 5) { - int32_t nStartOffset; - int32_t nCountOverload; - if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) { - m_LinkArray.push_back( - {start + nStartOffset, nCountOverload, strBeCheck}); + auto maybe_link = CheckWebLink(strBeCheck); + if (maybe_link.has_value()) { + maybe_link.value().m_Start += start; + m_LinkArray.push_back(maybe_link.value()); } else if (CheckMailLink(&strBeCheck)) { - m_LinkArray.push_back({start, nCount, strBeCheck}); + m_LinkArray.push_back(Link{start, nCount, strBeCheck}); } } } @@ -177,36 +176,34 @@ } } -bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck, - int32_t* nStart, - int32_t* nCount) { +absl::optional<CPDF_LinkExtract::Link> CPDF_LinkExtract::CheckWebLink( + const WideString& strBeCheck) { static const wchar_t kHttpScheme[] = L"http"; static const wchar_t kWWWAddrStart[] = L"www."; const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme); const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart); - WideString str = *strBeCheck; + WideString str = strBeCheck; str.MakeLower(); - size_t len = str.GetLength(); // First, try to find the scheme. auto start = str.Find(kHttpScheme); if (start.has_value()) { size_t off = start.value() + kHttpSchemeLen; // move after "http". - if (len > off + 4) { // At least "://<char>" follows. + if (str.GetLength() > off + 4) { // At least "://<char>" follows. if (str[off] == L's') // "https" scheme is accepted. off++; if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') { off += 3; - size_t end = TrimExternalBracketsFromWebLink(str, start.value(), - str.GetLength() - 1); - end = FindWebLinkEnding(str, off, end); + const size_t end = + FindWebLinkEnding(str, off, + TrimExternalBracketsFromWebLink( + str, start.value(), str.GetLength() - 1)); if (end > off) { // Non-empty host name. - *nStart = start.value(); - *nCount = end - start.value() + 1; - *strBeCheck = strBeCheck->Substr(*nStart, *nCount); - return true; + const size_t nStart = start.value(); + const size_t nCount = end - nStart + 1; + return Link{nStart, nCount, strBeCheck.Substr(nStart, nCount)}; } } } @@ -214,18 +211,23 @@ // When there is no scheme, try to find url starting with "www.". start = str.Find(kWWWAddrStart); - if (start.has_value() && len > start.value() + kWWWAddrStartLen) { - size_t end = TrimExternalBracketsFromWebLink(str, start.value(), - str.GetLength() - 1); - end = FindWebLinkEnding(str, start.value(), end); - if (end > start.value() + kWWWAddrStartLen) { - *nStart = start.value(); - *nCount = end - start.value() + 1; - *strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount); - return true; + if (start.has_value()) { + size_t off = start.value() + kWWWAddrStartLen; + if (str.GetLength() > off) { + const size_t end = + FindWebLinkEnding(str, start.value(), + TrimExternalBracketsFromWebLink( + str, start.value(), str.GetLength() - 1)); + if (end > off) { + const size_t nStart = start.value(); + const size_t nCount = end - nStart + 1; + return Link{nStart, nCount, + L"http://" + strBeCheck.Substr(nStart, nCount)}; + } } } - return false; + + return absl::nullopt; } bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
diff --git a/core/fpdftext/cpdf_linkextract.h b/core/fpdftext/cpdf_linkextract.h index e86b7a7..d44db70 100644 --- a/core/fpdftext/cpdf_linkextract.h +++ b/core/fpdftext/cpdf_linkextract.h
@@ -15,6 +15,7 @@ #include "core/fxcrt/fx_coordinates.h" #include "core/fxcrt/unowned_ptr.h" #include "core/fxcrt/widestring.h" +#include "third_party/abseil-cpp/absl/types/optional.h" class CPDF_TextPage; @@ -30,16 +31,15 @@ bool GetTextRange(size_t index, int* start_char_index, int* char_count) const; protected: - bool CheckWebLink(WideString* str, int32_t* nStart, int32_t* nCount); - bool CheckMailLink(WideString* str); - - private: struct Link { - int m_Start; - int m_Count; + size_t m_Start; + size_t m_Count; WideString m_strUrl; }; + absl::optional<Link> CheckWebLink(const WideString& str); + bool CheckMailLink(WideString* str); + UnownedPtr<const CPDF_TextPage> const m_pTextPage; std::vector<Link> m_LinkArray; };
diff --git a/core/fpdftext/cpdf_linkextract_unittest.cpp b/core/fpdftext/cpdf_linkextract_unittest.cpp index 66b30f4..f75853e 100644 --- a/core/fpdftext/cpdf_linkextract_unittest.cpp +++ b/core/fpdftext/cpdf_linkextract_unittest.cpp
@@ -65,7 +65,10 @@ // Check cases that fail to extract valid web link. // The last few are legit web addresses that we don't handle now. const wchar_t* const kInvalidCases[] = { - L"", L"http", L"www.", L"https-and-www", + L"", // Blank. + L"http", // No colon. + L"www.", // Missing domain. + L"https-and-www", // Dash not colon. L"http:/abc.com", // Missing slash. L"http://((()),", // Only invalid chars in host name. L"ftp://example.com", // Ftp scheme is not supported. @@ -73,18 +76,11 @@ L"http//[example.com", // Invalid IPv6 address. L"http//[00:00:00:00:00:00", // Invalid IPv6 address. L"http//[]", // Empty IPv6 address. - // Web addresses that in correct format that we don't handle. - L"abc.example.com", // URL without scheme. + L"abc.example.com", // URL without scheme. }; - constexpr int32_t kDefaultValue = -42; for (const wchar_t* input : kInvalidCases) { - WideString text_str(input); - int32_t start_offset = kDefaultValue; - int32_t count = kDefaultValue; - EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count)) - << input; - EXPECT_EQ(kDefaultValue, start_offset) << input; - EXPECT_EQ(kDefaultValue, count) << input; + auto maybe_link = extractor.CheckWebLink(input); + EXPECT_FALSE(maybe_link.has_value()) << input; } // Check cases that can extract valid web link. @@ -92,8 +88,8 @@ struct ValidCase { const wchar_t* const input_string; const wchar_t* const url_extracted; - const int32_t start_offset; - const int32_t count; + const size_t start_offset; + const size_t count; }; const ValidCase kValidCases[] = { {L"http://www.example.com", L"http://www.example.com", 0, @@ -172,14 +168,10 @@ {L"www.测试.net;", L"http://www.测试.net;", 0, 11}, }; for (const auto& it : kValidCases) { - const wchar_t* const input = it.input_string; - WideString text_str(input); - int32_t start_offset = kDefaultValue; - int32_t count = kDefaultValue; - EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count)) - << input; - EXPECT_STREQ(it.url_extracted, text_str.c_str()); - EXPECT_EQ(it.start_offset, start_offset) << input; - EXPECT_EQ(it.count, count) << input; + auto maybe_link = extractor.CheckWebLink(it.input_string); + ASSERT_TRUE(maybe_link.has_value()) << it.input_string; + EXPECT_STREQ(it.url_extracted, maybe_link.value().m_strUrl.c_str()); + EXPECT_EQ(it.start_offset, maybe_link.value().m_Start) << it.input_string; + EXPECT_EQ(it.count, maybe_link.value().m_Count) << it.input_string; } }