Rework CPDF_LinkExtract::CheckWebLink() Convert to size_t and avoid out parameters. Change-Id: I89eaef3fef2cb4225c2063315f55ac26c4057b92 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/91470 Reviewed-by: Lei Zhang <thestig@chromium.org> Commit-Queue: Tom Sepez <tsepez@chromium.org>

commit: fabcfa2baa4d7a63caafcd415e96b8bfa4ada31e [log] [tgz]
author: Tom Sepez <tsepez@chromium.org> Tue Mar 15 17:09:09 2022 +0000
committer: Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com> Tue Mar 15 17:09:09 2022 +0000
tree: 5393f891824b7f0c6484267dff5a58028be5ba05
parent: 7dd9b67616a1e4b862382da30be89293c392f6ff [diff]
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index f822f3b..68bca2d 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp

@@ -112,11 +112,11 @@
 
 void CPDF_LinkExtract::ExtractLinks() {
   m_LinkArray.clear();
-  int start = 0;
-  int pos = 0;
+  size_t start = 0;
+  size_t pos = 0;
   bool bAfterHyphen = false;
   bool bLineBreak = false;
-  const int nTotalChar = m_pTextPage->CountChars();
+  const size_t nTotalChar = m_pTextPage->CountChars();
   const WideString page_text = m_pTextPage->GetAllPageText();
   while (pos < nTotalChar) {
     const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
@@ -130,7 +130,7 @@
       continue;
     }
 
-    int nCount = pos - start;
+    size_t nCount = pos - start;
     if (pos == nTotalChar - 1) {
       ++nCount;
     } else if (bAfterHyphen &&
@@ -163,13 +163,12 @@
       // Check for potential web URLs and email addresses.
       // Ftp address, file system links, data, blob etc. are not checked.
       if (nCount > 5) {
-        int32_t nStartOffset;
-        int32_t nCountOverload;
-        if (CheckWebLink(&strBeCheck, &nStartOffset, &nCountOverload)) {
-          m_LinkArray.push_back(
-              {start + nStartOffset, nCountOverload, strBeCheck});
+        auto maybe_link = CheckWebLink(strBeCheck);
+        if (maybe_link.has_value()) {
+          maybe_link.value().m_Start += start;
+          m_LinkArray.push_back(maybe_link.value());
         } else if (CheckMailLink(&strBeCheck)) {
-          m_LinkArray.push_back({start, nCount, strBeCheck});
+          m_LinkArray.push_back(Link{start, nCount, strBeCheck});
         }
       }
     }
@@ -177,36 +176,34 @@
   }
 }
 
-bool CPDF_LinkExtract::CheckWebLink(WideString* strBeCheck,
-                                    int32_t* nStart,
-                                    int32_t* nCount) {
+absl::optional<CPDF_LinkExtract::Link> CPDF_LinkExtract::CheckWebLink(
+    const WideString& strBeCheck) {
   static const wchar_t kHttpScheme[] = L"http";
   static const wchar_t kWWWAddrStart[] = L"www.";
 
   const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
   const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
 
-  WideString str = *strBeCheck;
+  WideString str = strBeCheck;
   str.MakeLower();
 
-  size_t len = str.GetLength();
   // First, try to find the scheme.
   auto start = str.Find(kHttpScheme);
   if (start.has_value()) {
     size_t off = start.value() + kHttpSchemeLen;  // move after "http".
-    if (len > off + 4) {     // At least "://<char>" follows.
+    if (str.GetLength() > off + 4) {  // At least "://<char>" follows.
       if (str[off] == L's')  // "https" scheme is accepted.
         off++;
       if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
         off += 3;
-        size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
-                                                     str.GetLength() - 1);
-        end = FindWebLinkEnding(str, off, end);
+        const size_t end =
+            FindWebLinkEnding(str, off,
+                              TrimExternalBracketsFromWebLink(
+                                  str, start.value(), str.GetLength() - 1));
         if (end > off) {  // Non-empty host name.
-          *nStart = start.value();
-          *nCount = end - start.value() + 1;
-          *strBeCheck = strBeCheck->Substr(*nStart, *nCount);
-          return true;
+          const size_t nStart = start.value();
+          const size_t nCount = end - nStart + 1;
+          return Link{nStart, nCount, strBeCheck.Substr(nStart, nCount)};
         }
       }
     }
@@ -214,18 +211,23 @@
 
   // When there is no scheme, try to find url starting with "www.".
   start = str.Find(kWWWAddrStart);
-  if (start.has_value() && len > start.value() + kWWWAddrStartLen) {
-    size_t end = TrimExternalBracketsFromWebLink(str, start.value(),
-                                                 str.GetLength() - 1);
-    end = FindWebLinkEnding(str, start.value(), end);
-    if (end > start.value() + kWWWAddrStartLen) {
-      *nStart = start.value();
-      *nCount = end - start.value() + 1;
-      *strBeCheck = L"http://" + strBeCheck->Substr(*nStart, *nCount);
-      return true;
+  if (start.has_value()) {
+    size_t off = start.value() + kWWWAddrStartLen;
+    if (str.GetLength() > off) {
+      const size_t end =
+          FindWebLinkEnding(str, start.value(),
+                            TrimExternalBracketsFromWebLink(
+                                str, start.value(), str.GetLength() - 1));
+      if (end > off) {
+        const size_t nStart = start.value();
+        const size_t nCount = end - nStart + 1;
+        return Link{nStart, nCount,
+                    L"http://" + strBeCheck.Substr(nStart, nCount)};
+      }
     }
   }
-  return false;
+
+  return absl::nullopt;
 }
 
 bool CPDF_LinkExtract::CheckMailLink(WideString* str) {

diff --git a/core/fpdftext/cpdf_linkextract.h b/core/fpdftext/cpdf_linkextract.h
index e86b7a7..d44db70 100644
--- a/core/fpdftext/cpdf_linkextract.h
+++ b/core/fpdftext/cpdf_linkextract.h

@@ -15,6 +15,7 @@
 #include "core/fxcrt/fx_coordinates.h"
 #include "core/fxcrt/unowned_ptr.h"
 #include "core/fxcrt/widestring.h"
+#include "third_party/abseil-cpp/absl/types/optional.h"
 
 class CPDF_TextPage;
 
@@ -30,16 +31,15 @@
   bool GetTextRange(size_t index, int* start_char_index, int* char_count) const;
 
  protected:
-  bool CheckWebLink(WideString* str, int32_t* nStart, int32_t* nCount);
-  bool CheckMailLink(WideString* str);
-
- private:
   struct Link {
-    int m_Start;
-    int m_Count;
+    size_t m_Start;
+    size_t m_Count;
     WideString m_strUrl;
   };
 
+  absl::optional<Link> CheckWebLink(const WideString& str);
+  bool CheckMailLink(WideString* str);
+
   UnownedPtr<const CPDF_TextPage> const m_pTextPage;
   std::vector<Link> m_LinkArray;
 };

diff --git a/core/fpdftext/cpdf_linkextract_unittest.cpp b/core/fpdftext/cpdf_linkextract_unittest.cpp
index 66b30f4..f75853e 100644
--- a/core/fpdftext/cpdf_linkextract_unittest.cpp
+++ b/core/fpdftext/cpdf_linkextract_unittest.cpp

@@ -65,7 +65,10 @@
   // Check cases that fail to extract valid web link.
   // The last few are legit web addresses that we don't handle now.
   const wchar_t* const kInvalidCases[] = {
-      L"", L"http", L"www.", L"https-and-www",
+      L"",                          // Blank.
+      L"http",                      // No colon.
+      L"www.",                      // Missing domain.
+      L"https-and-www",             // Dash not colon.
       L"http:/abc.com",             // Missing slash.
       L"http://((()),",             // Only invalid chars in host name.
       L"ftp://example.com",         // Ftp scheme is not supported.
@@ -73,18 +76,11 @@
       L"http//[example.com",        // Invalid IPv6 address.
       L"http//[00:00:00:00:00:00",  // Invalid IPv6 address.
       L"http//[]",                  // Empty IPv6 address.
-      // Web addresses that in correct format that we don't handle.
-      L"abc.example.com",  // URL without scheme.
+      L"abc.example.com",           // URL without scheme.
   };
-  constexpr int32_t kDefaultValue = -42;
   for (const wchar_t* input : kInvalidCases) {
-    WideString text_str(input);
-    int32_t start_offset = kDefaultValue;
-    int32_t count = kDefaultValue;
-    EXPECT_FALSE(extractor.CheckWebLink(&text_str, &start_offset, &count))
-        << input;
-    EXPECT_EQ(kDefaultValue, start_offset) << input;
-    EXPECT_EQ(kDefaultValue, count) << input;
+    auto maybe_link = extractor.CheckWebLink(input);
+    EXPECT_FALSE(maybe_link.has_value()) << input;
   }
 
   // Check cases that can extract valid web link.
@@ -92,8 +88,8 @@
   struct ValidCase {
     const wchar_t* const input_string;
     const wchar_t* const url_extracted;
-    const int32_t start_offset;
-    const int32_t count;
+    const size_t start_offset;
+    const size_t count;
   };
   const ValidCase kValidCases[] = {
       {L"http://www.example.com", L"http://www.example.com", 0,
@@ -172,14 +168,10 @@
       {L"www.测试.net；", L"http://www.测试.net；", 0, 11},
   };
   for (const auto& it : kValidCases) {
-    const wchar_t* const input = it.input_string;
-    WideString text_str(input);
-    int32_t start_offset = kDefaultValue;
-    int32_t count = kDefaultValue;
-    EXPECT_TRUE(extractor.CheckWebLink(&text_str, &start_offset, &count))
-        << input;
-    EXPECT_STREQ(it.url_extracted, text_str.c_str());
-    EXPECT_EQ(it.start_offset, start_offset) << input;
-    EXPECT_EQ(it.count, count) << input;
+    auto maybe_link = extractor.CheckWebLink(it.input_string);
+    ASSERT_TRUE(maybe_link.has_value()) << it.input_string;
+    EXPECT_STREQ(it.url_extracted, maybe_link.value().m_strUrl.c_str());
+    EXPECT_EQ(it.start_offset, maybe_link.value().m_Start) << it.input_string;
+    EXPECT_EQ(it.count, maybe_link.value().m_Count) << it.input_string;
   }
 }
commit	fabcfa2baa4d7a63caafcd415e96b8bfa4ada31e	[log] [tgz]
author	Tom Sepez <tsepez@chromium.org>	Tue Mar 15 17:09:09 2022 +0000
committer	Pdfium LUCI CQ <pdfium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Tue Mar 15 17:09:09 2022 +0000
tree	5393f891824b7f0c6484267dff5a58028be5ba05
parent	7dd9b67616a1e4b862382da30be89293c392f6ff [diff]