API to return text range for weblinks

PDF text is parsed to find links using regex matching. These are not
links which are present as annotations in the PDF but are still
considered links (categorized as Auto-Detected links).
While parsing the links PDFium identifies the start char index and char
count of the detected link.

This CL exposes a public method to get start char index and char count
of such auto detected links. This is required for accessibility to
generate the relative mapping of links and text.

This CL adds a unit test to cover both cases: asking the text range for
a valid link and an invalid link.

Bug: pdfium:1334
Change-Id: I6d98b35eb9c6e7fcf4d800776b9129588fce0e09
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/56810
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index cdbf46a..399db50 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -317,3 +317,13 @@
   return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
                                    m_LinkArray[index].m_Count);
 }
+
+bool CPDF_LinkExtract::GetTextRange(size_t index,
+                                    int* start_char_index,
+                                    int* char_count) const {
+  if (index >= m_LinkArray.size())
+    return false;
+  *start_char_index = m_LinkArray[index].m_Start;
+  *char_count = m_LinkArray[index].m_Count;
+  return true;
+}
diff --git a/core/fpdftext/cpdf_linkextract.h b/core/fpdftext/cpdf_linkextract.h
index 713f658..1bb7b2f 100644
--- a/core/fpdftext/cpdf_linkextract.h
+++ b/core/fpdftext/cpdf_linkextract.h
@@ -24,6 +24,7 @@
   size_t CountLinks() const { return m_LinkArray.size(); }
   WideString GetURL(size_t index) const;
   std::vector<CFX_FloatRect> GetRects(size_t index) const;
+  bool GetTextRange(size_t index, int* start_char_index, int* char_count) const;
 
  protected:
   void ParseLink();
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index cc965bd..cc7ca5b 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -402,6 +402,18 @@
   return true;
 }
 
+FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
+FPDFLink_GetTextRange(FPDF_PAGELINK link_page,
+                      int link_index,
+                      int* start_char_index,
+                      int* char_count) {
+  if (!link_page || link_index < 0)
+    return false;
+
+  CPDF_LinkExtract* page_link = CPDFLinkExtractFromFPDFPageLink(link_page);
+  return page_link->GetTextRange(link_index, start_char_index, char_count);
+}
+
 FPDF_EXPORT void FPDF_CALLCONV FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page) {
   delete CPDFLinkExtractFromFPDFPageLink(link_page);
 }
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index 37e8dfa..12b09a7 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -582,6 +582,54 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFTextEmbedderTest, WebLinksCharRanges) {
+  ASSERT_TRUE(OpenDocument("weblinks.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
+  ASSERT_TRUE(text_page);
+
+  FPDF_PAGELINK page_link = FPDFLink_LoadWebLinks(text_page);
+  EXPECT_TRUE(page_link);
+
+  // Test for char indices of a valid link
+  int start_char_index;
+  int char_count;
+  ASSERT_TRUE(
+      FPDFLink_GetTextRange(page_link, 0, &start_char_index, &char_count));
+  EXPECT_EQ(35, start_char_index);
+  EXPECT_EQ(24, char_count);
+
+  // Test for char indices of an invalid link
+  start_char_index = -10;
+  char_count = -8;
+  ASSERT_FALSE(
+      FPDFLink_GetTextRange(page_link, 6, &start_char_index, &char_count));
+  EXPECT_EQ(start_char_index, -10);
+  EXPECT_EQ(char_count, -8);
+
+  // Test for pagelink = nullptr
+  start_char_index = -10;
+  char_count = -8;
+  ASSERT_FALSE(
+      FPDFLink_GetTextRange(nullptr, 0, &start_char_index, &char_count));
+  EXPECT_EQ(start_char_index, -10);
+  EXPECT_EQ(char_count, -8);
+
+  // Test for link_index < 0
+  start_char_index = -10;
+  char_count = -8;
+  ASSERT_FALSE(
+      FPDFLink_GetTextRange(page_link, -4, &start_char_index, &char_count));
+  EXPECT_EQ(start_char_index, -10);
+  EXPECT_EQ(char_count, -8);
+
+  FPDFLink_CloseWebLinks(page_link);
+  FPDFText_ClosePage(text_page);
+  UnloadPage(page);
+}
+
 TEST_F(FPDFTextEmbedderTest, GetFontSize) {
   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
   FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 32bb18d..0e64a00 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -311,6 +311,7 @@
     CHK(FPDFLink_CloseWebLinks);
     CHK(FPDFLink_CountRects);
     CHK(FPDFLink_CountWebLinks);
+    CHK(FPDFLink_GetTextRange);
     CHK(FPDFLink_GetRect);
     CHK(FPDFLink_GetURL);
     CHK(FPDFLink_LoadWebLinks);
diff --git a/public/fpdf_text.h b/public/fpdf_text.h
index 008c236..7bd84bf 100644
--- a/public/fpdf_text.h
+++ b/public/fpdf_text.h
@@ -469,6 +469,26 @@
                                                      double* right,
                                                      double* bottom);
 
+// Experimental API.
+// Function: FPDFLink_GetTextRange
+//          Fetch the start char index and char count for a link.
+// Parameters:
+//          link_page         -   Handle returned by FPDFLink_LoadWebLinks.
+//          link_index        -   Zero-based index for the link.
+//          start_char_index  -   pointer to int receiving the start char index
+//          char_count        -   pointer to int receiving the char count
+// Return Value:
+//          On success, return TRUE and fill in |start_char_index| and
+//          |char_count|. if |link_page| is invalid or if |link_index| does
+//          not correspond to a valid link, then return FALSE and the out
+//          parameters remain unmodified.
+//
+FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
+FPDFLink_GetTextRange(FPDF_PAGELINK link_page,
+                      int link_index,
+                      int* start_char_index,
+                      int* char_count);
+
 // Function: FPDFLink_CloseWebLinks
 //          Release resources used by weblink feature.
 // Parameters: