Handle web links across lines

When a web link has a hyphen at the end of line, we consider it to
be continued to the next line. For example, "http://www.abc.com/my-\r\ntest"
should be extracted as "http://www.abc.com/my-test".

BUG=pdfium:650

Change-Id: I64a93d9c66faf2be0abdaf8cfe8ee496c435d0ca
Reviewed-on: https://pdfium-review.googlesource.com/3092
Commit-Queue: Wei Li <weili@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
index 686b6a2..47d0754 100644
--- a/core/fpdftext/cpdf_linkextract.cpp
+++ b/core/fpdftext/cpdf_linkextract.cpp
@@ -31,18 +31,36 @@
 }
 
 void CPDF_LinkExtract::ParseLink() {
-  int start = 0, pos = 0;
-  int TotalChar = m_pTextPage->CountChars();
-  while (pos < TotalChar) {
+  int start = 0;
+  int pos = 0;
+  int nTotalChar = m_pTextPage->CountChars();
+  bool bAfterHyphen = false;
+  bool bLineBreak = false;
+  while (pos < nTotalChar) {
     FPDF_CHAR_INFO pageChar;
     m_pTextPage->GetCharInfo(pos, &pageChar);
     if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
-        pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
+        pageChar.m_Unicode == TEXT_SPACE_CHAR || pos == nTotalChar - 1) {
       int nCount = pos - start;
-      if (pos == TotalChar - 1)
+      if (pos == nTotalChar - 1) {
         nCount++;
+      } else if (bAfterHyphen && (pageChar.m_Unicode == TEXT_LINEFEED_CHAR ||
+                                  pageChar.m_Unicode == TEXT_RETURN_CHAR)) {
+        // Handle text breaks with a hyphen to the next line.
+        bLineBreak = true;
+        pos++;
+        continue;
+      }
       CFX_WideString strBeCheck;
       strBeCheck = m_pTextPage->GetPageText(start, nCount);
+      if (bLineBreak) {
+        strBeCheck.Remove(TEXT_LINEFEED_CHAR);
+        strBeCheck.Remove(TEXT_RETURN_CHAR);
+        bLineBreak = false;
+      }
+      // Replace the generated code with the hyphen char.
+      strBeCheck.Replace(L"\xfffe", TEXT_HYPHEN);
+
       if (strBeCheck.GetLength() > 5) {
         while (strBeCheck.GetLength() > 0) {
           wchar_t ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
@@ -60,6 +78,9 @@
       }
       start = ++pos;
     } else {
+      bAfterHyphen = (pageChar.m_Flag == FPDFTEXT_CHAR_HYPHEN ||
+                      (pageChar.m_Flag == FPDFTEXT_CHAR_NORMAL &&
+                       pageChar.m_Unicode == TEXT_HYPHEN_CHAR));
       pos++;
     }
   }
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index ebe58eb..d7e29ed 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -34,10 +34,12 @@
 #define TEXT_SPACE_CHAR L' '
 #define TEXT_LINEFEED_CHAR L'\n'
 #define TEXT_RETURN_CHAR L'\r'
+#define TEXT_HYPHEN_CHAR L'-'
 #define TEXT_EMPTY L""
 #define TEXT_SPACE L" "
 #define TEXT_RETURN_LINEFEED L"\r\n"
 #define TEXT_LINEFEED L"\n"
+#define TEXT_HYPHEN L"-"
 #define TEXT_CHARRATIO_GAPDELTA 0.070
 
 enum class FPDFText_MarkedContent { Pass = 0, Done, Delay };
diff --git a/fpdfsdk/fpdftext_embeddertest.cpp b/fpdfsdk/fpdftext_embeddertest.cpp
index 198ef8a..3d496bc 100644
--- a/fpdfsdk/fpdftext_embeddertest.cpp
+++ b/fpdfsdk/fpdftext_embeddertest.cpp
@@ -370,6 +370,74 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLines) {
+  EXPECT_TRUE(OpenDocument("weblinks_across_lines.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  EXPECT_TRUE(page);
+
+  FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+  EXPECT_TRUE(textpage);
+
+  FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
+  EXPECT_TRUE(pagelink);
+
+  static const char* const kExpectedUrls[] = {
+      "http://example.com?",          // from "http://www.example.com?\r\nfoo"
+      "http://example.com/",          // from "http://www.example.com/\r\nfoo"
+      "http://example.com/test-foo",  // from "http://example.com/test-\r\nfoo"
+      "http://abc.com/test-foo",      // from "http://abc.com/test-\r\n\r\nfoo"
+      // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/"
+      "http://example.com/", "http://www.abc.com",
+  };
+  static const int kNumLinks = static_cast<int>(FX_ArraySize(kExpectedUrls));
+
+  EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink));
+
+  unsigned short fixed_buffer[128];
+  for (int i = 0; i < kNumLinks; i++) {
+    const size_t expected_len = strlen(kExpectedUrls[i]) + 1;
+    memset(fixed_buffer, 0, FX_ArraySize(fixed_buffer));
+    EXPECT_EQ(static_cast<int>(expected_len),
+              FPDFLink_GetURL(pagelink, i, nullptr, 0));
+    EXPECT_EQ(
+        static_cast<int>(expected_len),
+        FPDFLink_GetURL(pagelink, i, fixed_buffer, FX_ArraySize(fixed_buffer)));
+    EXPECT_TRUE(
+        check_unsigned_shorts(kExpectedUrls[i], fixed_buffer, expected_len));
+  }
+
+  FPDFLink_CloseWebLinks(pagelink);
+  FPDFText_ClosePage(textpage);
+  UnloadPage(page);
+}
+
+TEST_F(FPDFTextEmbeddertest, WebLinksAcrossLinesBug) {
+  EXPECT_TRUE(OpenDocument("bug_650.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  EXPECT_TRUE(page);
+
+  FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+  EXPECT_TRUE(textpage);
+
+  FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
+  EXPECT_TRUE(pagelink);
+
+  EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
+  unsigned short fixed_buffer[128] = {0};
+  static const char kExpectedUrl[] =
+      "http://tutorial45.com/learn-autocad-basics-day-166/";
+  static const int kUrlSize = static_cast<int>(sizeof(kExpectedUrl));
+
+  EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
+  EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, fixed_buffer,
+                                      FX_ArraySize(fixed_buffer)));
+  EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, fixed_buffer, kUrlSize));
+
+  FPDFLink_CloseWebLinks(pagelink);
+  FPDFText_ClosePage(textpage);
+  UnloadPage(page);
+}
+
 TEST_F(FPDFTextEmbeddertest, GetFontSize) {
   EXPECT_TRUE(OpenDocument("hello_world.pdf"));
   FPDF_PAGE page = LoadPage(0);
diff --git a/testing/resources/bug_650.pdf b/testing/resources/bug_650.pdf
new file mode 100644
index 0000000..5e46032
--- /dev/null
+++ b/testing/resources/bug_650.pdf
Binary files differ
diff --git a/testing/resources/weblinks_across_lines.in b/testing/resources/weblinks_across_lines.in
new file mode 100644
index 0000000..bb04b5e
--- /dev/null
+++ b/testing/resources/weblinks_across_lines.in
@@ -0,0 +1,74 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+{{object 2 0}} <<
+  /Type /Pages
+  /MediaBox [ 0 0 600 600 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+{{object 5 0}} <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+{{object 6 0}} <<
+>>
+stream
+BT
+/F1 12 Tf
+50 50 Td
+(Hello, world! This is not a link.) Tj
+0 50 Td
+(Is this http://example.com?) Tj
+0 50 Td
+(foo a link?) Tj
+/F2 14 Tf
+0 50 Td
+(How about this http://example.com/) Tj
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(Is this http://example.com/test-) Tj
+0 50 Td
+(foo a link?) Tj
+(Is this http://abc.com/test-) Tj
+0 50 Td
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(And this http://example.com/) Tj
+0 50 Td
+(http://www.abc.com a link?) Tj
+ET
+endstream
+endobj
+{{xref}}
+trailer <<
+  /Size 6
+  /Root 1 0 R
+>>
+{{startxref}}
+%%EOF
diff --git a/testing/resources/weblinks_across_lines.pdf b/testing/resources/weblinks_across_lines.pdf
new file mode 100644
index 0000000..e9327c4
--- /dev/null
+++ b/testing/resources/weblinks_across_lines.pdf
@@ -0,0 +1,84 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [ 0 0 600 600 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+      /F2 5 0 R
+    >>
+  >>
+  /Contents 6 0 R
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Times-Roman
+>>
+endobj
+5 0 obj <<
+  /Type /Font
+  /Subtype /Type1
+  /BaseFont /Helvetica
+>>
+endobj
+6 0 obj <<
+>>
+stream
+BT
+/F1 12 Tf
+50 50 Td
+(Hello, world! This is not a link.) Tj
+0 50 Td
+(Is this http://example.com?) Tj
+0 50 Td
+(foo a link?) Tj
+/F2 14 Tf
+0 50 Td
+(How about this http://example.com/) Tj
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(Is this http://example.com/test-) Tj
+0 50 Td
+(foo a link?) Tj
+(Is this http://abc.com/test-) Tj
+0 50 Td
+0 50 Td
+(foo a link?) Tj
+0 50 Td
+(And this http://example.com/) Tj
+0 50 Td
+(http://www.abc.com a link?) Tj
+ET
+endstream
+endobj
+xref
+0 7
+0000000000 65535 f 
+0000000015 00000 n 
+0000000061 00000 n 
+0000000154 00000 n 
+0000000296 00000 n 
+0000000374 00000 n 
+0000000450 00000 n 
+trailer <<
+  /Size 6
+  /Root 1 0 R
+>>
+startxref
+921
+%%EOF