Rewrite IsHyphen using string operations

The existing code did end of range checks by making sure that the 
value was never less then 0. This isn't correct when using an unsigned
type, since 0 - 1 will wrap around to the max possible value, and
thus still be less then 0. Additionally the existing code was hard to
follow due to the complexity of some of the low level operations being 
performed.

It has been rewritten using higher level string operations to make it
clearer and correct.

BUG=chromium:763256

Change-Id: Ib8bf5ca0e29e73724c4a1c4781362e8a8fc30149
Reviewed-on: https://pdfium-review.googlesource.com/13690
Commit-Queue: Ryan Harrison <rharrison@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 3006ebc..ee1d51b 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -106,6 +106,10 @@
   return count / (end - start);
 }
 
+bool IsHyphenCode(wchar_t c) {
+  return c == 0x2D || c == 0xAD;
+}
+
 }  // namespace
 
 PDFTEXT_Obj::PDFTEXT_Obj() {}
@@ -1215,36 +1219,37 @@
   return m_TextlineDir;
 }
 
-bool CPDF_TextPage::IsHyphen(wchar_t curChar) {
-  CFX_WideString strCurText = m_TempTextBuf.MakeString();
-  if (strCurText.IsEmpty())
-    strCurText = m_TextBuf.AsStringC();
-  FX_STRSIZE nCount = strCurText.GetLength();
-  if (nCount < 1)
+bool CPDF_TextPage::IsHyphen(wchar_t curChar) const {
+  CFX_WideStringC curText;
+  if (!m_TempTextBuf.IsEmpty())
+    curText = m_TempTextBuf.AsStringC();
+  else if (!m_TextBuf.IsEmpty())
+    curText = m_TextBuf.AsStringC();
+  else
     return false;
-  FX_STRSIZE nIndex = nCount - 1;
-  wchar_t wcTmp = strCurText[nIndex];
-  while (wcTmp == 0x20 && nIndex > 0 && nIndex <= nCount - 1)
-    wcTmp = strCurText[--nIndex];
-  if (0x2D == wcTmp || 0xAD == wcTmp) {
-    if (--nIndex > 0) {
-      wchar_t preChar = strCurText[nIndex];
-      if (FXSYS_iswalpha(preChar) && FXSYS_iswalpha(curChar))
-        return true;
-    }
-    const PAGECHAR_INFO* preInfo;
-    if (!m_TempCharList.empty())
-      preInfo = &m_TempCharList.back();
-    else if (!m_CharList.empty())
-      preInfo = &m_CharList.back();
-    else
-      return false;
-    if (FPDFTEXT_CHAR_PIECE == preInfo->m_Flag &&
-        (0xAD == preInfo->m_Unicode || 0x2D == preInfo->m_Unicode)) {
-      return true;
-    }
-  }
-  return false;
+
+  curText = curText.TrimmedRight(0x20);
+  if (curText.GetLength() < 2)
+    return false;
+
+  // Extracting the last 2 characters, since they are all that matter
+  curText = curText.Right(2);
+  if (!IsHyphenCode(curText.Last()))
+    return false;
+
+  if (FXSYS_iswalpha(curText.First() && FXSYS_iswalnum(curChar)))
+    return true;
+
+  const PAGECHAR_INFO* preInfo;
+  if (!m_TempCharList.empty())
+    preInfo = &m_TempCharList.back();
+  else if (!m_CharList.empty())
+    preInfo = &m_CharList.back();
+  else
+    return false;
+
+  return FPDFTEXT_CHAR_PIECE == preInfo->m_Flag &&
+         IsHyphenCode(preInfo->m_Unicode);
 }
 
 CPDF_TextPage::GenerateCharacter CPDF_TextPage::ProcessInsertObject(
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index 6a4c873..41892ea 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -132,7 +132,7 @@
     Hyphen,
   };
 
-  bool IsHyphen(wchar_t curChar);
+  bool IsHyphen(wchar_t curChar) const;
   bool IsControlChar(const PAGECHAR_INFO& charInfo);
   void ProcessObject();
   void ProcessFormObject(CPDF_FormObject* pFormObj,
diff --git a/core/fxcrt/cfx_binarybuf.cpp b/core/fxcrt/cfx_binarybuf.cpp
index 73fe945..b826fdd 100644
--- a/core/fxcrt/cfx_binarybuf.cpp
+++ b/core/fxcrt/cfx_binarybuf.cpp
@@ -28,6 +28,10 @@
   m_DataSize -= count;
 }
 
+FX_STRSIZE CFX_BinaryBuf::GetLength() const {
+  return m_DataSize;
+}
+
 void CFX_BinaryBuf::Clear() {
   m_DataSize = 0;
 }
diff --git a/core/fxcrt/cfx_binarybuf.h b/core/fxcrt/cfx_binarybuf.h
index 3081d02..4c795ed 100644
--- a/core/fxcrt/cfx_binarybuf.h
+++ b/core/fxcrt/cfx_binarybuf.h
@@ -17,10 +17,12 @@
  public:
   CFX_BinaryBuf();
   explicit CFX_BinaryBuf(FX_STRSIZE size);
-  ~CFX_BinaryBuf();
+  virtual ~CFX_BinaryBuf();
 
   uint8_t* GetBuffer() const { return m_pBuffer.get(); }
   FX_STRSIZE GetSize() const { return m_DataSize; }
+  virtual FX_STRSIZE GetLength() const;
+  bool IsEmpty() const { return GetLength() == 0; }
 
   void Clear();
   void EstimateSize(FX_STRSIZE size, FX_STRSIZE alloc_step = 0);
diff --git a/core/fxcrt/cfx_bytestring_unittest.cpp b/core/fxcrt/cfx_bytestring_unittest.cpp
index dcb8577..a4c5187 100644
--- a/core/fxcrt/cfx_bytestring_unittest.cpp
+++ b/core/fxcrt/cfx_bytestring_unittest.cpp
@@ -999,6 +999,14 @@
   EXPECT_EQ("", longer_string.Mid(4, 3));
 }
 
+TEST(fxcrt, ByteStringCTrimmedRight) {
+  CFX_ByteStringC fred("FRED");
+  EXPECT_EQ("FRED", fred.TrimmedRight('E'));
+  EXPECT_EQ("FRE", fred.TrimmedRight('D'));
+  CFX_ByteStringC fredd("FREDD");
+  EXPECT_EQ("FRE", fred.TrimmedRight('D'));
+}
+
 TEST(fxcrt, ByteStringCElementAccess) {
   // CFX_ByteStringC includes the NUL terminator for non-empty strings.
   CFX_ByteStringC abc("abc");
diff --git a/core/fxcrt/cfx_string_c_template.h b/core/fxcrt/cfx_string_c_template.h
index 8585d73..b0e17a1 100644
--- a/core/fxcrt/cfx_string_c_template.h
+++ b/core/fxcrt/cfx_string_c_template.h
@@ -176,6 +176,20 @@
     return Mid(GetLength() - count, count);
   }
 
+  CFX_StringCTemplate TrimmedRight(CharType ch) const {
+    if (IsEmpty())
+      return CFX_StringCTemplate();
+
+    FX_STRSIZE pos = GetLength();
+    while (pos && CharAt(pos - 1) == ch)
+      pos--;
+
+    if (pos == 0)
+      return CFX_StringCTemplate();
+
+    return CFX_StringCTemplate(m_Ptr.Get(), pos);
+  }
+
   bool operator<(const CFX_StringCTemplate& that) const {
     int result = FXSYS_cmp(reinterpret_cast<const CharType*>(m_Ptr.Get()),
                            reinterpret_cast<const CharType*>(that.m_Ptr.Get()),
diff --git a/core/fxcrt/cfx_widestring_unittest.cpp b/core/fxcrt/cfx_widestring_unittest.cpp
index 7b12d50..1c8aca9 100644
--- a/core/fxcrt/cfx_widestring_unittest.cpp
+++ b/core/fxcrt/cfx_widestring_unittest.cpp
@@ -1038,6 +1038,14 @@
   EXPECT_FALSE(pdfium::ContainsValue(str, L'z'));
 }
 
+TEST(fxcrt, WideStringCTrimmedRight) {
+  CFX_WideStringC fred(L"FRED");
+  EXPECT_EQ(L"FRED", fred.TrimmedRight(L'E'));
+  EXPECT_EQ(L"FRE", fred.TrimmedRight(L'D'));
+  CFX_WideStringC fredd(L"FREDD");
+  EXPECT_EQ(L"FRE", fred.TrimmedRight(L'D'));
+}
+
 TEST(fxcrt, WideStringFormatWidth) {
   {
     CFX_WideString str;
diff --git a/core/fxcrt/cfx_widetextbuf.cpp b/core/fxcrt/cfx_widetextbuf.cpp
index 246124b..81b5fd4 100644
--- a/core/fxcrt/cfx_widetextbuf.cpp
+++ b/core/fxcrt/cfx_widetextbuf.cpp
@@ -6,6 +6,10 @@
 
 #include "core/fxcrt/cfx_widetextbuf.h"
 
+FX_STRSIZE CFX_WideTextBuf::GetLength() const {
+  return m_DataSize / sizeof(wchar_t);
+}
+
 void CFX_WideTextBuf::AppendChar(wchar_t ch) {
   ExpandBuf(sizeof(wchar_t));
   *(wchar_t*)(m_pBuffer.get() + m_DataSize) = ch;
diff --git a/core/fxcrt/cfx_widetextbuf.h b/core/fxcrt/cfx_widetextbuf.h
index 0c9b6ff..389f2e5 100644
--- a/core/fxcrt/cfx_widetextbuf.h
+++ b/core/fxcrt/cfx_widetextbuf.h
@@ -14,7 +14,7 @@
 class CFX_WideTextBuf : public CFX_BinaryBuf {
  public:
   void AppendChar(wchar_t wch);
-  FX_STRSIZE GetLength() const { return m_DataSize / sizeof(wchar_t); }
+  FX_STRSIZE GetLength() const override;
   wchar_t* GetBuffer() const {
     return reinterpret_cast<wchar_t*>(m_pBuffer.get());
   }