Improve RTL support for numbers with separators

Numbers with separators (e.g. 123-456, 1/1/2000, 1,111.11)
were reversed due to Issue 1994 (also partially mentioned in Issue
1456).

This fixes the bug by creating a new LeftWeak Bidi class
(see https://unicode.org/reports/tr9/). The new class makes sure that
numbers are in the same segment together with their separators.
This prevents the multiple segments inside the number from being
reversed in CFX_BidiString::SetOverallDirectionRight().

Bug: pdfium:1994, pdfium:1456
Change-Id: Ib195accc80d24a6a00ab9ed78d3c0716c70a5d2b
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/104130
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/AUTHORS b/AUTHORS
index a4c3f99..f8e3ed9 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -16,6 +16,7 @@
 Antonio Gomes <tonikitoo@igalia.com>
 Chery Cherian <cherycherian@gmail.com>
 Claudio DeSouza <claudiomdsjr@gmail.com>
+Dan Ilan <danilan@gmail.com>
 Felix Kauselmann <licorn@gmail.com>
 GiWan Go <gogil@stealien.com>
 Huy Ngo <huyna89@gmail.com>
diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 825be03..68c2211 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -734,7 +734,9 @@
       for (int m = segment.start + segment.count; m > segment.start; --m)
         AddCharInfoByRLDirection(str[m - 1], m_TempCharList[m - 1]);
     } else {
-      eCurrentDirection = CFX_BidiChar::Direction::kLeft;
+      if (segment.direction != CFX_BidiChar::Direction::kLeftWeak) {
+        eCurrentDirection = CFX_BidiChar::Direction::kLeft;
+      }
       for (int m = segment.start; m < segment.start + segment.count; ++m)
         AddCharInfoByLRDirection(str[m], m_TempCharList[m]);
     }
diff --git a/core/fxcrt/fx_bidi.cpp b/core/fxcrt/fx_bidi.cpp
index 0a13ff7..756aa12 100644
--- a/core/fxcrt/fx_bidi.cpp
+++ b/core/fxcrt/fx_bidi.cpp
@@ -9,7 +9,7 @@
 #include <algorithm>
 
 #include "core/fxcrt/fx_unicode.h"
-#include "third_party/base/check.h"
+#include "third_party/base/check_op.h"
 
 CFX_BidiChar::CFX_BidiChar()
     : m_CurrentSegment({0, 0, Direction::kNeutral}),
@@ -19,9 +19,16 @@
   Direction direction;
   switch (pdfium::unicode::GetBidiClass(wch)) {
     case FX_BIDICLASS::kL:
+      direction = Direction::kLeft;
+      break;
     case FX_BIDICLASS::kAN:
     case FX_BIDICLASS::kEN:
-      direction = Direction::kLeft;
+    case FX_BIDICLASS::kNSM:
+    case FX_BIDICLASS::kCS:
+    case FX_BIDICLASS::kES:
+    case FX_BIDICLASS::kET:
+    case FX_BIDICLASS::kBN:
+      direction = Direction::kLeftWeak;
       break;
     case FX_BIDICLASS::kR:
     case FX_BIDICLASS::kAL:
@@ -78,7 +85,8 @@
 CFX_BidiString::~CFX_BidiString() = default;
 
 CFX_BidiChar::Direction CFX_BidiString::OverallDirection() const {
-  DCHECK(m_eOverallDirection != CFX_BidiChar::Direction::kNeutral);
+  DCHECK_NE(m_eOverallDirection, CFX_BidiChar::Direction::kNeutral);
+  DCHECK_NE(m_eOverallDirection, CFX_BidiChar::Direction::kLeftWeak);
   return m_eOverallDirection;
 }
 
diff --git a/core/fxcrt/fx_bidi.h b/core/fxcrt/fx_bidi.h
index 86e7f37..19301c4 100644
--- a/core/fxcrt/fx_bidi.h
+++ b/core/fxcrt/fx_bidi.h
@@ -16,7 +16,7 @@
 // Processes characters and group them into segments based on text direction.
 class CFX_BidiChar {
  public:
-  enum class Direction { kNeutral, kLeft, kRight };
+  enum class Direction { kNeutral, kLeft, kRight, kLeftWeak };
   struct Segment {
     int32_t start;        // Start position.
     int32_t count;        // Character count.
@@ -25,7 +25,7 @@
 
   CFX_BidiChar();
 
-  // Append a character and classify it as left, right, or neutral.
+  // Append a character and classify it as left, left-weak, right, or neutral.
   // Returns true if the character has a different direction than the
   // existing direction to indicate there is a segment to process.
   bool AppendChar(wchar_t wch);
diff --git a/core/fxcrt/fx_bidi_unittest.cpp b/core/fxcrt/fx_bidi_unittest.cpp
index b029819..ce2ae66 100644
--- a/core/fxcrt/fx_bidi_unittest.cpp
+++ b/core/fxcrt/fx_bidi_unittest.cpp
@@ -7,9 +7,10 @@
 
 namespace {
 
-const wchar_t kNeutralChar = 32;
-const wchar_t kLeftChar = 65;
-const wchar_t kRightChar = 1424;
+const wchar_t kNeutralChar = 32;   // ' '
+const wchar_t kLeftChar = 65;      // 'A'
+const wchar_t kRightChar = 1488;   // 'א'
+const wchar_t kLeftWeakChar = 46;  // '.'
 
 }  // namespace
 
@@ -84,6 +85,40 @@
   EXPECT_FALSE(bidi.EndChar());
 }
 
+TEST(fxcrt, BidiCharLeftLeftWeakRight) {
+  CFX_BidiChar bidi;
+  CFX_BidiChar::Segment info;
+
+  EXPECT_TRUE(bidi.AppendChar(kLeftChar));
+  info = bidi.GetSegmentInfo();
+  EXPECT_EQ(0, info.start);
+  EXPECT_EQ(0, info.count);
+
+  EXPECT_FALSE(bidi.AppendChar(kLeftChar));
+  EXPECT_FALSE(bidi.AppendChar(kLeftChar));
+  EXPECT_TRUE(bidi.AppendChar(kLeftWeakChar));
+  info = bidi.GetSegmentInfo();
+  EXPECT_EQ(0, info.start);
+  EXPECT_EQ(3, info.count);
+
+  EXPECT_FALSE(bidi.AppendChar(kLeftWeakChar));
+  EXPECT_FALSE(bidi.AppendChar(kLeftWeakChar));
+  EXPECT_FALSE(bidi.AppendChar(kLeftWeakChar));
+  EXPECT_TRUE(bidi.AppendChar(kRightChar));
+  info = bidi.GetSegmentInfo();
+  EXPECT_EQ(CFX_BidiChar::Direction::kLeftWeak, info.direction);
+  EXPECT_EQ(3, info.start);
+  EXPECT_EQ(4, info.count);
+
+  EXPECT_TRUE(bidi.EndChar());
+  info = bidi.GetSegmentInfo();
+  EXPECT_EQ(CFX_BidiChar::Direction::kRight, info.direction);
+  EXPECT_EQ(7, info.start);
+  EXPECT_EQ(1, info.count);
+
+  EXPECT_FALSE(bidi.EndChar());
+}
+
 TEST(fxcrt, BidiCharLeftRightLeft) {
   CFX_BidiChar bidi;
   CFX_BidiChar::Segment info;
@@ -198,6 +233,49 @@
   }
 }
 
+TEST(fxcrt, BidiStringAllLeftWeak) {
+  {
+    const wchar_t str[] = {kLeftWeakChar, 0};
+    CFX_BidiString bidi(str);
+    EXPECT_EQ(CFX_BidiChar::Direction::kLeft, bidi.OverallDirection());
+
+    auto it = bidi.begin();
+    ASSERT_FALSE(it == bidi.end());
+    EXPECT_EQ(0, it->start);
+    EXPECT_EQ(0, it->count);
+    EXPECT_EQ(CFX_BidiChar::Direction::kNeutral, it->direction);
+
+    ++it;
+    ASSERT_FALSE(it == bidi.end());
+    EXPECT_EQ(0, it->start);
+    EXPECT_EQ(1, it->count);
+    EXPECT_EQ(CFX_BidiChar::Direction::kLeftWeak, it->direction);
+
+    ++it;
+    EXPECT_TRUE(it == bidi.end());
+  }
+  {
+    const wchar_t str[] = {kLeftWeakChar, kLeftWeakChar, kLeftWeakChar, 0};
+    CFX_BidiString bidi(str);
+    EXPECT_EQ(CFX_BidiChar::Direction::kLeft, bidi.OverallDirection());
+
+    auto it = bidi.begin();
+    ASSERT_FALSE(it == bidi.end());
+    EXPECT_EQ(0, it->start);
+    EXPECT_EQ(0, it->count);
+    EXPECT_EQ(CFX_BidiChar::Direction::kNeutral, it->direction);
+
+    ++it;
+    ASSERT_FALSE(it == bidi.end());
+    EXPECT_EQ(0, it->start);
+    EXPECT_EQ(3, it->count);
+    EXPECT_EQ(CFX_BidiChar::Direction::kLeftWeak, it->direction);
+
+    ++it;
+    EXPECT_TRUE(it == bidi.end());
+  }
+}
+
 TEST(fxcrt, BidiStringAllRight) {
   {
     const wchar_t str[] = {kRightChar, 0};
@@ -323,24 +401,69 @@
   EXPECT_TRUE(it == bidi.end());
 }
 
+TEST(fxcrt, BidiStringRightLeftWeakLeftRight) {
+  const wchar_t str[] = {kRightChar, kLeftWeakChar, kLeftChar, kRightChar, 0};
+  CFX_BidiString bidi(str);
+  EXPECT_EQ(CFX_BidiChar::Direction::kRight, bidi.OverallDirection());
+
+  auto it = bidi.begin();
+  ASSERT_FALSE(it == bidi.end());
+  EXPECT_EQ(3, it->start);
+  EXPECT_EQ(1, it->count);
+  EXPECT_EQ(CFX_BidiChar::Direction::kRight, it->direction);
+
+  ++it;
+  ASSERT_FALSE(it == bidi.end());
+  EXPECT_EQ(2, it->start);
+  EXPECT_EQ(1, it->count);
+  EXPECT_EQ(CFX_BidiChar::Direction::kLeft, it->direction);
+
+  ++it;
+  ASSERT_FALSE(it == bidi.end());
+  EXPECT_EQ(1, it->start);
+  EXPECT_EQ(1, it->count);
+  EXPECT_EQ(CFX_BidiChar::Direction::kLeftWeak, it->direction);
+
+  ++it;
+  ASSERT_FALSE(it == bidi.end());
+  EXPECT_EQ(0, it->start);
+  EXPECT_EQ(1, it->count);
+  EXPECT_EQ(CFX_BidiChar::Direction::kRight, it->direction);
+
+  ++it;
+  ASSERT_FALSE(it == bidi.end());
+  EXPECT_EQ(0, it->start);
+  EXPECT_EQ(0, it->count);
+  EXPECT_EQ(CFX_BidiChar::Direction::kNeutral, it->direction);
+
+  ++it;
+  EXPECT_TRUE(it == bidi.end());
+}
+
 TEST(fxcrt, BidiStringReverse) {
-  const wchar_t str[] = {kLeftChar, kNeutralChar, kRightChar, kLeftChar, 0};
+  const wchar_t str[] = {kLeftChar,     kNeutralChar, kRightChar,
+                         kLeftWeakChar, kLeftChar,    0};
   CFX_BidiString bidi(str);
   EXPECT_EQ(CFX_BidiChar::Direction::kLeft, bidi.OverallDirection());
   bidi.SetOverallDirectionRight();
 
   auto it = bidi.begin();
   ASSERT_FALSE(it == bidi.end());
-  EXPECT_EQ(3, it->start);
+  EXPECT_EQ(4, it->start);
   EXPECT_EQ(1, it->count);
   EXPECT_EQ(CFX_BidiChar::Direction::kLeft, it->direction);
 
   ++it;
   ASSERT_FALSE(it == bidi.end());
+  EXPECT_EQ(3, it->start);
+  EXPECT_EQ(1, it->count);
+  EXPECT_EQ(CFX_BidiChar::Direction::kLeftWeak, it->direction);
+
+  ++it;
+  ASSERT_FALSE(it == bidi.end());
   EXPECT_EQ(2, it->start);
   EXPECT_EQ(1, it->count);
   EXPECT_EQ(CFX_BidiChar::Direction::kRight, it->direction);
-  ASSERT_FALSE(it == bidi.end());
 
   ++it;
   ASSERT_FALSE(it == bidi.end());