New API to retrieve a character's loose bounds

This change adds a new API `FPDFText_GetLooseCharBox` that retrieves the
"loose" bounds of a character.  I.e., the returned bounds will cover the
entire possible extent of the character (given its width).  The height
of the box will always be the same as the font size.  The width will
equal the entire "advance" of the character.

Compare with `FPDFText_GetCharBox`, which returns a "tight" box around
the actual curves that make up the glyph.

This API will be used to make the PDF a11y line-breaking heuristics
more reliable, so that, e.g., lines are treated the same way
(mathematically) even if they don't have any ascenders or descenders.

Change-Id: I2860f12865b6c81e90cb2621fd8adac1ab5f2530
Bug: chromium:985604
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/59971
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
diff --git a/fpdfsdk/fpdf_text.cpp b/fpdfsdk/fpdf_text.cpp
index e204465..1387473 100644
--- a/fpdfsdk/fpdf_text.cpp
+++ b/fpdfsdk/fpdf_text.cpp
@@ -11,6 +11,7 @@
 #include <vector>
 
 #include "build/build_config.h"
+#include "core/fpdfapi/font/cpdf_cidfont.h"
 #include "core/fpdfapi/font/cpdf_font.h"
 #include "core/fpdfapi/page/cpdf_page.h"
 #include "core/fpdfapi/page/cpdf_textobject.h"
@@ -236,6 +237,63 @@
 }
 
 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
+FPDFText_GetLooseCharBox(FPDF_TEXTPAGE text_page,
+                         int index,
+                         double* left,
+                         double* right,
+                         double* bottom,
+                         double* top) {
+  CPDF_TextPage* textpage = GetTextPageForValidIndex(text_page, index);
+  if (!textpage)
+    return false;
+
+  FPDF_CHAR_INFO charinfo;
+  textpage->GetCharInfo(index, &charinfo);
+
+  if (charinfo.m_pTextObj && !IsFloatZero(charinfo.m_FontSize)) {
+    bool is_vert_writing = charinfo.m_pTextObj->GetFont()->IsVertWriting();
+    if (is_vert_writing && charinfo.m_pTextObj->GetFont()->IsCIDFont()) {
+      CPDF_CIDFont* pCIDFont = charinfo.m_pTextObj->GetFont()->AsCIDFont();
+      uint16_t cid = pCIDFont->CIDFromCharCode(charinfo.m_Charcode);
+
+      short vx;
+      short vy;
+      pCIDFont->GetVertOrigin(cid, vx, vy);
+      double offsetx = (vx - 500) * charinfo.m_FontSize / 1000.0;
+      double offsety = vy * charinfo.m_FontSize / 1000.0;
+      short vert_width = pCIDFont->GetVertWidth(cid);
+      double height = vert_width * charinfo.m_FontSize / 1000.0;
+
+      *left = charinfo.m_Origin.x + offsetx;
+      *right = *left + charinfo.m_FontSize;
+      *bottom = charinfo.m_Origin.y + offsety;
+      *top = *bottom + height;
+      return true;
+    }
+
+    int ascent = charinfo.m_pTextObj->GetFont()->GetTypeAscent();
+    int descent = charinfo.m_pTextObj->GetFont()->GetTypeDescent();
+    if (ascent != descent) {
+      float width = charinfo.m_pTextObj->GetCharWidth(charinfo.m_Charcode);
+      float font_scale = charinfo.m_FontSize / (ascent - descent);
+
+      *left = charinfo.m_Origin.x;
+      *right = charinfo.m_Origin.x + (is_vert_writing ? -width : width);
+      *bottom = charinfo.m_Origin.y + descent * font_scale;
+      *top = charinfo.m_Origin.y + ascent * font_scale;
+      return true;
+    }
+  }
+
+  // Fallback to the tight bounds in empty text scenarios, or bad font metrics
+  *left = charinfo.m_CharBox.left;
+  *right = charinfo.m_CharBox.right;
+  *bottom = charinfo.m_CharBox.bottom;
+  *top = charinfo.m_CharBox.top;
+  return true;
+}
+
+FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
 FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page,
                        int index,
                        double* x,
diff --git a/fpdfsdk/fpdf_text_embeddertest.cpp b/fpdfsdk/fpdf_text_embeddertest.cpp
index e4d11ed..e3b8216 100644
--- a/fpdfsdk/fpdf_text_embeddertest.cpp
+++ b/fpdfsdk/fpdf_text_embeddertest.cpp
@@ -117,6 +117,14 @@
   EXPECT_NEAR(49.844, bottom, 0.001);
   EXPECT_NEAR(55.520, top, 0.001);
 
+  EXPECT_TRUE(
+      FPDFText_GetLooseCharBox(textpage, 4, &left, &right, &bottom, &top));
+  EXPECT_NEAR(40.664, left, 0.001);
+  EXPECT_NEAR(46.664, right, 0.001);
+  EXPECT_NEAR(47.667, bottom, 0.001);
+  EXPECT_NEAR(59.667, top, 0.001);
+  EXPECT_NEAR(12.000, top - bottom, 0.001);
+
   double x = 0.0;
   double y = 0.0;
   EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 4, &x, &y));
@@ -193,6 +201,50 @@
   UnloadPage(page);
 }
 
+TEST_F(FPDFTextEmbedderTest, TextVertical) {
+  ASSERT_TRUE(OpenDocument("vertical_text.pdf"));
+  FPDF_PAGE page = LoadPage(0);
+  ASSERT_TRUE(page);
+
+  FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
+  ASSERT_TRUE(textpage);
+
+  EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
+
+  double x = 0.0;
+  double y = 0.0;
+  EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 1, &x, &y));
+  EXPECT_NEAR(6.664, x, 0.001);
+  EXPECT_NEAR(171.508, y, 0.001);
+
+  EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 2, &x, &y));
+  EXPECT_NEAR(8.668, x, 0.001);
+  EXPECT_NEAR(160.492, y, 0.001);
+
+  double left = 0.0;
+  double right = 0.0;
+  double bottom = 0.0;
+  double top = 0.0;
+  EXPECT_TRUE(
+      FPDFText_GetLooseCharBox(textpage, 1, &left, &right, &bottom, &top));
+  EXPECT_NEAR(4, left, 0.001);
+  EXPECT_NEAR(16, right, 0.001);
+  EXPECT_NEAR(178.984, bottom, 0.001);
+  EXPECT_NEAR(170.308, top, 0.001);
+  EXPECT_NEAR(12.000, right - left, 0.001);
+
+  EXPECT_TRUE(
+      FPDFText_GetLooseCharBox(textpage, 2, &left, &right, &bottom, &top));
+  EXPECT_NEAR(4, left, 0.001);
+  EXPECT_NEAR(16, right, 0.001);
+  EXPECT_NEAR(170.308, bottom, 0.001);
+  EXPECT_NEAR(159.292, top, 0.001);
+  EXPECT_NEAR(12.000, right - left, 0.001);
+
+  FPDFText_ClosePage(textpage);
+  UnloadPage(page);
+}
+
 TEST_F(FPDFTextEmbedderTest, TextSearch) {
   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
   FPDF_PAGE page = LoadPage(0);
diff --git a/fpdfsdk/fpdf_view_c_api_test.c b/fpdfsdk/fpdf_view_c_api_test.c
index 0a7baf7..e7d1026 100644
--- a/fpdfsdk/fpdf_view_c_api_test.c
+++ b/fpdfsdk/fpdf_view_c_api_test.c
@@ -340,6 +340,7 @@
     CHK(FPDFText_GetFontInfo);
     CHK(FPDFText_GetFontSize);
     CHK(FPDFText_GetFontWeight);
+    CHK(FPDFText_GetLooseCharBox);
     CHK(FPDFText_GetRect);
     CHK(FPDFText_GetSchCount);
     CHK(FPDFText_GetSchResultIndex);
diff --git a/public/fpdf_text.h b/public/fpdf_text.h
index 04664be..d71e146 100644
--- a/public/fpdf_text.h
+++ b/public/fpdf_text.h
@@ -237,6 +237,38 @@
                                                         double* bottom,
                                                         double* top);
 
+// Experimental API.
+// Function: FPDFText_GetLooseCharBox
+//          Get a "loose" bounding box of a particular character, i.e., covering
+//          the entire glyph bounds, without taking the actual glyph shape into
+//          account.
+// Parameters:
+//          text_page   -   Handle to a text page information structure.
+//                          Returned by FPDFText_LoadPage function.
+//          index       -   Zero-based index of the character.
+//          left        -   Pointer to a double number receiving left position
+//                          of the character box.
+//          right       -   Pointer to a double number receiving right position
+//                          of the character box.
+//          bottom      -   Pointer to a double number receiving bottom position
+//                          of the character box.
+//          top         -   Pointer to a double number receiving top position of
+//                          the character box.
+// Return Value:
+//          On success, return TRUE and fill in |left|, |right|, |bottom|, and
+//          |top|. If |text_page| is invalid, or if |index| is out of bounds,
+//          then return FALSE, and the out parameters remain unmodified.
+// Comments:
+//          All positions are measured in PDF "user space".
+//
+FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
+FPDFText_GetLooseCharBox(FPDF_TEXTPAGE text_page,
+                         int index,
+                         double* left,
+                         double* right,
+                         double* bottom,
+                         double* top);
+
 // Function: FPDFText_GetCharOrigin
 //          Get origin of a particular character.
 // Parameters:
diff --git a/testing/resources/vertical_text.in b/testing/resources/vertical_text.in
new file mode 100644
index 0000000..4a874c3
--- /dev/null
+++ b/testing/resources/vertical_text.in
@@ -0,0 +1,92 @@
+{{header}}
+{{object 1 0}} <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+{{object 3 0}} <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+    >>
+  >>
+  /Contents 7 0 R
+>>
+endobj
+{{object 4 0}} <<
+  /Type /Font
+  /Subtype /Type0
+  /Encoding /UniGB-UTF16-V
+  /BaseFont /Helvetica
+  /DescendantFonts [5 0 R]
+>>
+endobj
+{{object 5 0}} <<
+  /Type /Font
+  /Subtype /CIDFontType2
+  /BaseFont /Helvetica
+  /CIDSystemInfo <</Registry (Adobe) /Ordering (GB1) /Supplement 4>>
+  /FontDescriptor 6 0 R
+  /DW 1000
+  /W [
+    1 [278]  %space
+    2 [278]  %!
+    41 [722] %H
+    56 [944] %W
+    69 [556] %d
+    70 [556] %e
+    77 [222] %l
+    80 [556] %o
+    83 [333] %r
+  ]
+  /DW2 [0 -1000]
+  /W2 [
+    1 [-723 139 623]    %space
+    2 [-918 139 818]    %!
+    41 [-918 361 818]   %H
+    56 [-918 472 818]   %W
+    69 [-918 278 818]   %d
+    70 [-723 278 623]   %e
+    77 [-918 111 818]   %l
+    80 [-723 278 623]   %o
+    83 [-723 166.5 623] %r
+  ]
+>>
+endobj
+{{object 6 0}} <<
+  /Type /FontDescriptor
+  /Ascent 718
+  /CapHeight 500
+  /Descent -207
+  /Flags 32
+  /FontBBox [-166 -225 1000 931]
+  /FontName /Helvetica
+  /ItalicAngle 0
+  /StemV 80
+>>
+endobj
+{{object 7 0}} <<
+  {{streamlen}}
+>>
+stream
+BT
+/F1 12 Tf
+10 190 Td
+(\000H\000e\000l\000l\000o\000 ) Tj
+(\000W\000o\000r\000l\000d\000!) Tj
+ET
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/vertical_text.pdf b/testing/resources/vertical_text.pdf
new file mode 100644
index 0000000..37d8115
--- /dev/null
+++ b/testing/resources/vertical_text.pdf
@@ -0,0 +1,106 @@
+%PDF-1.7
+% ò¤ô
+1 0 obj <<
+  /Type /Catalog
+  /Pages 2 0 R
+>>
+endobj
+2 0 obj <<
+  /Type /Pages
+  /MediaBox [ 0 0 200 200 ]
+  /Count 1
+  /Kids [ 3 0 R ]
+>>
+endobj
+3 0 obj <<
+  /Type /Page
+  /Parent 2 0 R
+  /Resources <<
+    /Font <<
+      /F1 4 0 R
+    >>
+  >>
+  /Contents 7 0 R
+>>
+endobj
+4 0 obj <<
+  /Type /Font
+  /Subtype /Type0
+  /Encoding /UniGB-UTF16-V
+  /BaseFont /Helvetica
+  /DescendantFonts [5 0 R]
+>>
+endobj
+5 0 obj <<
+  /Type /Font
+  /Subtype /CIDFontType2
+  /BaseFont /Helvetica
+  /CIDSystemInfo <</Registry (Adobe) /Ordering (GB1) /Supplement 4>>
+  /FontDescriptor 6 0 R
+  /DW 1000
+  /W [
+    1 [278]  %space
+    2 [278]  %!
+    41 [722] %H
+    56 [944] %W
+    69 [556] %d
+    70 [556] %e
+    77 [222] %l
+    80 [556] %o
+    83 [333] %r
+  ]
+  /DW2 [0 -1000]
+  /W2 [
+    1 [-723 139 623]    %space
+    2 [-918 139 818]    %!
+    41 [-918 361 818]   %H
+    56 [-918 472 818]   %W
+    69 [-918 278 818]   %d
+    70 [-723 278 623]   %e
+    77 [-918 111 818]   %l
+    80 [-723 278 623]   %o
+    83 [-723 166.5 623] %r
+  ]
+>>
+endobj
+6 0 obj <<
+  /Type /FontDescriptor
+  /Ascent 718
+  /CapHeight 500
+  /Descent -207
+  /Flags 32
+  /FontBBox [-166 -225 1000 931]
+  /FontName /Helvetica
+  /ItalicAngle 0
+  /StemV 80
+>>
+endobj
+7 0 obj <<
+  /Length 98
+>>
+stream
+BT
+/F1 12 Tf
+10 190 Td
+(\000H\000e\000l\000l\000o\000 ) Tj
+(\000W\000o\000r\000l\000d\000!) Tj
+ET
+endstream
+endobj
+xref
+0 8
+0000000000 65535 f 
+0000000015 00000 n 
+0000000068 00000 n 
+0000000161 00000 n 
+0000000287 00000 n 
+0000000417 00000 n 
+0000001039 00000 n 
+0000001228 00000 n 
+trailer <<
+  /Root 1 0 R
+  /Size 8
+>>
+startxref
+1377
+%%EOF