blob: de5490a766d40e26c933b2df9dddba3fb04774de [file] [log] [blame]
// Copyright 2017 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#include "xfa/fxfa/cxfa_textparser.h"
#include <algorithm>
#include <utility>
#include <vector>
#include "core/fxcrt/fx_codepage.h"
#include "core/fxcrt/xml/cfx_xmlelement.h"
#include "core/fxcrt/xml/cfx_xmlnode.h"
#include "third_party/base/ptr_util.h"
#include "xfa/fde/css/cfde_csscomputedstyle.h"
#include "xfa/fde/css/cfde_cssstyleselector.h"
#include "xfa/fde/css/cfde_cssstylesheet.h"
#include "xfa/fde/css/fde_css.h"
#include "xfa/fgas/font/cfgas_fontmgr.h"
#include "xfa/fxfa/cxfa_ffapp.h"
#include "xfa/fxfa/cxfa_ffdoc.h"
#include "xfa/fxfa/cxfa_fontmgr.h"
#include "xfa/fxfa/cxfa_textparsecontext.h"
#include "xfa/fxfa/cxfa_textprovider.h"
#include "xfa/fxfa/cxfa_texttabstopscontext.h"
#include "xfa/fxfa/parser/cxfa_measurement.h"
namespace {
enum class TabStopStatus {
Error,
EOS,
None,
Alignment,
StartLeader,
Leader,
Location,
};
} // namespace
CXFA_TextParser::CXFA_TextParser()
: m_bParsed(false), m_cssInitialized(false) {}
CXFA_TextParser::~CXFA_TextParser() {}
void CXFA_TextParser::Reset() {
m_mapXMLNodeToParseContext.clear();
m_bParsed = false;
}
void CXFA_TextParser::InitCSSData(CXFA_TextProvider* pTextProvider) {
if (!pTextProvider)
return;
if (!m_pSelector) {
m_pSelector = pdfium::MakeUnique<CFDE_CSSStyleSelector>();
CXFA_Font font = pTextProvider->GetFontNode();
m_pSelector->SetDefFontSize(font ? font.GetFontSize() : 10.0f);
}
if (m_cssInitialized)
return;
m_cssInitialized = true;
auto uaSheet = LoadDefaultSheetStyle();
m_pSelector->SetUAStyleSheet(std::move(uaSheet));
m_pSelector->UpdateStyleIndex();
}
std::unique_ptr<CFDE_CSSStyleSheet> CXFA_TextParser::LoadDefaultSheetStyle() {
static const wchar_t s_pStyle[] =
L"html,body,ol,p,ul{display:block}"
L"li{display:list-item}"
L"ol,ul{padding-left:33px;margin:1.12em 0}"
L"ol{list-style-type:decimal}"
L"a{color:#0000ff;text-decoration:underline}"
L"b{font-weight:bolder}"
L"i{font-style:italic}"
L"sup{vertical-align:+15em;font-size:.66em}"
L"sub{vertical-align:-15em;font-size:.66em}";
auto sheet = pdfium::MakeUnique<CFDE_CSSStyleSheet>();
return sheet->LoadBuffer(s_pStyle, FXSYS_wcslen(s_pStyle)) ? std::move(sheet)
: nullptr;
}
CFX_RetainPtr<CFDE_CSSComputedStyle> CXFA_TextParser::CreateRootStyle(
CXFA_TextProvider* pTextProvider) {
CXFA_Font font = pTextProvider->GetFontNode();
CXFA_Para para = pTextProvider->GetParaNode();
auto pStyle = m_pSelector->CreateComputedStyle(nullptr);
float fLineHeight = 0;
float fFontSize = 10;
if (para) {
fLineHeight = para.GetLineHeight();
FDE_CSSLength indent;
indent.Set(FDE_CSSLengthUnit::Point, para.GetTextIndent());
pStyle->SetTextIndent(indent);
FDE_CSSTextAlign hAlign = FDE_CSSTextAlign::Left;
switch (para.GetHorizontalAlign()) {
case XFA_ATTRIBUTEENUM_Center:
hAlign = FDE_CSSTextAlign::Center;
break;
case XFA_ATTRIBUTEENUM_Right:
hAlign = FDE_CSSTextAlign::Right;
break;
case XFA_ATTRIBUTEENUM_Justify:
hAlign = FDE_CSSTextAlign::Justify;
break;
case XFA_ATTRIBUTEENUM_JustifyAll:
hAlign = FDE_CSSTextAlign::JustifyAll;
break;
}
pStyle->SetTextAlign(hAlign);
FDE_CSSRect rtMarginWidth;
rtMarginWidth.left.Set(FDE_CSSLengthUnit::Point, para.GetMarginLeft());
rtMarginWidth.top.Set(FDE_CSSLengthUnit::Point, para.GetSpaceAbove());
rtMarginWidth.right.Set(FDE_CSSLengthUnit::Point, para.GetMarginRight());
rtMarginWidth.bottom.Set(FDE_CSSLengthUnit::Point, para.GetSpaceBelow());
pStyle->SetMarginWidth(rtMarginWidth);
}
if (font) {
pStyle->SetColor(font.GetColor());
pStyle->SetFontStyle(font.IsItalic() ? FDE_CSSFontStyle::Italic
: FDE_CSSFontStyle::Normal);
pStyle->SetFontWeight(font.IsBold() ? FXFONT_FW_BOLD : FXFONT_FW_NORMAL);
pStyle->SetNumberVerticalAlign(-font.GetBaselineShift());
fFontSize = font.GetFontSize();
FDE_CSSLength letterSpacing;
letterSpacing.Set(FDE_CSSLengthUnit::Point, font.GetLetterSpacing());
pStyle->SetLetterSpacing(letterSpacing);
uint32_t dwDecoration = 0;
if (font.GetLineThrough() > 0)
dwDecoration |= FDE_CSSTEXTDECORATION_LineThrough;
if (font.GetUnderline() > 1)
dwDecoration |= FDE_CSSTEXTDECORATION_Double;
else if (font.GetUnderline() > 0)
dwDecoration |= FDE_CSSTEXTDECORATION_Underline;
pStyle->SetTextDecoration(dwDecoration);
}
pStyle->SetLineHeight(fLineHeight);
pStyle->SetFontSize(fFontSize);
return pStyle;
}
CFX_RetainPtr<CFDE_CSSComputedStyle> CXFA_TextParser::CreateStyle(
CFDE_CSSComputedStyle* pParentStyle) {
auto pNewStyle = m_pSelector->CreateComputedStyle(pParentStyle);
ASSERT(pNewStyle);
if (!pParentStyle)
return pNewStyle;
uint32_t dwDecoration = pParentStyle->GetTextDecoration();
float fBaseLine = 0;
if (pParentStyle->GetVerticalAlign() == FDE_CSSVerticalAlign::Number)
fBaseLine = pParentStyle->GetNumberVerticalAlign();
pNewStyle->SetTextDecoration(dwDecoration);
pNewStyle->SetNumberVerticalAlign(fBaseLine);
const FDE_CSSRect* pRect = pParentStyle->GetMarginWidth();
if (pRect)
pNewStyle->SetMarginWidth(*pRect);
return pNewStyle;
}
CFX_RetainPtr<CFDE_CSSComputedStyle> CXFA_TextParser::ComputeStyle(
CFX_XMLNode* pXMLNode,
CFDE_CSSComputedStyle* pParentStyle) {
auto it = m_mapXMLNodeToParseContext.find(pXMLNode);
if (it == m_mapXMLNodeToParseContext.end())
return nullptr;
CXFA_TextParseContext* pContext = it->second.get();
if (!pContext)
return nullptr;
pContext->m_pParentStyle.Reset(pParentStyle);
auto tagProvider = ParseTagInfo(pXMLNode);
if (tagProvider->m_bContent)
return nullptr;
auto pStyle = CreateStyle(pParentStyle);
m_pSelector->ComputeStyle(pContext->GetDecls(),
tagProvider->GetAttribute(L"style"),
tagProvider->GetAttribute(L"align"), pStyle.Get());
return pStyle;
}
void CXFA_TextParser::DoParse(CFX_XMLNode* pXMLContainer,
CXFA_TextProvider* pTextProvider) {
if (!pXMLContainer || !pTextProvider || m_bParsed)
return;
m_bParsed = true;
InitCSSData(pTextProvider);
auto pRootStyle = CreateRootStyle(pTextProvider);
ParseRichText(pXMLContainer, pRootStyle.Get());
}
void CXFA_TextParser::ParseRichText(CFX_XMLNode* pXMLNode,
CFDE_CSSComputedStyle* pParentStyle) {
if (!pXMLNode)
return;
auto tagProvider = ParseTagInfo(pXMLNode);
if (!tagProvider->m_bTagAvailable)
return;
CFX_RetainPtr<CFDE_CSSComputedStyle> pNewStyle;
if ((tagProvider->GetTagName() != L"body") ||
(tagProvider->GetTagName() != L"html")) {
auto pTextContext = pdfium::MakeUnique<CXFA_TextParseContext>();
FDE_CSSDisplay eDisplay = FDE_CSSDisplay::Inline;
if (!tagProvider->m_bContent) {
auto declArray =
m_pSelector->MatchDeclarations(tagProvider->GetTagName());
pNewStyle = CreateStyle(pParentStyle);
m_pSelector->ComputeStyle(declArray, tagProvider->GetAttribute(L"style"),
tagProvider->GetAttribute(L"align"),
pNewStyle.Get());
if (!declArray.empty())
pTextContext->SetDecls(std::move(declArray));
eDisplay = pNewStyle->GetDisplay();
}
pTextContext->SetDisplay(eDisplay);
m_mapXMLNodeToParseContext[pXMLNode] = std::move(pTextContext);
}
for (CFX_XMLNode* pXMLChild = pXMLNode->GetNodeItem(CFX_XMLNode::FirstChild);
pXMLChild;
pXMLChild = pXMLChild->GetNodeItem(CFX_XMLNode::NextSibling)) {
ParseRichText(pXMLChild, pNewStyle.Get());
}
}
bool CXFA_TextParser::TagValidate(const CFX_WideString& wsName) const {
static const uint32_t s_XFATagName[] = {
0x61, // a
0x62, // b
0x69, // i
0x70, // p
0x0001f714, // br
0x00022a55, // li
0x000239bb, // ol
0x00025881, // ul
0x0bd37faa, // sub
0x0bd37fb8, // sup
0xa73e3af2, // span
0xb182eaae, // body
0xdb8ac455, // html
};
static const int32_t s_iCount = FX_ArraySize(s_XFATagName);
return std::binary_search(s_XFATagName, s_XFATagName + s_iCount,
FX_HashCode_GetW(wsName.AsStringC(), true));
}
std::unique_ptr<CXFA_TextParser::TagProvider> CXFA_TextParser::ParseTagInfo(
CFX_XMLNode* pXMLNode) {
auto tagProvider = pdfium::MakeUnique<TagProvider>();
CFX_WideString wsName;
if (pXMLNode->GetType() == FX_XMLNODE_Element) {
CFX_XMLElement* pXMLElement = static_cast<CFX_XMLElement*>(pXMLNode);
wsName = pXMLElement->GetLocalTagName();
tagProvider->SetTagName(wsName);
tagProvider->m_bTagAvailable = TagValidate(wsName);
CFX_WideString wsValue = pXMLElement->GetString(L"style");
if (!wsValue.IsEmpty())
tagProvider->SetAttribute(L"style", wsValue);
} else if (pXMLNode->GetType() == FX_XMLNODE_Text) {
tagProvider->m_bTagAvailable = true;
tagProvider->m_bContent = true;
}
return tagProvider;
}
int32_t CXFA_TextParser::GetVAlign(CXFA_TextProvider* pTextProvider) const {
CXFA_Para para = pTextProvider->GetParaNode();
return para ? para.GetVerticalAlign() : XFA_ATTRIBUTEENUM_Top;
}
float CXFA_TextParser::GetTabInterval(CFDE_CSSComputedStyle* pStyle) const {
CFX_WideString wsValue;
if (pStyle && pStyle->GetCustomStyle(L"tab-interval", wsValue))
return CXFA_Measurement(wsValue.AsStringC()).ToUnit(XFA_UNIT_Pt);
return 36;
}
int32_t CXFA_TextParser::CountTabs(CFDE_CSSComputedStyle* pStyle) const {
CFX_WideString wsValue;
if (pStyle && pStyle->GetCustomStyle(L"xfa-tab-count", wsValue))
return wsValue.GetInteger();
return 0;
}
bool CXFA_TextParser::IsSpaceRun(CFDE_CSSComputedStyle* pStyle) const {
CFX_WideString wsValue;
if (pStyle && pStyle->GetCustomStyle(L"xfa-spacerun", wsValue)) {
wsValue.MakeLower();
return wsValue == L"yes";
}
return false;
}
CFX_RetainPtr<CFGAS_GEFont> CXFA_TextParser::GetFont(
CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle) const {
CFX_WideStringC wsFamily = L"Courier";
uint32_t dwStyle = 0;
CXFA_Font font = pTextProvider->GetFontNode();
if (font) {
font.GetTypeface(wsFamily);
if (font.IsBold())
dwStyle |= FX_FONTSTYLE_Bold;
if (font.IsItalic())
dwStyle |= FX_FONTSTYLE_Italic;
}
if (pStyle) {
int32_t iCount = pStyle->CountFontFamilies();
if (iCount > 0)
wsFamily = pStyle->GetFontFamily(iCount - 1).AsStringC();
dwStyle = 0;
if (pStyle->GetFontWeight() > FXFONT_FW_NORMAL)
dwStyle |= FX_FONTSTYLE_Bold;
if (pStyle->GetFontStyle() == FDE_CSSFontStyle::Italic)
dwStyle |= FX_FONTSTYLE_Italic;
}
CXFA_FFDoc* pDoc = pTextProvider->GetDocNode();
CXFA_FontMgr* pFontMgr = pDoc->GetApp()->GetXFAFontMgr();
return pFontMgr->GetFont(pDoc, wsFamily, dwStyle);
}
float CXFA_TextParser::GetFontSize(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle) const {
if (pStyle)
return pStyle->GetFontSize();
CXFA_Font font = pTextProvider->GetFontNode();
if (font)
return font.GetFontSize();
return 10;
}
int32_t CXFA_TextParser::GetHorScale(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle,
CFX_XMLNode* pXMLNode) const {
if (pStyle) {
CFX_WideString wsValue;
if (pStyle->GetCustomStyle(L"xfa-font-horizontal-scale", wsValue))
return wsValue.GetInteger();
while (pXMLNode) {
auto it = m_mapXMLNodeToParseContext.find(pXMLNode);
if (it != m_mapXMLNodeToParseContext.end()) {
CXFA_TextParseContext* pContext = it->second.get();
if (pContext && pContext->m_pParentStyle &&
pContext->m_pParentStyle->GetCustomStyle(
L"xfa-font-horizontal-scale", wsValue)) {
return wsValue.GetInteger();
}
}
pXMLNode = pXMLNode->GetNodeItem(CFX_XMLNode::Parent);
}
}
if (CXFA_Font font = pTextProvider->GetFontNode())
return static_cast<int32_t>(font.GetHorizontalScale());
return 100;
}
int32_t CXFA_TextParser::GetVerScale(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle) const {
if (pStyle) {
CFX_WideString wsValue;
if (pStyle->GetCustomStyle(L"xfa-font-vertical-scale", wsValue))
return wsValue.GetInteger();
}
if (CXFA_Font font = pTextProvider->GetFontNode())
return (int32_t)font.GetVerticalScale();
return 100;
}
void CXFA_TextParser::GetUnderline(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle,
int32_t& iUnderline,
int32_t& iPeriod) const {
iUnderline = 0;
iPeriod = XFA_ATTRIBUTEENUM_All;
if (!pStyle) {
CXFA_Font font = pTextProvider->GetFontNode();
if (font) {
iUnderline = font.GetUnderline();
iPeriod = font.GetUnderlinePeriod();
}
return;
}
uint32_t dwDecoration = pStyle->GetTextDecoration();
if (dwDecoration & FDE_CSSTEXTDECORATION_Double)
iUnderline = 2;
else if (dwDecoration & FDE_CSSTEXTDECORATION_Underline)
iUnderline = 1;
CFX_WideString wsValue;
if (pStyle->GetCustomStyle(L"underlinePeriod", wsValue)) {
if (wsValue == L"word")
iPeriod = XFA_ATTRIBUTEENUM_Word;
} else if (CXFA_Font font = pTextProvider->GetFontNode()) {
iPeriod = font.GetUnderlinePeriod();
}
}
void CXFA_TextParser::GetLinethrough(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle,
int32_t& iLinethrough) const {
if (pStyle) {
uint32_t dwDecoration = pStyle->GetTextDecoration();
iLinethrough = (dwDecoration & FDE_CSSTEXTDECORATION_LineThrough) ? 1 : 0;
return;
}
CXFA_Font font = pTextProvider->GetFontNode();
if (font)
iLinethrough = font.GetLineThrough();
}
FX_ARGB CXFA_TextParser::GetColor(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle) const {
if (pStyle)
return pStyle->GetColor();
if (CXFA_Font font = pTextProvider->GetFontNode())
return font.GetColor();
return 0xFF000000;
}
float CXFA_TextParser::GetBaseline(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle) const {
if (pStyle) {
if (pStyle->GetVerticalAlign() == FDE_CSSVerticalAlign::Number)
return pStyle->GetNumberVerticalAlign();
} else if (CXFA_Font font = pTextProvider->GetFontNode()) {
return font.GetBaselineShift();
}
return 0;
}
float CXFA_TextParser::GetLineHeight(CXFA_TextProvider* pTextProvider,
CFDE_CSSComputedStyle* pStyle,
bool bFirst,
float fVerScale) const {
float fLineHeight = 0;
if (pStyle)
fLineHeight = pStyle->GetLineHeight();
else if (CXFA_Para para = pTextProvider->GetParaNode())
fLineHeight = para.GetLineHeight();
if (bFirst) {
float fFontSize = GetFontSize(pTextProvider, pStyle);
if (fLineHeight < 0.1f)
fLineHeight = fFontSize;
else
fLineHeight = std::min(fLineHeight, fFontSize);
} else if (fLineHeight < 0.1f) {
fLineHeight = GetFontSize(pTextProvider, pStyle) * 1.2f;
}
fLineHeight *= fVerScale;
return fLineHeight;
}
bool CXFA_TextParser::GetEmbbedObj(CXFA_TextProvider* pTextProvider,
CFX_XMLNode* pXMLNode,
CFX_WideString& wsValue) {
wsValue.clear();
if (!pXMLNode)
return false;
bool bRet = false;
if (pXMLNode->GetType() == FX_XMLNODE_Element) {
CFX_XMLElement* pElement = static_cast<CFX_XMLElement*>(pXMLNode);
CFX_WideString wsAttr = pElement->GetString(L"xfa:embed");
if (wsAttr.IsEmpty())
return false;
if (wsAttr.GetAt(0) == L'#')
wsAttr.Delete(0);
CFX_WideString ws = pElement->GetString(L"xfa:embedType");
if (ws.IsEmpty())
ws = L"som";
else
ws.MakeLower();
bool bURI = (ws == L"uri");
if (!bURI && ws != L"som")
return false;
ws = pElement->GetString(L"xfa:embedMode");
if (ws.IsEmpty())
ws = L"formatted";
else
ws.MakeLower();
bool bRaw = (ws == L"raw");
if (!bRaw && ws != L"formatted")
return false;
bRet = pTextProvider->GetEmbbedObj(bURI, bRaw, wsAttr, wsValue);
}
return bRet;
}
CXFA_TextParseContext* CXFA_TextParser::GetParseContextFromMap(
CFX_XMLNode* pXMLNode) {
auto it = m_mapXMLNodeToParseContext.find(pXMLNode);
return it != m_mapXMLNodeToParseContext.end() ? it->second.get() : nullptr;
}
bool CXFA_TextParser::GetTabstops(CFDE_CSSComputedStyle* pStyle,
CXFA_TextTabstopsContext* pTabstopContext) {
if (!pStyle || !pTabstopContext)
return false;
CFX_WideString wsValue;
if (!pStyle->GetCustomStyle(L"xfa-tab-stops", wsValue) &&
!pStyle->GetCustomStyle(L"tab-stops", wsValue)) {
return false;
}
int32_t iLength = wsValue.GetLength();
const wchar_t* pTabStops = wsValue.c_str();
int32_t iCur = 0;
int32_t iLast = 0;
CFX_WideString wsAlign;
TabStopStatus eStatus = TabStopStatus::None;
wchar_t ch;
while (iCur < iLength) {
ch = pTabStops[iCur];
switch (eStatus) {
case TabStopStatus::None:
if (ch <= ' ') {
iCur++;
} else {
eStatus = TabStopStatus::Alignment;
iLast = iCur;
}
break;
case TabStopStatus::Alignment:
if (ch == ' ') {
wsAlign = CFX_WideStringC(pTabStops + iLast, iCur - iLast);
eStatus = TabStopStatus::StartLeader;
iCur++;
while (iCur < iLength && pTabStops[iCur] <= ' ')
iCur++;
iLast = iCur;
} else {
iCur++;
}
break;
case TabStopStatus::StartLeader:
if (ch != 'l') {
eStatus = TabStopStatus::Location;
} else {
int32_t iCount = 0;
while (iCur < iLength) {
ch = pTabStops[iCur];
iCur++;
if (ch == '(') {
iCount++;
} else if (ch == ')') {
iCount--;
if (iCount == 0)
break;
}
}
while (iCur < iLength && pTabStops[iCur] <= ' ')
iCur++;
iLast = iCur;
eStatus = TabStopStatus::Location;
}
break;
case TabStopStatus::Location:
if (ch == ' ') {
uint32_t dwHashCode = FX_HashCode_GetW(wsAlign.AsStringC(), true);
CXFA_Measurement ms(CFX_WideStringC(pTabStops + iLast, iCur - iLast));
float fPos = ms.ToUnit(XFA_UNIT_Pt);
pTabstopContext->Append(dwHashCode, fPos);
wsAlign.clear();
eStatus = TabStopStatus::None;
}
iCur++;
break;
default:
break;
}
}
if (!wsAlign.IsEmpty()) {
uint32_t dwHashCode = FX_HashCode_GetW(wsAlign.AsStringC(), true);
CXFA_Measurement ms(CFX_WideStringC(pTabStops + iLast, iCur - iLast));
float fPos = ms.ToUnit(XFA_UNIT_Pt);
pTabstopContext->Append(dwHashCode, fPos);
}
return true;
}
CXFA_TextParser::TagProvider::TagProvider()
: m_bTagAvailable(false), m_bContent(false) {}
CXFA_TextParser::TagProvider::~TagProvider() {}