Rework inline image parsing in content streams
Currently, CPDF_StreamContentParser:Handle_BeginImage() will try to
parse inline images in content streams when it sees the "BI" operator.
The parts that deal with the related "ID" and "EI" operators
(Handle_BeginImageData() and Handle_EndImage(), respectively) are
no-ops, since Handle_BeginImage() did all the work.
Rework Handle_BeginImage() and Handle_BeginImageData() so
CPDF_StreamContentParser records their respective keywords's positions.
Then handle the inline image parsing in Handle_EndImage() with those
positions. At this point, CPDF_StreamContentParser know the distance
between the "ID" and "EI" operators, so it knows where the length of the
inline image stream. Pass this length to
CPDF_StreamParser::ReadInlineStream() and prevent it from reading the
"EI" operator or past it.
As a result of this change:
- Handle_BeginImage() no longer needs to look for "ID" and "EI".
- ReadInlineStream() no longer needs to looks for "EI".
- bug_1236805.in fails because the code was reading "EI" and past it.
Fix this by adding a placeholder value so the inline stream is not
empty. Remove the temporary suppression for this file.
- The bug_407752631.in test case can be checked in and will no longer
cause a crash.
To deal with malforms images that may not have "BI", "ID", and "EI"
operators in that order, do some state tracking so that
CPDF_StreamContentParser only looks for "ID" and "EI" when appropriate,
like the existing code.
Bug: 407752631
Change-Id: Ie14908d8ed72c92c2ae880c18f62469a5cf4eef3
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/130911
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.cpp b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
index fdd3ac9..8e12bef 100644
--- a/core/fpdfapi/page/cpdf_streamcontentparser.cpp
+++ b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
@@ -10,6 +10,7 @@
#include <array>
#include <map>
#include <memory>
+#include <optional>
#include <utility>
#include <vector>
@@ -41,11 +42,13 @@
#include "core/fxcrt/check.h"
#include "core/fxcrt/compiler_specific.h"
#include "core/fxcrt/containers/contains.h"
+#include "core/fxcrt/fx_memory.h"
#include "core/fxcrt/fx_safe_types.h"
#include "core/fxcrt/notreached.h"
#include "core/fxcrt/scoped_set_insertion.h"
#include "core/fxcrt/span.h"
#include "core/fxcrt/stl_util.h"
+#include "core/fxcrt/unowned_ptr.h"
#include "core/fxge/cfx_graphstate.h"
#include "core/fxge/cfx_graphstatedata.h"
@@ -257,6 +260,20 @@
}
}
+template <typename T>
+class AutoOptionalResetter {
+ public:
+ FX_STACK_ALLOCATED();
+
+ explicit AutoOptionalResetter(std::optional<T>* value) : value_(value) {
+ CHECK(value_);
+ }
+ ~AutoOptionalResetter() { value_->reset(); }
+
+ private:
+ const UnownedPtr<std::optional<T>> value_;
+};
+
} // namespace
// static
@@ -589,7 +606,25 @@
}
void CPDF_StreamContentParser::OnOperator(ByteStringView op) {
- auto it = g_opcodes->find(op.GetID());
+ const uint32_t op_id = op.GetID();
+ if (inline_image_context_.has_value()) {
+ switch (inline_image_context_.value().state) {
+ case InlineImageContext::State::kLookingForID:
+ // Also tolerate the case with "ID" missing.
+ if (op_id != FXBSTR_ID('I', 'D', 0, 0) &&
+ op_id != FXBSTR_ID('E', 'I', 0, 0)) {
+ return;
+ }
+ break;
+ case InlineImageContext::State::kLookingForEI:
+ if (op_id != FXBSTR_ID('E', 'I', 0, 0)) {
+ return;
+ }
+ break;
+ }
+ }
+
+ auto it = g_opcodes->find(op_id);
if (it != g_opcodes->end()) {
(this->*it->second)();
}
@@ -640,63 +675,8 @@
}
void CPDF_StreamContentParser::Handle_BeginImage() {
- FX_FILESIZE savePos = syntax_->GetPos();
- auto pDict = document_->New<CPDF_Dictionary>();
- while (true) {
- CPDF_StreamParser::ElementType type = syntax_->ParseNextElement();
- if (type == CPDF_StreamParser::ElementType::kKeyword) {
- if (syntax_->GetWord() != "ID") {
- syntax_->SetPos(savePos);
- return;
- }
- }
- if (type != CPDF_StreamParser::ElementType::kName) {
- break;
- }
- // Next `syntax_` read below may invalidate `word`. Must save to `key`.
- ByteStringView word = syntax_->GetWord();
- ByteString key(word.Last(word.GetLength() - 1));
- auto pObj = syntax_->ReadNextObject(false, false, 0);
- if (pObj && !pObj->IsInline()) {
- pDict->SetNewFor<CPDF_Reference>(key, document_, pObj->GetObjNum());
- } else {
- pDict->SetFor(key, std::move(pObj));
- }
- }
- ReplaceAbbr(pDict);
- RetainPtr<const CPDF_Object> pCSObj;
- if (pDict->KeyExist("ColorSpace")) {
- pCSObj = pDict->GetDirectObjectFor("ColorSpace");
- if (pCSObj->IsName()) {
- ByteString name = pCSObj->GetString();
- if (name != "DeviceRGB" && name != "DeviceGray" && name != "DeviceCMYK") {
- pCSObj = FindResourceObj("ColorSpace", name);
- if (pCSObj && pCSObj->IsInline()) {
- pDict->SetFor("ColorSpace", pCSObj->Clone());
- }
- }
- }
- }
- pDict->SetNewFor<CPDF_Name>("Subtype", "Image");
- RetainPtr<CPDF_Stream> pStream =
- syntax_->ReadInlineStream(document_, std::move(pDict), pCSObj.Get());
- while (true) {
- CPDF_StreamParser::ElementType type = syntax_->ParseNextElement();
- if (type == CPDF_StreamParser::ElementType::kEndOfData) {
- return;
- }
-
- if (type == CPDF_StreamParser::ElementType::kKeyword &&
- syntax_->GetWord() == "EI") {
- break;
- }
- }
- CPDF_ImageObject* pObj = AddImageFromStream(std::move(pStream), /*name=*/"");
- // Record the bounding box of this image, so rendering code can draw it
- // properly.
- if (pObj && pObj->GetImage()->IsMask()) {
- object_holder_->AddImageMaskBoundingBox(pObj->GetRect());
- }
+ CHECK(!inline_image_context_.has_value());
+ inline_image_context_.emplace().begin_pos = syntax_->GetPos();
}
void CPDF_StreamContentParser::Handle_BeginMarkedContent() {
@@ -899,7 +879,79 @@
void CPDF_StreamContentParser::Handle_MarkPlace_Dictionary() {}
-void CPDF_StreamContentParser::Handle_EndImage() {}
+void CPDF_StreamContentParser::Handle_EndImage() {
+ if (!inline_image_context_.has_value()) {
+ return; // No matching "BI" operator.
+ }
+
+ AutoOptionalResetter auto_resetter(&inline_image_context_);
+ const InlineImageContext& inline_image_context =
+ inline_image_context_.value();
+ if (inline_image_context.state != InlineImageContext::State::kLookingForEI) {
+ return; // No matching "ID" operator.
+ }
+
+ CHECK_LT(inline_image_context.begin_pos, inline_image_context.data_pos);
+
+ const uint32_t saved_position = syntax_->GetPos();
+ auto synthesized_dict = document_->New<CPDF_Dictionary>();
+ syntax_->SetPos(inline_image_context.begin_pos);
+ while (true) {
+ CPDF_StreamParser::ElementType type = syntax_->ParseNextElement();
+ if (syntax_->GetPos() >= inline_image_context.data_pos) {
+ break; // Reached "ID" operator.
+ }
+ if (type == CPDF_StreamParser::ElementType::kKeyword) {
+ continue; // Already handled by CPDF_StreamContentParser itself.
+ }
+ if (type != CPDF_StreamParser::ElementType::kName) {
+ break;
+ }
+ // Next `syntax_` read below may invalidate `word`. Must save to `key`.
+ ByteStringView word = syntax_->GetWord();
+ ByteString key(word.Last(word.GetLength() - 1));
+ auto value_object = syntax_->ReadNextObject(false, false, 0);
+ if (value_object && !value_object->IsInline()) {
+ synthesized_dict->SetNewFor<CPDF_Reference>(key, document_,
+ value_object->GetObjNum());
+ } else {
+ synthesized_dict->SetFor(key, std::move(value_object));
+ }
+ }
+ ReplaceAbbr(synthesized_dict);
+ synthesized_dict->SetNewFor<CPDF_Name>("Subtype", "Image");
+
+ RetainPtr<const CPDF_Object> colorspace;
+ if (synthesized_dict->KeyExist("ColorSpace")) {
+ colorspace = synthesized_dict->GetDirectObjectFor("ColorSpace");
+ if (colorspace->IsName()) {
+ ByteString name = colorspace->GetString();
+ if (name != "DeviceRGB" && name != "DeviceGray" && name != "DeviceCMYK") {
+ colorspace = FindResourceObj("ColorSpace", name);
+ if (colorspace && colorspace->IsInline()) {
+ synthesized_dict->SetFor("ColorSpace", colorspace->Clone());
+ }
+ }
+ }
+ }
+
+ syntax_->SetPos(inline_image_context.data_pos);
+ FX_SAFE_UINT32 stream_length = saved_position;
+ stream_length -= inline_image_context.data_pos;
+ stream_length -= 2; // Account for "EI" operator.
+ RetainPtr<CPDF_Stream> inline_stream =
+ syntax_->ReadInlineStream(document_, std::move(synthesized_dict),
+ colorspace, stream_length.ValueOrDie());
+ CPDF_ImageObject* new_image_object =
+ AddImageFromStream(std::move(inline_stream), /*name=*/"");
+ // Record the bounding box of this image, so rendering code can draw it
+ // properly.
+ if (new_image_object && new_image_object->GetImage()->IsMask()) {
+ object_holder_->AddImageMaskBoundingBox(new_image_object->GetRect());
+ }
+
+ syntax_->SetPos(saved_position);
+}
void CPDF_StreamContentParser::Handle_EndMarkedContent() {
// First element is a sentinel, so do not pop it, ever. This may come up if
@@ -975,7 +1027,20 @@
cur_states_->mutable_general_state().SetFlatness(GetNumber(0));
}
-void CPDF_StreamContentParser::Handle_BeginImageData() {}
+void CPDF_StreamContentParser::Handle_BeginImageData() {
+ InlineImageContext& inline_image_context = inline_image_context_.value();
+ CHECK_EQ(inline_image_context.state,
+ InlineImageContext::State::kLookingForID);
+ CHECK_EQ(inline_image_context.data_pos, 0u);
+
+ inline_image_context.state = InlineImageContext::State::kLookingForEI;
+
+ uint32_t pos = syntax_->GetPos();
+ // Something else, like the "BI" operator, must be before the "ID" operator,
+ // so "ID" cannot be at position 0.
+ CHECK_NE(pos, 0u);
+ inline_image_context.data_pos = pos;
+}
void CPDF_StreamContentParser::Handle_SetLineJoin() {
cur_states_->mutable_graph_state().SetLineJoin(
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.h b/core/fpdfapi/page/cpdf_streamcontentparser.h
index 80b097c..a1bc9ed 100644
--- a/core/fpdfapi/page/cpdf_streamcontentparser.h
+++ b/core/fpdfapi/page/cpdf_streamcontentparser.h
@@ -9,6 +9,7 @@
#include <array>
#include <memory>
+#include <optional>
#include <stack>
#include <variant>
#include <vector>
@@ -73,6 +74,14 @@
private:
enum class RenderType : bool { kFill = false, kStroke = true };
+ struct InlineImageContext {
+ enum class State : bool { kLookingForID, kLookingForEI };
+
+ State state = State::kLookingForID;
+ uint32_t begin_pos = 0;
+ uint32_t data_pos = 0;
+ };
+
using ContentParam =
std::variant<RetainPtr<CPDF_Object>, FX_Number, ByteString>;
@@ -242,6 +251,8 @@
// The merged stream offset at which the last |syntax_| started parsing.
uint32_t start_parse_offset_ = 0;
+
+ std::optional<InlineImageContext> inline_image_context_;
};
#endif // CORE_FPDFAPI_PAGE_CPDF_STREAMCONTENTPARSER_H_
diff --git a/core/fpdfapi/page/cpdf_streamparser.cpp b/core/fpdfapi/page/cpdf_streamparser.cpp
index 4bdfb61..7f750ff 100644
--- a/core/fpdfapi/page/cpdf_streamparser.cpp
+++ b/core/fpdfapi/page/cpdf_streamparser.cpp
@@ -25,7 +25,6 @@
#include "core/fxcodec/data_and_bytes_consumed.h"
#include "core/fxcodec/jpeg/jpegmodule.h"
#include "core/fxcodec/scanlinedecoder.h"
-#include "core/fxcrt/autorestorer.h"
#include "core/fxcrt/check.h"
#include "core/fxcrt/data_vector.h"
#include "core/fxcrt/fx_extension.h"
@@ -140,14 +139,14 @@
RetainPtr<CPDF_Stream> CPDF_StreamParser::ReadInlineStream(
CPDF_Document* pDoc,
RetainPtr<CPDF_Dictionary> pDict,
- const CPDF_Object* pCSObj) {
- auto stream_span = buf_.subspan(pos_);
+ const CPDF_Object* pCSObj,
+ uint32_t stream_length) {
+ auto stream_span = buf_.subspan(pos_).first(stream_length);
if (stream_span.empty()) {
return nullptr;
}
if (PDFCharIsWhitespace(stream_span.front())) {
- pos_++;
stream_span = stream_span.subspan<1>();
if (stream_span.empty()) {
return nullptr;
@@ -201,7 +200,6 @@
auto src_span = stream_span.first(original_size);
data = DataVector<uint8_t>(src_span.begin(), src_span.end());
actual_stream_size = original_size;
- pos_ += original_size;
} else {
actual_stream_size =
DecodeInlineStream(stream_span, width, height, decoder,
@@ -210,26 +208,8 @@
return nullptr;
}
- {
- AutoRestorer<uint32_t> saved_position(&pos_);
- pos_ += actual_stream_size;
- while (true) {
- uint32_t saved_iteration_position = pos_;
- ElementType type = ParseNextElement();
- if (type == ElementType::kEndOfData) {
- return nullptr;
- }
-
- if (type == ElementType::kKeyword && GetWord() == "EI") {
- break;
- }
-
- actual_stream_size += pos_ - saved_iteration_position;
- }
- }
auto src_span = stream_span.first(actual_stream_size);
data = DataVector<uint8_t>(src_span.begin(), src_span.end());
- pos_ += actual_stream_size;
}
pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(actual_stream_size));
return pdfium::MakeRetain<CPDF_Stream>(std::move(data), std::move(pDict));
diff --git a/core/fpdfapi/page/cpdf_streamparser.h b/core/fpdfapi/page/cpdf_streamparser.h
index 417f5f2..a00cc71 100644
--- a/core/fpdfapi/page/cpdf_streamparser.h
+++ b/core/fpdfapi/page/cpdf_streamparser.h
@@ -44,7 +44,8 @@
uint32_t dwRecursionLevel);
RetainPtr<CPDF_Stream> ReadInlineStream(CPDF_Document* pDoc,
RetainPtr<CPDF_Dictionary> pDict,
- const CPDF_Object* pCSObj);
+ const CPDF_Object* pCSObj,
+ uint32_t stream_length);
private:
friend class CPDFStreamParserTest_ReadHexString_Test;
diff --git a/testing/SUPPRESSIONS b/testing/SUPPRESSIONS
index ae5308a..aa1db92 100644
--- a/testing/SUPPRESSIONS
+++ b/testing/SUPPRESSIONS
@@ -622,9 +622,6 @@
# TODO(chromium:1131694): Remove after associated bug is fixed
bug_1131694.in * * * *
-# TODO(thestig): Remove after fixing inline image parsing.
-bug_1236805.in * * * *
-
# TODO(pdfium:1747): Remove after associated bug is fixed
bug_1258634.in * * * *
diff --git a/testing/resources/pixel/bug_1236805_expected.pdf.0.png b/testing/resources/pixel/bug_1236805_expected.pdf.0.png
index 7d4eebe..08c11b0 100644
--- a/testing/resources/pixel/bug_1236805_expected.pdf.0.png
+++ b/testing/resources/pixel/bug_1236805_expected.pdf.0.png
Binary files differ
diff --git a/testing/resources/pixel/bug_407752631.in b/testing/resources/pixel/bug_407752631.in
new file mode 100644
index 0000000..3993fa4
--- /dev/null
+++ b/testing/resources/pixel/bug_407752631.in
@@ -0,0 +1,69 @@
+{{header}}
+{{object 1 0}} <<
+ /Type /Catalog
+ /Pages 2 0 R
+>>
+endobj
+{{object 2 0}} <<
+ /Type /Pages
+ /Count 1
+ /Kids [3 0 R]
+ /MediaBox [0 0 200 200]
+>>
+endobj
+{{object 3 0}} <<
+ /Type /Page
+ /Parent 2 0 R
+ /Contents 4 0 R
+>>
+endobj
+{{object 4 0}} <<
+ /Filter [
+ % The stream contains a few hundred megabytes of spaces. Do a few rounds of
+ % compression to reduce the file size.
+ /ASCII85Decode
+ /FlateDecode
+ /ASCII85Decode
+ /FlateDecode
+ /ASCII85Decode
+ /FlateDecode
+ ]
+ {{streamlen}}
+>>
+stream
+GhT!\fp\An(5C/G$NX+E5XeK\'GUb@;+(Ei6pVM%E&d6#hpohD-_7E?O=04LO#=ZO>,%oD/DX*[+M+AH
+n@\iELsngCp8V,_>&67C1Ck_\NC('?^J9rQ:5YAbfgouJk$tOnR.c1L:(6k[0Bc\"VOmu:([Q2:\]e>S
+2o35";9H3iiCU_#G+I/NM"9J[3=PIUaJBQ*K7`]DX*EH,ADNiI$\W\)*%@N":8=s+]kU1PR-8&p\k-GP
+2.uFEWbfLNl!ef$?B^R'"nQ.,Ou%%M=a:<ZMs^X.DjfWFV(X@5*JNTeS_FAA-Kj6`l11DGB-u!$>YTa5
+-a$sNpJ?ol^EKZtT''G8.5PB%lMWUP\B4/9X[1.Fm7cNaL^0o!^6,71c8sniWe'7"SRug4A3Jk/c,O+P
+]!>JJ;R_Y0i&!UCN\o(u$)']kWu*U!9TmG8*!,f=G=W^tCPZD%q68T"F>3:r;bA&^F$&j&ok/lHI$.QX
+3Ws@K*[p.!TiF\0[u72J!6JC2(c?d=o>b$mp3C]7O82IGcL/5t3n+5Hg=YH>.'3+iT?$PUC)q+U)h1sp
+SlDSUbZrHPEVQ>PV4)a17_gZ@riTU!R*"L^L;&Dg>4]-CN=p25XCK4ic^>lOk?&nAHjdk`2F27f"ZA1J
+=f$][QS)armMr;\C[_'"]W8GDk+FDb;s2GRbG68Q_'nRInt<nBXVF&"q`c,;:8MHVGa8!F;oH,l0#FN;
+L0JhZ8B2%rX2u&)"*Y^%-i7-%/e"]K*kZ(s#[9bPfS`\"^J2)d%C,\"0<FhH"\f)$2`#roQlSW#bJQ+0
+B>2pWFU%2?">=q`Emh4dd2+keN[#!VdPSpGY*i1gl=N6jO];[DH7c+[^ACb;InGef-%%Ht%92cFWPeT7
+_PP_Xd:.d:Z-S'tYn%(3k3U]T]?ZR9ZQ!'3c#%J#O;G/,B.udsJ3_GcE='1$V23)ViHcQJ%u=I0E,cT3
+8=kA^+hX?SZmCXf%k@u8?RmdD/g[J&bZ\d<f$#85@+uNp8?.h&;,ZOWh=6u2T"43*`!i#o;o'b$<8L<G
+$-(T1]I"\t6h[f9W\&=d')^G*lGe^oU9Fd=6UK(QQ'qK)]I2.A**\8d#L6T1>O0_[dM'(\>laX^7KP42
+/!@WgNKNjLAQKC[c,pm0AMO5Oer.;CnorV_1H2#m%6'a+EUJ))NDJA^aI8Z;j@li)9*Y<pAq_I3m6air
+eZ#9G]C;=iRLNqI2Ri?ngjo[FIKF[52!m$4:R:aj!rN(%OS]aYAC]);?&,$t&%RUM_Z5\3H9L<2:gHoj
+=,X/G5q9g82B743=unGpqsuW?Z/@6?//K572Nr+7A@aYG;YC!NVa#!Xs3Sb.NMaA-RI^[tWnFG>P>WKc
+jZPhY59\42>aPh%I_jhI5fOB5j=4YqBIM^@4mik2^=8qA`q6Y/i;iul4m=C_U+s+X7)MZqG`M8!pd1$^
+V/7.#Ss#o0*fa"4LtGqqY9rAO8"K=];#*ETQRl&?LO6MM4#q]'=076j7P;4RX6X)?^m<A?CL.KZX$TDn
+I/:s\r"PQt8_c4Ef-d7p]8U>+a#?/paf#tV"T<-n!On]AoW;tV"XBKtaD.T5HEPpZ)0CiU,#St+Eq*?q
+*KWCTI9C#nVof.]W0XT$KlT/!GaI=B_dmOd,U@N@"5%[,oUt/[S(.+R*fR3T:2%/aej/lo2a%UE^$MDB
+GL%nG2qgUb2nit>\NPNta$c"eCDb1KXn8c)p`.!RL:'gCV&r\R-r_X;!]D%'phh>!KJa>p(`lFYXn]gm
+NTHm3UfVBH`IR!neB_DWa/!W.,5ES\]N8ff#0h*LBm#+e'nCB2[[48.>5Ai0I)=Q!!I1g0XQ-7t<d8:_
+BPBPWb/A.ZKp'UC&LLV'KuSrl6TO!?0XU@QNkK\N77-g+f%Z\Mn_V+\M^^ROmYd#7fTeI!RWWu5&6dj3
+,F-AP8ZH3\r8H$(]Q\L[ak?LP@pYYiWYh`)1[u/u_p0h4,t4b`Ep@9mf:HGEU(W^BrLkU)nY/\-2uB><
+1o/d!pSKgYoic'Ni%916M[troWAYl%59R;2nK+>ei=cP]JUggfdmY<hSE/Y;:="XSmYla`BZ)Ec(?S0N
+EAaAX%\LLS$O7eqlWdfa.&W&G_'E'IrkGSBn"E(WPR@u8<QGL;4hj0,9*8S[AdB[ekIt@WG=g;-NBZ$%
+M)[N4cP[$a@#i?>N('9(lRNUZ<qWh[5b(t\"=mP777gQ_8$_1$ZKgi)@S;($/Pa&H*\7q^TW`M#R^Gp<
+J,M!Le'^s$1nVA*Ug\q/r)Vs7*a>k(no*>MJC\`Rf#1Mc/ke;hUN4o]jZg;.J=d=F5JDc#Gg-3!X].;]
+hX,&5T&Ga2b:)8hRH83=Ri^/B0TE21pL)t\=g7C?Xe3gO>/$WP9U"o@0>X6Fg4(u1rrJ1r11^~>
+endstream
+endobj
+{{xref}}
+{{trailer}}
+{{startxref}}
+%%EOF
diff --git a/testing/resources/pixel/bug_407752631_expected.pdf.0.png b/testing/resources/pixel/bug_407752631_expected.pdf.0.png
new file mode 100644
index 0000000..f97e340
--- /dev/null
+++ b/testing/resources/pixel/bug_407752631_expected.pdf.0.png
Binary files differ