Rework inline image parsing in content streams Currently, CPDF_StreamContentParser:Handle_BeginImage() will try to parse inline images in content streams when it sees the "BI" operator. The parts that deal with the related "ID" and "EI" operators (Handle_BeginImageData() and Handle_EndImage(), respectively) are no-ops, since Handle_BeginImage() did all the work. Rework Handle_BeginImage() and Handle_BeginImageData() so CPDF_StreamContentParser records their respective keywords's positions. Then handle the inline image parsing in Handle_EndImage() with those positions. At this point, CPDF_StreamContentParser know the distance between the "ID" and "EI" operators, so it knows where the length of the inline image stream. Pass this length to CPDF_StreamParser::ReadInlineStream() and prevent it from reading the "EI" operator or past it. As a result of this change: - Handle_BeginImage() no longer needs to look for "ID" and "EI". - ReadInlineStream() no longer needs to looks for "EI". - bug_1236805.in fails because the code was reading "EI" and past it. Fix this by adding a placeholder value so the inline stream is not empty. Remove the temporary suppression for this file. - The bug_407752631.in test case can be checked in and will no longer cause a crash. To deal with malforms images that may not have "BI", "ID", and "EI" operators in that order, do some state tracking so that CPDF_StreamContentParser only looks for "ID" and "EI" when appropriate, like the existing code. Bug: 407752631 Change-Id: Ie14908d8ed72c92c2ae880c18f62469a5cf4eef3 Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/130911 Commit-Queue: Lei Zhang <thestig@chromium.org> Reviewed-by: Tom Sepez <tsepez@chromium.org>
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.cpp b/core/fpdfapi/page/cpdf_streamcontentparser.cpp index fdd3ac9..8e12bef 100644 --- a/core/fpdfapi/page/cpdf_streamcontentparser.cpp +++ b/core/fpdfapi/page/cpdf_streamcontentparser.cpp
@@ -10,6 +10,7 @@ #include <array> #include <map> #include <memory> +#include <optional> #include <utility> #include <vector> @@ -41,11 +42,13 @@ #include "core/fxcrt/check.h" #include "core/fxcrt/compiler_specific.h" #include "core/fxcrt/containers/contains.h" +#include "core/fxcrt/fx_memory.h" #include "core/fxcrt/fx_safe_types.h" #include "core/fxcrt/notreached.h" #include "core/fxcrt/scoped_set_insertion.h" #include "core/fxcrt/span.h" #include "core/fxcrt/stl_util.h" +#include "core/fxcrt/unowned_ptr.h" #include "core/fxge/cfx_graphstate.h" #include "core/fxge/cfx_graphstatedata.h" @@ -257,6 +260,20 @@ } } +template <typename T> +class AutoOptionalResetter { + public: + FX_STACK_ALLOCATED(); + + explicit AutoOptionalResetter(std::optional<T>* value) : value_(value) { + CHECK(value_); + } + ~AutoOptionalResetter() { value_->reset(); } + + private: + const UnownedPtr<std::optional<T>> value_; +}; + } // namespace // static @@ -589,7 +606,25 @@ } void CPDF_StreamContentParser::OnOperator(ByteStringView op) { - auto it = g_opcodes->find(op.GetID()); + const uint32_t op_id = op.GetID(); + if (inline_image_context_.has_value()) { + switch (inline_image_context_.value().state) { + case InlineImageContext::State::kLookingForID: + // Also tolerate the case with "ID" missing. + if (op_id != FXBSTR_ID('I', 'D', 0, 0) && + op_id != FXBSTR_ID('E', 'I', 0, 0)) { + return; + } + break; + case InlineImageContext::State::kLookingForEI: + if (op_id != FXBSTR_ID('E', 'I', 0, 0)) { + return; + } + break; + } + } + + auto it = g_opcodes->find(op_id); if (it != g_opcodes->end()) { (this->*it->second)(); } @@ -640,63 +675,8 @@ } void CPDF_StreamContentParser::Handle_BeginImage() { - FX_FILESIZE savePos = syntax_->GetPos(); - auto pDict = document_->New<CPDF_Dictionary>(); - while (true) { - CPDF_StreamParser::ElementType type = syntax_->ParseNextElement(); - if (type == CPDF_StreamParser::ElementType::kKeyword) { - if (syntax_->GetWord() != "ID") { - syntax_->SetPos(savePos); - return; - } - } - if (type != CPDF_StreamParser::ElementType::kName) { - break; - } - // Next `syntax_` read below may invalidate `word`. Must save to `key`. - ByteStringView word = syntax_->GetWord(); - ByteString key(word.Last(word.GetLength() - 1)); - auto pObj = syntax_->ReadNextObject(false, false, 0); - if (pObj && !pObj->IsInline()) { - pDict->SetNewFor<CPDF_Reference>(key, document_, pObj->GetObjNum()); - } else { - pDict->SetFor(key, std::move(pObj)); - } - } - ReplaceAbbr(pDict); - RetainPtr<const CPDF_Object> pCSObj; - if (pDict->KeyExist("ColorSpace")) { - pCSObj = pDict->GetDirectObjectFor("ColorSpace"); - if (pCSObj->IsName()) { - ByteString name = pCSObj->GetString(); - if (name != "DeviceRGB" && name != "DeviceGray" && name != "DeviceCMYK") { - pCSObj = FindResourceObj("ColorSpace", name); - if (pCSObj && pCSObj->IsInline()) { - pDict->SetFor("ColorSpace", pCSObj->Clone()); - } - } - } - } - pDict->SetNewFor<CPDF_Name>("Subtype", "Image"); - RetainPtr<CPDF_Stream> pStream = - syntax_->ReadInlineStream(document_, std::move(pDict), pCSObj.Get()); - while (true) { - CPDF_StreamParser::ElementType type = syntax_->ParseNextElement(); - if (type == CPDF_StreamParser::ElementType::kEndOfData) { - return; - } - - if (type == CPDF_StreamParser::ElementType::kKeyword && - syntax_->GetWord() == "EI") { - break; - } - } - CPDF_ImageObject* pObj = AddImageFromStream(std::move(pStream), /*name=*/""); - // Record the bounding box of this image, so rendering code can draw it - // properly. - if (pObj && pObj->GetImage()->IsMask()) { - object_holder_->AddImageMaskBoundingBox(pObj->GetRect()); - } + CHECK(!inline_image_context_.has_value()); + inline_image_context_.emplace().begin_pos = syntax_->GetPos(); } void CPDF_StreamContentParser::Handle_BeginMarkedContent() { @@ -899,7 +879,79 @@ void CPDF_StreamContentParser::Handle_MarkPlace_Dictionary() {} -void CPDF_StreamContentParser::Handle_EndImage() {} +void CPDF_StreamContentParser::Handle_EndImage() { + if (!inline_image_context_.has_value()) { + return; // No matching "BI" operator. + } + + AutoOptionalResetter auto_resetter(&inline_image_context_); + const InlineImageContext& inline_image_context = + inline_image_context_.value(); + if (inline_image_context.state != InlineImageContext::State::kLookingForEI) { + return; // No matching "ID" operator. + } + + CHECK_LT(inline_image_context.begin_pos, inline_image_context.data_pos); + + const uint32_t saved_position = syntax_->GetPos(); + auto synthesized_dict = document_->New<CPDF_Dictionary>(); + syntax_->SetPos(inline_image_context.begin_pos); + while (true) { + CPDF_StreamParser::ElementType type = syntax_->ParseNextElement(); + if (syntax_->GetPos() >= inline_image_context.data_pos) { + break; // Reached "ID" operator. + } + if (type == CPDF_StreamParser::ElementType::kKeyword) { + continue; // Already handled by CPDF_StreamContentParser itself. + } + if (type != CPDF_StreamParser::ElementType::kName) { + break; + } + // Next `syntax_` read below may invalidate `word`. Must save to `key`. + ByteStringView word = syntax_->GetWord(); + ByteString key(word.Last(word.GetLength() - 1)); + auto value_object = syntax_->ReadNextObject(false, false, 0); + if (value_object && !value_object->IsInline()) { + synthesized_dict->SetNewFor<CPDF_Reference>(key, document_, + value_object->GetObjNum()); + } else { + synthesized_dict->SetFor(key, std::move(value_object)); + } + } + ReplaceAbbr(synthesized_dict); + synthesized_dict->SetNewFor<CPDF_Name>("Subtype", "Image"); + + RetainPtr<const CPDF_Object> colorspace; + if (synthesized_dict->KeyExist("ColorSpace")) { + colorspace = synthesized_dict->GetDirectObjectFor("ColorSpace"); + if (colorspace->IsName()) { + ByteString name = colorspace->GetString(); + if (name != "DeviceRGB" && name != "DeviceGray" && name != "DeviceCMYK") { + colorspace = FindResourceObj("ColorSpace", name); + if (colorspace && colorspace->IsInline()) { + synthesized_dict->SetFor("ColorSpace", colorspace->Clone()); + } + } + } + } + + syntax_->SetPos(inline_image_context.data_pos); + FX_SAFE_UINT32 stream_length = saved_position; + stream_length -= inline_image_context.data_pos; + stream_length -= 2; // Account for "EI" operator. + RetainPtr<CPDF_Stream> inline_stream = + syntax_->ReadInlineStream(document_, std::move(synthesized_dict), + colorspace, stream_length.ValueOrDie()); + CPDF_ImageObject* new_image_object = + AddImageFromStream(std::move(inline_stream), /*name=*/""); + // Record the bounding box of this image, so rendering code can draw it + // properly. + if (new_image_object && new_image_object->GetImage()->IsMask()) { + object_holder_->AddImageMaskBoundingBox(new_image_object->GetRect()); + } + + syntax_->SetPos(saved_position); +} void CPDF_StreamContentParser::Handle_EndMarkedContent() { // First element is a sentinel, so do not pop it, ever. This may come up if @@ -975,7 +1027,20 @@ cur_states_->mutable_general_state().SetFlatness(GetNumber(0)); } -void CPDF_StreamContentParser::Handle_BeginImageData() {} +void CPDF_StreamContentParser::Handle_BeginImageData() { + InlineImageContext& inline_image_context = inline_image_context_.value(); + CHECK_EQ(inline_image_context.state, + InlineImageContext::State::kLookingForID); + CHECK_EQ(inline_image_context.data_pos, 0u); + + inline_image_context.state = InlineImageContext::State::kLookingForEI; + + uint32_t pos = syntax_->GetPos(); + // Something else, like the "BI" operator, must be before the "ID" operator, + // so "ID" cannot be at position 0. + CHECK_NE(pos, 0u); + inline_image_context.data_pos = pos; +} void CPDF_StreamContentParser::Handle_SetLineJoin() { cur_states_->mutable_graph_state().SetLineJoin(
diff --git a/core/fpdfapi/page/cpdf_streamcontentparser.h b/core/fpdfapi/page/cpdf_streamcontentparser.h index 80b097c..a1bc9ed 100644 --- a/core/fpdfapi/page/cpdf_streamcontentparser.h +++ b/core/fpdfapi/page/cpdf_streamcontentparser.h
@@ -9,6 +9,7 @@ #include <array> #include <memory> +#include <optional> #include <stack> #include <variant> #include <vector> @@ -73,6 +74,14 @@ private: enum class RenderType : bool { kFill = false, kStroke = true }; + struct InlineImageContext { + enum class State : bool { kLookingForID, kLookingForEI }; + + State state = State::kLookingForID; + uint32_t begin_pos = 0; + uint32_t data_pos = 0; + }; + using ContentParam = std::variant<RetainPtr<CPDF_Object>, FX_Number, ByteString>; @@ -242,6 +251,8 @@ // The merged stream offset at which the last |syntax_| started parsing. uint32_t start_parse_offset_ = 0; + + std::optional<InlineImageContext> inline_image_context_; }; #endif // CORE_FPDFAPI_PAGE_CPDF_STREAMCONTENTPARSER_H_
diff --git a/core/fpdfapi/page/cpdf_streamparser.cpp b/core/fpdfapi/page/cpdf_streamparser.cpp index 4bdfb61..7f750ff 100644 --- a/core/fpdfapi/page/cpdf_streamparser.cpp +++ b/core/fpdfapi/page/cpdf_streamparser.cpp
@@ -25,7 +25,6 @@ #include "core/fxcodec/data_and_bytes_consumed.h" #include "core/fxcodec/jpeg/jpegmodule.h" #include "core/fxcodec/scanlinedecoder.h" -#include "core/fxcrt/autorestorer.h" #include "core/fxcrt/check.h" #include "core/fxcrt/data_vector.h" #include "core/fxcrt/fx_extension.h" @@ -140,14 +139,14 @@ RetainPtr<CPDF_Stream> CPDF_StreamParser::ReadInlineStream( CPDF_Document* pDoc, RetainPtr<CPDF_Dictionary> pDict, - const CPDF_Object* pCSObj) { - auto stream_span = buf_.subspan(pos_); + const CPDF_Object* pCSObj, + uint32_t stream_length) { + auto stream_span = buf_.subspan(pos_).first(stream_length); if (stream_span.empty()) { return nullptr; } if (PDFCharIsWhitespace(stream_span.front())) { - pos_++; stream_span = stream_span.subspan<1>(); if (stream_span.empty()) { return nullptr; @@ -201,7 +200,6 @@ auto src_span = stream_span.first(original_size); data = DataVector<uint8_t>(src_span.begin(), src_span.end()); actual_stream_size = original_size; - pos_ += original_size; } else { actual_stream_size = DecodeInlineStream(stream_span, width, height, decoder, @@ -210,26 +208,8 @@ return nullptr; } - { - AutoRestorer<uint32_t> saved_position(&pos_); - pos_ += actual_stream_size; - while (true) { - uint32_t saved_iteration_position = pos_; - ElementType type = ParseNextElement(); - if (type == ElementType::kEndOfData) { - return nullptr; - } - - if (type == ElementType::kKeyword && GetWord() == "EI") { - break; - } - - actual_stream_size += pos_ - saved_iteration_position; - } - } auto src_span = stream_span.first(actual_stream_size); data = DataVector<uint8_t>(src_span.begin(), src_span.end()); - pos_ += actual_stream_size; } pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(actual_stream_size)); return pdfium::MakeRetain<CPDF_Stream>(std::move(data), std::move(pDict));
diff --git a/core/fpdfapi/page/cpdf_streamparser.h b/core/fpdfapi/page/cpdf_streamparser.h index 417f5f2..a00cc71 100644 --- a/core/fpdfapi/page/cpdf_streamparser.h +++ b/core/fpdfapi/page/cpdf_streamparser.h
@@ -44,7 +44,8 @@ uint32_t dwRecursionLevel); RetainPtr<CPDF_Stream> ReadInlineStream(CPDF_Document* pDoc, RetainPtr<CPDF_Dictionary> pDict, - const CPDF_Object* pCSObj); + const CPDF_Object* pCSObj, + uint32_t stream_length); private: friend class CPDFStreamParserTest_ReadHexString_Test;
diff --git a/testing/SUPPRESSIONS b/testing/SUPPRESSIONS index ae5308a..aa1db92 100644 --- a/testing/SUPPRESSIONS +++ b/testing/SUPPRESSIONS
@@ -622,9 +622,6 @@ # TODO(chromium:1131694): Remove after associated bug is fixed bug_1131694.in * * * * -# TODO(thestig): Remove after fixing inline image parsing. -bug_1236805.in * * * * - # TODO(pdfium:1747): Remove after associated bug is fixed bug_1258634.in * * * *
diff --git a/testing/resources/pixel/bug_1236805_expected.pdf.0.png b/testing/resources/pixel/bug_1236805_expected.pdf.0.png index 7d4eebe..08c11b0 100644 --- a/testing/resources/pixel/bug_1236805_expected.pdf.0.png +++ b/testing/resources/pixel/bug_1236805_expected.pdf.0.png Binary files differ
diff --git a/testing/resources/pixel/bug_407752631.in b/testing/resources/pixel/bug_407752631.in new file mode 100644 index 0000000..3993fa4 --- /dev/null +++ b/testing/resources/pixel/bug_407752631.in
@@ -0,0 +1,69 @@ +{{header}} +{{object 1 0}} << + /Type /Catalog + /Pages 2 0 R +>> +endobj +{{object 2 0}} << + /Type /Pages + /Count 1 + /Kids [3 0 R] + /MediaBox [0 0 200 200] +>> +endobj +{{object 3 0}} << + /Type /Page + /Parent 2 0 R + /Contents 4 0 R +>> +endobj +{{object 4 0}} << + /Filter [ + % The stream contains a few hundred megabytes of spaces. Do a few rounds of + % compression to reduce the file size. + /ASCII85Decode + /FlateDecode + /ASCII85Decode + /FlateDecode + /ASCII85Decode + /FlateDecode + ] + {{streamlen}} +>> +stream +GhT!\fp\An(5C/G$NX+E5XeK\'GUb@;+(Ei6pVM%E&d6#hpohD-_7E?O=04LO#=ZO>,%oD/DX*[+M+AH +n@\iELsngCp8V,_>&67C1Ck_\NC('?^J9rQ:5YAbfgouJk$tOnR.c1L:(6k[0Bc\"VOmu:([Q2:\]e>S +2o35";9H3iiCU_#G+I/NM"9J[3=PIUaJBQ*K7`]DX*EH,ADNiI$\W\)*%@N":8=s+]kU1PR-8&p\k-GP +2.uFEWbfLNl!ef$?B^R'"nQ.,Ou%%M=a:<ZMs^X.DjfWFV(X@5*JNTeS_FAA-Kj6`l11DGB-u!$>YTa5 +-a$sNpJ?ol^EKZtT''G8.5PB%lMWUP\B4/9X[1.Fm7cNaL^0o!^6,71c8sniWe'7"SRug4A3Jk/c,O+P +]!>JJ;R_Y0i&!UCN\o(u$)']kWu*U!9TmG8*!,f=G=W^tCPZD%q68T"F>3:r;bA&^F$&j&ok/lHI$.QX +3Ws@K*[p.!TiF\0[u72J!6JC2(c?d=o>b$mp3C]7O82IGcL/5t3n+5Hg=YH>.'3+iT?$PUC)q+U)h1sp +SlDSUbZrHPEVQ>PV4)a17_gZ@riTU!R*"L^L;&Dg>4]-CN=p25XCK4ic^>lOk?&nAHjdk`2F27f"ZA1J +=f$][QS)armMr;\C[_'"]W8GDk+FDb;s2GRbG68Q_'nRInt<nBXVF&"q`c,;:8MHVGa8!F;oH,l0#FN; +L0JhZ8B2%rX2u&)"*Y^%-i7-%/e"]K*kZ(s#[9bPfS`\"^J2)d%C,\"0<FhH"\f)$2`#roQlSW#bJQ+0 +B>2pWFU%2?">=q`Emh4dd2+keN[#!VdPSpGY*i1gl=N6jO];[DH7c+[^ACb;InGef-%%Ht%92cFWPeT7 +_PP_Xd:.d:Z-S'tYn%(3k3U]T]?ZR9ZQ!'3c#%J#O;G/,B.udsJ3_GcE='1$V23)ViHcQJ%u=I0E,cT3 +8=kA^+hX?SZmCXf%k@u8?RmdD/g[J&bZ\d<f$#85@+uNp8?.h&;,ZOWh=6u2T"43*`!i#o;o'b$<8L<G +$-(T1]I"\t6h[f9W\&=d')^G*lGe^oU9Fd=6UK(QQ'qK)]I2.A**\8d#L6T1>O0_[dM'(\>laX^7KP42 +/!@WgNKNjLAQKC[c,pm0AMO5Oer.;CnorV_1H2#m%6'a+EUJ))NDJA^aI8Z;j@li)9*Y<pAq_I3m6air +eZ#9G]C;=iRLNqI2Ri?ngjo[FIKF[52!m$4:R:aj!rN(%OS]aYAC]);?&,$t&%RUM_Z5\3H9L<2:gHoj +=,X/G5q9g82B743=unGpqsuW?Z/@6?//K572Nr+7A@aYG;YC!NVa#!Xs3Sb.NMaA-RI^[tWnFG>P>WKc +jZPhY59\42>aPh%I_jhI5fOB5j=4YqBIM^@4mik2^=8qA`q6Y/i;iul4m=C_U+s+X7)MZqG`M8!pd1$^ +V/7.#Ss#o0*fa"4LtGqqY9rAO8"K=];#*ETQRl&?LO6MM4#q]'=076j7P;4RX6X)?^m<A?CL.KZX$TDn +I/:s\r"PQt8_c4Ef-d7p]8U>+a#?/paf#tV"T<-n!On]AoW;tV"XBKtaD.T5HEPpZ)0CiU,#St+Eq*?q +*KWCTI9C#nVof.]W0XT$KlT/!GaI=B_dmOd,U@N@"5%[,oUt/[S(.+R*fR3T:2%/aej/lo2a%UE^$MDB +GL%nG2qgUb2nit>\NPNta$c"eCDb1KXn8c)p`.!RL:'gCV&r\R-r_X;!]D%'phh>!KJa>p(`lFYXn]gm +NTHm3UfVBH`IR!neB_DWa/!W.,5ES\]N8ff#0h*LBm#+e'nCB2[[48.>5Ai0I)=Q!!I1g0XQ-7t<d8:_ +BPBPWb/A.ZKp'UC&LLV'KuSrl6TO!?0XU@QNkK\N77-g+f%Z\Mn_V+\M^^ROmYd#7fTeI!RWWu5&6dj3 +,F-AP8ZH3\r8H$(]Q\L[ak?LP@pYYiWYh`)1[u/u_p0h4,t4b`Ep@9mf:HGEU(W^BrLkU)nY/\-2uB>< +1o/d!pSKgYoic'Ni%916M[troWAYl%59R;2nK+>ei=cP]JUggfdmY<hSE/Y;:="XSmYla`BZ)Ec(?S0N +EAaAX%\LLS$O7eqlWdfa.&W&G_'E'IrkGSBn"E(WPR@u8<QGL;4hj0,9*8S[AdB[ekIt@WG=g;-NBZ$% +M)[N4cP[$a@#i?>N('9(lRNUZ<qWh[5b(t\"=mP777gQ_8$_1$ZKgi)@S;($/Pa&H*\7q^TW`M#R^Gp< +J,M!Le'^s$1nVA*Ug\q/r)Vs7*a>k(no*>MJC\`Rf#1Mc/ke;hUN4o]jZg;.J=d=F5JDc#Gg-3!X].;] +hX,&5T&Ga2b:)8hRH83=Ri^/B0TE21pL)t\=g7C?Xe3gO>/$WP9U"o@0>X6Fg4(u1rrJ1r11^~> +endstream +endobj +{{xref}} +{{trailer}} +{{startxref}} +%%EOF
diff --git a/testing/resources/pixel/bug_407752631_expected.pdf.0.png b/testing/resources/pixel/bug_407752631_expected.pdf.0.png new file mode 100644 index 0000000..f97e340 --- /dev/null +++ b/testing/resources/pixel/bug_407752631_expected.pdf.0.png Binary files differ