core/fpdfapi/parser/cpdf_syntax_parser.cpp - pdfium - Git at Google

 // Copyright 2016 The PDFium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"

 #include <ctype.h>

 #include <algorithm>
 #include <utility>

 #include "core/fpdfapi/parser/cpdf_array.h"
 #include "core/fpdfapi/parser/cpdf_boolean.h"
 #include "core/fpdfapi/parser/cpdf_crypto_handler.h"
 #include "core/fpdfapi/parser/cpdf_dictionary.h"
 #include "core/fpdfapi/parser/cpdf_name.h"
 #include "core/fpdfapi/parser/cpdf_null.h"
 #include "core/fpdfapi/parser/cpdf_number.h"
 #include "core/fpdfapi/parser/cpdf_read_validator.h"
 #include "core/fpdfapi/parser/cpdf_reference.h"
 #include "core/fpdfapi/parser/cpdf_stream.h"
 #include "core/fpdfapi/parser/cpdf_string.h"
 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
 #include "core/fxcrt/autorestorer.h"
 #include "core/fxcrt/cfx_read_only_vector_stream.h"
 #include "core/fxcrt/check.h"
 #include "core/fxcrt/check_op.h"
 #include "core/fxcrt/fixed_size_data_vector.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxcrt/fx_safe_types.h"

 namespace {

 enum class ReadStatus {
   kNormal,
   kBackslash,
   kOctal,
   kFinishOctal,
   kCarriageReturn
 };

 class ReadableSubStream final : public IFX_SeekableReadStream {
  public:
   ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,
                     FX_FILESIZE part_offset,
                     FX_FILESIZE part_size)
       : m_pFileRead(std::move(pFileRead)),
         m_PartOffset(part_offset),
         m_PartSize(part_size) {}

   ~ReadableSubStream() override = default;

   // IFX_SeekableReadStream overrides:
   bool ReadBlockAtOffset(pdfium::span<uint8_t> buffer,
                          FX_FILESIZE offset) override {
     FX_SAFE_FILESIZE safe_end = offset;
     safe_end += buffer.size();
     // Check that requested range is valid, to prevent calling of ReadBlock
     // of original m_pFileRead with incorrect params.
     if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_PartSize)
       return false;

     return m_pFileRead->ReadBlockAtOffset(buffer, m_PartOffset + offset);
   }

   FX_FILESIZE GetSize() override { return m_PartSize; }

  private:
   RetainPtr<IFX_SeekableReadStream> m_pFileRead;
   FX_FILESIZE m_PartOffset;
   FX_FILESIZE m_PartSize;
 };

 }  // namespace

 // static
 int CPDF_SyntaxParser::s_CurrentRecursionDepth = 0;

 // static
 std::unique_ptr<CPDF_SyntaxParser> CPDF_SyntaxParser::CreateForTesting(
     RetainPtr<IFX_SeekableReadStream> pFileAccess,
     FX_FILESIZE HeaderOffset) {
   return std::make_unique<CPDF_SyntaxParser>(
       pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess), nullptr),
       HeaderOffset);
 }

 CPDF_SyntaxParser::CPDF_SyntaxParser(
     RetainPtr<IFX_SeekableReadStream> pFileAccess)
     : CPDF_SyntaxParser(
           pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess),
                                                  nullptr),
           0) {}

 CPDF_SyntaxParser::CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,
                                      FX_FILESIZE HeaderOffset)
     : m_pFileAccess(std::move(validator)),
       m_HeaderOffset(HeaderOffset),
       m_FileLen(m_pFileAccess->GetSize()) {
   DCHECK(m_HeaderOffset <= m_FileLen);
 }

 CPDF_SyntaxParser::~CPDF_SyntaxParser() = default;

 bool CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) {
   AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
   m_Pos = pos;
   return GetNextChar(ch);
 }

 bool CPDF_SyntaxParser::ReadBlockAt(FX_FILESIZE read_pos) {
   if (read_pos >= m_FileLen)
     return false;
   size_t read_size = m_ReadBufferSize;
   FX_SAFE_FILESIZE safe_end = read_pos;
   safe_end += read_size;
   if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_FileLen)
     read_size = m_FileLen - read_pos;

   m_pFileBuf.resize(read_size);
   if (!m_pFileAccess->ReadBlockAtOffset(m_pFileBuf, read_pos)) {
     m_pFileBuf.clear();
     return false;
   }

   m_BufOffset = read_pos;
   return true;
 }

 bool CPDF_SyntaxParser::GetNextChar(uint8_t& ch) {
   FX_FILESIZE pos = m_Pos + m_HeaderOffset;
   if (pos >= m_FileLen)
     return false;

   if (!IsPositionRead(pos) && !ReadBlockAt(pos))
     return false;

   ch = m_pFileBuf[pos - m_BufOffset];
   m_Pos++;
   return true;
 }

 FX_FILESIZE CPDF_SyntaxParser::GetDocumentSize() const {
   return m_FileLen - m_HeaderOffset;
 }

 bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) {
   pos += m_HeaderOffset;
   if (pos >= m_FileLen)
     return false;

   if (!IsPositionRead(pos)) {
     FX_FILESIZE block_start = 0;
     if (pos >= CPDF_Stream::kFileBufSize)
       block_start = pos - CPDF_Stream::kFileBufSize + 1;
     if (!ReadBlockAt(block_start) || !IsPositionRead(pos))
       return false;
   }
   *ch = m_pFileBuf[pos - m_BufOffset];
   return true;
 }

 bool CPDF_SyntaxParser::ReadBlock(pdfium::span<uint8_t> buffer) {
   if (!m_pFileAccess->ReadBlockAtOffset(buffer, m_Pos + m_HeaderOffset))
     return false;
   m_Pos += buffer.size();
   return true;
 }

 CPDF_SyntaxParser::WordType CPDF_SyntaxParser::GetNextWordInternal() {
   m_WordSize = 0;
   WordType word_type = WordType::kNumber;

   ToNextWord();
   uint8_t ch;
   if (!GetNextChar(ch))
     return word_type;

   if (PDFCharIsDelimiter(ch)) {
     word_type = WordType::kWord;

     m_WordBuffer[m_WordSize++] = ch;
     if (ch == '/') {
       while (true) {
         if (!GetNextChar(ch))
           return word_type;

         if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
           m_Pos--;
           return word_type;
         }

         if (m_WordSize < sizeof(m_WordBuffer) - 1)
           m_WordBuffer[m_WordSize++] = ch;
       }
     } else if (ch == '<') {
       if (!GetNextChar(ch))
         return word_type;

       if (ch == '<')
         m_WordBuffer[m_WordSize++] = ch;
       else
         m_Pos--;
     } else if (ch == '>') {
       if (!GetNextChar(ch))
         return word_type;

       if (ch == '>')
         m_WordBuffer[m_WordSize++] = ch;
       else
         m_Pos--;
     }
     return word_type;
   }

   while (true) {
     if (m_WordSize < sizeof(m_WordBuffer) - 1)
       m_WordBuffer[m_WordSize++] = ch;

     if (!PDFCharIsNumeric(ch))
       word_type = WordType::kWord;

     if (!GetNextChar(ch))
       return word_type;

     if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
       m_Pos--;
       break;
     }
   }
   return word_type;
 }

 ByteString CPDF_SyntaxParser::ReadString() {
   uint8_t ch;
   if (!GetNextChar(ch))
     return ByteString();

   ByteString buf;
   int32_t parlevel = 0;
   ReadStatus status = ReadStatus::kNormal;
   int32_t iEscCode = 0;
   while (true) {
     switch (status) {
       case ReadStatus::kNormal:
         if (ch == ')') {
           if (parlevel == 0)
             return ByteString(buf);
           parlevel--;
         } else if (ch == '(') {
           parlevel++;
         }
         if (ch == '\\')
           status = ReadStatus::kBackslash;
         else
           buf += static_cast<char>(ch);
         break;
       case ReadStatus::kBackslash:
         if (FXSYS_IsOctalDigit(ch)) {
           iEscCode = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
           status = ReadStatus::kOctal;
           break;
         }
         if (ch == '\r') {
           status = ReadStatus::kCarriageReturn;
           break;
         }
         if (ch == 'n') {
           buf += '\n';
         } else if (ch == 'r') {
           buf += '\r';
         } else if (ch == 't') {
           buf += '\t';
         } else if (ch == 'b') {
           buf += '\b';
         } else if (ch == 'f') {
           buf += '\f';
         } else if (ch != '\n') {
           buf += static_cast<char>(ch);
         }
         status = ReadStatus::kNormal;
         break;
       case ReadStatus::kOctal:
         if (FXSYS_IsOctalDigit(ch)) {
           iEscCode =
               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
           status = ReadStatus::kFinishOctal;
         } else {
           buf += static_cast<char>(iEscCode);
           status = ReadStatus::kNormal;
           continue;
         }
         break;
       case ReadStatus::kFinishOctal:
         status = ReadStatus::kNormal;
         if (FXSYS_IsOctalDigit(ch)) {
           iEscCode =
               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
           buf += static_cast<char>(iEscCode);
         } else {
           buf += static_cast<char>(iEscCode);
           continue;
         }
         break;
       case ReadStatus::kCarriageReturn:
         status = ReadStatus::kNormal;
         if (ch != '\n')
           continue;
         break;
     }

     if (!GetNextChar(ch))
       break;
   }

   GetNextChar(ch);
   return buf;
 }

 ByteString CPDF_SyntaxParser::ReadHexString() {
   uint8_t ch;
   if (!GetNextChar(ch))
     return ByteString();

   ByteString buf;
   bool bFirst = true;
   uint8_t code = 0;
   while (true) {
     if (ch == '>')
       break;

     if (isxdigit(ch)) {
       int val = FXSYS_HexCharToInt(ch);
       if (bFirst) {
         code = val * 16;
       } else {
         code += val;
         buf += static_cast<char>(code);
       }
       bFirst = !bFirst;
     }

     if (!GetNextChar(ch))
       break;
   }
   if (!bFirst)
     buf += static_cast<char>(code);

   return buf;
 }

 void CPDF_SyntaxParser::ToNextLine() {
   uint8_t ch;
   while (GetNextChar(ch)) {
     if (ch == '\n')
       break;

     if (ch == '\r') {
       GetNextChar(ch);
       if (ch != '\n')
         --m_Pos;
       break;
     }
   }
 }

 void CPDF_SyntaxParser::ToNextWord() {
   if (m_TrailerEnds) {
     RecordingToNextWord();
     return;
   }

   uint8_t ch;
   if (!GetNextChar(ch))
     return;

   while (true) {
     while (PDFCharIsWhitespace(ch)) {
       if (!GetNextChar(ch))
         return;
     }

     if (ch != '%')
       break;

     while (true) {
       if (!GetNextChar(ch))
         return;
       if (PDFCharIsLineEnding(ch))
         break;
     }
   }
   m_Pos--;
 }

 // A state machine which goes % -> E -> O -> F -> line ending.
 enum class EofState {
   kInitial = 0,
   kNonPercent,
   kPercent,
   kE,
   kO,
   kF,
   kInvalid,
 };

 void CPDF_SyntaxParser::RecordingToNextWord() {
   DCHECK(m_TrailerEnds);

   EofState eof_state = EofState::kInitial;
   // Find the first character which is neither whitespace, nor part of a
   // comment.
   while (true) {
     uint8_t ch;
     if (!GetNextChar(ch))
       return;
     switch (eof_state) {
       case EofState::kInitial:
         if (!PDFCharIsWhitespace(ch))
           eof_state = ch == '%' ? EofState::kPercent : EofState::kNonPercent;
         break;
       case EofState::kNonPercent:
         break;
       case EofState::kPercent:
         if (ch == 'E')
           eof_state = EofState::kE;
         else if (ch != '%')
           eof_state = EofState::kInvalid;
         break;
       case EofState::kE:
         eof_state = ch == 'O' ? EofState::kO : EofState::kInvalid;
         break;
       case EofState::kO:
         eof_state = ch == 'F' ? EofState::kF : EofState::kInvalid;
         break;
       case EofState::kF:
         if (ch == '\r') {
           // See if \r has to be combined with a \n that follows it
           // immediately.
           if (GetNextChar(ch) && ch != '\n') {
             ch = '\r';
             m_Pos--;
           }
         }
         // If we now have a \r, that's not followed by a \n, so both are OK.
         if (ch == '\r' || ch == '\n')
           m_TrailerEnds->push_back(m_Pos);
         eof_state = EofState::kInvalid;
         break;
       case EofState::kInvalid:
         break;
     }
     if (PDFCharIsLineEnding(ch))
       eof_state = EofState::kInitial;
     if (eof_state == EofState::kNonPercent)
       break;
   }
   m_Pos--;
 }

 CPDF_SyntaxParser::WordResult CPDF_SyntaxParser::GetNextWord() {
   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
   WordType word_type = GetNextWordInternal();
   ByteString word;
   if (!GetValidator()->has_read_problems())
     word = ByteString(m_WordBuffer.data(), m_WordSize);
   return {word, word_type == WordType::kNumber};
 }

 ByteString CPDF_SyntaxParser::PeekNextWord() {
   AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
   return GetNextWord().word;
 }

 ByteString CPDF_SyntaxParser::GetKeyword() {
   return GetNextWord().word;
 }

 void CPDF_SyntaxParser::SetPos(FX_FILESIZE pos) {
   DCHECK_GE(pos, 0);
   m_Pos = std::min(pos, m_FileLen);
 }

 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBody(
     CPDF_IndirectObjectHolder* pObjList) {
   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
   auto result = GetObjectBodyInternal(pObjList, ParseType::kLoose);
   if (GetValidator()->has_read_problems())
     return nullptr;
   return result;
 }

 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBodyInternal(
     CPDF_IndirectObjectHolder* pObjList,
     ParseType parse_type) {
   AutoRestorer<int> depth_restorer(&s_CurrentRecursionDepth);
   if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth)
     return nullptr;

   FX_FILESIZE SavedObjPos = m_Pos;
   WordResult word_result = GetNextWord();
   const ByteString& word = word_result.word;
   if (word.IsEmpty())
     return nullptr;

   if (word_result.is_number) {
     AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
     WordResult nextword = GetNextWord();
     if (!nextword.is_number)
       return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());

     WordResult nextword2 = GetNextWord();
     if (nextword2.word != "R")
       return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());

     pos_restorer.AbandonRestoration();
     uint32_t refnum = FXSYS_atoui(word.c_str());
     if (refnum == CPDF_Object::kInvalidObjNum)
       return nullptr;

     return pdfium::MakeRetain<CPDF_Reference>(pObjList, refnum);
   }

   if (word == "true" || word == "false")
     return pdfium::MakeRetain<CPDF_Boolean>(word == "true");

   if (word == "null")
     return pdfium::MakeRetain<CPDF_Null>();

   if (word == "(") {
     ByteString str = ReadString();
     return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
   }
   if (word == "<") {
     ByteString str = ReadHexString();
     return pdfium::MakeRetain<CPDF_String>(m_pPool, str, true);
   }
   if (word == "[") {
     auto pArray = pdfium::MakeRetain<CPDF_Array>();
     while (RetainPtr<CPDF_Object> pObj =
                GetObjectBodyInternal(pObjList, ParseType::kLoose)) {
       // `pObj` cannot be a stream, per ISO 32000-1:2008 section 7.3.8.1.
       if (!pObj->IsStream()) {
         pArray->Append(std::move(pObj));
       }
     }
     return (parse_type == ParseType::kLoose || m_WordBuffer[0] == ']')
                ? std::move(pArray)
                : nullptr;
   }
   if (word[0] == '/') {
     auto word_span = pdfium::make_span(m_WordBuffer).first(m_WordSize);
     return pdfium::MakeRetain<CPDF_Name>(
         m_pPool, PDF_NameDecode(ByteStringView(word_span).Substr(1)));
   }
   if (word == "<<") {
     RetainPtr<CPDF_Dictionary> pDict =
         pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
     while (true) {
       WordResult inner_word_result = GetNextWord();
       const ByteString& inner_word = inner_word_result.word;
       if (inner_word.IsEmpty())
         return nullptr;

       FX_FILESIZE SavedPos = m_Pos - inner_word.GetLength();
       if (inner_word == ">>")
         break;

       if (inner_word == "endobj") {
         m_Pos = SavedPos;
         break;
       }
       if (inner_word[0] != '/')
         continue;

       ByteString key = PDF_NameDecode(inner_word.AsStringView());
       if (key.IsEmpty() && parse_type == ParseType::kLoose)
         continue;

       RetainPtr<CPDF_Object> pObj =
           GetObjectBodyInternal(pObjList, ParseType::kLoose);
       if (!pObj) {
         if (parse_type == ParseType::kLoose)
           continue;

         ToNextLine();
         return nullptr;
       }

       // `key` has to be "/X" at the minimum.
       // `pObj` cannot be a stream, per ISO 32000-1:2008 section 7.3.8.1.
       if (key.GetLength() > 1 && !pObj->IsStream()) {
         pDict->SetFor(key.Substr(1), std::move(pObj));
       }
     }

     AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
     if (GetNextWord().word != "stream")
       return pDict;
     pos_restorer.AbandonRestoration();
     return ReadStream(std::move(pDict));
   }
   if (word == ">>")
     m_Pos = SavedObjPos;

   return nullptr;
 }

 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetIndirectObject(
     CPDF_IndirectObjectHolder* pObjList,
     ParseType parse_type) {
   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
   const FX_FILESIZE saved_pos = GetPos();

   WordResult objnum_word_result = GetNextWord();
   if (!objnum_word_result.is_number || objnum_word_result.word.IsEmpty()) {
     SetPos(saved_pos);
     return nullptr;
   }
   const uint32_t parser_objnum = FXSYS_atoui(objnum_word_result.word.c_str());

   WordResult gennum_word_result = GetNextWord();
   const ByteString& gennum_word = gennum_word_result.word;
   if (!gennum_word_result.is_number || gennum_word.IsEmpty()) {
     SetPos(saved_pos);
     return nullptr;
   }
   const uint32_t parser_gennum = FXSYS_atoui(gennum_word.c_str());

   if (GetKeyword() != "obj") {
     SetPos(saved_pos);
     return nullptr;
   }

   RetainPtr<CPDF_Object> pObj = GetObjectBodyInternal(pObjList, parse_type);
   if (pObj) {
     pObj->SetObjNum(parser_objnum);
     pObj->SetGenNum(parser_gennum);
   }

   return GetValidator()->has_read_problems() ? nullptr : pObj;
 }

 unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) {
   unsigned char byte1 = 0;
   unsigned char byte2 = 0;

   GetCharAt(pos, byte1);
   GetCharAt(pos + 1, byte2);

   if (byte1 == '\r' && byte2 == '\n')
     return 2;

   if (byte1 == '\r' || byte1 == '\n')
     return 1;

   return 0;
 }

 FX_FILESIZE CPDF_SyntaxParser::FindWordPos(ByteStringView word) {
   AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
   FX_FILESIZE end_offset = FindTag(word);
   while (end_offset >= 0) {
     // Stop searching when word is found.
     if (IsWholeWord(GetPos() - word.GetLength(), m_FileLen, word, true))
       return GetPos() - word.GetLength();

     end_offset = FindTag(word);
   }
   return -1;
 }

 FX_FILESIZE CPDF_SyntaxParser::FindStreamEndPos() {
   const ByteStringView kEndStreamStr("endstream");
   const ByteStringView kEndObjStr("endobj");

   FX_FILESIZE endStreamWordOffset = FindWordPos(kEndStreamStr);
   FX_FILESIZE endObjWordOffset = FindWordPos(kEndObjStr);

   // Can't find "endstream" or "endobj".
   if (endStreamWordOffset < 0 && endObjWordOffset < 0) {
     return -1;
   }

   if (endStreamWordOffset < 0 && endObjWordOffset >= 0) {
     // Correct the position of end stream.
     endStreamWordOffset = endObjWordOffset;
   } else if (endStreamWordOffset >= 0 && endObjWordOffset < 0) {
     // Correct the position of end obj.
     endObjWordOffset = endStreamWordOffset;
   } else if (endStreamWordOffset > endObjWordOffset) {
     endStreamWordOffset = endObjWordOffset;
   }

   int numMarkers = ReadEOLMarkers(endStreamWordOffset - 2);
   if (numMarkers == 2) {
     endStreamWordOffset -= 2;
   } else {
     numMarkers = ReadEOLMarkers(endStreamWordOffset - 1);
     if (numMarkers == 1) {
       endStreamWordOffset -= 1;
     }
   }
   if (endStreamWordOffset < GetPos()) {
     return -1;
   }
   return endStreamWordOffset;
 }

 RetainPtr<CPDF_Stream> CPDF_SyntaxParser::ReadStream(
     RetainPtr<CPDF_Dictionary> pDict) {
   RetainPtr<const CPDF_Number> pLenObj =
       ToNumber(pDict->GetDirectObjectFor("Length"));
   FX_FILESIZE len = pLenObj ? pLenObj->GetInteger() : -1;

   // Locate the start of stream.
   ToNextLine();
   const FX_FILESIZE streamStartPos = GetPos();

   if (len > 0) {
     FX_SAFE_FILESIZE pos = GetPos();
     pos += len;
     if (!pos.IsValid() || pos.ValueOrDie() >= m_FileLen)
       len = -1;
   }

   RetainPtr<IFX_SeekableReadStream> substream;
   if (len > 0) {
     // Check data availability first to allow the Validator to request data
     // smoothly, without jumps.
     if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
             m_HeaderOffset + GetPos(), len)) {
       return nullptr;
     }

     substream = pdfium::MakeRetain<ReadableSubStream>(
         GetValidator(), m_HeaderOffset + GetPos(), len);
     SetPos(GetPos() + len);
   }

   const ByteStringView kEndStreamStr("endstream");
   const ByteStringView kEndObjStr("endobj");

   // Note, we allow zero length streams as we need to pass them through when we
   // are importing pages into a new document.
   if (len >= 0) {
     CPDF_ReadValidator::ScopedSession read_session(GetValidator());
     m_Pos += ReadEOLMarkers(GetPos());
     memset(m_WordBuffer.data(), 0, kEndStreamStr.GetLength() + 1);
     GetNextWordInternal();
     if (GetValidator()->has_read_problems())
       return nullptr;

     // Earlier version of PDF specification doesn't require EOL marker before
     // 'endstream' keyword. If keyword 'endstream' follows the bytes in
     // specified length, it signals the end of stream.
     if (memcmp(m_WordBuffer.data(), kEndStreamStr.unterminated_unsigned_str(),
                kEndStreamStr.GetLength()) != 0) {
       substream.Reset();
       len = -1;
       SetPos(streamStartPos);
     }
   }

   if (len < 0) {
     // If len is not available or incorrect, len needs to be calculated
     // by searching the keywords "endstream" or "endobj".
     const FX_FILESIZE streamEndPos = FindStreamEndPos();
     if (streamEndPos < 0)
       return nullptr;

     len = streamEndPos - streamStartPos;
     DCHECK_GE(len, 0);
     if (len > 0) {
       SetPos(streamStartPos);
       // Check data availability first to allow the Validator to request data
       // smoothly, without jumps.
       if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
               m_HeaderOffset + GetPos(), len)) {
         return nullptr;
       }

       substream = pdfium::MakeRetain<ReadableSubStream>(
           GetValidator(), m_HeaderOffset + GetPos(), len);
       SetPos(GetPos() + len);
     }
   }

   RetainPtr<CPDF_Stream> stream;
   if (substream) {
     // It is unclear from CPDF_SyntaxParser's perspective what object
     // `substream` is ultimately holding references to. To avoid unexpectedly
     // changing object lifetimes by handing `substream` to `stream`, make a
     // copy of the data here.
     auto data = FixedSizeDataVector<uint8_t>::Uninit(substream->GetSize());
     bool did_read = substream->ReadBlockAtOffset(data.span(), 0);
     CHECK(did_read);
     auto data_as_stream =
         pdfium::MakeRetain<CFX_ReadOnlyVectorStream>(std::move(data));

     stream = pdfium::MakeRetain<CPDF_Stream>(std::move(data_as_stream),
                                              std::move(pDict));
   } else {
     DCHECK(!len);
     stream = pdfium::MakeRetain<CPDF_Stream>(std::move(pDict));
   }
   const FX_FILESIZE end_stream_offset = GetPos();
   memset(m_WordBuffer.data(), 0, kEndObjStr.GetLength() + 1);
   GetNextWordInternal();

   // Allow whitespace after endstream and before a newline.
   unsigned char ch = 0;
   while (GetNextChar(ch)) {
     if (!PDFCharIsWhitespace(ch) || PDFCharIsLineEnding(ch))
       break;
   }
   SetPos(GetPos() - 1);

   int numMarkers = ReadEOLMarkers(GetPos());
   if (m_WordSize == static_cast<unsigned int>(kEndObjStr.GetLength()) &&
       numMarkers != 0 &&
       memcmp(m_WordBuffer.data(), kEndObjStr.unterminated_unsigned_str(),
              kEndObjStr.GetLength()) == 0) {
     SetPos(end_stream_offset);
   }
   return stream;
 }

 uint32_t CPDF_SyntaxParser::GetDirectNum() {
   if (GetNextWordInternal() != WordType::kNumber)
     return 0;

   m_WordBuffer[m_WordSize] = 0;
   return FXSYS_atoui(pdfium::as_chars(pdfium::make_span(m_WordBuffer)).data());
 }

 RetainPtr<CPDF_ReadValidator> CPDF_SyntaxParser::GetValidator() const {
   return m_pFileAccess;
 }

 bool CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos,
                                     FX_FILESIZE limit,
                                     ByteStringView tag,
                                     bool checkKeyword) {
   const uint32_t taglen = tag.GetLength();

   bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]);
   bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) &&
                      !PDFCharIsWhitespace(tag[taglen - 1]);

   uint8_t ch;
   if (bCheckRight && startpos + static_cast<int32_t>(taglen) <= limit &&
       GetCharAt(startpos + static_cast<int32_t>(taglen), ch)) {
     if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
         (checkKeyword && PDFCharIsDelimiter(ch))) {
       return false;
     }
   }

   if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) {
     if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
         (checkKeyword && PDFCharIsDelimiter(ch))) {
       return false;
     }
   }
   return true;
 }

 bool CPDF_SyntaxParser::BackwardsSearchToWord(ByteStringView word,
                                               FX_FILESIZE limit) {
   int32_t taglen = word.GetLength();
   if (taglen == 0)
     return false;

   FX_FILESIZE pos = m_Pos;
   int32_t offset = taglen - 1;
   while (true) {
     if (limit && pos <= m_Pos - limit)
       return false;

     uint8_t byte;
     if (!GetCharAtBackward(pos, &byte))
       return false;

     if (byte == word[offset]) {
       offset--;
       if (offset >= 0) {
         pos--;
         continue;
       }
       if (IsWholeWord(pos, limit, word, false)) {
         m_Pos = pos;
         return true;
       }
     }
     offset = byte == word[taglen - 1] ? taglen - 2 : taglen - 1;
     pos--;
     if (pos < 0)
       return false;
   }
 }

 FX_FILESIZE CPDF_SyntaxParser::FindTag(ByteStringView tag) {
   const FX_FILESIZE startpos = GetPos();
   const int32_t taglen = tag.GetLength();
   DCHECK_GT(taglen, 0);

   int32_t match = 0;
   while (true) {
     uint8_t ch;
     if (!GetNextChar(ch))
       return -1;

     if (ch == tag[match]) {
       match++;
       if (match == taglen)
         return GetPos() - startpos - taglen;
     } else {
       match = ch == tag[0] ? 1 : 0;
     }
   }
 }

 bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const {
   return m_BufOffset <= pos &&
          pos < static_cast<FX_FILESIZE>(m_BufOffset + m_pFileBuf.size());
 }