core/fpdfapi/parser/cpdf_syntax_parser.h - pdfium - Git at Google

 // Copyright 2016 The PDFium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

 #ifndef CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_
 #define CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_

 #include <stdint.h>

 #include <array>
 #include <memory>
 #include <vector>

 #include "core/fpdfapi/parser/cpdf_stream.h"
 #include "core/fxcrt/data_vector.h"
 #include "core/fxcrt/fx_types.h"
 #include "core/fxcrt/retain_ptr.h"
 #include "core/fxcrt/span.h"
 #include "core/fxcrt/string_pool_template.h"
 #include "core/fxcrt/unowned_ptr.h"
 #include "core/fxcrt/weak_ptr.h"

 class CPDF_Dictionary;
 class CPDF_IndirectObjectHolder;
 class CPDF_Object;
 class CPDF_ReadValidator;
 class CPDF_Stream;
 class IFX_SeekableReadStream;

 class CPDF_SyntaxParser {
  public:
   enum class ParseType : bool { kStrict, kLoose };

   struct WordResult {
     ByteString word;
     bool is_number;
   };

   static std::unique_ptr<CPDF_SyntaxParser> CreateForTesting(
       RetainPtr<IFX_SeekableReadStream> pFileAccess,
       FX_FILESIZE HeaderOffset);

   explicit CPDF_SyntaxParser(RetainPtr<IFX_SeekableReadStream> pFileAccess);
   CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> pValidator,
                     FX_FILESIZE HeaderOffset);
   ~CPDF_SyntaxParser();

   void SetReadBufferSize(uint32_t read_buffer_size) {
     m_ReadBufferSize = read_buffer_size;
   }

   FX_FILESIZE GetPos() const { return m_Pos; }
   void SetPos(FX_FILESIZE pos);

   RetainPtr<CPDF_Object> GetObjectBody(CPDF_IndirectObjectHolder* pObjList);
   RetainPtr<CPDF_Object> GetIndirectObject(CPDF_IndirectObjectHolder* pObjList,
                                            ParseType parse_type);

   ByteString GetKeyword();
   void ToNextLine();
   void ToNextWord();
   void RecordingToNextWord();
   bool BackwardsSearchToWord(ByteStringView word, FX_FILESIZE limit);
   FX_FILESIZE FindTag(ByteStringView tag);
   bool ReadBlock(pdfium::span<uint8_t> buffer);
   bool GetCharAt(FX_FILESIZE pos, uint8_t& ch);
   WordResult GetNextWord();
   ByteString PeekNextWord();

   RetainPtr<CPDF_ReadValidator> GetValidator() const;
   uint32_t GetDirectNum();
   bool GetNextChar(uint8_t& ch);

   // The document size may be smaller than the file size.
   // The syntax parser use position relative to document
   // offset (|m_HeaderOffset|).
   // The document size will be FileSize - "Header offset".
   // All offsets was readed from document, should not be great than document
   // size. Use it for checks instead of real file size.
   FX_FILESIZE GetDocumentSize() const;

   ByteString ReadString();
   ByteString ReadHexString();

   void SetTrailerEnds(std::vector<unsigned int>* trailer_ends) {
     m_TrailerEnds = trailer_ends;
   }

  private:
   enum class WordType : bool { kWord, kNumber };

   friend class CPDF_DataAvail;
   friend class cpdf_syntax_parser_ReadHexString_Test;

   static constexpr int kParserMaxRecursionDepth = 64;
   static int s_CurrentRecursionDepth;

   bool ReadBlockAt(FX_FILESIZE read_pos);
   bool GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch);
   WordType GetNextWordInternal();
   bool IsWholeWord(FX_FILESIZE startpos,
                    FX_FILESIZE limit,
                    ByteStringView tag,
                    bool checkKeyword);

   unsigned int ReadEOLMarkers(FX_FILESIZE pos);
   FX_FILESIZE FindWordPos(ByteStringView word);
   FX_FILESIZE FindStreamEndPos();
   RetainPtr<CPDF_Stream> ReadStream(RetainPtr<CPDF_Dictionary> pDict);

   bool IsPositionRead(FX_FILESIZE pos) const;

   RetainPtr<CPDF_Object> GetObjectBodyInternal(
       CPDF_IndirectObjectHolder* pObjList,
       ParseType parse_type);

   RetainPtr<CPDF_ReadValidator> m_pFileAccess;
   // The syntax parser use position relative to header offset.
   // The header contains at file start, and can follow after some stuff. We
   // ignore this stuff.
   const FX_FILESIZE m_HeaderOffset;
   const FX_FILESIZE m_FileLen;
   FX_FILESIZE m_Pos = 0;
   WeakPtr<ByteStringPool> m_pPool;
   DataVector<uint8_t> m_pFileBuf;
   FX_FILESIZE m_BufOffset = 0;
   uint32_t m_WordSize = 0;
   uint32_t m_ReadBufferSize = CPDF_Stream::kFileBufSize;
   std::array<uint8_t, 257> m_WordBuffer = {};

   // The syntax parser records traversed trailer end byte offsets here.
   UnownedPtr<std::vector<unsigned int>> m_TrailerEnds;
 };

 #endif  // CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_
	// Copyright 2016 The PDFium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

	#ifndef CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_
	#define CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_

	#include <stdint.h>

	#include <array>
	#include <memory>
	#include <vector>

	#include "core/fpdfapi/parser/cpdf_stream.h"
	#include "core/fxcrt/data_vector.h"
	#include "core/fxcrt/fx_types.h"
	#include "core/fxcrt/retain_ptr.h"
	#include "core/fxcrt/span.h"
	#include "core/fxcrt/string_pool_template.h"
	#include "core/fxcrt/unowned_ptr.h"
	#include "core/fxcrt/weak_ptr.h"

	class CPDF_Dictionary;
	class CPDF_IndirectObjectHolder;
	class CPDF_Object;
	class CPDF_ReadValidator;
	class CPDF_Stream;
	class IFX_SeekableReadStream;

	class CPDF_SyntaxParser {
	public:
	enum class ParseType : bool { kStrict, kLoose };

	struct WordResult {
	ByteString word;
	bool is_number;
	};

	static std::unique_ptr<CPDF_SyntaxParser> CreateForTesting(
	RetainPtr<IFX_SeekableReadStream> pFileAccess,
	FX_FILESIZE HeaderOffset);

	explicit CPDF_SyntaxParser(RetainPtr<IFX_SeekableReadStream> pFileAccess);
	CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> pValidator,
	FX_FILESIZE HeaderOffset);
	~CPDF_SyntaxParser();

	void SetReadBufferSize(uint32_t read_buffer_size) {
	m_ReadBufferSize = read_buffer_size;
	}

	FX_FILESIZE GetPos() const { return m_Pos; }
	void SetPos(FX_FILESIZE pos);

	RetainPtr<CPDF_Object> GetObjectBody(CPDF_IndirectObjectHolder* pObjList);
	RetainPtr<CPDF_Object> GetIndirectObject(CPDF_IndirectObjectHolder* pObjList,
	ParseType parse_type);

	ByteString GetKeyword();
	void ToNextLine();
	void ToNextWord();
	void RecordingToNextWord();
	bool BackwardsSearchToWord(ByteStringView word, FX_FILESIZE limit);
	FX_FILESIZE FindTag(ByteStringView tag);
	bool ReadBlock(pdfium::span<uint8_t> buffer);
	bool GetCharAt(FX_FILESIZE pos, uint8_t& ch);
	WordResult GetNextWord();
	ByteString PeekNextWord();

	RetainPtr<CPDF_ReadValidator> GetValidator() const;
	uint32_t GetDirectNum();
	bool GetNextChar(uint8_t& ch);

	// The document size may be smaller than the file size.
	// The syntax parser use position relative to document
	// offset (\|m_HeaderOffset\|).
	// The document size will be FileSize - "Header offset".
	// All offsets was readed from document, should not be great than document
	// size. Use it for checks instead of real file size.
	FX_FILESIZE GetDocumentSize() const;

	ByteString ReadString();
	ByteString ReadHexString();

	void SetTrailerEnds(std::vector<unsigned int>* trailer_ends) {
	m_TrailerEnds = trailer_ends;
	}

	private:
	enum class WordType : bool { kWord, kNumber };

	friend class CPDF_DataAvail;
	friend class cpdf_syntax_parser_ReadHexString_Test;

	static constexpr int kParserMaxRecursionDepth = 64;
	static int s_CurrentRecursionDepth;

	bool ReadBlockAt(FX_FILESIZE read_pos);
	bool GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch);
	WordType GetNextWordInternal();
	bool IsWholeWord(FX_FILESIZE startpos,
	FX_FILESIZE limit,
	ByteStringView tag,
	bool checkKeyword);

	unsigned int ReadEOLMarkers(FX_FILESIZE pos);
	FX_FILESIZE FindWordPos(ByteStringView word);
	FX_FILESIZE FindStreamEndPos();
	RetainPtr<CPDF_Stream> ReadStream(RetainPtr<CPDF_Dictionary> pDict);

	bool IsPositionRead(FX_FILESIZE pos) const;

	RetainPtr<CPDF_Object> GetObjectBodyInternal(
	CPDF_IndirectObjectHolder* pObjList,
	ParseType parse_type);

	RetainPtr<CPDF_ReadValidator> m_pFileAccess;
	// The syntax parser use position relative to header offset.
	// The header contains at file start, and can follow after some stuff. We
	// ignore this stuff.
	const FX_FILESIZE m_HeaderOffset;
	const FX_FILESIZE m_FileLen;
	FX_FILESIZE m_Pos = 0;
	WeakPtr<ByteStringPool> m_pPool;
	DataVector<uint8_t> m_pFileBuf;
	FX_FILESIZE m_BufOffset = 0;
	uint32_t m_WordSize = 0;
	uint32_t m_ReadBufferSize = CPDF_Stream::kFileBufSize;
	std::array<uint8_t, 257> m_WordBuffer = {};

	// The syntax parser records traversed trailer end byte offsets here.
	UnownedPtr<std::vector<unsigned int>> m_TrailerEnds;
	};

	#endif // CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_