blob: cce2f287bbca68de9c14bac6e52d5fd33183ff6e [file] [log] [blame]
// Copyright 2016 PDFium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
#ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
#define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
#include <stddef.h>
#include <stdint.h>
#include <limits>
#include <map>
#include <memory>
#include <set>
#include <vector>
#include "core/fpdfapi/parser/cpdf_cross_ref_table.h"
#include "core/fpdfapi/parser/cpdf_indirect_object_holder.h"
#include "core/fxcrt/bytestring.h"
#include "core/fxcrt/fx_types.h"
#include "core/fxcrt/retain_ptr.h"
#include "core/fxcrt/unowned_ptr.h"
class CPDF_Array;
class CPDF_Dictionary;
class CPDF_LinearizedHeader;
class CPDF_Object;
class CPDF_ObjectStream;
class CPDF_ReadValidator;
class CPDF_SecurityHandler;
class CPDF_SyntaxParser;
class IFX_SeekableReadStream;
class CPDF_Parser {
public:
using ObjectType = CPDF_CrossRefTable::ObjectType;
using ObjectInfo = CPDF_CrossRefTable::ObjectInfo;
class ParsedObjectsHolder : public CPDF_IndirectObjectHolder {
public:
virtual bool TryInit() = 0;
};
enum Error {
SUCCESS = 0,
FILE_ERROR,
FORMAT_ERROR,
PASSWORD_ERROR,
HANDLER_ERROR
};
// A limit on the maximum object number in the xref table. Theoretical limits
// are higher, but this may be large enough in practice.
// Note: This was 1M, but https://crbug.com/910009 encountered a PDF with
// object numbers in the 1.7M range. The PDF only has 10K objects, but they
// are non-consecutive.
static constexpr uint32_t kMaxObjectNumber = 4 * 1024 * 1024;
static constexpr size_t kInvalidPos = std::numeric_limits<size_t>::max();
explicit CPDF_Parser(ParsedObjectsHolder* holder);
CPDF_Parser();
~CPDF_Parser();
Error StartParse(const RetainPtr<IFX_SeekableReadStream>& pFile,
const ByteString& password);
Error StartLinearizedParse(const RetainPtr<CPDF_ReadValidator>& validator,
const ByteString& password);
void SetPassword(const ByteString& password) { m_Password = password; }
ByteString GetPassword() const { return m_Password; }
// Take the GetPassword() value and encode it, if necessary, based on the
// password encoding conversion.
ByteString GetEncodedPassword() const;
const CPDF_Dictionary* GetTrailer() const;
CPDF_Dictionary* GetMutableTrailerForTesting();
// Returns a new trailer which combines the last read trailer with the /Root
// and /Info from previous ones.
RetainPtr<CPDF_Dictionary> GetCombinedTrailer() const;
FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
uint32_t GetPermissions() const;
uint32_t GetRootObjNum() const;
uint32_t GetInfoObjNum() const;
const CPDF_Array* GetIDArray() const;
CPDF_Dictionary* GetRoot() const;
const CPDF_Dictionary* GetEncryptDict() const;
RetainPtr<CPDF_Object> ParseIndirectObject(uint32_t objnum);
uint32_t GetLastObjNum() const;
bool IsValidObjectNumber(uint32_t objnum) const;
FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
bool IsObjectFreeOrNull(uint32_t objnum) const;
const RetainPtr<CPDF_SecurityHandler>& GetSecurityHandler() const {
return m_pSecurityHandler;
}
bool IsObjectFree(uint32_t objnum) const;
int GetFileVersion() const { return m_FileVersion; }
bool IsXRefStream() const { return m_bXRefStream; }
RetainPtr<CPDF_Object> ParseIndirectObjectAt(FX_FILESIZE pos,
uint32_t objnum);
uint32_t GetFirstPageNo() const;
const CPDF_LinearizedHeader* GetLinearizedHeader() const {
return m_pLinearized.get();
}
const CPDF_CrossRefTable* GetCrossRefTable() const {
return m_CrossRefTable.get();
}
bool xref_table_rebuilt() const { return m_bXRefTableRebuilt; }
CPDF_SyntaxParser* GetSyntax() const { return m_pSyntax.get(); }
void SetLinearizedHeaderForTesting(
std::unique_ptr<CPDF_LinearizedHeader> pLinearized);
protected:
bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip);
bool RebuildCrossRef();
Error StartParseInternal();
FX_FILESIZE ParseStartXRef();
std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
void SetSyntaxParserForTesting(std::unique_ptr<CPDF_SyntaxParser> parser);
private:
friend class CPDF_DataAvail;
struct CrossRefObjData {
uint32_t obj_num = 0;
ObjectInfo info;
};
bool LoadAllCrossRefV4(FX_FILESIZE xref_offset);
bool LoadAllCrossRefV5(FX_FILESIZE xref_offset);
bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
void ProcessCrossRefV5Entry(pdfium::span<const uint8_t> entry_span,
pdfium::span<const uint32_t> field_widths,
uint32_t obj_num);
RetainPtr<CPDF_Dictionary> LoadTrailerV4();
Error SetEncryptHandler();
void ReleaseEncryptHandler();
bool LoadLinearizedAllCrossRefV4(FX_FILESIZE main_xref_offset);
bool LoadLinearizedAllCrossRefV5(FX_FILESIZE main_xref_offset);
Error LoadLinearizedMainXRefTable();
const CPDF_ObjectStream* GetObjectStream(uint32_t object_number);
void ShrinkObjectMap(uint32_t size);
// A simple check whether the cross reference table matches with
// the objects.
bool VerifyCrossRefV4();
// If out_objects is null, the parser position will be moved to end subsection
// without additional validation.
bool ParseAndAppendCrossRefSubsectionData(
uint32_t start_objnum,
uint32_t count,
std::vector<CrossRefObjData>* out_objects);
bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects);
void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
bool InitSyntaxParser(const RetainPtr<CPDF_ReadValidator>& validator);
bool ParseFileVersion();
ObjectType GetObjectType(uint32_t objnum) const;
std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
std::unique_ptr<ParsedObjectsHolder> m_pOwnedObjectsHolder;
UnownedPtr<ParsedObjectsHolder> m_pObjectsHolder;
bool m_bHasParsed = false;
bool m_bXRefStream = false;
bool m_bXRefTableRebuilt = false;
int m_FileVersion = 0;
// m_CrossRefTable must be destroyed after m_pSecurityHandler due to the
// ownership of the ID array data.
std::unique_ptr<CPDF_CrossRefTable> m_CrossRefTable;
FX_FILESIZE m_LastXRefOffset;
RetainPtr<CPDF_SecurityHandler> m_pSecurityHandler;
ByteString m_Password;
std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
// A map of object numbers to indirect streams.
std::map<uint32_t, std::unique_ptr<CPDF_ObjectStream>> m_ObjectStreamMap;
// All indirect object numbers that are being parsed.
std::set<uint32_t> m_ParsingObjNums;
uint32_t m_MetadataObjnum = 0;
};
#endif // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_