Use document size instead of file size while parsing.

We should use document size instead of File size,
because all offsets and sizes was read from document should
take into account of header offset.

Added some tests of parsing of documents with header offset.
Also drop friendship of CPDF_SyntaxParser with CPDF_Parser.

Change-Id: Iebec75ab2ee07fb644a6c653b4ef5c2e09af09fe
Reviewed-on: https://pdfium-review.googlesource.com/35830
Commit-Queue: Art Snake <art-snake@yandex-team.ru>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/core/fpdfapi/edit/cpdf_creator.cpp b/core/fpdfapi/edit/cpdf_creator.cpp
index aa8c751..11b0628 100644
--- a/core/fpdfapi/edit/cpdf_creator.cpp
+++ b/core/fpdfapi/edit/cpdf_creator.cpp
@@ -17,6 +17,7 @@
 #include "core/fpdfapi/parser/cpdf_parser.h"
 #include "core/fpdfapi/parser/cpdf_security_handler.h"
 #include "core/fpdfapi/parser/cpdf_string.h"
+#include "core/fpdfapi/parser/cpdf_syntax_parser.h"
 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
 #include "core/fxcrt/fx_extension.h"
@@ -254,20 +255,19 @@
       }
       m_iStage = Stage::kInitWriteObjs20;
     } else {
-      m_SavedOffset = m_pParser->GetFileAccess()->GetSize();
+      m_SavedOffset = m_pParser->GetSyntax()->GetDocumentSize();
       m_iStage = Stage::kWriteIncremental15;
     }
   }
   if (m_iStage == Stage::kWriteIncremental15) {
     if (m_IsOriginal && m_SavedOffset > 0) {
-      RetainPtr<IFX_SeekableReadStream> pSrcFile = m_pParser->GetFileAccess();
-      std::vector<uint8_t> buffer(4096);
+      static constexpr FX_FILESIZE kBufferSize = 4096;
+      std::vector<uint8_t> buffer(kBufferSize);
       FX_FILESIZE src_size = m_SavedOffset;
+      m_pParser->GetSyntax()->SetPos(0);
       while (src_size) {
-        uint32_t block_size = src_size > 4096 ? 4096 : src_size;
-        if (!pSrcFile->ReadBlock(buffer.data(),
-                                 m_Archive->CurrentOffset() - src_size,
-                                 block_size)) {
+        const FX_FILESIZE block_size = std::min(kBufferSize, src_size);
+        if (!m_pParser->GetSyntax()->ReadBlock(buffer.data(), block_size)) {
           return Stage::kInvalid;
         }
         if (!m_Archive->WriteBlock(buffer.data(), block_size))
diff --git a/core/fpdfapi/parser/cpdf_linearized_header.cpp b/core/fpdfapi/parser/cpdf_linearized_header.cpp
index 5032bc3..8286e0f 100644
--- a/core/fpdfapi/parser/cpdf_linearized_header.cpp
+++ b/core/fpdfapi/parser/cpdf_linearized_header.cpp
@@ -38,17 +38,17 @@
 }
 
 bool IsLinearizedHeaderValid(const CPDF_LinearizedHeader* header,
-                             FX_FILESIZE file_size) {
+                             FX_FILESIZE document_size) {
   ASSERT(header);
-  return header->GetFileSize() == file_size &&
+  return header->GetFileSize() == document_size &&
          static_cast<int>(header->GetFirstPageNo()) >= 0 &&
          header->GetFirstPageNo() < kMaxInt &&
          header->GetFirstPageNo() < header->GetPageCount() &&
-         header->GetMainXRefTableFirstEntryOffset() < file_size &&
+         header->GetMainXRefTableFirstEntryOffset() < document_size &&
          header->GetPageCount() > 0 &&
-         header->GetFirstPageEndOffset() < file_size &&
-         header->GetLastXRefOffset() < file_size &&
-         header->GetHintStart() < file_size;
+         header->GetFirstPageEndOffset() < document_size &&
+         header->GetLastXRefOffset() < document_size &&
+         header->GetHintStart() < document_size;
 }
 
 }  // namespace
@@ -78,10 +78,9 @@
   auto result = pdfium::WrapUnique(
       new CPDF_LinearizedHeader(pDict.get(), parser->GetPos()));
 
-  if (!IsLinearizedHeaderValid(result.get(),
-                               parser->GetFileAccess()->GetSize())) {
+  if (!IsLinearizedHeaderValid(result.get(), parser->GetDocumentSize()))
     return nullptr;
-  }
+
   return result;
 }
 
diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp
index 16077fd..be18560 100644
--- a/core/fpdfapi/parser/cpdf_parser.cpp
+++ b/core/fpdfapi/parser/cpdf_parser.cpp
@@ -119,10 +119,6 @@
   return GetObjectType(objnum) == ObjectType::kFree;
 }
 
-RetainPtr<IFX_SeekableReadStream> CPDF_Parser::GetFileAccess() const {
-  return m_pSyntax->GetFileAccess();
-}
-
 void CPDF_Parser::ShrinkObjectMap(uint32_t objnum) {
   m_CrossRefTable->ShrinkObjectMap(objnum);
 }
@@ -230,8 +226,7 @@
 
 FX_FILESIZE CPDF_Parser::ParseStartXRef() {
   static constexpr char kStartXRefKeyword[] = "startxref";
-  m_pSyntax->SetPos(m_pSyntax->m_FileLen - m_pSyntax->m_HeaderOffset -
-                    strlen(kStartXRefKeyword));
+  m_pSyntax->SetPos(m_pSyntax->GetDocumentSize() - strlen(kStartXRefKeyword));
   if (!m_pSyntax->BackwardsSearchToWord(kStartXRefKeyword, 4096))
     return 0;
 
@@ -245,7 +240,7 @@
     return 0;
 
   const FX_SAFE_FILESIZE result = FXSYS_atoi64(xrefpos_str.c_str());
-  if (!result.IsValid() || result.ValueOrDie() >= GetFileAccess()->GetSize())
+  if (!result.IsValid() || result.ValueOrDie() >= m_pSyntax->GetDocumentSize())
     return 0;
 
   return result.ValueOrDie();
@@ -453,7 +448,7 @@
     return false;
 
   const size_t max_entries_in_file =
-      m_pSyntax->GetFileAccess()->GetSize() / kEntryConstSize;
+      m_pSyntax->GetDocumentSize() / kEntryConstSize;
   if (new_size.ValueOrDie() > max_entries_in_file)
     return false;
 
diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h
index 0af39d7..7cd98c2 100644
--- a/core/fpdfapi/parser/cpdf_parser.h
+++ b/core/fpdfapi/parser/cpdf_parser.h
@@ -90,7 +90,6 @@
   CPDF_SecurityHandler* GetSecurityHandler() const {
     return m_pSecurityHandler.get();
   }
-  RetainPtr<IFX_SeekableReadStream> GetFileAccess() const;
   bool IsObjectFree(uint32_t objnum) const;
 
   int GetFileVersion() const { return m_FileVersion; }
@@ -109,6 +108,8 @@
     return m_CrossRefTable.get();
   }
 
+  CPDF_SyntaxParser* GetSyntax() const { return m_pSyntax.get(); }
+
   void SetLinearizedHeader(std::unique_ptr<CPDF_LinearizedHeader> pLinearized);
 
  protected:
@@ -121,6 +122,9 @@
   bool RebuildCrossRef();
 
  private:
+  friend class cpdf_parser_ParseStartXRefWithHeaderOffset_Test;
+  friend class cpdf_parser_ParseStartXRef_Test;
+  friend class cpdf_parser_ParseLinearizedWithHeaderOffset_Test;
   friend class CPDF_DataAvail;
 
   struct CrossRefObjData {
diff --git a/core/fpdfapi/parser/cpdf_parser_unittest.cpp b/core/fpdfapi/parser/cpdf_parser_unittest.cpp
index db48493..4ce1238 100644
--- a/core/fpdfapi/parser/cpdf_parser_unittest.cpp
+++ b/core/fpdfapi/parser/cpdf_parser_unittest.cpp
@@ -3,8 +3,12 @@
 // found in the LICENSE file.
 
 #include <limits>
+#include <memory>
 #include <string>
+#include <vector>
 
+#include "core/fpdfapi/parser/cpdf_linearized_header.h"
+#include "core/fpdfapi/parser/cpdf_object.h"
 #include "core/fpdfapi/parser/cpdf_parser.h"
 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
 #include "core/fxcrt/fx_extension.h"
@@ -43,13 +47,19 @@
   }
 
   // Setup reading from a buffer and initial states.
-  bool InitTestFromBuffer(const unsigned char* buffer, size_t len) {
-    // For the test file, the header is set at the beginning.
+  bool InitTestFromBufferWithOffset(const unsigned char* buffer,
+                                    size_t len,
+                                    int header_offset) {
     m_pSyntax->InitParser(
-        pdfium::MakeRetain<CFX_BufferSeekableReadStream>(buffer, len), 0);
+        pdfium::MakeRetain<CFX_BufferSeekableReadStream>(buffer, len),
+        header_offset);
     return true;
   }
 
+  bool InitTestFromBuffer(const unsigned char* buffer, size_t len) {
+    return InitTestFromBufferWithOffset(buffer, len, 0 /*header_offset*/);
+  }
+
  private:
   // Add test cases here as private friend so that protected members in
   // CPDF_Parser can be accessed by test cases.
@@ -226,3 +236,58 @@
     }
   }
 }
+
+TEST(cpdf_parser, ParseStartXRef) {
+  CPDF_TestParser parser;
+  std::string test_file;
+  ASSERT_TRUE(
+      PathService::GetTestFilePath("annotation_stamp_with_ap.pdf", &test_file));
+  ASSERT_TRUE(parser.InitTestFromFile(test_file.c_str())) << test_file;
+
+  EXPECT_EQ(100940, parser.ParseStartXRef());
+  std::unique_ptr<CPDF_Object> cross_ref_v5_obj =
+      parser.ParseIndirectObjectAt(100940, 0);
+  ASSERT_TRUE(cross_ref_v5_obj);
+  EXPECT_EQ(75u, cross_ref_v5_obj->GetObjNum());
+}
+
+TEST(cpdf_parser, ParseStartXRefWithHeaderOffset) {
+  static constexpr FX_FILESIZE kTestHeaderOffset = 765;
+  std::string test_file;
+  ASSERT_TRUE(
+      PathService::GetTestFilePath("annotation_stamp_with_ap.pdf", &test_file));
+  RetainPtr<IFX_SeekableReadStream> pFileAccess =
+      IFX_SeekableReadStream::CreateFromFilename(test_file.c_str());
+  ASSERT_TRUE(pFileAccess);
+
+  std::vector<unsigned char> data(pFileAccess->GetSize() + kTestHeaderOffset);
+  ASSERT_TRUE(pFileAccess->ReadBlock(&data.front() + kTestHeaderOffset, 0,
+                                     pFileAccess->GetSize()));
+  CPDF_TestParser parser;
+  parser.InitTestFromBufferWithOffset(&data.front(), data.size(),
+                                      kTestHeaderOffset);
+
+  EXPECT_EQ(100940, parser.ParseStartXRef());
+  std::unique_ptr<CPDF_Object> cross_ref_v5_obj =
+      parser.ParseIndirectObjectAt(100940, 0);
+  ASSERT_TRUE(cross_ref_v5_obj);
+  EXPECT_EQ(75u, cross_ref_v5_obj->GetObjNum());
+}
+
+TEST(cpdf_parser, ParseLinearizedWithHeaderOffset) {
+  static constexpr FX_FILESIZE kTestHeaderOffset = 765;
+  std::string test_file;
+  ASSERT_TRUE(PathService::GetTestFilePath("linearized.pdf", &test_file));
+  RetainPtr<IFX_SeekableReadStream> pFileAccess =
+      IFX_SeekableReadStream::CreateFromFilename(test_file.c_str());
+  ASSERT_TRUE(pFileAccess);
+
+  std::vector<unsigned char> data(pFileAccess->GetSize() + kTestHeaderOffset);
+  ASSERT_TRUE(pFileAccess->ReadBlock(&data.front() + kTestHeaderOffset, 0,
+                                     pFileAccess->GetSize()));
+  CPDF_TestParser parser;
+  parser.InitTestFromBufferWithOffset(&data.front(), data.size(),
+                                      kTestHeaderOffset);
+
+  EXPECT_TRUE(parser.ParseLinearizedHeader());
+}
diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.cpp b/core/fpdfapi/parser/cpdf_syntax_parser.cpp
index f8f36ae..af5ff48 100644
--- a/core/fpdfapi/parser/cpdf_syntax_parser.cpp
+++ b/core/fpdfapi/parser/cpdf_syntax_parser.cpp
@@ -112,6 +112,10 @@
   return true;
 }
 
+FX_FILESIZE CPDF_SyntaxParser::GetDocumentSize() const {
+  return m_FileLen - m_HeaderOffset;
+}
+
 bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) {
   pos += m_HeaderOffset;
   if (pos >= m_FileLen)
@@ -714,6 +718,7 @@
   m_pFileBuf.clear();
   m_HeaderOffset = HeaderOffset;
   m_FileLen = validator->GetSize();
+  ASSERT(m_HeaderOffset <= m_FileLen);
   m_Pos = 0;
   m_pFileAccess = validator;
   m_BufOffset = 0;
@@ -813,10 +818,6 @@
   return -1;
 }
 
-RetainPtr<IFX_SeekableReadStream> CPDF_SyntaxParser::GetFileAccess() const {
-  return m_pFileAccess;
-}
-
 bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const {
   return m_BufOffset <= pos &&
          pos < static_cast<FX_FILESIZE>(m_BufOffset + m_pFileBuf.size());
diff --git a/core/fpdfapi/parser/cpdf_syntax_parser.h b/core/fpdfapi/parser/cpdf_syntax_parser.h
index 962c327..ebd844c 100644
--- a/core/fpdfapi/parser/cpdf_syntax_parser.h
+++ b/core/fpdfapi/parser/cpdf_syntax_parser.h
@@ -60,16 +60,24 @@
   ByteString GetNextWord(bool* bIsNumber);
   ByteString PeekNextWord(bool* bIsNumber);
 
-  RetainPtr<IFX_SeekableReadStream> GetFileAccess() const;
-
   const RetainPtr<CPDF_ReadValidator>& GetValidator() const {
     return m_pFileAccess;
   }
   uint32_t GetDirectNum();
   bool GetNextChar(uint8_t& ch);
 
+  // The document size may be smaller than the file size.
+  // The syntax parser use position relative to document
+  // offset (|m_HeaderOffset|).
+  // The document size will be FileSize - "Header offset".
+  // All offsets was readed from document, should not be great than document
+  // size. Use it for checks instead of real file size.
+  FX_FILESIZE GetDocumentSize() const;
+
+  ByteString ReadString();
+  ByteString ReadHexString();
+
  private:
-  friend class CPDF_Parser;
   friend class CPDF_DataAvail;
   friend class cpdf_syntax_parser_ReadHexString_Test;
 
@@ -84,8 +92,6 @@
                    const ByteStringView& tag,
                    bool checkKeyword);
 
-  ByteString ReadString();
-  ByteString ReadHexString();
   unsigned int ReadEOLMarkers(FX_FILESIZE pos);
   FX_FILESIZE FindWordPos(const ByteStringView& word);
   FX_FILESIZE FindStreamEndPos();
@@ -99,6 +105,9 @@
       ParseType parse_type);
 
   FX_FILESIZE m_Pos;
+  // The syntax parser use position relative to header offset.
+  // The header contains at file start, and can follow after some stuff. We
+  // ignore this stuff.
   FX_FILESIZE m_HeaderOffset;
   FX_FILESIZE m_FileLen;
   WeakPtr<ByteStringPool> m_pPool;