Enhance encode_pdf_filter.py to extract PNG IDAT

Adds a new "virtual" filter to encode_pdf_filter.py that extracts the
contents of IDAT chunks from a PNG file. The extracted IDAT data can be
used to "embed" PNG images in a PDF file.

For example, a 640x480 24-bit RGB PNG image might be embedded like so:

  {{object 5 0}} <<
    /Type /XObject
    /Subtype /Image
    /Width 640
    /Height 480
    /BitsPerComponent 8
    /ColorSpace /DeviceRGB
    /Filter /FlateDecode
    /DecodeParms <<
      /Columns 640
      /BitsPerComponent 8
      /Colors 3
      /Predictor 10
    >>
    {{streamlen}}
  >>
  stream
  {{include extracted-png.idat}}

  endstream
  endobj

The filter is "virtual" because it's not an actual PDF filter, although
it does contribute a /FlateDecode pass.

Bug: chromium:587182
Change-Id: Id4f3989e2225627d9d46df1aeba10b60eb04ee75
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/68195
Auto-Submit: K Moon <kmoon@chromium.org>
Commit-Queue: K Moon <kmoon@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/testing/tools/encode_pdf_filter.py b/testing/tools/encode_pdf_filter.py
index 2d56543..5f29c82 100755
--- a/testing/tools/encode_pdf_filter.py
+++ b/testing/tools/encode_pdf_filter.py
@@ -38,18 +38,24 @@
 
     return filter_class
 
-  @staticmethod
-  def RegisterFilter(filter_class):
-    assert filter_class not in _PdfStream._unique_filter_classes
-    _PdfStream._unique_filter_classes.append(filter_class)
+  @classmethod
+  def Register(cls):
+    assert cls not in _PdfStream._unique_filter_classes
+    _PdfStream._unique_filter_classes.append(cls)
+    cls.RegisterByName()
+    cls.RegisterByAliases()
 
-    assert filter_class.name[0] == '/'
-    lower_name = filter_class.name.lower()
-    _PdfStream._filter_classes[lower_name] = filter_class
-    _PdfStream._filter_classes[lower_name[1:]] = filter_class
+  @classmethod
+  def RegisterByName(cls):
+    assert cls.name[0] == '/'
+    lower_name = cls.name.lower()
+    _PdfStream._filter_classes[lower_name] = cls
+    _PdfStream._filter_classes[lower_name[1:]] = cls
 
-    for alias in filter_class.aliases:
-      _PdfStream._filter_classes[alias.lower()] = filter_class
+  @classmethod
+  def RegisterByAliases(cls):
+    for alias in cls.aliases:
+      _PdfStream._filter_classes[alias.lower()] = cls
 
   @staticmethod
   def GetHelp():
@@ -59,6 +65,21 @@
                                             ', '.join(filter_class.aliases))
     return text
 
+  @classmethod
+  def AddEntries(cls, entries):
+    _PdfStream.AddListEntry(entries, 'Filter', cls.name)
+
+  @staticmethod
+  def AddListEntry(entries, key, value):
+    old_value = entries.get(key)
+    if old_value is None:
+      entries[key] = value
+    else:
+      if not isinstance(old_value, collections.abc.MutableSequence):
+        old_value = [old_value]
+        entries[key] = old_value
+      old_value.append(value)
+
   def __init__(self, out_buffer, **kwargs):
     del kwargs
     self.buffer = out_buffer
@@ -166,9 +187,94 @@
     self.buffer.close()
 
 
-_PdfStream.RegisterFilter(_Ascii85DecodePdfStream)
-_PdfStream.RegisterFilter(_AsciiHexDecodePdfStream)
-_PdfStream.RegisterFilter(_FlateDecodePdfStream)
+class _VirtualPdfStream(_PdfStream):
+
+  @classmethod
+  def RegisterByName(cls):
+    pass
+
+  @classmethod
+  def AddEntries(cls, entries):
+    pass
+
+
+class _PassthroughPdfStream(_VirtualPdfStream):
+  name = '(virtual) passthrough'
+  aliases = ('noop', 'passthrough')
+
+
+class _PngIdatPdfStream(_VirtualPdfStream):
+  name = '(virtual) PNG IDAT'
+  aliases = ('png',)
+
+  _EXPECT_HEADER = -1
+  _EXPECT_LENGTH = -2
+  _EXPECT_CHUNK_TYPE = -3
+  _EXPECT_CRC = -4
+
+  _PNG_HEADER = 0x89504E470D0A1A0A
+  _PNG_CHUNK_IDAT = 0x49444154
+
+  @classmethod
+  def AddEntries(cls, entries):
+    # Technically only true for compression method 0 (zlib), but no other
+    # methods have been standardized.
+    _PdfStream.AddListEntry(entries, 'Filter', '/FlateDecode')
+
+  def __init__(self, out_buffer, **kwargs):
+    super().__init__(out_buffer, **kwargs)
+    self.chunk = _PngIdatPdfStream._EXPECT_HEADER
+    self.remaining = 8
+    self.accumulator = 0
+    self.length = 0
+
+  def write(self, data):
+    position = 0
+    while position < len(data):
+      if self.chunk >= 0:
+        # Only pass through IDAT chunk data.
+        read_size = min(self.remaining, len(data) - position)
+        if self.chunk == _PngIdatPdfStream._PNG_CHUNK_IDAT:
+          self.buffer.write(data[position:position + read_size])
+        self.remaining -= read_size
+        if self.remaining == 0:
+          self.ResetAccumulator(_PngIdatPdfStream._EXPECT_CRC, 4)
+        position += read_size
+      else:
+        # As far as we're concerned, PNG files are just a header followed by a
+        # series of (length, chunk type, data[length], CRC) chunks.
+        if self.AccumulateByte(data[position]):
+          if self.chunk == _PngIdatPdfStream._EXPECT_HEADER:
+            if self.accumulator != _PngIdatPdfStream._PNG_HEADER:
+              raise ValueError('Invalid PNG header', self.accumulator)
+            self.ResetAccumulator(_PngIdatPdfStream._EXPECT_LENGTH, 4)
+          elif self.chunk == _PngIdatPdfStream._EXPECT_LENGTH:
+            self.length = self.accumulator
+            self.ResetAccumulator(_PngIdatPdfStream._EXPECT_CHUNK_TYPE, 4)
+          elif self.chunk == _PngIdatPdfStream._EXPECT_CHUNK_TYPE:
+            self.ResetAccumulator(self.accumulator, self.length)
+          elif self.chunk == _PngIdatPdfStream._EXPECT_CRC:
+            # Don't care if the CRC is correct.
+            self.ResetAccumulator(_PngIdatPdfStream._EXPECT_LENGTH, 4)
+        position += 1
+
+  def ResetAccumulator(self, chunk, remaining):
+    self.chunk = chunk
+    self.remaining = remaining
+    self.accumulator = 0
+
+  def AccumulateByte(self, byte):
+    assert self.remaining > 0
+    self.accumulator = self.accumulator << 8 | byte
+    self.remaining -= 1
+    return self.remaining == 0
+
+
+_Ascii85DecodePdfStream.Register()
+_AsciiHexDecodePdfStream.Register()
+_FlateDecodePdfStream.Register()
+_PassthroughPdfStream.Register()
+_PngIdatPdfStream.Register()
 
 _DEFAULT_FILTERS = (_Ascii85DecodePdfStream, _FlateDecodePdfStream)
 
@@ -261,7 +367,7 @@
 
 
 def _EncodePdfValue(value):
-  if isinstance(value, collections.abc.Sequence):
+  if isinstance(value, collections.abc.MutableSequence):
     value = '[' + ' '.join(value) + ']'
   return value
 
@@ -276,7 +382,8 @@
     out_buffer.close()
 
   entries = collections.OrderedDict()
-  entries['Filter'] = [f.name for f in args.filter]
+  for f in args.filter:
+    f.AddEntries(entries)
   _WritePdfStreamObject(
       args.outfile.buffer,
       data=encoded_sink.getbuffer(),