testing/tools/strip_jp2_comments.py - pdfium - Git at Google

 #!/usr/bin/env python3
 # Copyright 2023 The PDFium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """Strips comments from a JP2 file.

 This is a simple filter script to strip comments from a JP2 file, in order to
 save a few bytes from the final file size.
 """

 import struct
 import sys

 BOX_HEADER_SIZE = 8
 BOX_TAG_JP2C = b'jp2c'

 MARKER_SIZE = 2
 MARKER_START = 0xff
 MARKER_TAG_IGNORE = 0x00
 MARKER_TAG_COMMENT = 0x64
 MARKER_TAG_FILL = 0xff


 def parse_box(buffer, offset):
   """Parses the next box in a JP2 file.

   Args:
     buffer: A buffer containing the JP2 file contents.
     offset: The starting offset into the buffer.

   Returns:
     A tuple (next_offset, tag) where next_offset is the ending offset, and tag
     is the type tag. The box contents will be buffer[offset + 8:next_offset].
   """
   length, tag = struct.unpack_from('>I4s', buffer, offset)
   return offset + length, tag


 def parse_marker(buffer, offset):
   """Parses the next marker in a codestream.

   Args:
     buffer: A buffer containing the codestream.
     offset: The starting offset into the buffer.

   Returns:
     A tuple (next_offset, tag) where next_offset is the offset after the marker,
     and tag is the type tag. If no marker was found, next_offset will point to
     the end of the buffer, and tag will be None. A marker is always 2 bytes.
   """
   while True:
     # Search for start of marker.
     next_offset = buffer.find(MARKER_START, offset)
     if next_offset == -1:
       next_offset = len(buffer)
       break
     next_offset += 1

     # Parse marker.
     if next_offset == len(buffer):
       break
     tag = buffer[next_offset]
     if tag == MARKER_TAG_FILL:
       # Possible fill byte, reparse as start of marker.
       continue
     next_offset += 1

     if tag == MARKER_TAG_IGNORE:
       # Not a real marker.
       continue
     return next_offset, tag

   return next_offset


 def rewrite_jp2c(buffer):
   rewrite_buffer = bytearray(BOX_HEADER_SIZE)

   offset = 0
   start_offset = offset
   while offset < len(buffer):
     next_offset, marker = parse_marker(buffer, offset)
     if marker == MARKER_TAG_COMMENT:
       # Flush the codestream before the comment.
       rewrite_buffer.extend(buffer[start_offset:next_offset - MARKER_SIZE])

       # Find the next marker, skipping the comment.
       next_offset, marker = parse_marker(buffer, next_offset)
       if marker is not None:
         # Reparse the marker.
         next_offset -= MARKER_SIZE
       start_offset = next_offset
     else:
       # Pass through other markers.
       pass
     offset = next_offset

   # Flush the tail of the codestream.
   rewrite_buffer.extend(buffer[start_offset:])

   struct.pack_into('>I4s', rewrite_buffer, 0, len(rewrite_buffer), BOX_TAG_JP2C)
   return rewrite_buffer


 def main(in_file, out_file):
   buffer = in_file.read()

   # Scan through JP2 boxes.
   offset = 0
   while offset < len(buffer):
     next_offset, tag = parse_box(buffer, offset)
     if tag == BOX_TAG_JP2C:
       # Rewrite "jp2c" (codestream) box.
       out_file.write(rewrite_jp2c(buffer[offset + BOX_HEADER_SIZE:next_offset]))
     else:
       # Pass through other boxes.
       out_file.write(buffer[offset:next_offset])
     offset = next_offset

   out_file.flush()


 if __name__ == '__main__':
   main(sys.stdin.buffer, sys.stdout.buffer)
	#!/usr/bin/env python3
	# Copyright 2023 The PDFium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""Strips comments from a JP2 file.

	This is a simple filter script to strip comments from a JP2 file, in order to
	save a few bytes from the final file size.
	"""

	import struct
	import sys

	BOX_HEADER_SIZE = 8
	BOX_TAG_JP2C = b'jp2c'

	MARKER_SIZE = 2
	MARKER_START = 0xff
	MARKER_TAG_IGNORE = 0x00
	MARKER_TAG_COMMENT = 0x64
	MARKER_TAG_FILL = 0xff


	def parse_box(buffer, offset):
	"""Parses the next box in a JP2 file.

	Args:
	buffer: A buffer containing the JP2 file contents.
	offset: The starting offset into the buffer.

	Returns:
	A tuple (next_offset, tag) where next_offset is the ending offset, and tag
	is the type tag. The box contents will be buffer[offset + 8:next_offset].
	"""
	length, tag = struct.unpack_from('>I4s', buffer, offset)
	return offset + length, tag


	def parse_marker(buffer, offset):
	"""Parses the next marker in a codestream.

	Args:
	buffer: A buffer containing the codestream.
	offset: The starting offset into the buffer.

	Returns:
	A tuple (next_offset, tag) where next_offset is the offset after the marker,
	and tag is the type tag. If no marker was found, next_offset will point to
	the end of the buffer, and tag will be None. A marker is always 2 bytes.
	"""
	while True:
	# Search for start of marker.
	next_offset = buffer.find(MARKER_START, offset)
	if next_offset == -1:
	next_offset = len(buffer)
	break
	next_offset += 1

	# Parse marker.
	if next_offset == len(buffer):
	break
	tag = buffer[next_offset]
	if tag == MARKER_TAG_FILL:
	# Possible fill byte, reparse as start of marker.
	continue
	next_offset += 1

	if tag == MARKER_TAG_IGNORE:
	# Not a real marker.
	continue
	return next_offset, tag

	return next_offset


	def rewrite_jp2c(buffer):
	rewrite_buffer = bytearray(BOX_HEADER_SIZE)

	offset = 0
	start_offset = offset
	while offset < len(buffer):
	next_offset, marker = parse_marker(buffer, offset)
	if marker == MARKER_TAG_COMMENT:
	# Flush the codestream before the comment.
	rewrite_buffer.extend(buffer[start_offset:next_offset - MARKER_SIZE])

	# Find the next marker, skipping the comment.
	next_offset, marker = parse_marker(buffer, next_offset)
	if marker is not None:
	# Reparse the marker.
	next_offset -= MARKER_SIZE
	start_offset = next_offset
	else:
	# Pass through other markers.
	pass
	offset = next_offset

	# Flush the tail of the codestream.
	rewrite_buffer.extend(buffer[start_offset:])

	struct.pack_into('>I4s', rewrite_buffer, 0, len(rewrite_buffer), BOX_TAG_JP2C)
	return rewrite_buffer


	def main(in_file, out_file):
	buffer = in_file.read()

	# Scan through JP2 boxes.
	offset = 0
	while offset < len(buffer):
	next_offset, tag = parse_box(buffer, offset)
	if tag == BOX_TAG_JP2C:
	# Rewrite "jp2c" (codestream) box.
	out_file.write(rewrite_jp2c(buffer[offset + BOX_HEADER_SIZE:next_offset]))
	else:
	# Pass through other boxes.
	out_file.write(buffer[offset:next_offset])
	offset = next_offset

	out_file.flush()


	if __name__ == '__main__':
	main(sys.stdin.buffer, sys.stdout.buffer)