testing/tools/pdftoin_shrdlu.py - pdfium - Git at Google

 #!/usr/bin/env python3
 # Copyright 2025 The PDFium Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 """
 Convert PDF files to .in format on a best-effort basis.

 Requires `mutool` on PATH.
 """

 import argparse
 import re
 import subprocess
 import sys
 import tempfile


 class PDFParser:

   def __init__(self, pdf_data):
     self.data = pdf_data
     self.objects = {}

   def parse(self):
     # Find all objects in the PDF
     obj_pattern = rb'(\d+)\s+(\d+)\s+obj\s*(.*?)\s*endobj'
     matches = re.finditer(obj_pattern, self.data, re.DOTALL)

     for match in matches:
       obj_num = int(match.group(1))
       gen_num = int(match.group(2))
       obj_content = match.group(3)

       self.objects[(obj_num, gen_num)] = obj_content

     return self.objects

   def format_object_content(self, content):
     """Format object content, handling dictionaries and streams."""
     content_str = content.decode('latin-1', errors='replace')

     # Check if this is a stream object
     if 'stream' in content_str and 'endstream' in content_str:
       # Split into dictionary and stream parts
       stream_start = content_str.find('stream')
       dict_part = content_str[:stream_start].strip()
       stream_part = content_str[stream_start:]

       # Replace /Length with {{streamlen}}
       dict_part = re.sub(r'/Length\s+\d+', '{{streamlen}}', dict_part)

       # Extract actual stream data
       stream_match = re.search(r'stream\s*(.*?)\s*endstream', stream_part,
                                re.DOTALL)
       if stream_match:
         stream_data = stream_match.group(1).strip()
         return f"{dict_part}\nstream\n{stream_data}\nendstream"

     # Regular dictionary object
     content_str = re.sub(r'/Length\s+\d+', '{{streamlen}}', content_str)
     return content_str.strip()


 def pdf_to_in(pdf_path, output_path=None):
   with tempfile.NamedTemporaryFile(
       mode='wb', suffix='.pdf', delete=False) as tmp_file:
     tmp_path = tmp_file.name

     # Run mutool clean -a
     result = subprocess.run(['mutool', 'clean', '-a', pdf_path, tmp_path],
                             capture_output=True,
                             check=False,
                             text=True)

     if result.returncode != 0:
       raise RuntimeError(f"mutool failed: {result.stderr}")

     # Read the cleaned PDF
     with open(tmp_path, 'rb') as f:
       pdf_data = f.read()

   # Parse PDF
   parser = PDFParser(pdf_data)
   objects = parser.parse()

   # Build output
   output_lines = ['{{header}}', '']

   # Sort objects by number
   sorted_objects = sorted(objects.keys(), key=lambda x: (x[0], x[1]))

   # Add objects
   for obj_num, gen_num in sorted_objects:
     obj_content = objects[(obj_num, gen_num)]
     formatted_content = parser.format_object_content(obj_content)

     output_lines.append(f'{{{{object {obj_num} {gen_num}}}}} <<')
     output_lines.append(formatted_content.removeprefix('<<\n'))
     output_lines.append('endobj')
     output_lines.append('')

   output_lines.append('{{xref}}')
   output_lines.append('{{trailer}}')
   output_lines.append('{{startxref}}')
   output_lines.append('%%EOF')
   output_text = '\n'.join(output_lines)

   if output_path:
     with open(output_path, 'w', encoding='utf-8') as f:
       f.write(output_text + '\n')
     print(f"Converted PDF written to: {output_path}")
   else:
     print(output_text)

   return output_text


 def main():
   parser = argparse.ArgumentParser(
       description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
   parser.add_argument('input')
   parser.add_argument('-o', '--output')
   args = parser.parse_args()
   pdf_to_in(args.input, args.output)


 if __name__ == '__main__':
   sys.exit(main())
	#!/usr/bin/env python3
	# Copyright 2025 The PDFium Authors
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.
	"""
	Convert PDF files to .in format on a best-effort basis.

	Requires `mutool` on PATH.
	"""

	import argparse
	import re
	import subprocess
	import sys
	import tempfile


	class PDFParser:

	def __init__(self, pdf_data):
	self.data = pdf_data
	self.objects = {}

	def parse(self):
	# Find all objects in the PDF
	obj_pattern = rb'(\d+)\s+(\d+)\s+obj\s(.?)\s*endobj'
	matches = re.finditer(obj_pattern, self.data, re.DOTALL)

	for match in matches:
	obj_num = int(match.group(1))
	gen_num = int(match.group(2))
	obj_content = match.group(3)

	self.objects[(obj_num, gen_num)] = obj_content

	return self.objects

	def format_object_content(self, content):
	"""Format object content, handling dictionaries and streams."""
	content_str = content.decode('latin-1', errors='replace')

	# Check if this is a stream object
	if 'stream' in content_str and 'endstream' in content_str:
	# Split into dictionary and stream parts
	stream_start = content_str.find('stream')
	dict_part = content_str[:stream_start].strip()
	stream_part = content_str[stream_start:]

	# Replace /Length with {{streamlen}}
	dict_part = re.sub(r'/Length\s+\d+', '{{streamlen}}', dict_part)

	# Extract actual stream data
	stream_match = re.search(r'stream\s(.?)\s*endstream', stream_part,
	re.DOTALL)
	if stream_match:
	stream_data = stream_match.group(1).strip()
	return f"{dict_part}\nstream\n{stream_data}\nendstream"

	# Regular dictionary object
	content_str = re.sub(r'/Length\s+\d+', '{{streamlen}}', content_str)
	return content_str.strip()


	def pdf_to_in(pdf_path, output_path=None):
	with tempfile.NamedTemporaryFile(
	mode='wb', suffix='.pdf', delete=False) as tmp_file:
	tmp_path = tmp_file.name

	# Run mutool clean -a
	result = subprocess.run(['mutool', 'clean', '-a', pdf_path, tmp_path],
	capture_output=True,
	check=False,
	text=True)

	if result.returncode != 0:
	raise RuntimeError(f"mutool failed: {result.stderr}")

	# Read the cleaned PDF
	with open(tmp_path, 'rb') as f:
	pdf_data = f.read()

	# Parse PDF
	parser = PDFParser(pdf_data)
	objects = parser.parse()

	# Build output
	output_lines = ['{{header}}', '']

	# Sort objects by number
	sorted_objects = sorted(objects.keys(), key=lambda x: (x[0], x[1]))

	# Add objects
	for obj_num, gen_num in sorted_objects:
	obj_content = objects[(obj_num, gen_num)]
	formatted_content = parser.format_object_content(obj_content)

	output_lines.append(f'{{{{object {obj_num} {gen_num}}}}} <<')
	output_lines.append(formatted_content.removeprefix('<<\n'))
	output_lines.append('endobj')
	output_lines.append('')

	output_lines.append('{{xref}}')
	output_lines.append('{{trailer}}')
	output_lines.append('{{startxref}}')
	output_lines.append('%%EOF')
	output_text = '\n'.join(output_lines)

	if output_path:
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(output_text + '\n')
	print(f"Converted PDF written to: {output_path}")
	else:
	print(output_text)

	return output_text


	def main():
	parser = argparse.ArgumentParser(
	description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
	parser.add_argument('input')
	parser.add_argument('-o', '--output')
	args = parser.parse_args()
	pdf_to_in(args.input, args.output)


	if __name__ == '__main__':
	sys.exit(main())