blob: 742b8cf6607e846e7167652571a06da152ff474f [file] [log] [blame] [edit]
#!/usr/bin/env python3
# Copyright 2025 The PDFium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
Convert PDF files to .in format on a best-effort basis.
Requires `mutool` on PATH.
"""
import argparse
import re
import subprocess
import sys
import tempfile
class PDFParser:
def __init__(self, pdf_data):
self.data = pdf_data
self.objects = {}
def parse(self):
# Find all objects in the PDF
obj_pattern = rb'(\d+)\s+(\d+)\s+obj\s*(.*?)\s*endobj'
matches = re.finditer(obj_pattern, self.data, re.DOTALL)
for match in matches:
obj_num = int(match.group(1))
gen_num = int(match.group(2))
obj_content = match.group(3)
self.objects[(obj_num, gen_num)] = obj_content
return self.objects
def format_object_content(self, content):
"""Format object content, handling dictionaries and streams."""
content_str = content.decode('latin-1', errors='replace')
# Check if this is a stream object
if 'stream' in content_str and 'endstream' in content_str:
# Split into dictionary and stream parts
stream_start = content_str.find('stream')
dict_part = content_str[:stream_start].strip()
stream_part = content_str[stream_start:]
# Replace /Length with {{streamlen}}
dict_part = re.sub(r'/Length\s+\d+', '{{streamlen}}', dict_part)
# Extract actual stream data
stream_match = re.search(r'stream\s*(.*?)\s*endstream', stream_part,
re.DOTALL)
if stream_match:
stream_data = stream_match.group(1).strip()
return f"{dict_part}\nstream\n{stream_data}\nendstream"
# Regular dictionary object
content_str = re.sub(r'/Length\s+\d+', '{{streamlen}}', content_str)
return content_str.strip()
def pdf_to_in(pdf_path, output_path=None):
with tempfile.NamedTemporaryFile(
mode='wb', suffix='.pdf', delete=False) as tmp_file:
tmp_path = tmp_file.name
# Run mutool clean -a
result = subprocess.run(['mutool', 'clean', '-a', pdf_path, tmp_path],
capture_output=True,
check=False,
text=True)
if result.returncode != 0:
raise RuntimeError(f"mutool failed: {result.stderr}")
# Read the cleaned PDF
with open(tmp_path, 'rb') as f:
pdf_data = f.read()
# Parse PDF
parser = PDFParser(pdf_data)
objects = parser.parse()
# Build output
output_lines = ['{{header}}', '']
# Sort objects by number
sorted_objects = sorted(objects.keys(), key=lambda x: (x[0], x[1]))
# Add objects
for obj_num, gen_num in sorted_objects:
obj_content = objects[(obj_num, gen_num)]
formatted_content = parser.format_object_content(obj_content)
output_lines.append(f'{{{{object {obj_num} {gen_num}}}}} <<')
output_lines.append(formatted_content.removeprefix('<<\n'))
output_lines.append('endobj')
output_lines.append('')
output_lines.append('{{xref}}')
output_lines.append('{{trailer}}')
output_lines.append('{{startxref}}')
output_lines.append('%%EOF')
output_text = '\n'.join(output_lines)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(output_text + '\n')
print(f"Converted PDF written to: {output_path}")
else:
print(output_text)
return output_text
def main():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('input')
parser.add_argument('-o', '--output')
args = parser.parse_args()
pdf_to_in(args.input, args.output)
if __name__ == '__main__':
sys.exit(main())