Add a script for converting pdf files to .in format
Used to create the .in files in
https://pdfium-review.googlesource.com/c/pdfium/+/136530
Bug: none
Change-Id: Ib96357ccbb865efbeee05e6acc0b91d8b8261dc1
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/136590
Reviewed-by: Nico Weber <thakis@google.com>
Commit-Queue: Nico Weber <thakis@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/testing/tools/pdftoin_shrdlu.py b/testing/tools/pdftoin_shrdlu.py
new file mode 100755
index 0000000..742b8cf
--- /dev/null
+++ b/testing/tools/pdftoin_shrdlu.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# Copyright 2025 The PDFium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""
+Convert PDF files to .in format on a best-effort basis.
+
+Requires `mutool` on PATH.
+"""
+
+import argparse
+import re
+import subprocess
+import sys
+import tempfile
+
+
+class PDFParser:
+
+ def __init__(self, pdf_data):
+ self.data = pdf_data
+ self.objects = {}
+
+ def parse(self):
+ # Find all objects in the PDF
+ obj_pattern = rb'(\d+)\s+(\d+)\s+obj\s*(.*?)\s*endobj'
+ matches = re.finditer(obj_pattern, self.data, re.DOTALL)
+
+ for match in matches:
+ obj_num = int(match.group(1))
+ gen_num = int(match.group(2))
+ obj_content = match.group(3)
+
+ self.objects[(obj_num, gen_num)] = obj_content
+
+ return self.objects
+
+ def format_object_content(self, content):
+ """Format object content, handling dictionaries and streams."""
+ content_str = content.decode('latin-1', errors='replace')
+
+ # Check if this is a stream object
+ if 'stream' in content_str and 'endstream' in content_str:
+ # Split into dictionary and stream parts
+ stream_start = content_str.find('stream')
+ dict_part = content_str[:stream_start].strip()
+ stream_part = content_str[stream_start:]
+
+ # Replace /Length with {{streamlen}}
+ dict_part = re.sub(r'/Length\s+\d+', '{{streamlen}}', dict_part)
+
+ # Extract actual stream data
+ stream_match = re.search(r'stream\s*(.*?)\s*endstream', stream_part,
+ re.DOTALL)
+ if stream_match:
+ stream_data = stream_match.group(1).strip()
+ return f"{dict_part}\nstream\n{stream_data}\nendstream"
+
+ # Regular dictionary object
+ content_str = re.sub(r'/Length\s+\d+', '{{streamlen}}', content_str)
+ return content_str.strip()
+
+
+def pdf_to_in(pdf_path, output_path=None):
+ with tempfile.NamedTemporaryFile(
+ mode='wb', suffix='.pdf', delete=False) as tmp_file:
+ tmp_path = tmp_file.name
+
+ # Run mutool clean -a
+ result = subprocess.run(['mutool', 'clean', '-a', pdf_path, tmp_path],
+ capture_output=True,
+ check=False,
+ text=True)
+
+ if result.returncode != 0:
+ raise RuntimeError(f"mutool failed: {result.stderr}")
+
+ # Read the cleaned PDF
+ with open(tmp_path, 'rb') as f:
+ pdf_data = f.read()
+
+ # Parse PDF
+ parser = PDFParser(pdf_data)
+ objects = parser.parse()
+
+ # Build output
+ output_lines = ['{{header}}', '']
+
+ # Sort objects by number
+ sorted_objects = sorted(objects.keys(), key=lambda x: (x[0], x[1]))
+
+ # Add objects
+ for obj_num, gen_num in sorted_objects:
+ obj_content = objects[(obj_num, gen_num)]
+ formatted_content = parser.format_object_content(obj_content)
+
+ output_lines.append(f'{{{{object {obj_num} {gen_num}}}}} <<')
+ output_lines.append(formatted_content.removeprefix('<<\n'))
+ output_lines.append('endobj')
+ output_lines.append('')
+
+ output_lines.append('{{xref}}')
+ output_lines.append('{{trailer}}')
+ output_lines.append('{{startxref}}')
+ output_lines.append('%%EOF')
+ output_text = '\n'.join(output_lines)
+
+ if output_path:
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(output_text + '\n')
+ print(f"Converted PDF written to: {output_path}")
+ else:
+ print(output_text)
+
+ return output_text
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+ parser.add_argument('input')
+ parser.add_argument('-o', '--output')
+ args = parser.parse_args()
+ pdf_to_in(args.input, args.output)
+
+
+if __name__ == '__main__':
+ sys.exit(main())