Add a script for converting pdf files to .in format

Used to create the .in files in
https://pdfium-review.googlesource.com/c/pdfium/+/136530

Bug: none
Change-Id: Ib96357ccbb865efbeee05e6acc0b91d8b8261dc1
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/136590
Reviewed-by: Nico Weber <thakis@google.com>
Commit-Queue: Nico Weber <thakis@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
diff --git a/testing/tools/pdftoin_shrdlu.py b/testing/tools/pdftoin_shrdlu.py
new file mode 100755
index 0000000..742b8cf
--- /dev/null
+++ b/testing/tools/pdftoin_shrdlu.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# Copyright 2025 The PDFium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+"""
+Convert PDF files to .in format on a best-effort basis.
+
+Requires `mutool` on PATH.
+"""
+
+import argparse
+import re
+import subprocess
+import sys
+import tempfile
+
+
+class PDFParser:
+
+  def __init__(self, pdf_data):
+    self.data = pdf_data
+    self.objects = {}
+
+  def parse(self):
+    # Find all objects in the PDF
+    obj_pattern = rb'(\d+)\s+(\d+)\s+obj\s*(.*?)\s*endobj'
+    matches = re.finditer(obj_pattern, self.data, re.DOTALL)
+
+    for match in matches:
+      obj_num = int(match.group(1))
+      gen_num = int(match.group(2))
+      obj_content = match.group(3)
+
+      self.objects[(obj_num, gen_num)] = obj_content
+
+    return self.objects
+
+  def format_object_content(self, content):
+    """Format object content, handling dictionaries and streams."""
+    content_str = content.decode('latin-1', errors='replace')
+
+    # Check if this is a stream object
+    if 'stream' in content_str and 'endstream' in content_str:
+      # Split into dictionary and stream parts
+      stream_start = content_str.find('stream')
+      dict_part = content_str[:stream_start].strip()
+      stream_part = content_str[stream_start:]
+
+      # Replace /Length with {{streamlen}}
+      dict_part = re.sub(r'/Length\s+\d+', '{{streamlen}}', dict_part)
+
+      # Extract actual stream data
+      stream_match = re.search(r'stream\s*(.*?)\s*endstream', stream_part,
+                               re.DOTALL)
+      if stream_match:
+        stream_data = stream_match.group(1).strip()
+        return f"{dict_part}\nstream\n{stream_data}\nendstream"
+
+    # Regular dictionary object
+    content_str = re.sub(r'/Length\s+\d+', '{{streamlen}}', content_str)
+    return content_str.strip()
+
+
+def pdf_to_in(pdf_path, output_path=None):
+  with tempfile.NamedTemporaryFile(
+      mode='wb', suffix='.pdf', delete=False) as tmp_file:
+    tmp_path = tmp_file.name
+
+    # Run mutool clean -a
+    result = subprocess.run(['mutool', 'clean', '-a', pdf_path, tmp_path],
+                            capture_output=True,
+                            check=False,
+                            text=True)
+
+    if result.returncode != 0:
+      raise RuntimeError(f"mutool failed: {result.stderr}")
+
+    # Read the cleaned PDF
+    with open(tmp_path, 'rb') as f:
+      pdf_data = f.read()
+
+  # Parse PDF
+  parser = PDFParser(pdf_data)
+  objects = parser.parse()
+
+  # Build output
+  output_lines = ['{{header}}', '']
+
+  # Sort objects by number
+  sorted_objects = sorted(objects.keys(), key=lambda x: (x[0], x[1]))
+
+  # Add objects
+  for obj_num, gen_num in sorted_objects:
+    obj_content = objects[(obj_num, gen_num)]
+    formatted_content = parser.format_object_content(obj_content)
+
+    output_lines.append(f'{{{{object {obj_num} {gen_num}}}}} <<')
+    output_lines.append(formatted_content.removeprefix('<<\n'))
+    output_lines.append('endobj')
+    output_lines.append('')
+
+  output_lines.append('{{xref}}')
+  output_lines.append('{{trailer}}')
+  output_lines.append('{{startxref}}')
+  output_lines.append('%%EOF')
+  output_text = '\n'.join(output_lines)
+
+  if output_path:
+    with open(output_path, 'w', encoding='utf-8') as f:
+      f.write(output_text + '\n')
+    print(f"Converted PDF written to: {output_path}")
+  else:
+    print(output_text)
+
+  return output_text
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+  parser.add_argument('input')
+  parser.add_argument('-o', '--output')
+  args = parser.parse_args()
+  pdf_to_in(args.input, args.output)
+
+
+if __name__ == '__main__':
+  sys.exit(main())