#!/usr/bin/env python3 """Create minimal valid PDF fixtures with proper xref tables.""" import os import re def create_simple_pdf(fixture_name, extra_catalog_entries=None, extra_objects=None): """ Create a minimal valid PDF with proper xref table. Args: fixture_name: Name of the fixture (without .pdf) extra_catalog_entries: Extra dictionary entries to add to catalog (e.g., /OCProperties) extra_objects: List of (obj_num, dict_string) tuples for additional objects """ output_path = f"/home/coding/pdftract/tests/document_model/fixtures/{fixture_name}.pdf" # Base PDF content lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>>>>>>", "endobj", "", "2 0 obj", "<>>>>>>", "endobj", "", "3 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 1) Tj", "ET", "endstream", "endobj", "", "4 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 2) Tj", "ET", "endstream", "endobj", "", ] # Add catalog object (will be object 5, unless extra_objects shift it) catalog_obj_num = 5 # Add extra objects if provided (before catalog) if extra_objects: for obj_num, obj_content in extra_objects: lines.append(f"{obj_num} 0 obj") lines.append(obj_content) lines.append("endobj") lines.append("") # Build catalog with optional extra entries if extra_catalog_entries: catalog_dict = f"<>" else: catalog_dict = "<>" lines.append(f"{catalog_obj_num} 0 obj") lines.append(catalog_dict) lines.append("endobj") lines.append("") # Build full PDF content (without xref/trailer) full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() # Calculate xref offset xref_offset = len(full_pdf) + 1 # +1 for the newline after full_pdf # Build xref table max_obj = max(obj_offsets.keys()) if obj_offsets else catalog_obj_num xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") else: # Free entry - shouldn't happen but handle it xref_lines.append(f"0000000000 65535 f ") # Build trailer trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_ocg_default_off(): """Create OCG fixture with /D /BaseState /OFF.""" extra_objects = [ (6, "<>"), (7, "<>"), (8, "<>"), ] create_simple_pdf("ocg_default_off", extra_catalog_entries="/OCProperties 8 0 R", extra_objects=extra_objects) def create_missing_mediabox(): """Create PDF with missing MediaBox (EC-09).""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/missing_mediabox.pdf" lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>", "endobj", "", "2 0 obj", "<>", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = max(obj_offsets.keys()) if obj_offsets else 2 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_inheritance_grandparent_mediabox(): """Create PDF where page inherits MediaBox from grandparent /Pages.""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/inheritance_grandparent_mediabox.pdf" lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>", "endobj", "", "2 0 obj", "<>", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = max(obj_offsets.keys()) if obj_offsets else 2 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_js_in_openaction(): """Create PDF with JavaScript in /OpenAction.""" create_simple_pdf("js_in_openaction", extra_catalog_entries="/OpenAction<>") def create_xfa_form(): """Create PDF with XFA form.""" create_simple_pdf("xfa_form", extra_catalog_entries="/AcroForm<>") def create_pdfa_1b_conformance(): """Create PDF with PDF/A-1B XMP metadata.""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/pdfa_1b_conformance.pdf" lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>>>>>>", "endobj", "", "2 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 1) Tj", "ET", "endstream", "endobj", "", "3 0 obj", "<>", "endobj", "", "4 0 obj", "<>", "stream", '', '', ' ', ' 1', ' B', ' ', '', "endstream", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = max(obj_offsets.keys()) if obj_offsets else 4 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_multi_revision_3(): """Create PDF with 3 incremental revisions.""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/multi_revision_3.pdf" # First revision: 2-page PDF lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>>>>>>", "endobj", "", "2 0 obj", "<>>>>>>", "endobj", "", "3 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 1) Tj", "ET", "endstream", "endobj", "", "4 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 2) Tj", "ET", "endstream", "endobj", "", "5 0 obj", "<>", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = 5 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_partial_resource_override(): """Create PDF with partial resource override.""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/partial_resource_override.pdf" lines = [ "%PDF-1.4", "", "0 0 obj", "<>>>/ProcSet[/PDF]>>>", "endobj", "", "1 0 obj", "<>>>>>>", "endobj", "", "2 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 1) Tj", "ET", "endstream", "endobj", "", "3 0 obj", "<>", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = max(obj_offsets.keys()) if obj_offsets else 3 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_tagged_3_level_outline(): """Create PDF with 3-level outline structure.""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/tagged_3_level_outline.pdf" lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>>>>>>", "endobj", "", "2 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 1) Tj", "ET", "endstream", "endobj", "", "3 0 obj", "<>", "endobj", "", "4 0 obj", "<>", "endobj", "", "5 0 obj", "<>", "endobj", "", "6 0 obj", "<>", "endobj", "", "7 0 obj", "<>", "endobj", "", "8 0 obj", "<>", "endobj", "", "9 0 obj", "<>", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = max(obj_offsets.keys()) if obj_offsets else 9 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_page_labels_roman_arabic(): """Create PDF with roman numerals for pages 0-3 and arabic for page 4+.""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/page_labels_roman_arabic.pdf" lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>>>>>>", "endobj", "", "2 0 obj", "<>>>>>>", "endobj", "", "3 0 obj", "<>>>>>>", "endobj", "", "4 0 obj", "<>>>>>>", "endobj", "", "5 0 obj", "<>>>>>>", "endobj", "", "6 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page i) Tj", "ET", "endstream", "endobj", "", "7 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page ii) Tj", "ET", "endstream", "endobj", "", "8 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page iii) Tj", "ET", "endstream", "endobj", "", "9 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page iv) Tj", "ET", "endstream", "endobj", "", "10 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 1) Tj", "ET", "endstream", "endobj", "", "11 0 obj", "<>", "endobj", "", "12 0 obj", "<>4<>]>>", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = max(obj_offsets.keys()) if obj_offsets else 12 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") def create_encrypted_unknown_handler(): """Create PDF with unsupported encryption handler (Adobe.PubSec).""" output_path = "/home/coding/pdftract/tests/document_model/fixtures/encrypted_unknown_handler.pdf" lines = [ "%PDF-1.4", "", "0 0 obj", "<>", "endobj", "", "1 0 obj", "<>>>>>>", "endobj", "", "2 0 obj", "<>", "stream", "BT", "/F1 12 Tf", "100 700 Td", "(Page 1) Tj", "ET", "endstream", "endobj", "", "3 0 obj", "<>", "endobj", "", "4 0 obj", "<>", "endobj", "", ] full_pdf = "\n".join(lines) # Calculate object offsets by finding byte positions obj_offsets = {} for match in re.finditer(r'(\d+) 0 obj', full_pdf): obj_num = int(match.group(1)) obj_offsets[obj_num] = match.start() xref_offset = len(full_pdf) + 1 max_obj = max(obj_offsets.keys()) if obj_offsets else 4 xref_lines = [ f"xref", f"0 {max_obj + 1}", f"0000000000 65535 f ", ] for obj_num in range(1, max_obj + 1): if obj_num in obj_offsets: xref_lines.append(f"{obj_offsets[obj_num]:010d} 00000 n ") trailer_lines = [ "trailer", f"<>", f"startxref", f"{xref_offset}", f"%%EOF", ] final_pdf = full_pdf + "\n" + "\n".join(xref_lines) + "\n" + "\n".join(trailer_lines) with open(output_path, 'w') as f: f.write(final_pdf) print(f"Created {output_path}") if __name__ == "__main__": print("Creating valid PDF fixtures...") create_simple_pdf("base_hello") create_ocg_default_off() create_missing_mediabox() create_inheritance_grandparent_mediabox() create_js_in_openaction() create_xfa_form() create_pdfa_1b_conformance() create_multi_revision_3() create_partial_resource_override() create_tagged_3_level_outline() create_page_labels_roman_arabic() create_encrypted_unknown_handler() print("\nAll fixtures created successfully!")