pdftract/fix_fixtures.py
jedarden 432514d350 wip: AcroForm improvements, debug tooling, test corpus, and fixture updates
Collects in-progress work across forms (Ch/Tx field handling, value_text
edge cases), layout corrections, stream parser fixes, conformance test
expansion, security audit test (TH-08), stream-decoder bomb fixture,
debug examples reorganization under examples/debug/, sdk module scaffold,
xtask CLI enhancements, and provenance entries for new fixtures.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-30 09:48:14 -04:00

43 lines
1.5 KiB
Python

#!/usr/bin/env python3
"""Fix malformed PDF fixtures with incorrect startxref offsets."""
import re
import subprocess
fixtures = [
"tests/document_model/fixtures/ocg_default_off.pdf",
"tests/document_model/fixtures/tagged_3_level_outline.pdf",
"tests/document_model/fixtures/multi_revision_3.pdf",
"tests/document_model/fixtures/inheritance_grandparent_mediabox.pdf",
"tests/document_model/fixtures/missing_mediabox.pdf",
"tests/document_model/fixtures/partial_resource_override.pdf",
"tests/document_model/fixtures/js_in_openaction.pdf",
"tests/document_model/fixtures/xfa_form.pdf",
"tests/document_model/fixtures/pdfa_1b_conformance.pdf",
"tests/document_model/fixtures/page_labels_roman_arabic.pdf",
"tests/document_model/fixtures/encrypted_unknown_handler.pdf",
]
for fixture_path in fixtures:
try:
# Read the file
with open(fixture_path, 'rb') as f:
data = f.read()
# Find the first "xref" (the correct one)
xref_match = re.search(b'xref\n', data)
if not xref_match:
print(f"Skipping {fixture_path}: no xref found")
continue
correct_offset = xref_match.start()
# Fix the startxref value
new_data = re.sub(rb'startxref\n\d+', f'startxref\n{correct_offset}'.encode(), data)
# Write back
with open(fixture_path, 'wb') as f:
f.write(new_data)
print(f"Fixed {fixture_path}: startxref now points to {correct_offset}")
except Exception as e:
print(f"Error fixing {fixture_path}: {e}")