pdftract/tests/fingerprint/fixtures/create_fixtures.py
2026-05-29 08:25:23 -04:00

199 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""
Create fingerprint test fixtures with meaningful content differences.
This script generates PDFs where the actual rendered content differs.
"""
import struct
import zlib
import os
def create_simple_pdf(content_text, output_path):
"""
Create a simple PDF with the given text content.
The PDF structure:
- One page with Helvetica font
- Content stream displays the text
- Simple structure without complications
"""
# Create a simple content stream that displays text
# BT ... ET begins/ends text block
# Td moves to position
# Tj shows text
content_stream = f"BT 50 700 Td ({content_text}) Tj ET".encode('ascii')
# Compress the content stream with FlateDecode
compressed_content = zlib.compress(content_stream, 9)
# Build the PDF structure
pdf_objects = []
# Object 1: Catalog
pdf_objects.append(b"1 0 obj\n<< /Pages 2 0 R /Type /Catalog >>\nendobj\n")
# Object 2: Pages
pdf_objects.append(b"2 0 obj\n<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>\nendobj\n")
# Object 3: Page
pdf_objects.append(f"""3 0 obj
<< /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 << /BaseFont (/Helvetica) /Subtype (/Type1) /Type (/Font) >> >> >> /Type /Page >>
endobj
""".encode('ascii'))
# Object 4: Content stream (compressed)
pdf_objects.append(f"""4 0 obj
<< /Length {len(compressed_content)} /Filter /FlateDecode >>
stream
""".encode('ascii'))
pdf_objects.append(compressed_content)
pdf_objects.append(b"\nendstream\nendobj\n")
# Calculate xref offset
pdf_data = b"%PDF-1.3\n%abcdefghijklmnopqrstuvwxyz\n"
xref_offset = len(pdf_data)
for obj in pdf_objects:
pdf_data += obj
# Build trailer
trailer = f"""xref
0 5
0000000000 65535 f
{xref_offset:010d} 00000 n
{xref_offset + len(pdf_objects[0]):010d} 00000 n
{xref_offset + len(pdf_objects[0]) + len(pdf_objects[1]):010d} 00000 n
{xref_offset + len(pdf_objects[0]) + len(pdf_objects[1]) + len(pdf_objects[2]):010d} 00000 n
trailer
<< /Root 1 0 R /Size 5 >>
startxref
{xref_offset + sum(len(obj) for obj in pdf_objects)}
%%EOF
""".encode('ascii')
pdf_data += trailer
with open(output_path, 'wb') as f:
f.write(pdf_data)
def create_linearized_pdf(input_path, output_path):
"""
Create a linearized version of a PDF.
For proper linearization, we need to create a PDF with:
- A linearization dictionary at the beginning
- Hint tables
- Proper object ordering
Since this is complex without qpdf, we'll create a simpler variant:
Just add a /Linearized key to the document (not full linearization, but sufficient for testing).
"""
with open(input_path, 'rb') as f:
pdf_data = f.read()
# For this test, we'll add a comment at the beginning that indicates linearization
# In a real scenario, we'd use qpdf --linearize
# But since qpdf is not available, we'll create a variant with different byte layout
# Read the PDF and rebuild it with different object ordering
# This simulates what a tool like qpdf might do
lines = pdf_data.split(b'\n')
# Find the trailer and rebuild with different line length (simulating re-save)
new_lines = []
for line in lines:
if b'trailer' in line:
# Add some spaces to change byte layout
new_lines.append(b' ' + line)
else:
new_lines.append(line)
new_pdf = b'\n'.join(new_lines)
with open(output_path, 'wb') as f:
f.write(new_pdf)
def main():
fixtures_dir = "tests/fingerprint/fixtures"
# Create base_hello.pdf source
base_hello = os.path.join(fixtures_dir, ".clean_source.pdf")
# 1. byte_identical: Two copies of the same file
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "byte_identical/v1.pdf"))
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "byte_identical/v2.pdf"))
print("Created byte_identical fixtures")
# 2. acrobat_resave: Same content, simulate re-save by changing whitespace in trailer
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "acrobat_resave/v1.pdf"))
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "acrobat_resave/v2_temp.pdf"))
# Modify v2 to have different whitespace (simulating Acrobat re-save)
with open(os.path.join(fixtures_dir, "acrobat_resave/v2_temp.pdf"), 'rb') as f:
pdf_data = f.read()
# Add extra spaces before trailer
pdf_data = pdf_data.replace(b'\ntrailer', b'\n trailer')
with open(os.path.join(fixtures_dir, "acrobat_resave/v2.pdf"), 'wb') as f:
f.write(pdf_data)
os.remove(os.path.join(fixtures_dir, "acrobat_resave/v2_temp.pdf"))
print("Created acrobat_resave fixtures")
# 3. pdftk_resave: Same as acrobat_resave for our purposes
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "pdftk_resave/v1.pdf"))
with open(os.path.join(fixtures_dir, "pdftk_resave/v1.pdf"), 'rb') as f:
pdf_data = f.read()
# Modify whitespace differently
pdf_data = pdf_data.replace(b'\nendobj', b'\n endobj')
with open(os.path.join(fixtures_dir, "pdftk_resave/v2.pdf"), 'wb') as f:
f.write(pdf_data)
print("Created pdftk_resave fixtures")
# 4. qpdf_resave: Same as above, different whitespace pattern
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "qpdf_resave/v1.pdf"))
with open(os.path.join(fixtures_dir, "qpdf_resave/v1.pdf"), 'rb') as f:
pdf_data = f.read()
# Modify whitespace differently
pdf_data = pdf_data.replace(b' 0 obj', b' 0 obj ')
with open(os.path.join(fixtures_dir, "qpdf_resave/v2.pdf"), 'wb') as f:
f.write(pdf_data)
print("Created qpdf_resave fixtures")
# 5. content_edit_one_glyph: Change ONE character in the text
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "content_edit_one_glyph/v1.pdf"))
create_simple_pdf("Hallo World", os.path.join(fixtures_dir, "content_edit_one_glyph/v2.pdf")) # 'e' -> 'a'
print("Created content_edit_one_glyph fixtures")
# 6. content_edit_one_paragraph: Change the entire text
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "content_edit_one_paragraph/v1.pdf"))
create_simple_pdf("Goodbye World", os.path.join(fixtures_dir, "content_edit_one_paragraph/v2.pdf"))
print("Created content_edit_one_paragraph fixtures")
# 7. metadata_only: Same content, different metadata
# For this, we create PDFs with same content but different trailer IDs
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "metadata_only/v1.pdf"))
with open(os.path.join(fixtures_dir, "metadata_only/v1.pdf"), 'rb') as f:
pdf_data = f.read()
# Change the ID array in the trailer (metadata-only change)
pdf_data = pdf_data.replace(b'<1b9f3b313fa7bcbcf4a42403f1794221>',
b'<2a0f4c4240b8dcded0b53514g2805332>')
with open(os.path.join(fixtures_dir, "metadata_only/v2.pdf"), 'wb') as f:
f.write(pdf_data)
print("Created metadata_only fixtures")
# 8. linearization_toggle: We need a proper linearized PDF
# Since qpdf is not available, we'll create a variant that simulates
# the byte layout differences of linearization
create_simple_pdf("Hello World", os.path.join(fixtures_dir, "linearization_toggle/v1.pdf"))
with open(os.path.join(fixtures_dir, "linearization_toggle/v1.pdf"), 'rb') as f:
pdf_data = f.read()
# Simulate linearization by adding comment at start and reordering objects
linearized = b"%PDF-1.3\n% Linearized: No\n" + pdf_data.split(b'%PDF-1.3\n')[-1]
with open(os.path.join(fixtures_dir, "linearization_toggle/v2.pdf"), 'wb') as f:
f.write(linearized)
print("Created linearization_toggle fixtures")
print("\nAll fixtures created successfully!")
if __name__ == "__main__":
main()