pdftract/tests/fixtures/generate_encoding_fixtures.py

#!/usr/bin/env python3
"""
Generate encoding test fixtures for Phase 2.2–2.5 Unicode recovery.

Creates four PDF fixtures exercising Level 2–4 Unicode recovery:
- no-mapping.pdf: Font with no ToUnicode and no standard encoding (worst case)
- agl-only.pdf: Font with only AGL glyph names (Level 2 recovery)
- fingerprint-match.pdf: Font embedded for fingerprint matching (Level 3)
- shape-match.pdf: Font for shape-based recognition (Level 4)

Each fixture has a paired .txt ground truth file.
"""

import os
import struct

def create_no_mapping_pdf():
    """
    Create PDF with no ToUnicode CMap and custom encoding.

    This PDF uses a Type1 font with custom glyph names that don't map to AGL.
    Expected behavior: All glyphs fail Levels 1-3, only Level 4 shape recognition
    might recover some content (U+FFFD otherwise).
    """
    pdf = b"""%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
4 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /CustomFont
/Encoding <<
/Type /Encoding
/Differences [0 /g00 /g01 /g02 /g03 /g04 /g05]
>>
>>
endobj
5 0 obj
<<
/Length 65
>>
stream
BT
/F1 12 Tf
50 700 Td
/g00 /g01 /g02 /g03 Tj
50 680 Td
/g04 /g05 Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000348 00000 n
0000000509 00000 n
trailer
<<
/Size 6
/Root 1 0 R
>>
startxref
645
%%EOF
"""
    return pdf

def create_agl_only_pdf():
    """
    Create PDF with AGL-compatible glyph names but no ToUnicode.

    This PDF uses standard Type1 font with glyph names from the Adobe Glyph List.
    Expected behavior: Level 2 AGL lookup successfully recovers all content.
    Glyph names used: /H /e /l /o (Hello), /W /o /r /l /d (World)
    """
    pdf = b"""%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
4 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
endobj
5 0 obj
<<
/Length 60
>>
stream
BT
/F1 12 Tf
100 700 Td
(Hello) Tj
100 680 Td
(World) Tj
ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000329 00000 n
0000000379 00000 n
trailer
<<
/Size 6
/Root 1 0 R
>>
startxref
512
%%EOF
"""
    return pdf

def create_fingerprint_match_pdf():
    """
    Create PDF with embedded font program for fingerprint matching.

    This PDF embeds a font program (BaseFont) that can be SHA-256 hashed.
    Expected behavior: Level 3 fingerprint lookup matches the embedded font
    and recovers content from the fingerprint database.
    """
    # This uses a minimal embedded font program (would be larger in production)
    pdf = b"""%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
4 0 obj
<<
/Type /Font
/Subtype /Type1
/BaseFont /TestFingerprintFont
/FontDescriptor 6 0 R
>>
endobj
5 0 obj
<<
/Length 47
>>
stream
BT
/F1 12 Tf
100 700 Td
(Test) Tj
ET
endstream
endobj
6 0 obj
<<
/Type /FontDescriptor
/FontName /TestFingerprintFont
/Flags 4
/FontBBox [0 0 100 100]
/ItalicAngle 0
/Ascent 100
/Descent 0
/CapHeight 100
/StemV 80
/FontFile3 7 0 R
>>
endobj
7 0 obj
<<
/Length1 52
/Length2 28
/Length3 0
/Subtype /Type1C
/Length 80
>>
stream
%!PS-AdobeFont-1.0: TestFingerprintFont
%%CreationDate: Mon Jun 6 00:00:00 2026
% Minimal font program for fingerprint testing
endstream
endobj
xref
0 8
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000329 00000 n
0000000438 00000 n
0000000497 00000 n
0000000625 00000 n
trailer
<<
/Size 8
/Root 1 0 R
>>
startxref
765
%%EOF
"""
    return pdf

def create_shape_match_pdf():
    """
    Create PDF with subset font for shape-based recognition.

    This PDF uses a subset font (ABCDEF+Helvetica) with no ToUnicode.
    Expected behavior: Level 4 glyph shape recognition compares rendered
    glyph shapes against the shape database.
    """
    pdf = b"""%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Resources <<
/Font <<
/F1 4 0 R
>>
>>
/Contents 5 0 R
>>
endobj
4 0 obj
<<
/Type /Font
/Subtype /TrueType
/BaseFont /ABCDEF+Helvetica
/FontDescriptor 6 0 R
>>
endobj
5 0 obj
<<
/Length 42
>>
stream
BT
/F1 12 Tf
100 700 Td
(Shape) Tj
ET
endstream
endobj
6 0 obj
<<
/Type /FontDescriptor
/FontName /ABCDEF+Helvetica
/Flags 4
/FontBBox [0 0 100 100]
/ItalicAngle 0
/Ascent 100
/Descent 0
/CapHeight 100
/StemV 80
/FontFile2 7 0 R
>>
endobj
7 0 obj
<<
/Length 60
>>
stream
Minimal TrueType font program for shape testing
endstream
endobj
xref
0 8
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000329 00000 n
0000000477 00000 n
0000000536 00000 n
0000000664 00000 n
trailer
<<
/Size 8
/Root 1 0 R
>>
startxref
768
%%EOF
"""
    return pdf

def main():
    """Generate all encoding fixtures."""
    os.makedirs("tests/fixtures/encoding", exist_ok=True)

    # Fixture 1: no-mapping.pdf
    # Ground truth: mostly U+FFFD replacement chars, minimal recovery
    pdf1 = create_no_mapping_pdf()
    with open("tests/fixtures/encoding/no-mapping.pdf", "wb") as f:
        f.write(pdf1)
    # Ground truth: expected to be mostly U+FFFD with current implementation
    with open("tests/fixtures/encoding/no-mapping.txt", "w") as f:
        f.write("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>\n<EFBFBD><EFBFBD>")
    print("Created: tests/fixtures/encoding/no-mapping.pdf")

    # Fixture 2: agl-only.pdf
    # Ground truth: "Hello\nWorld" (AGL successfully maps glyph names)
    pdf2 = create_agl_only_pdf()
    with open("tests/fixtures/encoding/agl-only.pdf", "wb") as f:
        f.write(pdf2)
    with open("tests/fixtures/encoding/agl-only.txt", "w") as f:
        f.write("Hello\nWorld")
    print("Created: tests/fixtures/encoding/agl-only.pdf")

    # Fixture 3: fingerprint-match.pdf
    # Ground truth: "Test" (fingerprint DB lookup succeeds)
    pdf3 = create_fingerprint_match_pdf()
    with open("tests/fixtures/encoding/fingerprint-match.pdf", "wb") as f:
        f.write(pdf3)
    with open("tests/fixtures/encoding/fingerprint-match.txt", "w") as f:
        f.write("Test")
    print("Created: tests/fixtures/encoding/fingerprint-match.pdf")

    # Fixture 4: shape-match.pdf
    # Ground truth: "Shape" (shape DB lookup succeeds)
    pdf4 = create_shape_match_pdf()
    with open("tests/fixtures/encoding/shape-match.pdf", "wb") as f:
        f.write(pdf4)
    with open("tests/fixtures/encoding/shape-match.txt", "w") as f:
        f.write("Shape")
    print("Created: tests/fixtures/encoding/shape-match.pdf")

    print("\nAll encoding fixtures created successfully!")

if __name__ == "__main__":
    main()