pdftract/scripts/generate_document_model_fixtures.sh
jedarden bb7146cffe fix(pdftract-2uk9z): wrap native module results in typed Python objects
The native PyO3 module returns raw dicts via pythonize, but the Python SDK
API expects typed dataclass objects (Document, Page, Metadata, etc.) to be
consistent with the subprocess fallback and test expectations.

Updated wrapper functions in __init__.py to convert native results:
- extract(): wraps dict in Document.from_dict()
- extract_stream(): wraps yielded page dicts in Page.from_dict()
- get_metadata(): wraps dict in Metadata()
- hash(): wraps string in Fingerprint.from_string()
- classify(): wraps dict in Classification()
- search(): wraps yielded match dicts in Match

The native PyO3 entry points (extract, extract_text, extract_stream) were
already implemented with:
- extract: uses extract_pdf + pythonize for PyDict conversion
- extract_text: uses extract_text for plain String return
- extract_stream: uses extract_pdf_streaming with custom StreamIterator

All kwargs parsing with strict validation (unknown kwargs raise TypeError)
was already in place.

Acceptance criteria:
- pdftract.extract() returns Document object with pages/metadata
- pdftract.extract_text() returns plain text string
- pdftract.extract_stream() yields Page objects
- Unknown kwarg raises TypeError
2026-05-28 21:18:38 -04:00

380 lines
11 KiB
Bash
Executable file

#!/usr/bin/env bash
# Generate document model test fixtures
# Requires: qpdf (via nix-shell)
set -e
FIXTURES_DIR="tests/document_model/fixtures"
BASE_PDF="$FIXTURES_DIR/base_hello.pdf"
# Create a minimal base PDF for encryption
create_base_pdf() {
cat > "$BASE_PDF" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 4 0 R>>>/Contents 5 0 R>>endobj
4 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
5 0 obj<</Length 44>>stream
BT /F1 12 Tf 100 700 Td (Hello World) Tj ET
endstream endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000302 00000 n
0000000377 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 360
%%EOF
EOF
echo "Created base PDF: $BASE_PDF"
}
# Generate encrypted fixtures
generate_encrypted() {
echo "Generating encrypted fixtures..."
# RC4-40 with password "test" (EC-04)
nix-shell -p qpdf --run "qpdf --allow-weak-crypto --encrypt test test 40 -- $BASE_PDF $FIXTURES_DIR/encrypted_rc4_test.pdf"
# AES-128 with password "test" (EC-05)
nix-shell -p qpdf --run "qpdf --encrypt test test 128 -- $BASE_PDF $FIXTURES_DIR/encrypted_aes128_test.pdf"
# AES-256 with password "test" (EC-06) - requires PDF 2.0
nix-shell -p qpdf --run "qpdf --encrypt test test 256 --force-version=2.0 -- $BASE_PDF $FIXTURES_DIR/encrypted_aes256_test.pdf"
# Empty password (RC4-40)
nix-shell -p qpdf --run "qpdf --allow-weak-crypto --encrypt '' '' 40 -- $BASE_PDF $FIXTURES_DIR/encrypted_empty_password.pdf"
echo "Encrypted fixtures generated."
}
# Generate tagged PDF with 3-level outline
generate_tagged_outline() {
echo "Generating tagged_3_level_outline.pdf..."
cat > "$FIXTURES_DIR/tagged_3_level_outline.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/Outlines 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 2/Kids[4 0 R 5 0 R]>>endobj
3 0 obj<</Type/Outlines/First 6 0 R/Last 7 0 R/Count 2>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
6 0 obj<</Title(Chapter 1)/Parent 3 0 R/Next 7 0 R/First 8 0 R/Count 1/Dest[4 0 R /XYZ 0 792 null]>>endobj
7 0 obj<</Title(Chapter 2)/Parent 3 0 R/Prev 6 0 R/Dest[5 0 R /XYZ 0 792 null]>>endobj
8 0 obj<</Title(Section 1.1)/Parent 6 0 R/Dest[4 0 R /XYZ 0 700 null]>>endobj
xref
0 9
0000000000 65535 f
0000000009 00000 n
0000000066 00000 n
0000000133 00000 n
0000000222 00000 n
0000000313 00000 n
0000000404 00000 n
0000000549 00000 n
0000000680 00000 n
trailer<</Size 9/Root 1 0 R>>
startxref 795
%%EOF
EOF
echo "Generated tagged_3_level_outline.pdf"
}
# Generate OCG with default OFF (EC-16)
generate_ocg_off() {
echo "Generating ocg_default_off.pdf..."
cat > "$FIXTURES_DIR/ocg_default_off.pdf" <<'EOF'
%PDF-1.5
1 0 obj<</Type/Catalog/Pages 2 0 R/OCProperties</D</BaseState/OFF/ON[]/OFF[5 0 R]>>>>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/OCMD 4 0 R>>endobj
4 0 obj<</OCGs 5 0 R/P/ON>>endobj
5 0 obj[/OCG1]endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000157 00000 n
0000000232 00000 n
0000000331 00000 n
0000000424 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 509
%%EOF
EOF
echo "Generated ocg_default_off.pdf"
}
# Generate multi-revision PDF (3 revisions)
generate_multi_revision() {
echo "Generating multi_revision_3.pdf..."
cat > "$FIXTURES_DIR/multi_revision_3.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 3/Kids[3 0 R 4 0 R 5 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000125 00000 n
0000000222 00000 n
0000000319 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 416
%%EOF
EOF
echo "Generated multi_revision_3.pdf"
}
# Generate inheritance test fixtures
generate_inheritance() {
echo "Generating inheritance fixtures..."
cat > "$FIXTURES_DIR/inheritance_grandparent_mediabox.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]/MediaBox[0 0 612 792]>>endobj
3 0 obj<</Type/Pages/Count 1/Kids[4 0 R]>>endobj
4 0 obj<</Type/Page/Parent 3 0 R>>endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000157 00000 n
0000000240 00000 n
trailer<</Size 5/Root 1 0 R>>
startxref 325
%%EOF
EOF
cat > "$FIXTURES_DIR/missing_mediabox.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/Parent 2 0 R>>endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000125 00000 n
trailer<</Size 4/Root 1 0 R>>
startxref 210
%%EOF
EOF
echo "Generated inheritance fixtures."
}
# Generate partial resource override fixture
generate_partial_override() {
echo "Generating partial_resource_override.pdf..."
cat > "$FIXTURES_DIR/partial_resource_override.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 2/Kids[3 0 R 4 0 R]/Resources<</Font<</F1 5 0 R/F2 6 0 R>>>>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F3 7 0 R>>>/Contents 8 0 R>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
6 0 obj<</Type/Font/Subtype/Type1/BaseFont/Times-Roman>>endobj
7 0 obj<</Type/Font/Subtype/Type1/BaseFont/Courier>>endobj
8 0 obj<</Length 44>>stream
BT /F3 12 Tf 100 700 Td (Partial override) Tj ET
endstream endobj
xref
0 9
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000245 00000 n
0000000450 00000 n
0000000547 00000 n
0000000636 00000 n
0000000747 00000 n
0000000838 00000 n
trailer<</Size 9/Root 1 0 R>>
startxref 945
%%EOF
EOF
echo "Generated partial_resource_override.pdf"
}
# Generate JavaScript fixture
generate_js() {
echo "Generating js_in_openaction.pdf..."
cat > "$FIXTURES_DIR/js_in_openaction.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/OpenAction</S/JavaScript/JS(app.alert('Hello'))>>>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000176 00000 n
0000000263 00000 n
trailer<</Size 4/Root 1 0 R>>
startxref 348
%%EOF
EOF
echo "Generated js_in_openaction.pdf"
}
# Generate XFA form fixture
generate_xfa() {
echo "Generating xfa_form.pdf..."
cat > "$FIXTURES_DIR/xfa_form.pdf" <<'EOF'
%PDF-1.6
1 0 obj<</Type/Catalog/Pages 2 0 R/AcroForm 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[4 0 R]>>endobj
3 0 obj<</XFA[(xfa.xml)]/Fields[5 0 R]>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</T(Field1)/V(Test value)>>endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000134 00000 n
0000000227 00000 n
0000000330 00000 n
0000000439 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 528
%%EOF
EOF
echo "Generated xfa_form.pdf"
}
# Generate PDF/A-1B conformance fixture
generate_pdfa() {
echo "Generating pdfa_1b_conformance.pdf..."
cat > "$FIXTURES_DIR/pdfa_1b_conformance.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/Metadata 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[4 0 R]>>endobj
3 0 obj<</Type/Metadata/Subtype/XML/Length 220>>stream
<?xpacket begin="utf-8"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about="" xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/">
<pdfaid:part>1</pdfaid:part>
<pdfaid:conformance>B</pdfaid:conformance>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end="w"?>
endstream endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000134 00000 n
0000000235 00000 n
0000000609 00000 n
trailer<</Size 5/Root 1 0 R>>
startxref 682
%%EOF
EOF
echo "Generated pdfa_1b_conformance.pdf"
}
# Generate page labels fixture
generate_page_labels() {
echo "Generating page_labels_roman_arabic.pdf..."
cat > "$FIXTURES_DIR/page_labels_roman_arabic.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/PageLabels 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 6/Kids[4 0 R 5 0 R 6 0 R 7 0 R 8 0 R 9 0 R]>>endobj
3 0 obj<</Nums[0</S/R>>4</S/D>>]>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
6 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
7 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
8 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
9 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 10
0000000000 65535 f
0000000009 00000 n
0000000134 00000 n
0000000269 00000 n
0000000447 00000 n
0000000554 00000 n
0000000661 00000 n
0000000768 00000 n
0000000875 00000 n
0000000982 00000 n
trailer<</Size 10/Root 1 0 R>>
startxref 1089
%%EOF
EOF
echo "Generated page_labels_roman_arabic.pdf"
}
# Generate unknown handler fixture
generate_unknown_handler() {
echo "Generating encrypted_unknown_handler.pdf..."
cat > "$FIXTURES_DIR/encrypted_unknown_handler.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
4 0 obj<</O(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/U(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/P -1340>>endobj
5 0 obj<</O(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/U(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/P -1340>>endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000125 00000 n
0000000204 00000 n
0000000409 00000 n
trailer<</Size 6/Root 1 0 R/Encrypt</Filter/Adobe.PubSec/V 2/R 2/P -1340/O 4 0 R/U 5 0 R>>/ID[<1234567890abcdef1234567890abcdef><fedcba0987654321fedcba0987654321>]>>
startxref 614
%%EOF
EOF
echo "Generated encrypted_unknown_handler.pdf"
}
# Main execution
main() {
echo "Generating document model test fixtures..."
mkdir -p "$FIXTURES_DIR"
create_base_pdf
generate_encrypted
generate_tagged_outline
generate_ocg_off
generate_multi_revision
generate_inheritance
generate_partial_override
generate_js
generate_xfa
generate_pdfa
generate_page_labels
generate_unknown_handler
echo "All fixtures generated successfully!"
echo "Fixtures are in: $FIXTURES_DIR"
}
main "$@"