pdftract/scripts/generate_document_model_fixtures.sh
jedarden f85e5149dd feat(pdftract-91e1i): HTTP fetch sequence implementation
Implement orchestration layer connecting HttpRangeSource to Phase 1.3
xref resolver and Phase 1.4 document model for remote PDF access:

- Document::open_remote() public API for remote PDF loading
- Progressive tail fetch (16 KB → 1 MB) for startxref location
- Xref forward-scan disabled for remote sources (via is_remote check)
- Page-by-page on-demand fetch via HttpRangeSource caching
- Resource lazy load through XrefResolver cache
- HEAD probe with 405 fallback, no Content-Length handling

Acceptance criteria:
 open_remote(url) returns Document with correct page count
 HEAD failure modes (405, no Content-Length, 401) handled
 xref forward-scan disabled for remote (is_remote check)
 Page-by-page on-demand fetch (HttpRangeSource LRU cache)
 INV-8 maintained (all errors return Result)

Files modified:
- crates/pdftract-core/src/document.rs (Document::open_remote, from_source)
- crates/pdftract-core/src/remote.rs (progressive tail fetch)
- crates/pdftract-core/src/lib.rs (re-exports)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 13:17:00 -04:00

380 lines
11 KiB
Bash
Executable file

#!/usr/bin/env bash
# Generate document model test fixtures
# Requires: qpdf (via nix-shell)
set -e
FIXTURES_DIR="tests/document_model/fixtures"
BASE_PDF="$FIXTURES_DIR/base_hello.pdf"
# Create a minimal base PDF for encryption
create_base_pdf() {
cat > "$BASE_PDF" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F1 4 0 R>>>/Contents 5 0 R>>endobj
4 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
5 0 obj<</Length 44>>stream
BT /F1 12 Tf 100 700 Td (Hello World) Tj ET
endstream endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000302 00000 n
0000000377 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 445
%%EOF
EOF
echo "Created base PDF: $BASE_PDF"
}
# Generate encrypted fixtures
generate_encrypted() {
echo "Generating encrypted fixtures..."
# RC4-40 with password "test" (EC-04)
nix-shell -p qpdf --run "qpdf --allow-weak-crypto --encrypt test test 40 -- $BASE_PDF $FIXTURES_DIR/encrypted_rc4_test.pdf"
# AES-128 with password "test" (EC-05)
nix-shell -p qpdf --run "qpdf --encrypt test test 128 -- $BASE_PDF $FIXTURES_DIR/encrypted_aes128_test.pdf"
# AES-256 with password "test" (EC-06) - requires PDF 2.0
nix-shell -p qpdf --run "qpdf --encrypt test test 256 --force-version=2.0 -- $BASE_PDF $FIXTURES_DIR/encrypted_aes256_test.pdf"
# Empty password (RC4-40)
nix-shell -p qpdf --run "qpdf --allow-weak-crypto --encrypt '' '' 40 -- $BASE_PDF $FIXTURES_DIR/encrypted_empty_password.pdf"
echo "Encrypted fixtures generated."
}
# Generate tagged PDF with 3-level outline
generate_tagged_outline() {
echo "Generating tagged_3_level_outline.pdf..."
cat > "$FIXTURES_DIR/tagged_3_level_outline.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/Outlines 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 2/Kids[4 0 R 5 0 R]>>endobj
3 0 obj<</Type/Outlines/First 6 0 R/Last 7 0 R/Count 2>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
6 0 obj<</Title(Chapter 1)/Parent 3 0 R/Next 7 0 R/First 8 0 R/Count 1/Dest[4 0 R /XYZ 0 792 null]>>endobj
7 0 obj<</Title(Chapter 2)/Parent 3 0 R/Prev 6 0 R/Dest[5 0 R /XYZ 0 792 null]>>endobj
8 0 obj<</Title(Section 1.1)/Parent 6 0 R/Dest[4 0 R /XYZ 0 700 null]>>endobj
xref
0 9
0000000000 65535 f
0000000009 00000 n
0000000066 00000 n
0000000133 00000 n
0000000222 00000 n
0000000313 00000 n
0000000404 00000 n
0000000549 00000 n
0000000680 00000 n
trailer<</Size 9/Root 1 0 R>>
startxref 795
%%EOF
EOF
echo "Generated tagged_3_level_outline.pdf"
}
# Generate OCG with default OFF (EC-16)
generate_ocg_off() {
echo "Generating ocg_default_off.pdf..."
cat > "$FIXTURES_DIR/ocg_default_off.pdf" <<'EOF'
%PDF-1.5
1 0 obj<</Type/Catalog/Pages 2 0 R/OCProperties</D</BaseState/OFF/ON[]/OFF[5 0 R]>>>>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/OCMD 4 0 R>>endobj
4 0 obj<</OCGs 5 0 R/P/ON>>endobj
5 0 obj[/OCG1]endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000157 00000 n
0000000232 00000 n
0000000331 00000 n
0000000424 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 509
%%EOF
EOF
echo "Generated ocg_default_off.pdf"
}
# Generate multi-revision PDF (3 revisions)
generate_multi_revision() {
echo "Generating multi_revision_3.pdf..."
cat > "$FIXTURES_DIR/multi_revision_3.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 3/Kids[3 0 R 4 0 R 5 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000125 00000 n
0000000222 00000 n
0000000319 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 416
%%EOF
EOF
echo "Generated multi_revision_3.pdf"
}
# Generate inheritance test fixtures
generate_inheritance() {
echo "Generating inheritance fixtures..."
cat > "$FIXTURES_DIR/inheritance_grandparent_mediabox.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]/MediaBox[0 0 612 792]>>endobj
3 0 obj<</Type/Pages/Count 1/Kids[4 0 R]>>endobj
4 0 obj<</Type/Page/Parent 3 0 R>>endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000157 00000 n
0000000240 00000 n
trailer<</Size 5/Root 1 0 R>>
startxref 325
%%EOF
EOF
cat > "$FIXTURES_DIR/missing_mediabox.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/Parent 2 0 R>>endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000125 00000 n
trailer<</Size 4/Root 1 0 R>>
startxref 210
%%EOF
EOF
echo "Generated inheritance fixtures."
}
# Generate partial resource override fixture
generate_partial_override() {
echo "Generating partial_resource_override.pdf..."
cat > "$FIXTURES_DIR/partial_resource_override.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 2/Kids[3 0 R 4 0 R]/Resources<</Font<</F1 5 0 R/F2 6 0 R>>>>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Resources<</Font<</F3 7 0 R>>>/Contents 8 0 R>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj
6 0 obj<</Type/Font/Subtype/Type1/BaseFont/Times-Roman>>endobj
7 0 obj<</Type/Font/Subtype/Type1/BaseFont/Courier>>endobj
8 0 obj<</Length 44>>stream
BT /F3 12 Tf 100 700 Td (Partial override) Tj ET
endstream endobj
xref
0 9
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000245 00000 n
0000000450 00000 n
0000000547 00000 n
0000000636 00000 n
0000000747 00000 n
0000000838 00000 n
trailer<</Size 9/Root 1 0 R>>
startxref 945
%%EOF
EOF
echo "Generated partial_resource_override.pdf"
}
# Generate JavaScript fixture
generate_js() {
echo "Generating js_in_openaction.pdf..."
cat > "$FIXTURES_DIR/js_in_openaction.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/OpenAction</S/JavaScript/JS(app.alert('Hello'))>>>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000176 00000 n
0000000263 00000 n
trailer<</Size 4/Root 1 0 R>>
startxref 348
%%EOF
EOF
echo "Generated js_in_openaction.pdf"
}
# Generate XFA form fixture
generate_xfa() {
echo "Generating xfa_form.pdf..."
cat > "$FIXTURES_DIR/xfa_form.pdf" <<'EOF'
%PDF-1.6
1 0 obj<</Type/Catalog/Pages 2 0 R/AcroForm 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[4 0 R]>>endobj
3 0 obj<</XFA[(xfa.xml)]/Fields[5 0 R]>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</T(Field1)/V(Test value)>>endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000134 00000 n
0000000227 00000 n
0000000330 00000 n
0000000439 00000 n
trailer<</Size 6/Root 1 0 R>>
startxref 528
%%EOF
EOF
echo "Generated xfa_form.pdf"
}
# Generate PDF/A-1B conformance fixture
generate_pdfa() {
echo "Generating pdfa_1b_conformance.pdf..."
cat > "$FIXTURES_DIR/pdfa_1b_conformance.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/Metadata 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[4 0 R]>>endobj
3 0 obj<</Type/Metadata/Subtype/XML/Length 220>>stream
<?xpacket begin="utf-8"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about="" xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/">
<pdfaid:part>1</pdfaid:part>
<pdfaid:conformance>B</pdfaid:conformance>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end="w"?>
endstream endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000134 00000 n
0000000235 00000 n
0000000609 00000 n
trailer<</Size 5/Root 1 0 R>>
startxref 682
%%EOF
EOF
echo "Generated pdfa_1b_conformance.pdf"
}
# Generate page labels fixture
generate_page_labels() {
echo "Generating page_labels_roman_arabic.pdf..."
cat > "$FIXTURES_DIR/page_labels_roman_arabic.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R/PageLabels 3 0 R>>endobj
2 0 obj<</Type/Pages/Count 6/Kids[4 0 R 5 0 R 6 0 R 7 0 R 8 0 R 9 0 R]>>endobj
3 0 obj<</Nums[0</S/R>>4</S/D>>]>>endobj
4 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
5 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
6 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
7 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
8 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
9 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
xref
0 10
0000000000 65535 f
0000000009 00000 n
0000000134 00000 n
0000000269 00000 n
0000000447 00000 n
0000000554 00000 n
0000000661 00000 n
0000000768 00000 n
0000000875 00000 n
0000000982 00000 n
trailer<</Size 10/Root 1 0 R>>
startxref 1089
%%EOF
EOF
echo "Generated page_labels_roman_arabic.pdf"
}
# Generate unknown handler fixture
generate_unknown_handler() {
echo "Generating encrypted_unknown_handler.pdf..."
cat > "$FIXTURES_DIR/encrypted_unknown_handler.pdf" <<'EOF'
%PDF-1.4
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
2 0 obj<</Type/Pages/Count 1/Kids[3 0 R]>>endobj
3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R>>endobj
4 0 obj<</O(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/U(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/P -1340>>endobj
5 0 obj<</O(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/U(1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef)/P -1340>>endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000125 00000 n
0000000204 00000 n
0000000409 00000 n
trailer<</Size 6/Root 1 0 R/Encrypt</Filter/Adobe.PubSec/V 2/R 2/P -1340/O 4 0 R/U 5 0 R>>/ID[<1234567890abcdef1234567890abcdef><fedcba0987654321fedcba0987654321>]>>
startxref 614
%%EOF
EOF
echo "Generated encrypted_unknown_handler.pdf"
}
# Main execution
main() {
echo "Generating document model test fixtures..."
mkdir -p "$FIXTURES_DIR"
create_base_pdf
generate_encrypted
generate_tagged_outline
generate_ocg_off
generate_multi_revision
generate_inheritance
generate_partial_override
generate_js
generate_xfa
generate_pdfa
generate_page_labels
generate_unknown_handler
echo "All fixtures generated successfully!"
echo "Fixtures are in: $FIXTURES_DIR"
}
main "$@"