pdftract/crates/pdftract-core/examples/test_docstrum.rs
jedarden f85e5149dd feat(pdftract-91e1i): HTTP fetch sequence implementation
Implement orchestration layer connecting HttpRangeSource to Phase 1.3
xref resolver and Phase 1.4 document model for remote PDF access:

- Document::open_remote() public API for remote PDF loading
- Progressive tail fetch (16 KB → 1 MB) for startxref location
- Xref forward-scan disabled for remote sources (via is_remote check)
- Page-by-page on-demand fetch via HttpRangeSource caching
- Resource lazy load through XrefResolver cache
- HEAD probe with 405 fallback, no Content-Length handling

Acceptance criteria:
 open_remote(url) returns Document with correct page count
 HEAD failure modes (405, no Content-Length, 401) handled
 xref forward-scan disabled for remote (is_remote check)
 Page-by-page on-demand fetch (HttpRangeSource LRU cache)
 INV-8 maintained (all errors return Result)

Files modified:
- crates/pdftract-core/src/document.rs (Document::open_remote, from_source)
- crates/pdftract-core/src/remote.rs (progressive tail fetch)
- crates/pdftract-core/src/lib.rs (re-exports)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 13:17:00 -04:00

86 lines
3.3 KiB
Rust

/// Standalone test for Docstrum algorithm verification.
/// This verifies the acceptance criteria for bead pdftract-4bylb.
use pdftract_core::layout::reading_order::{docstrum, BlockWithBBox};
fn main() {
println!("Testing Docstrum algorithm...\n");
// Test 1: Magazine main + sidebar
println!("Test 1: Magazine main + sidebar");
let blocks = vec![
BlockWithBBox::new(0, [50.0, 700.0, 250.0, 750.0]), // main, top
BlockWithBBox::new(1, [50.0, 600.0, 250.0, 650.0]), // main, mid
BlockWithBBox::new(2, [50.0, 500.0, 250.0, 550.0]), // main, bot
BlockWithBBox::new(3, [350.0, 680.0, 450.0, 720.0]), // sidebar, top
BlockWithBBox::new(4, [350.0, 620.0, 450.0, 660.0]), // sidebar, mid
];
let result = docstrum(&blocks);
let order = &result.order;
println!(" Order: {:?}", order);
// Find where sidebar blocks appear
let sidebar_pos = order.iter().position(|&i| i >= 3).unwrap_or(order.len());
let main_blocks: Vec<_> = order.iter().filter(|&&i| i < 3).collect();
assert_eq!(main_blocks.len(), 3, "main column should have 3 blocks");
assert!(sidebar_pos >= 3, "sidebar should start after main column");
println!(" PASS: Main column (0,1,2) before sidebar (3,4)\n");
// Test 2: Pathological scattered
println!("Test 2: Pathological scattered");
let blocks = vec![
BlockWithBBox::new(0, [50.0, 700.0, 100.0, 750.0]),
BlockWithBBox::new(1, [150.0, 600.0, 200.0, 650.0]),
BlockWithBBox::new(2, [250.0, 500.0, 300.0, 550.0]),
BlockWithBBox::new(3, [350.0, 400.0, 400.0, 450.0]),
];
let result = docstrum(&blocks);
let order = &result.order;
println!(" Order: {:?}", order);
assert_eq!(order.len(), 4, "all 4 blocks should be in the order");
// No duplicate blocks
let mut sorted = order.clone();
sorted.sort();
sorted.dedup();
assert_eq!(sorted.len(), 4, "no duplicate blocks");
println!(" PASS: All blocks in order, no duplicates\n");
// Test 3: All one line horizontal
println!("Test 3: All one line horizontal");
let blocks = vec![
BlockWithBBox::new(0, [50.0, 700.0, 100.0, 750.0]),
BlockWithBBox::new(1, [120.0, 700.0, 170.0, 750.0]),
BlockWithBBox::new(2, [190.0, 700.0, 240.0, 750.0]),
];
let result = docstrum(&blocks);
let order = &result.order;
println!(" Order: {:?}", order);
assert_eq!(order.len(), 3, "all blocks should be in one component");
assert_eq!(*order, vec![0, 1, 2], "order should be left-to-right (0, 1, 2)");
println!(" PASS: Single component, left-to-right order\n");
// Test 4: All one column vertical
println!("Test 4: All one column vertical");
let blocks = vec![
BlockWithBBox::new(0, [50.0, 700.0, 100.0, 750.0]), // top
BlockWithBBox::new(1, [50.0, 600.0, 100.0, 650.0]), // middle
BlockWithBBox::new(2, [50.0, 500.0, 100.0, 550.0]), // bottom
];
let result = docstrum(&blocks);
let order = &result.order;
println!(" Order: {:?}", order);
assert_eq!(order.len(), 3, "all blocks should be in one component");
assert_eq!(*order, vec![0, 1, 2], "order should be top-to-bottom (0, 1, 2)");
println!(" PASS: Single component, top-to-bottom order\n");
println!("All Docstrum acceptance criteria tests PASSED!");
}