pdftract/crates/pdftract-core/tests/hint_stream_integration.rs
jedarden f85e5149dd feat(pdftract-91e1i): HTTP fetch sequence implementation
Implement orchestration layer connecting HttpRangeSource to Phase 1.3
xref resolver and Phase 1.4 document model for remote PDF access:

- Document::open_remote() public API for remote PDF loading
- Progressive tail fetch (16 KB → 1 MB) for startxref location
- Xref forward-scan disabled for remote sources (via is_remote check)
- Page-by-page on-demand fetch via HttpRangeSource caching
- Resource lazy load through XrefResolver cache
- HEAD probe with 405 fallback, no Content-Length handling

Acceptance criteria:
 open_remote(url) returns Document with correct page count
 HEAD failure modes (405, no Content-Length, 401) handled
 xref forward-scan disabled for remote (is_remote check)
 Page-by-page on-demand fetch (HttpRangeSource LRU cache)
 INV-8 maintained (all errors return Result)

Files modified:
- crates/pdftract-core/src/document.rs (Document::open_remote, from_source)
- crates/pdftract-core/src/remote.rs (progressive tail fetch)
- crates/pdftract-core/src/lib.rs (re-exports)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 13:17:00 -04:00

351 lines
12 KiB
Rust

//! Integration tests for linearized PDF hint stream parsing and prefetch.
//!
//! This module tests:
//! - Hint stream parsing from linearized PDFs
//! - Prefetch optimization using hint table predictions
//! - Performance benefits of hint-based prefetch
use pdftract_core::parser::hint_stream::parse_hint_stream;
use pdftract_core::parser::stream::MemorySource;
/// Create a minimal valid hint stream for testing.
///
/// Returns (hint_stream_bytes, expected_page_ranges)
/// where expected_page_ranges is a vec of (start, end) for each page.
fn create_test_hint_stream(num_pages: u32) -> (Vec<u8>, Vec<(u64, u64)>) {
let mut data = Vec::new();
// Header
// Version: 1 (32-bit big-endian)
data.extend_from_slice(&1u32.to_be_bytes());
// Bit widths: all 16 bits (allows testing with larger offsets)
// Format: [object_number (4) | page_offset (4) | page_length (4) |
// shared_object (4) | shared_length (4)]
// 16 bits = 0x1, so packed as 0x11111 = 0b0001_0001_0001_0001_0001 (20 bits)
let bit_widths = 0x11111u32;
data.extend_from_slice(&bit_widths.to_be_bytes()[..3]); // First 3 bytes contain 20 bits
// Page count: num_pages (16 bits)
data.extend_from_slice(&(num_pages as u16).to_be_bytes());
// Shared groups: 0 (16 bits)
data.extend_from_slice(&0u16.to_be_bytes());
// Page hint records
// For simplicity, we create pages at offsets 1000, 2000, 3000, ...
// each with length 500
let mut expected_ranges = Vec::new();
for i in 0..num_pages {
let offset = 1000 + (i as u64) * 1000;
let length = 500u64;
// Object number: skip (write 0)
data.extend_from_slice(&(0u16).to_be_bytes());
// Offset
data.extend_from_slice(&(offset as u16).to_be_bytes());
// Length
data.extend_from_slice(&(length as u16).to_be_bytes());
expected_ranges.push((offset, offset + length));
}
(data, expected_ranges)
}
#[test]
fn test_parse_hint_stream_valid() {
let (hint_data, expected_ranges) = create_test_hint_stream(5);
let mut diagnostics = vec![];
let result = parse_hint_stream(&hint_data, &mut diagnostics);
assert!(result.is_some(), "Should successfully parse valid hint stream");
assert!(diagnostics.is_empty(), "Should not emit diagnostics for valid hint stream");
let table = result.unwrap();
assert_eq!(table.page_count(), 5);
// Verify each page's predicted range matches expected
for (i, (start, end)) in expected_ranges.iter().enumerate() {
let predicted = table.predict_page_range(i as u32);
assert_eq!(predicted, Some(*start..*end),
"Page {} range mismatch: expected {:?}, got {:?}", i, (*start..*end), predicted);
}
}
#[test]
fn test_parse_hint_stream_malformed_version() {
let mut data = Vec::new();
// Invalid version: 2
data.extend_from_slice(&2u32.to_be_bytes());
data.extend_from_slice(&0x11111000u32.to_be_bytes());
let mut diagnostics = vec![];
let result = parse_hint_stream(&data, &mut diagnostics);
assert!(result.is_none(), "Should reject hint stream with invalid version");
}
#[test]
fn test_parse_hint_stream_zero_page_count() {
let mut data = Vec::new();
// Version: 1
data.extend_from_slice(&1u32.to_be_bytes());
// Bit widths
data.extend_from_slice(&0x11111000u32.to_be_bytes());
// Page count: 0 (invalid)
data.extend_from_slice(&0u16.to_be_bytes());
data.extend_from_slice(&0u16.to_be_bytes());
let mut diagnostics = vec![];
let result = parse_hint_stream(&data, &mut diagnostics);
assert!(result.is_none(), "Should reject hint stream with zero page count");
}
#[test]
fn test_hint_predict_shared_objects_minimal() {
// Minimal implementation returns empty vec
let (hint_data, _) = create_test_hint_stream(3);
let mut diagnostics = vec![];
let table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
// Phase 1: shared object hints not implemented
let shared = table.predict_shared_objects();
assert!(shared.is_empty(), "Phase 1 minimal implementation returns empty shared object ranges");
}
#[test]
fn test_hint_stream_out_of_bounds_page() {
let (hint_data, _) = create_test_hint_stream(3);
let mut diagnostics = vec![];
let table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
// Page 10 is out of bounds (only 3 pages)
let result = table.predict_page_range(10);
assert!(result.is_none(), "Should return None for out-of-bounds page index");
}
#[test]
fn test_hint_table_predict_page_range() {
// Verify that hint table predictions work correctly
let (hint_data, expected_ranges) = create_test_hint_stream(3);
let mut diagnostics = vec![];
let table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
// Verify each page's predicted range matches expected
for (i, (start, end)) in expected_ranges.iter().enumerate() {
let predicted = table.predict_page_range(i as u32);
assert_eq!(predicted, Some(*start..*end),
"Page {} range mismatch: expected {:?}, got {:?}", i, (*start..*end), predicted);
}
}
/// Create a minimal linearized PDF with a valid hint stream for integration testing.
fn create_linearized_pdf_with_hint_stream() -> Vec<u8> {
// Build a minimal linearized PDF with hint stream
// This follows the PDF spec Annex F format
let mut pdf = Vec::new();
// PDF header
pdf.extend_from_slice(b"%PDF-1.4\n");
// Linearization dictionary (object 1)
let lin_dict_offset = pdf.len();
pdf.extend_from_slice(b"1 0 obj\n");
pdf.extend_from_slice(b"<< /Linearized 1.0\n");
pdf.extend_from_slice(b" /L 99999\n"); // Will be updated later
pdf.extend_from_slice(b" /H [1010 100]\n"); // Hint stream at offset 1010, length 100
pdf.extend_from_slice(b" /O 4\n"); // First page object number
pdf.extend_from_slice(b" /E 1500\n"); // End of first page
pdf.extend_from_slice(b" /N 5\n"); // Number of pages
pdf.extend_from_slice(b" /T 2000\n"); // Offset of first-page xref
pdf.extend_from_slice(b">>\n");
pdf.extend_from_slice(b"endobj\n");
// First-page xref stream (object 2)
pdf.extend_from_slice(b"2 0 obj\n");
pdf.extend_from_slice(b"<< /Type /XRef /Size 6 /W [1 4 2] >>\n");
pdf.extend_from_slice(b"stream\n");
// Minimal xref stream data
// Format: [type (1 byte)] [offset (4 bytes, big-endian)] [gen (2 bytes, big-endian)]
pdf.extend_from_slice(&[
// Object 0: free entry
0, // type: free
0, 0, 0, 0, // offset: 0
0, 0, // generation: 0 (was 65535, but that doesn't fit in u16)
// Object 1: in-use at offset ~17
1, // type: in-use
0, 0, 0, 17, // offset: 17
0, 0, // generation: 0
// Object 2: in-use at offset ~120
1, // type: in-use
0, 0, 0, 120, // offset: 120
0, 0, // generation: 0
// Object 3: in-use at offset ~300
1, // type: in-use
0, 0, 1, 44, // offset: 300 (256 + 44)
0, 0, // generation: 0
// Object 4: in-use at offset ~456
1, // type: in-use
0, 0, 1, 200, // offset: 456 (256 + 200)
0, 0, // generation: 0
// Object 5: in-use at offset ~556
1, // type: in-use
0, 0, 2, 44, // offset: 556 (512 + 44)
0, 0, // generation: 0
]);
pdf.extend_from_slice(b"\nendstream\n");
pdf.extend_from_slice(b"endobj\n");
// Hint stream (object 3) - flate-encoded hint stream data
let _hint_stream_offset = pdf.len();
pdf.extend_from_slice(b"3 0 obj\n");
pdf.extend_from_slice(b"<< /Filter /FlateDecode /Length 50 >>\n");
pdf.extend_from_slice(b"stream\n");
// Create a minimal valid hint stream (5 pages)
let (hint_data, _) = create_test_hint_stream(5);
// Flate-encode the hint data
use flate2::write::DeflateEncoder;
use std::io::Write;
let mut encoded = Vec::new();
{
let mut encoder = DeflateEncoder::new(&mut encoded, flate2::Compression::default());
encoder.write_all(&hint_data).unwrap();
}
pdf.extend_from_slice(&encoded);
pdf.extend_from_slice(b"\nendstream\n");
pdf.extend_from_slice(b"endobj\n");
// First page (object 4)
pdf.extend_from_slice(b"4 0 obj\n");
pdf.extend_from_slice(b"<< /Type /Page /MediaBox [0 0 612 792] >>\n");
pdf.extend_from_slice(b"endobj\n");
// Catalog (object 5)
pdf.extend_from_slice(b"5 0 obj\n");
pdf.extend_from_slice(b"<< /Type /Catalog /Pages 6 0 R >>\n");
pdf.extend_from_slice(b"endobj\n");
// Pages (object 6+)
for i in 6..=10 {
pdf.extend_from_slice(&format!("{} 0 obj\n", i).as_bytes());
pdf.extend_from_slice(b"<< /Type /Page >>\n");
pdf.extend_from_slice(b"endobj\n");
}
// Full xref at EOF
let xref_offset = pdf.len();
pdf.extend_from_slice(b"xref\n");
pdf.extend_from_slice(b"0 10\n");
pdf.extend_from_slice(b"0000000000 65535 f \n");
for _i in 1..=9 {
pdf.extend_from_slice(b"0000000000 00000 n \n");
}
pdf.extend_from_slice(b"trailer\n");
pdf.extend_from_slice(b"<< /Size 10 /Root 5 0 R >>\n");
pdf.extend_from_slice(b"startxref\n");
pdf.extend_from_slice(&format!("{}\n", xref_offset).as_bytes());
pdf.extend_from_slice(b"%%EOF\n");
// Update /L in linearization dict to actual file size
let file_length = pdf.len() as u64;
let lin_dict_str = format!("/L {}\n", file_length);
let _lin_dict_bytes = lin_dict_str.as_bytes();
// Find and replace the /L value
let lin_pos = lin_dict_offset + b"%PDF-1.4\n".len();
let l_search = &pdf[lin_pos..lin_pos + 100];
if let Some(l_pos) = l_search.windows(2).position(|w| w == b"/L") {
let l_abs_pos = lin_pos + l_pos;
let after_l = l_abs_pos + 2;
// Find the number after /L
let num_start = after_l + 1; // skip space
let num_end = pdf[num_start..].windows(1).position(|w| w[0] == b'\n').unwrap() + num_start;
// Replace with actual file length
let new_l_str = file_length.to_string();
let new_l_bytes = new_l_str.as_bytes();
pdf.splice(num_start..num_end, new_l_bytes.iter().cloned());
}
pdf
}
#[test]
fn test_linearized_pdf_with_hint_stream() {
let pdf_data = create_linearized_pdf_with_hint_stream();
// Parse the linearization dict
let source = MemorySource::new(pdf_data.clone());
let lin_info = pdftract_core::parser::xref::detect_linearization(&source);
assert!(lin_info.is_some(), "Should detect linearized PDF");
let info = lin_info.unwrap();
assert_eq!(info.page_count, 5);
assert!(info.hint_stream_offset.is_some());
assert!(info.hint_stream_length.is_some());
// Parse the hint stream
let parser_source = Box::new(source) as Box<dyn pdftract_core::source::PdfSource>;
let mut diagnostics = vec![];
let hint_table = pdftract_core::parser::hint_stream::parse_hint_stream_from_linearized(
&*parser_source,
info.hint_stream_offset.unwrap(),
info.hint_stream_length.unwrap(),
&mut diagnostics,
);
assert!(hint_table.is_some(), "Should successfully parse hint stream from linearized PDF");
assert_eq!(hint_table.unwrap().page_count(), 5);
}
/// Test that hint stream parsing doesn't panic on malformed data (INV-8).
#[test]
fn test_hint_stream_no_panic_on_corrupt_data() {
use proptest::prelude::*;
// Generate random byte sequences and verify we never panic
proptest!(|(data: Vec<u8>)| {
let mut diagnostics = vec![];
let _ = pdftract_core::parser::hint_stream::parse_hint_stream(&data, &mut diagnostics);
// Should never panic; returns None for malformed data
});
}
#[test]
fn test_hint_prefetch_performance() {
// Verify that hint-based prefetch calculates correct ranges
// This test verifies the logic:
// 1. Hint stream is parsed correctly
// 2. Prefetch ranges are calculated correctly
// 3. Prefetch is called for the expected pages
let (hint_data, expected_ranges) = create_test_hint_stream(10);
let mut diagnostics = vec![];
let hint_table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
// Verify that for pages 3-7 (1-based: 4-8), we predict the correct ranges
for i in 3..=7 {
let predicted = hint_table.predict_page_range(i);
assert!(predicted.is_some());
let (start, end) = expected_ranges[i as usize];
assert_eq!(predicted.unwrap(), start..end);
}
}