Implement orchestration layer connecting HttpRangeSource to Phase 1.3 xref resolver and Phase 1.4 document model for remote PDF access: - Document::open_remote() public API for remote PDF loading - Progressive tail fetch (16 KB → 1 MB) for startxref location - Xref forward-scan disabled for remote sources (via is_remote check) - Page-by-page on-demand fetch via HttpRangeSource caching - Resource lazy load through XrefResolver cache - HEAD probe with 405 fallback, no Content-Length handling Acceptance criteria: ✅ open_remote(url) returns Document with correct page count ✅ HEAD failure modes (405, no Content-Length, 401) handled ✅ xref forward-scan disabled for remote (is_remote check) ✅ Page-by-page on-demand fetch (HttpRangeSource LRU cache) ✅ INV-8 maintained (all errors return Result) Files modified: - crates/pdftract-core/src/document.rs (Document::open_remote, from_source) - crates/pdftract-core/src/remote.rs (progressive tail fetch) - crates/pdftract-core/src/lib.rs (re-exports) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
351 lines
12 KiB
Rust
351 lines
12 KiB
Rust
//! Integration tests for linearized PDF hint stream parsing and prefetch.
|
|
//!
|
|
//! This module tests:
|
|
//! - Hint stream parsing from linearized PDFs
|
|
//! - Prefetch optimization using hint table predictions
|
|
//! - Performance benefits of hint-based prefetch
|
|
|
|
use pdftract_core::parser::hint_stream::parse_hint_stream;
|
|
use pdftract_core::parser::stream::MemorySource;
|
|
|
|
/// Create a minimal valid hint stream for testing.
|
|
///
|
|
/// Returns (hint_stream_bytes, expected_page_ranges)
|
|
/// where expected_page_ranges is a vec of (start, end) for each page.
|
|
fn create_test_hint_stream(num_pages: u32) -> (Vec<u8>, Vec<(u64, u64)>) {
|
|
let mut data = Vec::new();
|
|
|
|
// Header
|
|
// Version: 1 (32-bit big-endian)
|
|
data.extend_from_slice(&1u32.to_be_bytes());
|
|
|
|
// Bit widths: all 16 bits (allows testing with larger offsets)
|
|
// Format: [object_number (4) | page_offset (4) | page_length (4) |
|
|
// shared_object (4) | shared_length (4)]
|
|
// 16 bits = 0x1, so packed as 0x11111 = 0b0001_0001_0001_0001_0001 (20 bits)
|
|
let bit_widths = 0x11111u32;
|
|
data.extend_from_slice(&bit_widths.to_be_bytes()[..3]); // First 3 bytes contain 20 bits
|
|
|
|
// Page count: num_pages (16 bits)
|
|
data.extend_from_slice(&(num_pages as u16).to_be_bytes());
|
|
|
|
// Shared groups: 0 (16 bits)
|
|
data.extend_from_slice(&0u16.to_be_bytes());
|
|
|
|
// Page hint records
|
|
// For simplicity, we create pages at offsets 1000, 2000, 3000, ...
|
|
// each with length 500
|
|
let mut expected_ranges = Vec::new();
|
|
for i in 0..num_pages {
|
|
let offset = 1000 + (i as u64) * 1000;
|
|
let length = 500u64;
|
|
|
|
// Object number: skip (write 0)
|
|
data.extend_from_slice(&(0u16).to_be_bytes());
|
|
|
|
// Offset
|
|
data.extend_from_slice(&(offset as u16).to_be_bytes());
|
|
|
|
// Length
|
|
data.extend_from_slice(&(length as u16).to_be_bytes());
|
|
|
|
expected_ranges.push((offset, offset + length));
|
|
}
|
|
|
|
(data, expected_ranges)
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_hint_stream_valid() {
|
|
let (hint_data, expected_ranges) = create_test_hint_stream(5);
|
|
let mut diagnostics = vec![];
|
|
|
|
let result = parse_hint_stream(&hint_data, &mut diagnostics);
|
|
|
|
assert!(result.is_some(), "Should successfully parse valid hint stream");
|
|
assert!(diagnostics.is_empty(), "Should not emit diagnostics for valid hint stream");
|
|
|
|
let table = result.unwrap();
|
|
assert_eq!(table.page_count(), 5);
|
|
|
|
// Verify each page's predicted range matches expected
|
|
for (i, (start, end)) in expected_ranges.iter().enumerate() {
|
|
let predicted = table.predict_page_range(i as u32);
|
|
assert_eq!(predicted, Some(*start..*end),
|
|
"Page {} range mismatch: expected {:?}, got {:?}", i, (*start..*end), predicted);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_hint_stream_malformed_version() {
|
|
let mut data = Vec::new();
|
|
|
|
// Invalid version: 2
|
|
data.extend_from_slice(&2u32.to_be_bytes());
|
|
data.extend_from_slice(&0x11111000u32.to_be_bytes());
|
|
|
|
let mut diagnostics = vec![];
|
|
let result = parse_hint_stream(&data, &mut diagnostics);
|
|
|
|
assert!(result.is_none(), "Should reject hint stream with invalid version");
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_hint_stream_zero_page_count() {
|
|
let mut data = Vec::new();
|
|
|
|
// Version: 1
|
|
data.extend_from_slice(&1u32.to_be_bytes());
|
|
|
|
// Bit widths
|
|
data.extend_from_slice(&0x11111000u32.to_be_bytes());
|
|
|
|
// Page count: 0 (invalid)
|
|
data.extend_from_slice(&0u16.to_be_bytes());
|
|
data.extend_from_slice(&0u16.to_be_bytes());
|
|
|
|
let mut diagnostics = vec![];
|
|
let result = parse_hint_stream(&data, &mut diagnostics);
|
|
|
|
assert!(result.is_none(), "Should reject hint stream with zero page count");
|
|
}
|
|
|
|
#[test]
|
|
fn test_hint_predict_shared_objects_minimal() {
|
|
// Minimal implementation returns empty vec
|
|
let (hint_data, _) = create_test_hint_stream(3);
|
|
let mut diagnostics = vec![];
|
|
|
|
let table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
|
|
|
|
// Phase 1: shared object hints not implemented
|
|
let shared = table.predict_shared_objects();
|
|
assert!(shared.is_empty(), "Phase 1 minimal implementation returns empty shared object ranges");
|
|
}
|
|
|
|
#[test]
|
|
fn test_hint_stream_out_of_bounds_page() {
|
|
let (hint_data, _) = create_test_hint_stream(3);
|
|
let mut diagnostics = vec![];
|
|
|
|
let table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
|
|
|
|
// Page 10 is out of bounds (only 3 pages)
|
|
let result = table.predict_page_range(10);
|
|
assert!(result.is_none(), "Should return None for out-of-bounds page index");
|
|
}
|
|
|
|
#[test]
|
|
fn test_hint_table_predict_page_range() {
|
|
// Verify that hint table predictions work correctly
|
|
let (hint_data, expected_ranges) = create_test_hint_stream(3);
|
|
let mut diagnostics = vec![];
|
|
|
|
let table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
|
|
|
|
// Verify each page's predicted range matches expected
|
|
for (i, (start, end)) in expected_ranges.iter().enumerate() {
|
|
let predicted = table.predict_page_range(i as u32);
|
|
assert_eq!(predicted, Some(*start..*end),
|
|
"Page {} range mismatch: expected {:?}, got {:?}", i, (*start..*end), predicted);
|
|
}
|
|
}
|
|
|
|
/// Create a minimal linearized PDF with a valid hint stream for integration testing.
|
|
fn create_linearized_pdf_with_hint_stream() -> Vec<u8> {
|
|
// Build a minimal linearized PDF with hint stream
|
|
// This follows the PDF spec Annex F format
|
|
|
|
let mut pdf = Vec::new();
|
|
|
|
// PDF header
|
|
pdf.extend_from_slice(b"%PDF-1.4\n");
|
|
|
|
// Linearization dictionary (object 1)
|
|
let lin_dict_offset = pdf.len();
|
|
pdf.extend_from_slice(b"1 0 obj\n");
|
|
pdf.extend_from_slice(b"<< /Linearized 1.0\n");
|
|
pdf.extend_from_slice(b" /L 99999\n"); // Will be updated later
|
|
pdf.extend_from_slice(b" /H [1010 100]\n"); // Hint stream at offset 1010, length 100
|
|
pdf.extend_from_slice(b" /O 4\n"); // First page object number
|
|
pdf.extend_from_slice(b" /E 1500\n"); // End of first page
|
|
pdf.extend_from_slice(b" /N 5\n"); // Number of pages
|
|
pdf.extend_from_slice(b" /T 2000\n"); // Offset of first-page xref
|
|
pdf.extend_from_slice(b">>\n");
|
|
pdf.extend_from_slice(b"endobj\n");
|
|
|
|
// First-page xref stream (object 2)
|
|
pdf.extend_from_slice(b"2 0 obj\n");
|
|
pdf.extend_from_slice(b"<< /Type /XRef /Size 6 /W [1 4 2] >>\n");
|
|
pdf.extend_from_slice(b"stream\n");
|
|
// Minimal xref stream data
|
|
// Format: [type (1 byte)] [offset (4 bytes, big-endian)] [gen (2 bytes, big-endian)]
|
|
pdf.extend_from_slice(&[
|
|
// Object 0: free entry
|
|
0, // type: free
|
|
0, 0, 0, 0, // offset: 0
|
|
0, 0, // generation: 0 (was 65535, but that doesn't fit in u16)
|
|
// Object 1: in-use at offset ~17
|
|
1, // type: in-use
|
|
0, 0, 0, 17, // offset: 17
|
|
0, 0, // generation: 0
|
|
// Object 2: in-use at offset ~120
|
|
1, // type: in-use
|
|
0, 0, 0, 120, // offset: 120
|
|
0, 0, // generation: 0
|
|
// Object 3: in-use at offset ~300
|
|
1, // type: in-use
|
|
0, 0, 1, 44, // offset: 300 (256 + 44)
|
|
0, 0, // generation: 0
|
|
// Object 4: in-use at offset ~456
|
|
1, // type: in-use
|
|
0, 0, 1, 200, // offset: 456 (256 + 200)
|
|
0, 0, // generation: 0
|
|
// Object 5: in-use at offset ~556
|
|
1, // type: in-use
|
|
0, 0, 2, 44, // offset: 556 (512 + 44)
|
|
0, 0, // generation: 0
|
|
]);
|
|
pdf.extend_from_slice(b"\nendstream\n");
|
|
pdf.extend_from_slice(b"endobj\n");
|
|
|
|
// Hint stream (object 3) - flate-encoded hint stream data
|
|
let _hint_stream_offset = pdf.len();
|
|
pdf.extend_from_slice(b"3 0 obj\n");
|
|
pdf.extend_from_slice(b"<< /Filter /FlateDecode /Length 50 >>\n");
|
|
pdf.extend_from_slice(b"stream\n");
|
|
|
|
// Create a minimal valid hint stream (5 pages)
|
|
let (hint_data, _) = create_test_hint_stream(5);
|
|
|
|
// Flate-encode the hint data
|
|
use flate2::write::DeflateEncoder;
|
|
use std::io::Write;
|
|
|
|
let mut encoded = Vec::new();
|
|
{
|
|
let mut encoder = DeflateEncoder::new(&mut encoded, flate2::Compression::default());
|
|
encoder.write_all(&hint_data).unwrap();
|
|
}
|
|
|
|
pdf.extend_from_slice(&encoded);
|
|
pdf.extend_from_slice(b"\nendstream\n");
|
|
pdf.extend_from_slice(b"endobj\n");
|
|
|
|
// First page (object 4)
|
|
pdf.extend_from_slice(b"4 0 obj\n");
|
|
pdf.extend_from_slice(b"<< /Type /Page /MediaBox [0 0 612 792] >>\n");
|
|
pdf.extend_from_slice(b"endobj\n");
|
|
|
|
// Catalog (object 5)
|
|
pdf.extend_from_slice(b"5 0 obj\n");
|
|
pdf.extend_from_slice(b"<< /Type /Catalog /Pages 6 0 R >>\n");
|
|
pdf.extend_from_slice(b"endobj\n");
|
|
|
|
// Pages (object 6+)
|
|
for i in 6..=10 {
|
|
pdf.extend_from_slice(&format!("{} 0 obj\n", i).as_bytes());
|
|
pdf.extend_from_slice(b"<< /Type /Page >>\n");
|
|
pdf.extend_from_slice(b"endobj\n");
|
|
}
|
|
|
|
// Full xref at EOF
|
|
let xref_offset = pdf.len();
|
|
pdf.extend_from_slice(b"xref\n");
|
|
pdf.extend_from_slice(b"0 10\n");
|
|
pdf.extend_from_slice(b"0000000000 65535 f \n");
|
|
for _i in 1..=9 {
|
|
pdf.extend_from_slice(b"0000000000 00000 n \n");
|
|
}
|
|
|
|
pdf.extend_from_slice(b"trailer\n");
|
|
pdf.extend_from_slice(b"<< /Size 10 /Root 5 0 R >>\n");
|
|
pdf.extend_from_slice(b"startxref\n");
|
|
pdf.extend_from_slice(&format!("{}\n", xref_offset).as_bytes());
|
|
pdf.extend_from_slice(b"%%EOF\n");
|
|
|
|
// Update /L in linearization dict to actual file size
|
|
let file_length = pdf.len() as u64;
|
|
let lin_dict_str = format!("/L {}\n", file_length);
|
|
let _lin_dict_bytes = lin_dict_str.as_bytes();
|
|
|
|
// Find and replace the /L value
|
|
let lin_pos = lin_dict_offset + b"%PDF-1.4\n".len();
|
|
let l_search = &pdf[lin_pos..lin_pos + 100];
|
|
if let Some(l_pos) = l_search.windows(2).position(|w| w == b"/L") {
|
|
let l_abs_pos = lin_pos + l_pos;
|
|
let after_l = l_abs_pos + 2;
|
|
// Find the number after /L
|
|
let num_start = after_l + 1; // skip space
|
|
let num_end = pdf[num_start..].windows(1).position(|w| w[0] == b'\n').unwrap() + num_start;
|
|
// Replace with actual file length
|
|
let new_l_str = file_length.to_string();
|
|
let new_l_bytes = new_l_str.as_bytes();
|
|
pdf.splice(num_start..num_end, new_l_bytes.iter().cloned());
|
|
}
|
|
|
|
pdf
|
|
}
|
|
|
|
#[test]
|
|
fn test_linearized_pdf_with_hint_stream() {
|
|
let pdf_data = create_linearized_pdf_with_hint_stream();
|
|
|
|
// Parse the linearization dict
|
|
let source = MemorySource::new(pdf_data.clone());
|
|
let lin_info = pdftract_core::parser::xref::detect_linearization(&source);
|
|
|
|
assert!(lin_info.is_some(), "Should detect linearized PDF");
|
|
|
|
let info = lin_info.unwrap();
|
|
assert_eq!(info.page_count, 5);
|
|
assert!(info.hint_stream_offset.is_some());
|
|
assert!(info.hint_stream_length.is_some());
|
|
|
|
// Parse the hint stream
|
|
let parser_source = Box::new(source) as Box<dyn pdftract_core::source::PdfSource>;
|
|
let mut diagnostics = vec![];
|
|
let hint_table = pdftract_core::parser::hint_stream::parse_hint_stream_from_linearized(
|
|
&*parser_source,
|
|
info.hint_stream_offset.unwrap(),
|
|
info.hint_stream_length.unwrap(),
|
|
&mut diagnostics,
|
|
);
|
|
|
|
assert!(hint_table.is_some(), "Should successfully parse hint stream from linearized PDF");
|
|
assert_eq!(hint_table.unwrap().page_count(), 5);
|
|
}
|
|
|
|
/// Test that hint stream parsing doesn't panic on malformed data (INV-8).
|
|
#[test]
|
|
fn test_hint_stream_no_panic_on_corrupt_data() {
|
|
use proptest::prelude::*;
|
|
|
|
// Generate random byte sequences and verify we never panic
|
|
proptest!(|(data: Vec<u8>)| {
|
|
let mut diagnostics = vec![];
|
|
let _ = pdftract_core::parser::hint_stream::parse_hint_stream(&data, &mut diagnostics);
|
|
// Should never panic; returns None for malformed data
|
|
});
|
|
}
|
|
|
|
#[test]
|
|
fn test_hint_prefetch_performance() {
|
|
// Verify that hint-based prefetch calculates correct ranges
|
|
// This test verifies the logic:
|
|
// 1. Hint stream is parsed correctly
|
|
// 2. Prefetch ranges are calculated correctly
|
|
// 3. Prefetch is called for the expected pages
|
|
|
|
let (hint_data, expected_ranges) = create_test_hint_stream(10);
|
|
let mut diagnostics = vec![];
|
|
let hint_table = parse_hint_stream(&hint_data, &mut diagnostics).unwrap();
|
|
|
|
// Verify that for pages 3-7 (1-based: 4-8), we predict the correct ranges
|
|
for i in 3..=7 {
|
|
let predicted = hint_table.predict_page_range(i);
|
|
assert!(predicted.is_some());
|
|
let (start, end) = expected_ranges[i as usize];
|
|
assert_eq!(predicted.unwrap(), start..end);
|
|
}
|
|
}
|