Implement orchestration layer connecting HttpRangeSource to Phase 1.3 xref resolver and Phase 1.4 document model for remote PDF access: - Document::open_remote() public API for remote PDF loading - Progressive tail fetch (16 KB → 1 MB) for startxref location - Xref forward-scan disabled for remote sources (via is_remote check) - Page-by-page on-demand fetch via HttpRangeSource caching - Resource lazy load through XrefResolver cache - HEAD probe with 405 fallback, no Content-Length handling Acceptance criteria: ✅ open_remote(url) returns Document with correct page count ✅ HEAD failure modes (405, no Content-Length, 401) handled ✅ xref forward-scan disabled for remote (is_remote check) ✅ Page-by-page on-demand fetch (HttpRangeSource LRU cache) ✅ INV-8 maintained (all errors return Result) Files modified: - crates/pdftract-core/src/document.rs (Document::open_remote, from_source) - crates/pdftract-core/src/remote.rs (progressive tail fetch) - crates/pdftract-core/src/lib.rs (re-exports) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
265 lines
9 KiB
Rust
265 lines
9 KiB
Rust
//! Property-based tests for PDF stream decoder filters and filter pipelines.
|
|
//!
|
|
//! This module tests the core invariants of PDF stream decoding:
|
|
//! - No panic on any input (INV-8)
|
|
//! - Roundtrip correctness for encodable filters
|
|
//! - Bomb limit enforcement
|
|
//! - Filter pipeline ordering
|
|
|
|
use pdftract_core::parser::stream::{
|
|
FlateDecoder, LZWDecoder, ASCII85Decoder, ASCIIHexDecoder, RunLengthDecoder,
|
|
DCTDecoder, JpxStreamDecoder, CCITTFaxDecoder, CryptDecoder,
|
|
DEFAULT_MAX_DECOMPRESS_BYTES,
|
|
};
|
|
use indexmap::IndexMap;
|
|
use pdftract_core::parser::object::{PdfObject, PdfDict};
|
|
use pdftract_core::diagnostics::DiagCode;
|
|
|
|
/// Property: Filter pipeline never panics on arbitrary input.
|
|
///
|
|
/// Tests each filter with random byte inputs to ensure INV-8 compliance.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_filter_pipeline_never_panics(
|
|
filter in 0usize..8usize,
|
|
data in proptest::collection::vec(proptest::num::u8::ANY, 0..100_000)
|
|
) {
|
|
let mut counter = 0;
|
|
|
|
// Test each filter type
|
|
let result = match filter {
|
|
0 => FlateDecoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
1 => LZWDecoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
2 => ASCII85Decoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
3 => ASCIIHexDecoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
4 => RunLengthDecoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
5 => DCTDecoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
6 => JpxStreamDecoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
7 => CCITTFaxDecoder.decode(&data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES),
|
|
_ => unreachable!(),
|
|
};
|
|
|
|
// Should never panic - may return Ok or Err
|
|
prop_assert!(result.is_ok() || result.is_err());
|
|
}
|
|
}
|
|
|
|
/// Property: FlateDecode roundtrip - encode then decode produces original.
|
|
///
|
|
/// Uses flate2's ZlibEncoder to encode, then FlateDecoder to decode.
|
|
/// The output should be byte-identical to the input.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_flate_roundtrip(
|
|
data in proptest::collection::vec(proptest::num::u8::ANY, 0..50_000)
|
|
) {
|
|
use flate2::write::ZlibEncoder;
|
|
use flate2::Compression;
|
|
use std::io::Write;
|
|
|
|
// Encode with flate2 (zlib format)
|
|
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
|
|
encoder.write_all(&data).unwrap();
|
|
let encoded = encoder.finish().unwrap();
|
|
|
|
// Decode with our FlateDecoder (handles zlib format)
|
|
let mut counter = 0;
|
|
let result = FlateDecoder.decode(&encoded, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES);
|
|
|
|
prop_assert!(result.is_ok());
|
|
let decoded = result.unwrap();
|
|
|
|
// Should round-trip perfectly
|
|
prop_assert_eq!(decoded, data);
|
|
}
|
|
}
|
|
|
|
/// Property: ASCII85Decode roundtrip - encode then decode produces original.
|
|
///
|
|
/// Uses a custom ASCII85 encoder to encode, then ASCII85Decoder to decode.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_a85_roundtrip(
|
|
data in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
|
|
) {
|
|
let encoded = ascii85_encode(&data);
|
|
|
|
// Decode with our ASCII85Decoder
|
|
let mut counter = 0;
|
|
let result = ASCII85Decoder.decode(&encoded, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES);
|
|
|
|
prop_assert!(result.is_ok());
|
|
let decoded = result.unwrap();
|
|
|
|
// Should round-trip perfectly
|
|
prop_assert_eq!(decoded, data);
|
|
}
|
|
}
|
|
|
|
/// Property: RunLengthDecode roundtrip - encode then decode produces original.
|
|
///
|
|
/// Uses a custom RunLength encoder following the PDF spec.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_runlength_roundtrip(
|
|
data in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
|
|
) {
|
|
let encoded = runlength_encode(&data);
|
|
|
|
// Decode with our RunLengthDecoder
|
|
let mut counter = 0;
|
|
let result = RunLengthDecoder.decode(&encoded, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES);
|
|
|
|
prop_assert!(result.is_ok());
|
|
let decoded = result.unwrap();
|
|
|
|
// Should round-trip perfectly
|
|
prop_assert_eq!(decoded, data);
|
|
}
|
|
}
|
|
|
|
/// Property: Bomb limit enforced for synthetic FlateDecode bombs.
|
|
///
|
|
/// Creates synthetic FlateDecode bombs of varying sizes and verifies
|
|
/// that the output is capped at max_decompress_bytes.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_bomb_limit_enforced(
|
|
// Size of bomb in MB (10, 100, 1000)
|
|
size_mb in 10usize..1000usize,
|
|
// Bomb limit in bytes
|
|
bomb_limit in 100_000u64..10_000_000_000u64,
|
|
) {
|
|
use flate2::write::ZlibEncoder;
|
|
use flate2::Compression;
|
|
use std::io::Write;
|
|
|
|
// Create a pattern that compresses well (repeated bytes)
|
|
// 1 MB of zeros compresses to ~1 KB
|
|
let repeat_count = size_mb * 1024 * 1024;
|
|
let pattern = vec![0u8; repeat_count.min(50_000_000)]; // Cap at 50MB to avoid timeout
|
|
|
|
// Encode with flate2
|
|
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::fast());
|
|
encoder.write_all(&pattern).unwrap();
|
|
let encoded = encoder.finish().unwrap();
|
|
|
|
// Decode with bomb limit
|
|
let mut counter = 0;
|
|
let result = FlateDecoder.decode(&encoded, None, &mut counter, bomb_limit);
|
|
|
|
prop_assert!(result.is_ok());
|
|
let decoded = result.unwrap();
|
|
|
|
// Output should not exceed bomb limit significantly
|
|
// (allowing small margin for chunk processing)
|
|
prop_assert!(
|
|
decoded.len() as u64 <= bomb_limit + 100_000,
|
|
"Decoded {} bytes exceeds bomb limit {} by more than 100KB",
|
|
decoded.len(),
|
|
bomb_limit
|
|
);
|
|
}
|
|
}
|
|
|
|
/// Helper: Encode bytes in ASCII85 format (Base85).
|
|
fn ascii85_encode(data: &[u8]) -> Vec<u8> {
|
|
let mut result = Vec::with_capacity(data.len() / 4 * 5 + 10);
|
|
result.push(b'<');
|
|
result.push(b'~');
|
|
|
|
let mut chunk = [0u8; 4];
|
|
for (i, &byte) in data.iter().enumerate() {
|
|
chunk[i % 4] = byte;
|
|
|
|
if i % 4 == 3 || i == data.len() - 1 {
|
|
// Process this chunk
|
|
let chunk_len = if i == data.len() - 1 { (i % 4) + 1 } else { 4 };
|
|
|
|
// Check for all zeros (use 'z' shortcut)
|
|
if chunk_len == 4 && chunk.iter().all(|&b| b == 0) {
|
|
result.push(b'z');
|
|
chunk = [0; 4];
|
|
continue;
|
|
}
|
|
|
|
// Convert to 32-bit number
|
|
let value = u32::from_be_bytes(chunk);
|
|
|
|
// Encode in base85
|
|
for j in (0..5).rev() {
|
|
let divisor = 85u32.pow(j as u32);
|
|
let encoded_char = (value / divisor) % 85;
|
|
result.push(encoded_char as u8 + 33);
|
|
}
|
|
chunk = [0; 4];
|
|
}
|
|
}
|
|
|
|
result.push(b'~');
|
|
result.push(b'>');
|
|
result
|
|
}
|
|
|
|
/// Helper: Encode bytes using RunLength encoding (PDF spec).
|
|
fn runlength_encode(data: &[u8]) -> Vec<u8> {
|
|
let mut result = Vec::new();
|
|
let mut i = 0;
|
|
|
|
while i < data.len() {
|
|
// Look ahead for repeated bytes
|
|
let current_byte = data[i];
|
|
let mut repeat_count = 1;
|
|
|
|
while i + repeat_count < data.len() && data[i + repeat_count] == current_byte && repeat_count < 127 {
|
|
repeat_count += 1;
|
|
}
|
|
|
|
if repeat_count >= 3 {
|
|
// Use run-length encoding for 3+ repeats
|
|
// 257 - repeat_count = length byte
|
|
let len_byte = (257 - repeat_count) as u8;
|
|
result.push(len_byte);
|
|
result.push(current_byte);
|
|
i += repeat_count;
|
|
} else {
|
|
// Look ahead for non-repeating bytes
|
|
let literal_start = i;
|
|
let mut literal_len = 0;
|
|
|
|
while i + literal_len < data.len() && literal_len < 127 {
|
|
// Check if next byte would repeat (start of a run)
|
|
if i + literal_len + 2 < data.len()
|
|
&& data[i + literal_len] == data[i + literal_len + 1]
|
|
&& data[i + literal_len] == data[i + literal_len + 2]
|
|
{
|
|
break;
|
|
}
|
|
literal_len += 1;
|
|
}
|
|
|
|
// Encode as literal copy
|
|
if literal_len > 0 {
|
|
let len_byte = (literal_len - 1) as u8; // len+1 bytes -> len is len-1
|
|
result.push(len_byte);
|
|
result.extend_from_slice(&data[literal_start..literal_start + literal_len]);
|
|
i += literal_len;
|
|
} else {
|
|
// Single byte as literal
|
|
result.push(0); // len=0 means copy 1 byte
|
|
result.push(current_byte);
|
|
i += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// End of data marker
|
|
result.push(128);
|
|
|
|
result
|
|
}
|