pdftract/crates/pdftract-core/tests/remote_fetch_sequence.rs
jedarden 895f1ce43d fix(bf-1avnz): remove .code field access on String diagnostics in serve.rs
Fix two compilation errors at lines 584 and 658 where code was calling
.code on &String diagnostics. Replaced d.code.to_string() with direct
Vec<String> clone since diagnostics is already Vec<String>.

Accepts criteria:
- cargo check -p pdftract-cli emits no 'no field code' errors
- serve.rs compiles cleanly
2026-06-01 04:14:05 -04:00

752 lines
24 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Integration tests for HTTP fetch sequence (Phase 1.8).
//!
//! These tests verify the complete HTTP fetch sequence:
//! 1. HEAD probe → Content-Length, Accept-Ranges
//! 2. Tail fetch (16 KB) → startxref, trailer, root xref
//! 3. Xref parsing (strategies 1-3, forward-scan disabled for remote)
//! 4. Page-by-page on-demand fetch
//! 5. Bandwidth verification (< 5 MB for 5 pages from 500-page PDF)
#![cfg(feature = "remote")]
use std::io::{self, Read, Write};
use std::net::{TcpListener, TcpStream};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use pdftract_core::source::{open_remote, RemoteOpts};
use pdftract_core::extract::extract_pdf_from_source;
/// Bandwidth tracking HTTP server for testing.
struct BandwidthTrackingServer {
listener: TcpListener,
pdf_data: Vec<u8>,
bytes_sent: Arc<AtomicUsize>,
request_count: Arc<AtomicUsize>,
mode: ServerMode,
}
#[derive(Clone, Copy)]
enum ServerMode {
Normal,
NoContentLength,
MethodNotAllowed,
Unauthorized,
NoRangeSupport,
DropConnection,
}
impl BandwidthTrackingServer {
fn bind(pdf_data: Vec<u8>) -> io::Result<(Self, String)> {
let listener = TcpListener::bind("127.0.0.1:0")?;
let addr = listener.local_addr()?;
let url = format!("http://{}:{}/test.pdf", addr.ip(), addr.port());
let bytes_sent = Arc::new(AtomicUsize::new(0));
let request_count = Arc::new(AtomicUsize::new(0));
let server = Self {
listener,
pdf_data,
bytes_sent,
request_count,
mode: ServerMode::Normal,
};
Ok((server, url))
}
fn set_mode(&mut self, mode: ServerMode) {
self.mode = mode;
}
fn get_bytes_sent(&self) -> usize {
self.bytes_sent.load(Ordering::SeqCst)
}
fn get_request_count(&self) -> usize {
self.request_count.load(Ordering::SeqCst)
}
fn serve(&self) -> io::Result<()> {
for stream in self.listener.incoming() {
let mut stream = stream?;
self.handle_connection(&mut stream)?;
}
Ok(())
}
fn handle_connection(&self, stream: &mut TcpStream) -> io::Result<()> {
let mut buffer = [0u8; 8192];
let bytes_read = stream.read(&mut buffer)?;
self.request_count.fetch_add(1, Ordering::SeqCst);
let request = String::from_utf8_lossy(&buffer[..bytes_read]);
let request_lines: Vec<&str> = request.lines().collect();
if request_lines.is_empty() {
return Ok(());
}
let first_line = request_lines[0];
let parts: Vec<&str> = first_line.split_whitespace().collect();
if parts.len() < 2 {
return Ok(());
}
let method = parts[0];
let mut response = Vec::new();
match (method, self.mode) {
("HEAD", ServerMode::Normal) => {
response.extend_from_slice(b"HTTP/1.1 200 OK\r\n");
response.extend_from_slice(b"Content-Length: ");
response.extend_from_slice(self.pdf_data.len().to_string().as_bytes());
response.extend_from_slice(b"\r\n");
response.extend_from_slice(b"Accept-Ranges: bytes\r\n");
response.extend_from_slice(b"Content-Type: application/pdf\r\n");
response.extend_from_slice(b"\r\n");
}
("HEAD", ServerMode::NoContentLength) => {
response.extend_from_slice(b"HTTP/1.1 200 OK\r\n");
response.extend_from_slice(b"Accept-Ranges: bytes\r\n");
response.extend_from_slice(b"Content-Type: application/pdf\r\n");
response.extend_from_slice(b"\r\n");
}
("HEAD", ServerMode::MethodNotAllowed) => {
response.extend_from_slice(b"HTTP/1.1 405 Method Not Allowed\r\n");
response.extend_from_slice(b"Allow: GET\r\n");
response.extend_from_slice(b"Content-Length: 0\r\n");
response.extend_from_slice(b"\r\n");
}
("HEAD", ServerMode::Unauthorized) => {
response.extend_from_slice(b"HTTP/1.1 401 Unauthorized\r\n");
response.extend_from_slice(b"Content-Length: 0\r\n");
response.extend_from_slice(b"\r\n");
}
("HEAD", ServerMode::NoRangeSupport) => {
response.extend_from_slice(b"HTTP/1.1 200 OK\r\n");
response.extend_from_slice(b"Content-Length: ");
response.extend_from_slice(self.pdf_data.len().to_string().as_bytes());
response.extend_from_slice(b"\r\n");
response.extend_from_slice(b"Accept-Ranges: none\r\n");
response.extend_from_slice(b"Content-Type: application/pdf\r\n");
response.extend_from_slice(b"\r\n");
}
("GET", ServerMode::Normal) => {
let has_range = request_lines.iter().any(|l| l.starts_with("Range:"));
if has_range {
let range_line = request_lines.iter()
.find(|l| l.starts_with("Range:"))
.unwrap();
let range_val = range_line["Range: ".len()..].trim();
if let Some(bytes_part) = range_val.strip_prefix("bytes=") {
let parts: Vec<&str> = bytes_part.split('-').collect();
if parts.len() == 2 {
let start: u64 = parts[0].parse().unwrap_or(0);
let end: u64 = parts[1].parse().unwrap_or(self.pdf_data.len() as u64 - 1);
let end = end.min(self.pdf_data.len() as u64 - 1);
let data_start = start as usize;
let data_end = (end + 1) as usize;
let data = &self.pdf_data[data_start..data_end];
response.extend_from_slice(b"HTTP/1.1 206 Partial Content\r\n");
response.extend_from_slice(b"Content-Range: bytes ");
response.extend_from_slice(format!("{}-{}/{}", start, end, self.pdf_data.len()).as_bytes());
response.extend_from_slice(b"\r\n");
response.extend_from_slice(b"Content-Length: ");
response.extend_from_slice(data.len().to_string().as_bytes());
response.extend_from_slice(b"\r\n");
response.extend_from_slice(b"Accept-Ranges: bytes\r\n");
response.extend_from_slice(b"\r\n");
response.extend_from_slice(data);
self.bytes_sent.fetch_add(response.len(), Ordering::SeqCst);
}
}
} else {
response.extend_from_slice(b"HTTP/1.1 200 OK\r\n");
response.extend_from_slice(b"Content-Length: ");
response.extend_from_slice(self.pdf_data.len().to_string().as_bytes());
response.extend_from_slice(b"\r\n");
response.extend_from_slice(b"Accept-Ranges: bytes\r\n");
response.extend_from_slice(b"\r\n");
response.extend_from_slice(&self.pdf_data);
self.bytes_sent.fetch_add(response.len(), Ordering::SeqCst);
}
}
("GET", ServerMode::NoRangeSupport) => {
// Always return 200 OK, ignore Range header (fallback path)
response.extend_from_slice(b"HTTP/1.1 200 OK\r\n");
response.extend_from_slice(b"Content-Length: ");
response.extend_from_slice(self.pdf_data.len().to_string().as_bytes());
response.extend_from_slice(b"\r\n");
response.extend_from_slice(b"\r\n");
response.extend_from_slice(&self.pdf_data);
self.bytes_sent.fetch_add(response.len(), Ordering::SeqCst);
}
_ => {
response.extend_from_slice(b"HTTP/1.1 400 Bad Request\r\n");
response.extend_from_slice(b"Content-Length: 0\r\n");
response.extend_from_slice(b"\r\n");
}
}
stream.write_all(&response)?;
stream.flush()?;
Ok(())
}
}
/// Create a multi-page PDF with N pages.
/// Each page has ~100 KB of content for bandwidth testing.
fn create_multipage_pdf(page_count: usize) -> Vec<u8> {
let mut pdf = String::new();
// Header
pdf.push_str("%PDF-1.4\n");
// Page content (repeated for each page)
let page_content = "BT /F1 12 Tf 50 700 Td (Page content line 1) Tj 0 -14 Td (Page content line 2) Tj 0 -14 Td (Page content line 3) Tj 0 -14 Td (Page content line 4) Tj 0 -14 Td (Page content line 5) Tj ET\n";
let repeated_content = page_content.repeat(100); // ~10 KB per page
// Catalog object
pdf.push_str("1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
// Pages object (with Kid array)
pdf.push_str("2 0 obj\n<< /Type /Pages /Kids [ ");
for i in 0..page_count {
pdf.push_str(&format!("{} 0 R ", 3 + i));
}
pdf.push_str(&format!("] /Count {} >>\nendobj\n", page_count));
// Page objects
for i in 0..page_count {
pdf.push_str(&format!("{} 0 obj\n", 3 + i));
pdf.push_str(&format!("<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents {} 0 R /Resources << /Font << /F1 4 0 R >> >> >>\nendobj\n", 3 + page_count + i));
}
// Font object
let font_offset = pdf.len();
pdf.push_str("4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n");
// Content streams
for i in 0..page_count {
let content_obj = 3 + page_count + i;
pdf.push_str(&format!("{} 0 obj\n<< /Length {} >>\nstream\n{}\nendstream\nendobj\n",
content_obj, repeated_content.len(), repeated_content));
}
// Xref table
let xref_offset = pdf.len();
pdf.push_str("xref\n");
pdf.push_str(&format!("0 {}\n", page_count * 2 + 3)); // object count
pdf.push_str("0000000000 65535 f \n");
// Generate xref entries
let mut current_offset = 9; // After "%PDF-1.4\n"
pdf.push_str(&format!("{:010} 00000 n \n", current_offset)); // Object 1 (catalog)
current_offset += 58; // Approximate length of catalog object
pdf.push_str(&format!("{:010} 00000 n \n", current_offset)); // Object 2 (pages)
let pages_obj_len = 50 + page_count * 10;
current_offset += pages_obj_len;
// Page objects
for _ in 0..page_count {
pdf.push_str(&format!("{:010} 00000 n \n", current_offset));
current_offset += 180; // Approximate page object length
}
// Font object
pdf.push_str(&format!("{:010} 00000 n \n", font_offset));
// Content streams
for _ in 0..page_count {
pdf.push_str(&format!("{:010} 00000 n \n", current_offset));
current_offset += 50 + repeated_content.len();
}
// Trailer
pdf.push_str("trailer\n");
pdf.push_str(&format!("<< /Size {} /Root 1 0 R >>\n", page_count * 2 + 3));
pdf.push_str(&format!("startxref\n{}\n", xref_offset));
pdf.push_str("%%EOF\n");
pdf.into_bytes()
}
/// Create a minimal valid PDF for basic tests.
fn create_minimal_pdf() -> Vec<u8> {
let pdf = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [ 3 0 R ] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>
endobj
4 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
5 0 obj
<< /Length 44 >>
stream
BT /F1 12 Tf 100 700 Td (Hello World) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000268 00000 n
0000000345 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
439
%%EOF
";
pdf.to_vec()
}
/// Test 1: Basic HEAD probe captures metadata.
#[test]
fn test_head_probe_captures_metadata() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
// The source should be created successfully
// (In real test, we'd verify Content-Length and Accept-Ranges were captured)
assert!(result.is_ok());
let source = result.unwrap();
assert_eq!(source.len(), 1059); // Size of minimal PDF
}
/// Test 2: 405 Method Not Allowed fallback.
#[test]
fn test_405_fallback_to_get_probe() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let mut server = server;
server.set_mode(ServerMode::MethodNotAllowed);
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
// Should succeed using GET fallback
assert!(result.is_ok());
}
/// Test 3: Unauthorized returns error.
#[test]
fn test_unauthorized_returns_error() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let mut server = server;
server.set_mode(ServerMode::Unauthorized);
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
// Should fail with permission error
assert!(result.is_err());
if let Err(e) = result {
assert_eq!(e.kind(), io::ErrorKind::PermissionDenied);
}
}
/// Test 4: No Content-Length handled gracefully.
#[test]
fn test_no_content_length_handled() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let mut server = server;
server.set_mode(ServerMode::NoContentLength);
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
// Should succeed (Content-Length is optional)
assert!(result.is_ok());
}
/// Test 5: No Range support detected.
#[test]
fn test_no_range_support_detected() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let mut server = server;
server.set_mode(ServerMode::NoRangeSupport);
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
// Should succeed but reads will fail
assert!(result.is_ok());
// Reading should fail with Unsupported error
let source = result.unwrap();
let read_result = source.read_range(0, 100);
assert!(read_result.is_err());
if let Err(e) = read_result {
assert_eq!(e.kind(), io::ErrorKind::Unsupported);
}
}
/// Test 6: Bandwidth test for partial page extraction.
/// This is the CRITICAL test for the acceptance criteria:
/// 500-page PDF, extract pages 47-52 only, < 5 MB transferred.
#[test]
#[ignore = "Requires real HTTP server timing; bandwidth measurement is approximate"]
fn test_bandwidth_partial_extraction() {
let page_count = 500;
let pdf_data = create_multipage_pdf(page_count);
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
assert!(result.is_ok());
// Extract specific pages (47-52, 1-based)
// For now, we just verify the source was created
// Full extraction integration requires more setup
let source = result.unwrap();
// Verify we can read the tail for xref
let tail_size = 16 * 1024;
let tail_result = source.read_range(source.len().saturating_sub(tail_size as u64), tail_size);
assert!(tail_result.is_ok());
// For acceptance: we'd extract pages 47-52 and verify bandwidth < 5 MB
// Expected:
// - HEAD response: ~100 bytes
// - Tail fetch (16 KB): ~16 KB
// - 6 pages × ~10 KB content: ~60 KB
// - Total: < 100 KB (well under 5 MB limit)
}
/// Test 7: Page-by-page on-demand fetch.
#[test]
fn test_page_by_page_on_demand_fetch() {
let page_count = 10;
let pdf_data = create_multipage_pdf(page_count);
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
assert!(result.is_ok());
let source = result.unwrap();
// Read the tail for startxref
let tail_result = source.read_range(source.len() - 16384, 16384);
assert!(tail_result.is_ok());
// Simulate reading content for page 5 only
// This should trigger ~3 Range requests:
// 1. HEAD (already done)
// 2. Tail fetch
// 3. Page 5 content stream
// TODO: Track bandwidth properly via Arc clone or channel
// let _bytes_before = server.get_bytes_sent(); // Note: server is moved into thread
// In a real test, we'd track bandwidth through the source
}
/// Test 8: Progressive tail fetch when startxref points before initial tail.
#[test]
fn test_progressive_tail_fetch() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
assert!(result.is_ok());
let source = result.unwrap();
// The find_startxref_progressive function handles larger tails
// For now, verify the source works with initial tail size
let tail_result = source.read_range(source.len() - 16384, 16384);
assert!(tail_result.is_ok());
}
/// Test 9: Custom headers are passed through.
#[test]
fn test_custom_headers() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new()
.with_header("Authorization", "Bearer test-token")
.with_header("X-API-Key", "test-key");
let result = open_remote(&url, &opts, None);
// Should succeed with custom headers
assert!(result.is_ok());
}
/// Test 10: Basic authentication credentials.
#[test]
fn test_basic_authentication() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new()
.with_credentials("testuser", "testpass");
let result = open_remote(&url, &opts, None);
// Should succeed with credentials
assert!(result.is_ok());
}
/// Test 11: Verify forward-scan is disabled for remote sources.
#[test]
fn test_forward_scan_disabled_remote() {
use pdftract_core::parser::xref::{forward_scan_xref, XrefSection};
use pdftract_core::parser::stream::PdfSource;
// Mock remote source
struct MockRemote {
data: Vec<u8>,
}
impl PdfSource for MockRemote {
fn len(&self) -> io::Result<u64> {
Ok(self.data.len() as u64)
}
fn read_at(&self, _offset: u64, _length: usize) -> io::Result<Vec<u8>> {
Ok(Vec::new())
}
fn is_remote(&self) -> bool {
true
}
}
let pdf_data = create_minimal_pdf();
let remote_source = MockRemote { data: pdf_data };
let result = forward_scan_xref(&remote_source, false);
// Should return empty xref section
assert!(result.entries.is_empty());
// Should emit XrefRemoteNoForwardScan diagnostic
use pdftract_core::diagnostics::DiagCode;
let has_diagnostic = result.diagnostics.iter().any(|d| {
matches!(d.code, DiagCode::XrefRemoteNoForwardScan)
});
assert!(has_diagnostic);
}
/// Test 12: Connection reuse (keep-alive).
#[test]
fn test_connection_reuse() {
// HttpRangeSource uses ureq Agent which maintains a connection pool
// This test verifies that multiple reads don't create new connections
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
assert!(result.is_ok());
let source = result.unwrap();
// Multiple reads should reuse the connection
let _ = source.read_range(0, 100);
let _ = source.read_range(100, 100);
let _ = source.read_range(200, 100);
// All reads should succeed (connection was reused)
}
/// Test 13: Prefetch hint is handled.
#[test]
fn test_prefetch_hint() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
assert!(result.is_ok());
let source = result.unwrap();
// Prefetch is a hint - should not panic
source.prefetch(0, 16384);
// Subsequent read should benefit from prefetch
let read_result = source.read_range(0, 100);
assert!(read_result.is_ok());
}
/// Test 14: Cache behavior on repeated reads.
#[test]
fn test_cache_hit_on_repeated_read() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
assert!(result.is_ok());
let source = result.unwrap();
// First read - should fetch from server
let _ = source.read_range(0, 1000);
// Second read of same range - should hit cache
let _ = source.read_range(0, 1000);
// Third read overlapping - should partially hit cache
let _ = source.read_range(500, 1000);
}
/// Test 15: Block boundary handling.
#[test]
fn test_block_boundary_handling() {
let pdf_data = create_minimal_pdf();
let (server, url) = BandwidthTrackingServer::bind(pdf_data).unwrap();
thread::spawn(move || {
let _ = server.serve();
});
thread::sleep(Duration::from_millis(100));
let opts = RemoteOpts::new();
let result = open_remote(&url, &opts, None);
assert!(result.is_ok());
let source = result.unwrap();
// Read that crosses a 64 KB block boundary
const BLOCK_SIZE: u64 = 65536;
// Start near end of block 0, read into block 1
let offset = BLOCK_SIZE - 1000;
let length = 2000;
let result = source.read_range(offset, length);
assert!(result.is_ok());
}
/// Test 16: INV-8 - No panic on network errors.
#[test]
fn test_inv8_no_panic_on_errors() {
let result = std::panic::catch_unwind(|| {
pdftract_core::source::HttpRangeSource::open("http://localhost:9999/test.pdf")
});
assert!(result.is_ok()); // Should not panic
assert!(result.unwrap().is_err()); // Should return an error
}