pdftract/crates/pdftract-core/tests/remote_integration.rs
jedarden 432514d350 wip: AcroForm improvements, debug tooling, test corpus, and fixture updates
Collects in-progress work across forms (Ch/Tx field handling, value_text
edge cases), layout corrections, stream parser fixes, conformance test
expansion, security audit test (TH-08), stream-decoder bomb fixture,
debug examples reorganization under examples/debug/, sdk module scaffold,
xtask CLI enhancements, and provenance entries for new fixtures.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-30 09:48:14 -04:00

896 lines
33 KiB
Rust

//! Integration tests for remote HTTP PDF fetching.
//!
//! These tests use wiremock to simulate HTTP servers with various behaviors:
//! - Range request support
//! - No Range support (returns 200 for Range requests)
//! - 416 Range Not Satisfiable responses
//! - Connection drops mid-stream
//! - TLS handshake failures
//! - Linearized PDFs with hint streams
//!
//! Run with: `cargo test --features remote -p pdftract-core -- remote`
#![cfg(feature = "remote")]
use std::fs;
use std::io::{self, Read};
use std::path::PathBuf;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;
use pdftract_core::source::{HttpRangeSource, PdfSource};
use wiremock::{matchers, Mock, MockServer, ResponseTemplate};
use wiremock::Request as WiremockRequest;
/// Track total bytes transferred across all requests.
pub struct ByteCounter {
total: Arc<AtomicU64>,
request_count: Arc<AtomicU64>,
}
impl ByteCounter {
fn new() -> Self {
Self {
total: Arc::new(AtomicU64::new(0)),
request_count: Arc::new(AtomicU64::new(0)),
}
}
fn total(&self) -> u64 {
self.total.load(Ordering::SeqCst)
}
fn request_count(&self) -> u64 {
self.request_count.load(Ordering::SeqCst)
}
}
/// Custom responder that counts bytes served.
#[derive(Clone)]
struct ByteCountingResponder {
data: Vec<u8>,
counter: Arc<AtomicU64>,
request_counter: Arc<AtomicU64>,
status: u16,
supports_range: bool,
force_416_first: bool, // For testing 416 retry behavior
}
impl ByteCountingResponder {
fn new(data: Vec<u8>) -> Self {
Self {
data,
counter: Arc::new(AtomicU64::new(0)),
request_counter: Arc::new(AtomicU64::new(0)),
status: 200,
supports_range: true,
force_416_first: false,
}
}
fn with_supports_range(mut self, supports: bool) -> Self {
self.supports_range = supports;
self
}
fn with_counter(mut self, counter: Arc<AtomicU64>) -> Self {
self.counter = counter;
self
}
fn with_request_counter(mut self, counter: Arc<AtomicU64>) -> Self {
self.request_counter = counter;
self
}
fn with_force_416_first(mut self) -> Self {
self.force_416_first = true;
self
}
}
impl wiremock::Respond for ByteCountingResponder {
fn respond(&self, request: &WiremockRequest) -> wiremock::Response {
let request_num = self.request_counter.fetch_add(1, Ordering::SeqCst);
let mut response = ResponseTemplate::new(self.status);
// Add Accept-Ranges header if Range is supported
if self.supports_range {
response = response.append_header("Accept-Ranges", "bytes");
response = response.append_header("Content-Length", self.data.len().to_string());
}
// Handle Range requests
let range_header = request.headers.get("range").and_then(|v| v.to_str().ok());
if let Some(range_str) = range_header {
if !self.supports_range {
// Server doesn't support Range - return full content with 200
self.counter.fetch_add(self.data.len() as u64, Ordering::SeqCst);
return response
.set_body_bytes(self.data.clone())
.set_status(200);
}
// Test 416 behavior on first Range request if configured
if self.force_416_first && request_num == 0 {
response = response
.append_header("Content-Range", format!("bytes */{}", self.data.len()))
.append_header("Accept-Ranges", "bytes");
return response.set_status(416);
}
// Parse Range header: "bytes=START-END"
if let Some(range_part) = range_str.strip_prefix("bytes=") {
let parts: Vec<&str> = range_part.split('-').collect();
if parts.len() == 2 {
if let (Ok(start), Ok(end)) = (parts[0].parse::<u64>(), parts[1].parse::<u64>()) {
let data_len = self.data.len() as u64;
// Check if range is satisfiable
if start >= data_len {
// Return 416 Range Not Satisfiable
response = response
.append_header("Content-Range", format!("bytes */{}", data_len))
.set_status(416);
} else {
let end = end.min(data_len - 1);
let slice_start = start as usize;
let slice_end = (end + 1) as usize;
let slice_data = self.data[slice_start..slice_end.min(self.data.len())].to_vec();
self.counter.fetch_add(slice_data.len() as u64, Ordering::SeqCst);
response = response
.append_header("Content-Range", format!("bytes {}-{}/{}", start, end, data_len))
.append_header("Content-Length", slice_data.len().to_string())
.set_body_bytes(slice_data)
.set_status(206);
}
return response.into();
}
}
}
}
// No Range header or parsing failed - return full content
self.counter.fetch_add(self.data.len() as u64, Ordering::SeqCst);
response.set_body_bytes(self.data.clone()).into()
}
}
/// Load a test fixture PDF.
fn load_fixture(name: &str) -> Vec<u8> {
// First try tests/remote/fixtures, then tests/fixtures
let mut path = PathBuf::from("tests/remote/fixtures");
path.push(format!("{}.pdf", name));
if let Ok(data) = fs::read(&path) {
// Verify it's actually a PDF
if data.starts_with(b"%PDF") {
return data;
}
}
// Fallback to main fixtures
let mut path = PathBuf::from("tests/fixtures");
path.push(format!("{}.pdf", name));
fs::read(&path).unwrap_or_else(|e| {
panic!("Failed to load fixture {}: {}. Use existing PDFs from tests/fixtures/ as basis.", name, e)
})
}
/// Load a test fixture PDF with a specific filename.
fn load_fixture_file(filename: &str) -> Vec<u8> {
let mut path = PathBuf::from("tests/remote/fixtures");
path.push(filename);
fs::read(&path).unwrap_or_else(|e| {
panic!("Failed to load fixture file {}: {}. Ensure the file exists in tests/remote/fixtures/.", filename, e)
})
}
/// Assert that bytes transferred is less than or equal to max_bytes.
fn assert_bytes_transferred(counter: &ByteCounter, max_bytes: u64) {
let total = counter.total();
assert!(
total <= max_bytes,
"Transferred {} bytes, expected <= {} bytes",
total,
max_bytes
);
}
/// Test 1: Range request partial page extraction.
///
/// Critical test from plan Section 1.8: Mock HTTP server with Range support,
/// extract page 5 of a 100-page PDF, < 100 KB transferred.
#[tokio::test(flavor = "multi_thread")]
async fn test_range_request_partial_extraction() {
// Mock server with Range support
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(true)
.with_counter(counter.total.clone())
.with_request_counter(counter.request_count.clone());
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(responder)
.named("pdf-get")
.mount(&mock_server)
.await;
// Open the remote PDF
let url = format!("{}/test.pdf", mock_server.uri());
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// Verify Range support detected
assert!(source.supports_range(), "Server should support Range");
assert_eq!(source.len(), pdf_data.len() as u64);
// Read a small portion (simulating partial page extraction)
let offset = 1000;
let length = 4096;
let data = source.read_range(offset, length).expect("Failed to read range");
assert_eq!(data.len(), length);
assert_eq!(&data[..], &pdf_data[offset..offset + length]);
// For a minimal PDF, reading 5KB should transfer well under 100 KB
// In a real 100-page PDF, this would be much smaller
assert_bytes_transferred(&counter, 100_000);
// Verify at least one request was made
assert!(counter.request_count() >= 1, "Expected at least 1 request");
}
/// Test 2: Server without Range support.
///
/// Critical test from plan Section 1.8: Mock server without Range,
/// fallback to full download with documented warning.
#[tokio::test(flavor = "multi_thread")]
async fn test_no_range_support_fallback() {
// Mock server without Range support (returns 200 for Range requests)
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(false) // Server ignores Range header
.with_counter(counter.total.clone())
.with_request_counter(counter.request_count.clone());
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(responder)
.named("pdf-get-no-range")
.mount(&mock_server)
.await;
let url = format!("{}/test.pdf", mock_server.uri());
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// Verify no Range support detected
assert!(!source.supports_range(), "Server should NOT support Range");
// Attempt to read should return Unsupported error
let result = source.read_range(1000, 4096);
assert!(result.is_err());
let err = result.unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::Unsupported);
assert!(err.to_string().contains("Server does not support Range"));
// Verify full content was transferred (fallback behavior)
assert_eq!(counter.total(), pdf_data.len() as u64);
}
/// Test 3: 416 Range Not Satisfiable triggers retry without Range.
///
/// Critical test from plan Section 1.8: Mock server returning 416,
/// emit diagnostic; retry without Range.
#[tokio::test(flavor = "multi_thread")]
async fn test_416_range_not_satisfiable_retry() {
// Mock server that returns 416 for first Range request, then 200 for retry
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(true)
.with_counter(counter.total.clone())
.with_request_counter(counter.request_count.clone())
.with_force_416_first(); // First Range request gets 416
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(responder)
.named("pdf-get-416-retry")
.mount(&mock_server)
.await;
let url = format!("{}/test.pdf", mock_server.uri());
// Open should succeed (server reports Range support in HEAD)
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// First Range request will get 416, implementation should retry without Range
let result = source.read_range(1000, 4096);
// Should succeed after retry
assert!(result.is_ok(), "416 should trigger retry and succeed");
let data = result.unwrap();
assert_eq!(data.len(), 4096);
assert_eq!(&data[..], &pdf_data[1000..1000 + 4096]);
// Verify requests were made (at least 2: 1 Range + 1 retry)
assert!(counter.request_count() >= 2, "Expected at least 2 requests (Range + retry)");
}
/// Test 4: Connection drop after trailer.
///
/// Critical test from plan Section 1.8: Connection drop after the trailer
/// is fetched, extraction emits REMOTE_FETCH_INTERRUPTED.
#[tokio::test(flavor = "multi_thread")]
async fn test_connection_drop_after_trailer() {
use wiremock::respond::FnResponder;
// Mock server that drops connection after partial response
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
// Serve HEAD normally
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
// Responder that serves partial content then simulates connection drop
let partial_responder = FnResponder::new(move |_request: &WiremockRequest| {
// Return only first 1KB of data, simulating premature connection close
let partial_len = pdf_data.len().min(1024);
let partial_data = &pdf_data[..partial_len];
ResponseTemplate::new(206)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Range", format!("bytes 0-{}/{}", partial_len - 1, pdf_data.len()))
.append_header("Content-Length", partial_len.to_string())
.set_body_bytes(partial_data.to_vec())
});
Mock::given(matchers::method("GET"))
.respond_with(partial_responder)
.mount(&mock_server)
.await;
let url = format!("{}/test.pdf", mock_server.uri());
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// Try to read more than what's available - should handle gracefully
let result = source.read_range(0, 4096);
// The read should fail because the connection closed prematurely
assert!(result.is_err());
let err = result.unwrap_err();
// Should be an Interrupted error or similar connection error
assert!(matches!(err.kind(), io::ErrorKind::Interrupted | io::ErrorKind::UnexpectedEof));
}
/// Test 5: TLS handshake failure.
///
/// Critical test from plan Section 1.8: TLS-handshake failure, clear error
/// message with the certificate-chain reason; exit code 6.
///
/// Note: This test is marked as ignore because wiremock doesn't easily
/// support custom TLS certificates. Manual verification required.
#[tokio::test(flavor = "multi_thread")]
#[ignore = "Manual test - requires real TLS server with bad cert"]
async fn test_tls_handshake_failure_self_signed() {
use rcgen::{CertificateParams, DistinguishedName, SanType};
// Generate self-signed certificate using rcgen 0.13 API
let mut params = CertificateParams::default();
params.distinguished_name = DistinguishedName::new();
params.distinguished_name.push(rcgen::DnType::CommonName, "localhost");
params.subject_alt_names = vec![SanType::DnsName("localhost".to_string())];
// Generate key pair and self-signed certificate
let key_pair = params.key_pair.clone().unwrap_or_else(|| rcgen::KeyPair::generate().unwrap());
let cert = params.self_signed(&key_pair).expect("Failed to generate certificate");
let cert_pem = cert.pem().expect("Failed to serialize cert");
let key_pem = key_pair.serialize_pem();
// Manual verification steps (documented here):
// 1. Serve a PDF over HTTPS with self-signed cert
// 2. Run: pdftract extract https://localhost:8443/test.pdf
// 3. Expected: Exit code 6, stderr contains "TLS handshake failed"
println!("TLS cert generated: {} bytes", cert_pem.len());
println!("Key generated: {} bytes", key_pem.len());
println!("Manual test required: serve PDF with self-signed cert and run pdftract against it");
// For manual testing against known bad TLS servers:
// pdftract extract https://expired.badssl.com/fake.pdf
// Expected: Exit code 6
}
/// Test 6: Linearized PDF with hint stream prefetch.
///
/// Critical test from plan Section 1.8: Document with a linearized hint
/// stream, page-offset hints utilized to predict and prefetch.
#[tokio::test(flavor = "multi_thread")]
async fn test_linearized_hint_stream_prefetch() {
use wiremock::respond::FnResponder;
use std::sync::Mutex;
// Mock server with Range support
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
// Track request timing
let request_times = Arc::new(Mutex::new(Vec::new()));
let request_times_clone = request_times.clone();
let tracking_responder = FnResponder::new(move |request: &WiremockRequest| {
let mut times = request_times_clone.lock().unwrap();
times.push(std::time::Instant::now());
let range_header = request.headers.get("range").and_then(|v| v.to_str().ok());
if let Some(range_str) = range_header {
println!("Range request at {:?}", std::time::Instant::now());
println!("Range header: {}", range_str);
// Parse and serve the requested range
if let Some(range_part) = range_str.strip_prefix("bytes=") {
let parts: Vec<&str> = range_part.split('-').collect();
if parts.len() == 2 {
if let (Ok(start), Ok(end)) = (parts[0].parse::<usize>(), parts[1].parse::<usize>()) {
let end = end.min(pdf_data.len() - 1);
let slice_data = &pdf_data[start..=end];
return ResponseTemplate::new(206)
.append_header("Content-Range", format!("bytes {}-{}/{}", start, end, pdf_data.len()))
.append_header("Content-Length", slice_data.len().to_string())
.set_body_bytes(slice_data.to_vec());
}
}
}
}
// Fallback to full content
ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string())
.set_body_bytes(pdf_data.clone())
});
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string())
.append_header("Content-Type", "application/pdf"))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(tracking_responder)
.named("linearized-get")
.mount(&mock_server)
.await;
let url = format!("{}/test.pdf", mock_server.uri());
// Open the PDF
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
assert!(source.supports_range(), "Server should support Range");
// In a real linearized PDF, we would:
// 1. Parse the hint stream to get page offsets
// 2. Verify that prefetch() is called with page N+1 offsets before page N is fully consumed
// 3. Check that the request timeline shows prefetch behavior
// For now, we verify the basic fetch works
let data = source.read_range(0, 1024).expect("Failed to read range");
assert_eq!(data.len(), 1024);
let times = request_times.lock().unwrap();
println!("Total requests made: {}", times.len());
// In a real linearized PDF scenario, we'd see:
// - Request 1: HEAD (metadata)
// - Request 2: Tail (startxref, trailer)
// - Request 3: Hint stream or linearized dictionary
// - Request N: Prefetch for page 2 starts before page 1 is done
assert!(!times.is_empty(), "At least one request should be made");
}
/// Test: Custom headers (Authorization, API keys).
#[tokio::test(flavor = "multi_thread")]
async fn test_custom_headers() {
use wiremock::matchers::header;
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(true)
.with_counter(counter.total.clone());
Mock::given(matchers::method("HEAD"))
.and(header("Authorization", "Bearer test123"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.and(header("Authorization", "Bearer test123"))
.respond_with(responder)
.mount(&mock_server)
.await;
let url = format!("{}/test.pdf", mock_server.uri());
let headers = vec![
("Authorization".to_string(), "Bearer test123".to_string()),
];
let source = HttpRangeSource::with_headers(&url, headers).expect("Failed to open remote PDF");
let data = source.read_range(0, 1024).expect("Failed to read range");
assert_eq!(data.len(), 1024);
}
/// Test: Bandwidth verification for large file.
///
/// Verify that extracting a small portion from a large file
/// transfers significantly less than the full file.
#[tokio::test(flavor = "multi_thread")]
async fn test_bandwidth_efficiency() {
let mock_server = MockServer::start().await;
// Create a larger PDF (1 MB of data)
let base_pdf = load_fixture("valid-minimal");
let mut large_pdf = Vec::new();
while large_pdf.len() < 1_000_000 {
large_pdf.extend_from_slice(&base_pdf);
}
large_pdf.truncate(1_000_000);
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(large_pdf.clone())
.with_supports_range(true)
.with_counter(counter.total.clone());
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", large_pdf.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(responder)
.mount(&mock_server)
.await;
let url = format!("{}/large.pdf", mock_server.uri());
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// Read only 100 KB from the 1 MB file
let offset = 100_000;
let length = 100_000;
let data = source.read_range(offset, length).expect("Failed to read range");
assert_eq!(data.len(), length);
// Should transfer significantly less than the full file
// We expect roughly 2 blocks (128 KB) for 100 KB read
assert_bytes_transferred(&counter, 200_000);
assert!(counter.total() < large_pdf.len() as u64, "Should not transfer full file");
}
/// Test: Verify Range request count.
///
/// Verify that multiple reads to the same range hit cache.
#[tokio::test(flavor = "multi_thread")]
async fn test_cache_hit_reduces_requests() {
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(true)
.with_counter(counter.total.clone())
.with_request_counter(counter.request_count.clone());
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(responder)
.mount(&mock_server)
.await;
let url = format!("{}/test.pdf", mock_server.uri());
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// First read - should fetch from server
let data1 = source.read_range(1000, 4096).expect("Failed to read range");
let requests_after_first = counter.request_count();
// Second read of same range - should hit cache
let data2 = source.read_range(1000, 4096).expect("Failed to read range");
let requests_after_second = counter.request_count();
assert_eq!(data1, data2, "Data should be identical");
// Cache should prevent additional requests (allowing for HEAD + initial GET)
assert!(requests_after_second <= requests_after_first + 1, "Cache should reduce requests");
}
/// Test: Verify error classification for various failure modes.
#[tokio::test(flavor = "multi_thread")]
async fn test_error_classification_timeout() {
use wiremock::respond::FnResponder;
use std::thread;
use std::time::Duration;
let mock_server = MockServer::start().await;
// Responder that delays response to trigger timeout
let slow_responder = FnResponder::new(|_request: &WiremockRequest| {
thread::sleep(Duration::from_secs(35)); // Longer than 30s read timeout
ResponseTemplate::new(200).set_body_bytes(vec![1, 2, 3])
});
Mock::given(matchers::method("GET"))
.respond_with(slow_responder)
.mount(&mock_server)
.await;
let url = format!("{}/slow.pdf", mock_server.uri());
// This should timeout during the open call
let result = HttpRangeSource::open(&url);
assert!(result.is_err());
let err = result.unwrap_err();
// Timeout should be classified as Interrupted
assert!(matches!(err.kind(), io::ErrorKind::Interrupted | io::ErrorKind::TimedOut));
}
/// Test: Unauthorized access (401).
#[tokio::test(flavor = "multi_thread")]
async fn test_unauthorized_access() {
let mock_server = MockServer::start().await;
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(401).set_body_string("Unauthorized"))
.mount(&mock_server)
.await;
let url = format!("{}/protected.pdf", mock_server.uri());
let result = HttpRangeSource::open(&url);
assert!(result.is_err());
let err_msg = result.unwrap_err().to_string();
assert!(err_msg.contains("401") || err_msg.contains("Unauthorized"));
}
/// Test: Forbidden access (403).
#[tokio::test(flavor = "multi_thread")]
async fn test_forbidden_access() {
let mock_server = MockServer::start().await;
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(403).set_body_string("Forbidden"))
.mount(&mock_server)
.await;
let url = format!("{}/forbidden.pdf", mock_server.uri());
let result = HttpRangeSource::open(&url);
assert!(result.is_err());
let err_msg = result.unwrap_err().to_string();
assert!(err_msg.contains("403") || err_msg.contains("Forbidden"));
}
/// Test: Basic auth success.
#[tokio::test(flavor = "multi_thread")]
async fn test_basic_auth_success() {
use wiremock::matchers::header;
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(true)
.with_counter(counter.total.clone());
Mock::given(matchers::method("HEAD"))
.and(header("Authorization", "Basic dXNlcjpwYXNz")) // base64("user:pass")
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.and(header("Authorization", "Basic dXNlcjpwYXNz"))
.respond_with(responder)
.mount(&mock_server)
.await;
let url = format!("{}/protected.pdf", mock_server.uri());
let headers = vec![
("Authorization".to_string(), "Basic dXNlcjpwYXNz".to_string()),
];
let source = HttpRangeSource::with_headers(&url, headers).expect("Failed to open remote PDF");
assert!(source.supports_range());
}
/// Test: Page 5 of 100-page PDF extracts with < 100 KB transferred.
///
/// Critical test from plan Section 1.8: Mock HTTP server with Range support,
/// extract page 5 of a 100-page PDF, < 100 KB transferred.
///
/// This test verifies bandwidth efficiency when extracting a single page
/// from a large multi-page PDF using Range requests.
#[tokio::test(flavor = "multi_thread")]
async fn test_page_5_of_100_bandwidth_limited() {
// Load the 100-page PDF fixture (~1 MB total)
let pdf_data = load_fixture_file("multipage-100.pdf");
let total_size = pdf_data.len() as u64;
let mock_server = MockServer::start().await;
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(true)
.with_counter(counter.total.clone())
.with_request_counter(counter.request_count.clone());
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", total_size.to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(responder)
.named("pdf-get-range")
.mount(&mock_server)
.await;
let url = format!("{}/100page.pdf", mock_server.uri());
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// Verify Range support detected
assert!(source.supports_range(), "Server should support Range");
assert_eq!(source.len(), total_size);
// Simulate extracting page 5 only by reading a specific range
// In a real extraction, we'd parse the xref, find page 5's content stream,
// and read only that range. For this test, we simulate reading ~64 KB
// from the middle of the document (which represents fetching page 5 data).
let page_5_offset = (total_size as f64 * 0.05) as u64; // ~5% into the file
let page_5_length = 65536; // 64 KB (one cache block)
let data = source.read_range(page_5_offset, page_5_length)
.expect("Failed to read page 5 range");
assert_eq!(data.len(), page_5_length, "Should read exactly 64 KB");
// Critical: Verify bandwidth efficiency
// Expected transfers:
// - HEAD request: ~100 bytes
// - One Range request for 64 KB: ~64 KB
// Total: ~64 KB < 100 KB ✓
assert_bytes_transferred(&counter, 100_000);
// Also verify we didn't transfer the full file
assert!(counter.total() < total_size,
"Should transfer {} bytes, not full file {} bytes",
counter.total(), total_size);
// Verify request count: 1 HEAD + 1 Range = 2 requests
assert!(counter.request_count() >= 1 && counter.request_count() <= 3,
"Expected 1-3 requests (HEAD + Range + potential cache miss), got {}",
counter.request_count());
}
/// Test: Verify Range request count for 416 retry scenario.
///
/// When server returns 416 for Range request, verify that exactly
/// one retry without Range header occurs.
#[tokio::test(flavor = "multi_thread")]
async fn test_416_range_request_count_exact() {
let mock_server = MockServer::start().await;
let pdf_data = load_fixture("valid-minimal");
let counter = ByteCounter::new();
let responder = ByteCountingResponder::new(pdf_data.clone())
.with_supports_range(true)
.with_force_416_first()
.with_counter(counter.total.clone())
.with_request_counter(counter.request_count.clone());
Mock::given(matchers::method("HEAD"))
.respond_with(ResponseTemplate::new(200)
.append_header("Accept-Ranges", "bytes")
.append_header("Content-Length", pdf_data.len().to_string()))
.mount(&mock_server)
.await;
Mock::given(matchers::method("GET"))
.respond_with(responder)
.named("pdf-get-416")
.mount(&mock_server)
.await;
let url = format!("{}/test.pdf", mock_server.uri());
let source = HttpRangeSource::open(&url).expect("Failed to open remote PDF");
// First read should trigger 416 then retry
let _data = source.read_range(1000, 4096).expect("Read should succeed after retry");
// Critical: Verify exactly one retry occurred
// Expected: 1 initial Range (416) + 1 retry without Range (200)
// Total: 2 requests
assert_eq!(counter.request_count(), 2,
"Expected exactly 2 requests (1 Range with 416 + 1 retry without Range), got {}",
counter.request_count());
}
#[cfg(test)]
mod verification_helpers {
use super::*;
/// Helper to verify that the byte counter is working correctly.
#[test]
fn test_byte_counter() {
let counter = ByteCounter::new();
assert_eq!(counter.total(), 0);
assert_eq!(counter.request_count(), 0);
counter.total.fetch_add(1000, Ordering::SeqCst);
counter.request_count.fetch_add(1, Ordering::SeqCst);
assert_eq!(counter.total(), 1000);
assert_eq!(counter.request_count(), 1);
}
}