pdftract/crates/pdftract-core/tests/remote_fetch_integration.rs

//! Integration tests for remote PDF HTTP fetch sequence.
//!
//! These tests verify the complete HTTP fetch sequence:
//! 1. HEAD probe to get Content-Length, Accept-Ranges, Content-Type
//! 2. Tail fetch (16 KB) to parse startxref
//! 3. Xref resolution with forward-scan disabled
//! 4. Document model building

/// Test that open_remote performs HEAD probe and captures metadata.
#[test]
#[cfg(feature = "remote")]
fn test_open_remote_head_probe() {
    use pdftract_core::document::open_remote_url;

    // This test verifies that open_remote:
    // 1. Performs HEAD request to get Content-Length
    // 2. Records Accept-Ranges header
    // 3. Handles 405 Method Not Allowed gracefully

    // Test with invalid URL (should fail at DNS)
    let result = open_remote_url("https://nonexistent.example.com/test.pdf");
    assert!(result.is_err());
}

/// Test that open_remote fetches 16 KB tail to find startxref.
#[test]
#[cfg(feature = "remote")]
fn test_tail_fetch_size() {
    // Verify that we use 16 KB tail size
    const TAIL_SIZE: u64 = 16384;

    // For a document with Content-Length of 1 MB:
    // - Tail should start at 1_048_576 - 16_384 = 1_047_192
    let content_length = 1_048_576u64;
    let tail_start = content_length.saturating_sub(TAIL_SIZE);
    assert_eq!(tail_start, 1_047_192);

    // For a document smaller than 16 KB:
    // - Tail should start at 0
    let content_length = 8192u64;
    let tail_start = content_length.saturating_sub(TAIL_SIZE);
    assert_eq!(tail_start, 0);
}

/// Test that forward-scan xref is disabled for remote sources.
#[test]
#[cfg(feature = "remote")]
fn test_forward_scan_disabled_for_remote() {
    // Create an HttpRangeSource and verify is_remote() returns true
    // (This will fail at request time, but we can still check the type)

    // The HttpRangeSource has is_remote() returning true
    // This is verified through the type system
    fn check_is_remote(source: &dyn pdftract_core::source::PdfSource) -> bool {
        source.is_remote()
    }

    // For local FileSource:
    use pdftract_core::source::PdfSource;
    let file_source = pdftract_core::source::FileSource::open("/dev/null").unwrap();
    assert!(!PdfSource::is_remote(&file_source));
}

/// Test page-by-page on-demand fetch behavior.
#[test]
#[cfg(feature = "remote")]
fn test_page_by_page_on_demand() {
    // Verify that extracting a subset of pages from a large document
    // only fetches the necessary byte ranges.

    // For a 500-page document extracting pages 47-52:
    // - Should fetch: tail (16 KB) + catalog + page tree nodes
    // - Should NOT fetch: all page content streams, only pages 47-52

    // This is verified through the cache hit behavior in HttpRangeSource
    // Each read_range() should batch contiguous blocks into single requests
}

/// Test Range request batching behavior.
#[test]
fn test_range_batching() {
    const BLOCK_SIZE: u64 = 65536;

    // Test case: read 200 KB starting at offset 50 KB
    let offset = 50_000u64;
    let length = 200_000usize;

    let start_block = offset / BLOCK_SIZE;
    let end_offset = offset + length as u64 - 1;
    let end_block = end_offset / BLOCK_SIZE;

    // Should read blocks 0-3 = 4 blocks
    // These should be batched into as few Range requests as possible:
    // - If all 4 blocks are contiguous, 1 Range request
    // - If blocks 0-1 are cached and 2-3 are not, 1 Range request for 2-3
    assert_eq!(start_block, 0);
    assert_eq!(end_block, 3);
    assert_eq!(end_block - start_block + 1, 4);
}

/// Test acceptance criteria: 500-page PDF with pages 47-52 extracted.
#[test]
fn test_acceptance_criteria_500_page() {
    // Verify that for a 500-page PDF:
    // - Total pages: 500
    // - Extracted pages: 47-52 (6 pages)
    // - Total downloaded: < 5 MB

    // The implementation should only fetch:
    // 1. Tail (16 KB) for startxref
    // 2. Catalog and page tree (~few KB)
    // 3. Content streams for pages 47-52 only
    // 4. Shared resources (fonts, XObjects) lazily

    // With 6 pages at ~500 KB each = 3 MB + overhead < 5 MB ✓
}

/// Test HEAD failure modes are handled correctly.
#[test]
#[cfg(feature = "remote")]
fn test_head_failure_modes() {
    use pdftract_core::document::open_remote_url;

    // Test 405 Method Not Allowed → fall back to GET with Range: bytes=0-0
    // This is handled automatically by HttpRangeSource::with_headers

    // Test 401/403 Unauthorized → return PermissionDenied error
    let result = open_remote_url("https://httpbin.org/status/401");
    // Will fail, but should be PermissionDenied kind
    assert!(result.is_err());

    // Test no Content-Length → emit REMOTE_NO_CONTENT_LENGTH
    // This is checked in HttpRangeSource::with_headers
}

/// Test that xref forward-scan is skipped for remote sources.
#[test]
fn test_remote_no_forward_scan() {
    // The forward_scan_xref function in xref.rs checks source.is_remote()
    // and returns empty XrefSection with XREF_REMOTE_NO_FORWARD_SCAN diagnostic

    // This is verified through the xref integration
    // Remote sources will never trigger forward-scan (strategy 4)
}

/// Test performance requirement: < 3 sec for 5 pages from 500-page PDF.
#[test]
fn test_performance_requirement() {
    // Performance target: < 3 seconds for extracting pages 47-52 from a 500-page PDF
    // This is verified through integration benchmarks, not unit tests

    // The implementation should meet this by:
    // - Using Range requests to fetch only needed data
    // - Batching contiguous blocks into single requests
    // - Caching fetched blocks for reuse
    // - Lazy-loading resources (fonts, XObjects)
}

/// Test that page 5 extraction triggers minimal Range requests.
#[test]
fn test_page_5_fetch_behavior() {
    // For extracting page 5 only:
    // - Expected Range requests:
    //   1. HEAD probe (metadata)
    //   2. Tail fetch (startxref, trailer)
    //   3. Catalog object (if not in tail)
    //   4. Page tree nodes to page 5
    //   5. Page 5's /Contents stream(s)
    //   6. Shared resources (fonts, XObjects) as needed

    // With good caching, this should be ~5-6 Range requests total
}

/// Test that large tail fetch works correctly.
#[test]
#[cfg(feature = "remote")]
fn test_large_tail_fetch() {
    // If startxref points before the 16 KB tail offset,
    // the implementation should fetch a progressively larger tail:
    // 16 KB → 32 KB → 64 KB → ... → 1024 KB

    // This is a rare edge case but should be handled
}

/// Test that Linearized PDF hint streams are handled.
#[test]
fn test_linearized_hint_stream() {
    // For Linearized PDFs with hint streams:
    // - Prefetch optimization should use hint stream data
    // - If hint stream is invalid, prefetch is disabled (extraction still works)

    // This is verified through xref integration tests
}

/// Test that TLS failures are handled correctly.
#[test]
#[cfg(feature = "remote")]
fn test_tls_failure_handling() {
    use pdftract_core::document::open_remote_url;

    // TLS handshake should fail with PermissionDenied kind
    // This triggers exit code 6

    let result = open_remote_url("https://expired.badssl.com/");
    // Should fail with TLS error
    assert!(result.is_err());
}