pdftract/crates/pdftract-core/src/remote.rs
jedarden 84981f7c9b
Some checks are pending
Schema Generation Validation / Validate JSON Schema (push) Waiting to run
Schema Generation Validation / Validate JSON Syntax (push) Waiting to run
fix(pdftract-25igv): fix emit! macro usage in codespace parser
The emit! macro expects diagnostic codes without the DiagCode:: prefix.
Changed three occurrences in codespace.rs:
- Line 281: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace
- Line 290: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace
- Line 412: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace

This fixes compilation errors that prevented the codebase from building.

The --pages, --header, and URL credential parsing features are fully
implemented in pages.rs, header.rs, and url.rs modules with comprehensive
tests and integration in main.rs, grep/mod.rs, and hash.rs.

References: pdftract-25igv, notes/pdftract-25igv.md
2026-05-28 07:29:33 -04:00

331 lines
12 KiB
Rust

//! Remote PDF loading and extraction.
//!
//! This module provides the HTTP fetch sequence for remote PDFs:
//! 1. HEAD probe to verify Range support and get Content-Length
//! 2. Tail Range fetch to parse startxref, trailer, and root xref subsection
//! 3. Xref parsing with forward-scan disabled for remote sources
//! 4. Page-by-page on-demand fetch as the document model dereferences each page
//! 5. Resource lazy load (fonts and XObjects fetched on first reference)
//!
//! # Example
//!
//! ```ignore
//! use pdftract_core::remote::{open_remote, RemoteOpts};
//! use pdftract_core::options::ExtractionOptions;
//!
//! let opts = RemoteOpts::new()
//! .with_header("Authorization", "Bearer token");
//!
//! // Just open the remote PDF (for custom processing)
//! let (catalog, resolver, source, fingerprint) = open_remote("https://example.com/doc.pdf", &opts)?;
//!
//! // Or extract directly
//! let result = extract_remote("https://example.com/doc.pdf", &opts, &ExtractionOptions::default())?;
//! ```
use crate::document::compute_fingerprint_lazy;
use crate::extract::{extract_pdf_from_source, ExtractionSource};
use crate::options::ExtractionOptions;
use crate::parser::catalog::{parse_catalog, Catalog};
use crate::parser::hint_stream;
use crate::parser::xref::{detect_linearization, load_xref_with_prev_chain, XrefResolver};
use crate::source::{open_remote as open_remote_source, RemoteOpts};
use anyhow::{Context, Result};
/// Open a PDF from a remote HTTP/HTTPS URL.
///
/// This function performs the HTTP fetch sequence:
/// 1. HEAD request to verify Range support and get Content-Length
/// 2. Tail Range fetch (last 16 KB) to parse startxref and trailer
/// 3. Xref parsing with forward-scan disabled for remote sources
/// 4. Returns the parsed catalog, resolver, source, and fingerprint
///
/// # Arguments
///
/// * `url` - HTTP/HTTPS URL to the PDF file
/// * `opts` - Remote options (headers, credentials, etc.)
///
/// # Returns
///
/// A tuple of (catalog, resolver, source, fingerprint) for further processing.
///
/// # Errors
///
/// Returns an error if:
/// - URL is invalid or DNS fails → Error kind "NotFound"
/// - TLS handshake fails → Error kind "PermissionDenied"
/// - Server returns 401/403 → Error kind "PermissionDenied"
/// - Server doesn't support Range → Error kind "Unsupported"
/// - HEAD fails with 405 → Falls back to GET with Range: bytes=0-0
/// - No Content-Length → Returns error with REMOTE_NO_CONTENT_LENGTH diagnostic
///
/// # Example
///
/// ```ignore
/// use pdftract_core::remote::{open_remote, RemoteOpts};
///
/// let opts = RemoteOpts::new()
/// .with_header("Authorization", "Bearer token");
///
/// let (catalog, resolver, source, fingerprint) = open_remote("https://example.com/doc.pdf", &opts)?;
/// // Use catalog, resolver, source for custom processing
/// ```
pub fn open_remote(
url: &str,
opts: &RemoteOpts,
) -> Result<(Catalog, XrefResolver, Box<dyn crate::parser::stream::PdfSource>, String)> {
use crate::parser::stream::PdfSource as ParserPdfSource;
// Open the remote PDF source
let source = open_remote_source(url, opts).context("Failed to open remote PDF source")?;
// Find the startxref offset (reads last 1 KB of the file)
let startxref_offset = find_startxref(&*source).context("Failed to find startxref offset")?;
// Load the xref table (forward-scan is disabled for remote sources)
let xref_section = load_xref_with_prev_chain(&*source, startxref_offset);
// Create resolver from xref section
let resolver = XrefResolver::from_section(xref_section.clone());
// Get the root reference from trailer
let root_ref = xref_section
.trailer
.as_ref()
.and_then(|trailer| trailer.get("Root"))
.and_then(|obj| obj.as_ref())
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref, Some(&*source as &dyn ParserPdfSource)).map_err(
|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
.unwrap_or("unknown error");
anyhow::anyhow!("Failed to parse catalog: {}", msg)
},
)?;
// Resolve AcroForm dictionary if present (for XFA detection and fingerprint)
let acroform = catalog
.acroform_ref
.and_then(|r| resolver.resolve(r).ok())
.and_then(|o| o.as_dict())
.cloned();
// Build fingerprint input (without full page tree for lazy extraction)
let fingerprint = compute_fingerprint_lazy(&catalog, &resolver, &acroform);
Ok((catalog, resolver, source, fingerprint))
}
/// Extract pages from a remote PDF using the extraction options.
///
/// This is a convenience function that combines `open_remote` with extraction.
/// It performs the HTTP fetch sequence and then extracts the specified pages.
///
/// # Arguments
///
/// * `url` - HTTP/HTTPS URL to the PDF file
/// * `opts` - Remote options (headers, credentials, etc.)
/// * `extraction_opts` - Extraction options (page range, receipts, etc.)
///
/// # Returns
///
/// An `ExtractionResult` containing the extracted pages and metadata.
///
/// # Example
///
/// ```ignore
/// use pdftract_core::remote::{extract_remote, RemoteOpts};
/// use pdftract_core::options::ExtractionOptions;
///
/// let remote_opts = RemoteOpts::new()
/// .with_header("Authorization", "Bearer token");
///
/// let extraction_opts = ExtractionOptions::default();
///
/// let result = extract_remote("https://example.com/doc.pdf", &remote_opts, &extraction_opts)?;
/// ```
pub fn extract_remote(
url: &str,
opts: &RemoteOpts,
extraction_opts: &ExtractionOptions,
) -> Result<crate::extract::ExtractionResult> {
// Open the remote PDF source
let source = open_remote_source(url, opts).context("Failed to open remote PDF source")?;
// Prefetch pages using hint stream if available (optimization for linearized PDFs)
prefetch_hint_stream(&*source, extraction_opts);
// Use the extraction pipeline with the remote source
let extraction_source = ExtractionSource::Remote(source);
extract_pdf_from_source(extraction_source, extraction_opts)
}
/// Prefetch pages using the hint stream from a linearized PDF.
///
/// This function:
/// 1. Detects if the PDF is linearized
/// 2. Parses the hint stream if present
/// 3. Prefetches the requested page ranges using the hint table predictions
///
/// # Parameters
/// - `source`: The PDF source to read from
/// - `extraction_opts`: Extraction options containing page ranges
///
/// # Returns
/// Nothing; prefetch is a performance optimization that doesn't affect correctness.
pub fn prefetch_hint_stream(
source: &dyn crate::parser::stream::PdfSource,
extraction_opts: &ExtractionOptions,
) {
// Detect linearization
let lin_info = match detect_linearization(source) {
Some(info) => info,
None => return, // Not linearized, no hint stream
};
// Check if hint stream info is available
let (hint_offset, hint_length) = match (lin_info.hint_stream_offset, lin_info.hint_stream_length) {
(Some(offset), Some(length)) => (offset, length),
_ => return, // No hint stream, nothing to prefetch
};
// Parse the hint stream
let mut diagnostics = Vec::new();
let hint_table = match hint_stream::parse_hint_stream_from_linearized(
source,
hint_offset,
hint_length,
&mut diagnostics,
) {
Some(table) => table,
None => return, // Failed to parse hint stream, continue without prefetch
};
// Get the requested page range (if any)
let page_ranges = extraction_opts.pages.as_ref();
let page_indices: Vec<u32> = match page_ranges {
Some(ranges) => {
// Convert page ranges to 0-based indices
ranges
.iter()
.flat_map(|r| {
let start = r.start.saturating_sub(1) as u32; // Convert to 0-based
let end = r.end.saturating_sub(1) as u32;
start..=end
})
.collect()
}
None => {
// No page range specified, prefetch all pages (up to a limit)
(0..hint_table.page_count().min(100)).collect()
}
};
// Prefetch each requested page
for page_idx in page_indices {
if let Some(range) = hint_table.predict_page_range(page_idx) {
let length = range.end.saturating_sub(range.start) as usize;
source.prefetch(range.start, length);
}
}
// Note: Shared object hints are not yet implemented (Phase 2)
let _shared_ranges = hint_table.predict_shared_objects();
}
/// Find the startxref offset in a PDF file.
///
/// Scans the last 1024 bytes of the file for "startxref" keyword.
fn find_startxref(source: &dyn crate::parser::stream::PdfSource) -> Result<u64> {
let len = source.len()? as usize;
let scan_start = len.saturating_sub(1024);
let scan_end = len;
let tail_data = source
.read_at(scan_start as u64, scan_end - scan_start)
.context("Failed to read PDF tail")?;
// Find "startxref" in the tail data
let startxref_pos = tail_data
.windows(9)
.rposition(|w| w == b"startxref")
.ok_or_else(|| anyhow!("startxref not found in PDF"))?;
// Parse the offset after "startxref"
// Skip the "startxref" keyword (9 chars) and any following whitespace
let offset_data = &tail_data[startxref_pos + 9..];
// Skip leading whitespace (space, \r, \n, \t)
let offset_start = offset_data
.iter()
.position(|&b| !matches!(b, b' ' | b'\r' | b'\n' | b'\t'))
.unwrap_or(offset_data.len());
let offset_data_trimmed = &offset_data[offset_start..];
// Find the newline after the offset
let newline_pos = offset_data_trimmed
.iter()
.position(|&b| b == b'\n' || b == b'\r')
.unwrap_or(offset_data_trimmed.len());
let offset_str = std::str::from_utf8(&offset_data_trimmed[..newline_pos])
.context("startxref offset is not valid UTF-8")?;
let offset: u64 = offset_str
.trim()
.parse()
.context("startxref offset is not a valid number")?;
Ok(offset)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_startxref() {
// Test data with startxref at the end
let test_data = b"Some PDF content...%%EOF\nstartxref\n12345\n%%EOF";
let source = crate::parser::stream::MemorySource::new(test_data.to_vec());
let offset = find_startxref(&source).unwrap();
assert_eq!(offset, 12345);
}
#[test]
fn test_find_startxref_with_crlf() {
// Test data with CRLF line endings
let test_data = b"Some PDF content...%%EOF\r\nstartxref\r\n67890\r\n%%EOF";
let source = crate::parser::stream::MemorySource::new(test_data.to_vec());
let offset = find_startxref(&source).unwrap();
assert_eq!(offset, 67890);
}
#[test]
fn test_find_startxref_with_extra_whitespace() {
// Test data with extra whitespace
let test_data = b"Some PDF content...%%EOF\nstartxref\t \n99999\n%%EOF";
let source = crate::parser::stream::MemorySource::new(test_data.to_vec());
let offset = find_startxref(&source).unwrap();
assert_eq!(offset, 99999);
}
#[test]
fn test_find_startxref_not_found() {
// Test data without startxref
let test_data = b"Some PDF content...%%EOF\n%%EOF";
let source = crate::parser::stream::MemorySource::new(test_data.to_vec());
let result = find_startxref(&source);
assert!(result.is_err());
}
}