The native PyO3 module returns raw dicts via pythonize, but the Python SDK API expects typed dataclass objects (Document, Page, Metadata, etc.) to be consistent with the subprocess fallback and test expectations. Updated wrapper functions in __init__.py to convert native results: - extract(): wraps dict in Document.from_dict() - extract_stream(): wraps yielded page dicts in Page.from_dict() - get_metadata(): wraps dict in Metadata() - hash(): wraps string in Fingerprint.from_string() - classify(): wraps dict in Classification() - search(): wraps yielded match dicts in Match The native PyO3 entry points (extract, extract_text, extract_stream) were already implemented with: - extract: uses extract_pdf + pythonize for PyDict conversion - extract_text: uses extract_text for plain String return - extract_stream: uses extract_pdf_streaming with custom StreamIterator All kwargs parsing with strict validation (unknown kwargs raise TypeError) was already in place. Acceptance criteria: - pdftract.extract() returns Document object with pages/metadata - pdftract.extract_text() returns plain text string - pdftract.extract_stream() yields Page objects - Unknown kwarg raises TypeError
1207 lines
38 KiB
Rust
1207 lines
38 KiB
Rust
//! PDF document parsing helper.
|
||
//!
|
||
//! This module provides high-level functions for parsing PDF documents
|
||
//! and extracting the information needed for receipt verification.
|
||
//!
|
||
//! ## Lazy Page Iteration
|
||
//!
|
||
//! For memory-efficient extraction of large documents, this module provides
|
||
//! `PageIter` which yields pages lazily without materializing the entire page tree.
|
||
//! Use `PdfExtractor::pages()` to get an iterator that extracts each page on-demand.
|
||
|
||
use crate::detection::{detect_javascript, detect_xfa};
|
||
use crate::fingerprint::{
|
||
compute_fingerprint, CatalogFlags, ContentStreamData, FingerprintInput, PageFingerprintData,
|
||
};
|
||
use crate::parser::catalog::{parse_catalog, Catalog};
|
||
use crate::parser::object::PdfDict;
|
||
use crate::parser::pages::{flatten_page_tree, LazyPageIter, PageDict};
|
||
use crate::parser::stream::{FileSource as ParserFileSource, PdfSource as ParserPdfSource};
|
||
use crate::source::{FileSource, PdfSource};
|
||
use crate::parser::xref::{load_xref_with_prev_chain, XrefResolver, XrefSection};
|
||
use crate::receipts::verifier::SpanData;
|
||
use anyhow::{anyhow, Context, Result};
|
||
use serde::{Deserialize, Serialize};
|
||
use std::path::Path;
|
||
|
||
#[cfg(feature = "remote")]
|
||
use crate::source::RemoteOpts;
|
||
|
||
/// Parse a PDF file and return the document components needed for verification.
|
||
///
|
||
/// This is a high-level function that:
|
||
/// 1. Opens the PDF file
|
||
/// 2. Loads the xref table
|
||
/// 3. Parses the catalog
|
||
/// 4. Flattens the page tree
|
||
/// 5. Computes the fingerprint
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A tuple of (fingerprint, catalog, pages, resolver)
|
||
pub fn parse_pdf_file(
|
||
pdf_path: &std::path::Path,
|
||
) -> Result<(
|
||
String,
|
||
Catalog,
|
||
Vec<crate::parser::pages::PageDict>,
|
||
XrefResolver,
|
||
)> {
|
||
// Open the PDF file
|
||
let source = ParserFileSource::open(pdf_path).context("Failed to open PDF file")?;
|
||
|
||
// Find the startxref offset
|
||
let startxref_offset = find_startxref(&source).context("Failed to find startxref offset")?;
|
||
|
||
// Load the xref table
|
||
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
|
||
|
||
// Create resolver from xref section
|
||
let resolver = XrefResolver::from_section(xref_section.clone());
|
||
|
||
// Get the root reference from trailer
|
||
let root_ref = xref_section
|
||
.trailer
|
||
.as_ref()
|
||
.and_then(|trailer| trailer.get("Root"))
|
||
.and_then(|obj| obj.as_ref())
|
||
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
|
||
|
||
// Parse the catalog
|
||
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn ParserPdfSource)).map_err(
|
||
|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow!("Failed to parse catalog: {}", msg)
|
||
},
|
||
)?;
|
||
|
||
// Flatten the page tree
|
||
let pages = flatten_page_tree(&resolver, catalog.pages_ref).map_err(|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow!("Failed to flatten page tree: {}", msg)
|
||
})?;
|
||
|
||
// Resolve AcroForm dictionary if present
|
||
let acroform = catalog.acroform_ref
|
||
.and_then(|r| resolver.resolve(r).ok())
|
||
.and_then(|o| o.as_dict().map(|d| d.clone()));
|
||
|
||
// Build fingerprint input
|
||
let fingerprint_input = build_fingerprint_input(&catalog, &pages, &resolver, &acroform);
|
||
|
||
// Compute fingerprint with source available for content stream decoding
|
||
let fingerprint = compute_fingerprint(&fingerprint_input, &resolver, Some(&source as &dyn ParserPdfSource));
|
||
|
||
Ok((fingerprint, catalog, pages, resolver))
|
||
}
|
||
|
||
/// Parse a PDF from a generic source and return document components.
|
||
///
|
||
/// This is a variant of `parse_pdf_file` that works with any `PdfSource`
|
||
/// implementation (local files, HTTP sources, memory buffers, etc.).
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `source` - A PDF source (FileSource, HttpRangeSource, etc.)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A tuple of (fingerprint, catalog, pages, resolver)
|
||
pub fn parse_pdf_source(
|
||
source: Box<dyn ParserPdfSource>,
|
||
) -> Result<(
|
||
String,
|
||
Catalog,
|
||
Vec<crate::parser::pages::PageDict>,
|
||
XrefResolver,
|
||
)> {
|
||
// Find the startxref offset
|
||
let startxref_offset = find_startxref(&*source).context("Failed to find startxref offset")?;
|
||
|
||
// Load the xref table
|
||
let xref_section = load_xref_with_prev_chain(&*source, startxref_offset);
|
||
|
||
// Create resolver from xref section
|
||
let resolver = XrefResolver::from_section(xref_section.clone());
|
||
|
||
// Get the root reference from trailer
|
||
let root_ref = xref_section
|
||
.trailer
|
||
.as_ref()
|
||
.and_then(|trailer| trailer.get("Root"))
|
||
.and_then(|obj| obj.as_ref())
|
||
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
|
||
|
||
// Parse the catalog
|
||
let catalog = parse_catalog(&resolver, root_ref, Some(&*source as &dyn ParserPdfSource)).map_err(
|
||
|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow!("Failed to parse catalog: {}", msg)
|
||
},
|
||
)?;
|
||
|
||
// Flatten the page tree
|
||
let pages = flatten_page_tree(&resolver, catalog.pages_ref).map_err(|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow!("Failed to flatten page tree: {}", msg)
|
||
})?;
|
||
|
||
// Resolve AcroForm dictionary if present
|
||
let acroform = catalog.acroform_ref
|
||
.and_then(|r| resolver.resolve(r).ok())
|
||
.and_then(|o| o.as_dict().map(|d| d.clone()));
|
||
|
||
// Build fingerprint input
|
||
let fingerprint_input = build_fingerprint_input(&catalog, &pages, &resolver, &acroform);
|
||
|
||
// Compute fingerprint with source available
|
||
let fingerprint = compute_fingerprint(&fingerprint_input, &resolver, Some(&*source as &dyn ParserPdfSource));
|
||
|
||
Ok((fingerprint, catalog, pages, resolver))
|
||
}
|
||
|
||
/// Find the startxref offset in a PDF file.
|
||
///
|
||
/// Scans the last 1024 bytes of the file for "startxref" keyword.
|
||
fn find_startxref(source: &dyn ParserPdfSource) -> Result<u64> {
|
||
let len = source.len()? as usize;
|
||
let scan_start = len.saturating_sub(1024);
|
||
let scan_end = len;
|
||
|
||
let tail_data = source
|
||
.read_at(scan_start as u64, scan_end - scan_start)
|
||
.context("Failed to read PDF tail")?;
|
||
|
||
// Find "startxref" in the tail data
|
||
let startxref_pos = tail_data
|
||
.windows(9)
|
||
.rposition(|w| w == b"startxref")
|
||
.ok_or_else(|| anyhow!("startxref not found in PDF"))?;
|
||
|
||
// Parse the offset after "startxref"
|
||
// Skip the "startxref" keyword (9 chars) and any following whitespace
|
||
let offset_data = &tail_data[startxref_pos + 9..];
|
||
|
||
// Skip leading whitespace (space, \r, \n, \t)
|
||
let offset_start = offset_data
|
||
.iter()
|
||
.position(|&b| !matches!(b, b' ' | b'\r' | b'\n' | b'\t'))
|
||
.unwrap_or(offset_data.len());
|
||
|
||
let offset_data_trimmed = &offset_data[offset_start..];
|
||
|
||
// Find the newline after the offset
|
||
let newline_pos = offset_data_trimmed
|
||
.iter()
|
||
.position(|&b| b == b'\n' || b == b'\r')
|
||
.unwrap_or(offset_data_trimmed.len());
|
||
|
||
let offset_str = std::str::from_utf8(&offset_data_trimmed[..newline_pos])
|
||
.context("startxref offset is not valid UTF-8")?;
|
||
|
||
let offset: u64 = offset_str
|
||
.trim()
|
||
.parse()
|
||
.context("startxref offset is not a valid number")?;
|
||
|
||
Ok(offset)
|
||
}
|
||
|
||
/// Build FingerprintInput from catalog and pages.
|
||
fn build_fingerprint_input(
|
||
catalog: &Catalog,
|
||
pages: &[crate::parser::pages::PageDict],
|
||
resolver: &XrefResolver,
|
||
acroform: &Option<PdfDict>,
|
||
) -> FingerprintInput {
|
||
let page_count = pages.len() as u32;
|
||
|
||
let fingerprint_pages = pages
|
||
.iter()
|
||
.map(|page| {
|
||
PageFingerprintData {
|
||
content_streams: page
|
||
.contents
|
||
.iter()
|
||
.map(|&obj_ref| ContentStreamData::Indirect(obj_ref))
|
||
.collect(),
|
||
resources: None, // TODO: convert ResourceDict to PdfDict
|
||
media_box: page.media_box,
|
||
crop_box: page.crop_box,
|
||
rotate: page.rotate,
|
||
}
|
||
})
|
||
.collect();
|
||
|
||
// Detect JavaScript and XFA presence
|
||
let contains_javascript = detect_javascript(catalog, pages, acroform, resolver);
|
||
let contains_xfa = detect_xfa(acroform);
|
||
|
||
// Build catalog flags
|
||
let catalog_flags = CatalogFlags {
|
||
is_encrypted: false, // TODO: detect encryption
|
||
contains_javascript,
|
||
contains_xfa,
|
||
ocg_present: catalog
|
||
.oc_properties
|
||
.as_ref()
|
||
.map(|props| props.present)
|
||
.unwrap_or(false),
|
||
};
|
||
|
||
FingerprintInput {
|
||
page_count,
|
||
pages: fingerprint_pages,
|
||
struct_tree_root_ref: catalog.struct_tree_root_ref,
|
||
is_tagged: catalog.mark_info.is_tagged,
|
||
catalog_flags,
|
||
}
|
||
}
|
||
|
||
/// Extract text spans from a specific page.
|
||
///
|
||
/// This is a minimal implementation that extracts basic text information.
|
||
/// In a full implementation, this would use the complete text extraction pipeline.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file
|
||
/// * `page_index` - 0-based page index
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A vector of SpanData objects containing text and bbox information
|
||
pub fn extract_spans_from_page(
|
||
pdf_path: &std::path::Path,
|
||
page_index: usize,
|
||
) -> Result<Vec<SpanData>> {
|
||
// Parse the PDF
|
||
let (_fingerprint, _catalog, pages, _resolver) = parse_pdf_file(pdf_path)?;
|
||
|
||
// Check page index bounds
|
||
if page_index >= pages.len() {
|
||
return Err(anyhow!(
|
||
"Page index {} out of bounds (document has {} pages)",
|
||
page_index,
|
||
pages.len()
|
||
));
|
||
}
|
||
|
||
let page = &pages[page_index];
|
||
|
||
// For now, return a placeholder span
|
||
// In a full implementation, this would:
|
||
// 1. Parse the content streams
|
||
// 2. Extract text with positioning information
|
||
// 3. Build spans with text and bbox
|
||
|
||
// Return a single span covering the entire page as a placeholder
|
||
let [x0, y0, x1, y1] = page.media_box;
|
||
let spans = vec![SpanData {
|
||
text: format!("[Page {} text extraction not yet implemented]", page_index),
|
||
bbox: [x0, y0, x1, y1],
|
||
}];
|
||
|
||
Ok(spans)
|
||
}
|
||
|
||
/// Compute the fingerprint of a PDF file.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// The fingerprint string in the format "pdftract-v1:\<hex\>"
|
||
pub fn compute_pdf_fingerprint(pdf_path: &std::path::Path) -> Result<String> {
|
||
let (fingerprint, _catalog, _pages, _resolver) = parse_pdf_file(pdf_path)?;
|
||
Ok(fingerprint)
|
||
}
|
||
|
||
/// A lazy PDF page extractor that yields pages one at a time.
|
||
///
|
||
/// This struct provides memory-efficient extraction for large PDFs by:
|
||
/// - Materializing only the current page's data
|
||
/// - Decoding content streams on-demand per page
|
||
/// - Dropping decoded data immediately after use
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// let extractor = PdfExtractor::open("document.pdf")?;
|
||
/// for page_result in extractor.pages() {
|
||
/// let page = page_result?;
|
||
/// // Process page without holding all pages in memory
|
||
/// }
|
||
/// ```
|
||
/// PDF document extractor with lazy page iteration.
|
||
///
|
||
/// This struct provides on-demand access to PDF pages without materializing
|
||
/// the entire page tree in memory. Use it for memory-efficient extraction
|
||
/// from large documents or when you need random access to specific pages.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// Open a PDF and iterate over pages lazily:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::document::PdfExtractor;
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// let extractor = PdfExtractor::open("document.pdf")?;
|
||
/// println!("Fingerprint: {}", extractor.fingerprint());
|
||
/// println!("Total pages: {}", extractor.catalog().page_count.unwrap_or(0));
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
///
|
||
/// Memory-bounded extraction of specific pages:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::document::PdfExtractor;
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// let extractor = PdfExtractor::open("large.pdf")?;
|
||
///
|
||
/// // Only pages 5-10 are materialized, not the entire document
|
||
/// for page_result in extractor.pages()?.take(10) {
|
||
/// let page = page_result?;
|
||
/// println!("Page {} has {} spans", page.index, page.spans.len());
|
||
/// }
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
pub struct PdfExtractor {
|
||
/// The PDF file source
|
||
source: FileSource,
|
||
/// The xref resolver for indirect object lookup
|
||
resolver: XrefResolver,
|
||
/// The parsed catalog
|
||
catalog: Catalog,
|
||
/// The fingerprint of the document
|
||
fingerprint: String,
|
||
/// Pre-flattened pages (for non-streaming extraction)
|
||
pages: Option<Vec<PageDict>>,
|
||
}
|
||
|
||
impl PdfExtractor {
|
||
/// Open a PDF file for lazy extraction.
|
||
///
|
||
/// This parses the xref table and catalog but does NOT materialize
|
||
/// the page tree. Pages are resolved on-demand from the iterator.
|
||
pub fn open<P: AsRef<Path>>(pdf_path: P) -> Result<Self> {
|
||
let path = pdf_path.as_ref();
|
||
|
||
// Open the PDF file
|
||
let source = FileSource::open(path).context("Failed to open PDF file")?;
|
||
|
||
// Find the startxref offset
|
||
let startxref_offset =
|
||
find_startxref(&source).context("Failed to find startxref offset")?;
|
||
|
||
// Load the xref table
|
||
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
|
||
|
||
// Create resolver from xref section
|
||
let resolver = XrefResolver::from_section(xref_section.clone());
|
||
|
||
// Get the root reference from trailer
|
||
let root_ref = xref_section
|
||
.trailer
|
||
.as_ref()
|
||
.and_then(|trailer| trailer.get("Root"))
|
||
.and_then(|obj| obj.as_ref())
|
||
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
|
||
|
||
// Parse the catalog
|
||
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn ParserPdfSource)).map_err(
|
||
|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow!("Failed to parse catalog: {}", msg)
|
||
},
|
||
)?;
|
||
|
||
// Resolve AcroForm dictionary if present (for XFA detection)
|
||
let acroform = catalog.acroform_ref
|
||
.and_then(|r| resolver.resolve(r).ok())
|
||
.and_then(|o| o.as_dict().map(|d| d.clone()));
|
||
|
||
// Build fingerprint input (without full page tree for lazy extraction)
|
||
let fingerprint = compute_fingerprint_lazy(&catalog, &resolver, &acroform);
|
||
|
||
Ok(Self {
|
||
source,
|
||
resolver,
|
||
catalog,
|
||
fingerprint,
|
||
pages: None,
|
||
})
|
||
}
|
||
|
||
/// Get the document fingerprint.
|
||
pub fn fingerprint(&self) -> &str {
|
||
&self.fingerprint
|
||
}
|
||
|
||
/// Get the catalog.
|
||
pub fn catalog(&self) -> &Catalog {
|
||
&self.catalog
|
||
}
|
||
|
||
/// Get the total page count.
|
||
///
|
||
/// This walks the page tree to count pages without materializing PageDict objects.
|
||
/// Uses O(depth) memory, making it safe for large documents.
|
||
pub fn page_count(&self) -> Result<usize> {
|
||
if let Some(ref pages) = self.pages {
|
||
return Ok(pages.len());
|
||
}
|
||
|
||
// Use lazy counting that doesn't materialize all pages
|
||
use crate::parser::pages::count_pages_tree;
|
||
count_pages_tree(&self.resolver, self.catalog.pages_ref)
|
||
.map_err(|e| anyhow!("Failed to count pages: {:?}", e))
|
||
}
|
||
|
||
/// Materialize all pages (for non-streaming extraction).
|
||
///
|
||
/// This caches the flattened page tree for repeated access.
|
||
///
|
||
/// # WARNING: Memory Implications
|
||
///
|
||
/// This function materializes ALL pages in memory, which defeats lazy loading
|
||
/// and can consume significant memory for large documents (1000+ pages).
|
||
/// Use this ONLY when you need repeated random access to pages.
|
||
///
|
||
/// For streaming extraction or one-time sequential access, use the `pages()`
|
||
/// method instead, which returns a lazy `PageIter` that never materializes
|
||
/// all pages at once.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// // BAD: Materializes all pages in memory
|
||
/// extractor.materialize_pages()?;
|
||
/// for page in extractor.pages.unwrap() { ... }
|
||
///
|
||
/// // GOOD: Lazy iteration, one page at a time
|
||
/// for page_result in extractor.pages() {
|
||
/// let page = page_result?;
|
||
/// // Process page - it will be dropped after loop iteration
|
||
/// }
|
||
/// ```
|
||
pub fn materialize_pages(&mut self) -> Result<&[PageDict]> {
|
||
if self.pages.is_none() {
|
||
let pages = flatten_page_tree(&self.resolver, self.catalog.pages_ref)
|
||
.map_err(|e| anyhow!("Failed to flatten page tree: {:?}", e))?;
|
||
self.pages = Some(pages);
|
||
}
|
||
Ok(self.pages.as_ref().unwrap())
|
||
}
|
||
|
||
/// Get a lazy iterator over pages.
|
||
///
|
||
/// The iterator yields pages one at a time, decoding each page's
|
||
/// content streams on-demand and dropping them after use.
|
||
///
|
||
/// # Memory Behavior
|
||
///
|
||
/// This uses LazyPageIter which walks the page tree depth-first,
|
||
/// materializing only the current path from root to leaf (max ~16 nodes).
|
||
/// Each yielded PageDict is standalone and can be dropped after use.
|
||
/// Peak RSS stays O(depth) not O(pages).
|
||
///
|
||
/// # Preferred Streaming Approach
|
||
///
|
||
/// This is the RECOMMENDED way to iterate over pages for large documents,
|
||
/// as it never materializes all pages in memory. Use `materialize_pages()`
|
||
/// ONLY when you need repeated random access to pages.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// // GOOD: Lazy iteration, one page at a time
|
||
/// for page_result in extractor.pages() {
|
||
/// let page = page_result?;
|
||
/// // Process page - it will be dropped after loop iteration
|
||
/// }
|
||
///
|
||
/// // BAD: Materializes all pages in memory (avoid for large documents)
|
||
/// extractor.materialize_pages()?;
|
||
/// for page in extractor.pages.unwrap() { ... }
|
||
/// ```
|
||
pub fn pages(&self) -> PageIter<'_> {
|
||
PageIter {
|
||
lazy_iter: None,
|
||
catalog: &self.catalog,
|
||
resolver: &self.resolver,
|
||
source: Some(&self.source as &dyn ParserPdfSource),
|
||
index: 0,
|
||
}
|
||
}
|
||
|
||
/// Extract a single page by index.
|
||
///
|
||
/// This method extracts one page without materializing the entire document.
|
||
/// Content streams are decoded and the result is returned.
|
||
pub fn extract_page(&self, page_index: usize) -> Result<PageExtraction> {
|
||
let pages = self
|
||
.pages
|
||
.as_ref()
|
||
.ok_or_else(|| anyhow!("Pages not materialized. Call materialize_pages() first."))?;
|
||
|
||
if page_index >= pages.len() {
|
||
return Err(anyhow!(
|
||
"Page index {} out of bounds (document has {} pages)",
|
||
page_index,
|
||
pages.len()
|
||
));
|
||
}
|
||
|
||
let page = &pages[page_index];
|
||
|
||
// For now, return a placeholder extraction
|
||
// The full implementation would decode content streams here
|
||
let [x0, y0, x1, y1] = page.media_box;
|
||
|
||
Ok(PageExtraction {
|
||
index: page_index,
|
||
width: x1 - x0,
|
||
height: y1 - y0,
|
||
rotation: page.rotate,
|
||
spans: vec![],
|
||
blocks: vec![],
|
||
})
|
||
}
|
||
}
|
||
|
||
/// Result of extracting a single page.
|
||
///
|
||
/// This struct contains the minimal data needed for one page,
|
||
/// designed to be dropped immediately after serialization.
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
pub struct PageExtraction {
|
||
/// 0-based page index
|
||
pub index: usize,
|
||
/// Page width in points
|
||
pub width: f64,
|
||
/// Page height in points
|
||
pub height: f64,
|
||
/// Page rotation in degrees
|
||
pub rotation: i32,
|
||
/// Extracted text spans
|
||
pub spans: Vec<SpanData>,
|
||
/// Extracted blocks
|
||
pub blocks: Vec<BlockData>,
|
||
}
|
||
|
||
/// Block data for extracted content.
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
pub struct BlockData {
|
||
/// Block kind (paragraph, heading, etc.)
|
||
pub kind: String,
|
||
/// Block text
|
||
pub text: String,
|
||
}
|
||
|
||
/// Lazy iterator over PDF pages.
|
||
///
|
||
/// Compute fingerprint without full page materialization.
|
||
///
|
||
/// This is a simplified version that uses only catalog-level data.
|
||
/// The full fingerprint computation requires page content streams.
|
||
pub(crate) fn compute_fingerprint_lazy(
|
||
catalog: &Catalog,
|
||
resolver: &XrefResolver,
|
||
acroform: &Option<PdfDict>,
|
||
) -> String {
|
||
// For lazy extraction, use a simpler fingerprint based on catalog data
|
||
// The full implementation would incrementally hash pages as they're extracted
|
||
use crate::fingerprint::FingerprintInput;
|
||
|
||
// Detect JavaScript and XFA presence (no pages available in lazy mode)
|
||
let contains_javascript = if catalog.open_action.is_some() || catalog.aa.is_some() {
|
||
true
|
||
} else {
|
||
// For catalog-level checks, use simple detection
|
||
// Full page/annotation walk requires materialized pages
|
||
false
|
||
};
|
||
let contains_xfa = detect_xfa(acroform);
|
||
|
||
let fingerprint_input = FingerprintInput {
|
||
page_count: 0, // Will be updated when pages are extracted
|
||
pages: vec![],
|
||
struct_tree_root_ref: catalog.struct_tree_root_ref,
|
||
is_tagged: catalog.mark_info.is_tagged,
|
||
catalog_flags: CatalogFlags {
|
||
is_encrypted: false,
|
||
contains_javascript,
|
||
contains_xfa,
|
||
ocg_present: catalog
|
||
.oc_properties
|
||
.as_ref()
|
||
.map(|props| props.present)
|
||
.unwrap_or(false),
|
||
},
|
||
};
|
||
|
||
compute_fingerprint(&fingerprint_input, resolver, None)
|
||
}
|
||
|
||
/// A parsed PDF document that can be from either local or remote sources.
|
||
///
|
||
/// This type provides a unified interface for working with PDFs regardless
|
||
/// of their source (local file, HTTP/HTTPS URL, memory buffer). It holds
|
||
/// the parsed catalog, xref resolver, and lazy page iterator.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// use pdftract_core::document::Document;
|
||
///
|
||
/// // Open from local file
|
||
/// let doc = Document::open("document.pdf")?;
|
||
///
|
||
/// // Open from remote URL
|
||
/// let doc = Document::open_remote("https://example.com/doc.pdf", &RemoteOpts::new())?;
|
||
///
|
||
/// // Get page count
|
||
/// let count = doc.page_count()?;
|
||
///
|
||
/// // Iterate pages lazily
|
||
/// for page_result in doc.pages() {
|
||
/// let page = page_result?;
|
||
/// println!("Page {}: {}x{}", page.index, page.width, page.height);
|
||
/// }
|
||
/// ```
|
||
pub struct Document {
|
||
/// The parsed catalog
|
||
catalog: Catalog,
|
||
/// The xref resolver for object resolution
|
||
resolver: XrefResolver,
|
||
/// The PDF source (file, HTTP, memory)
|
||
source: Option<Box<dyn ParserPdfSource>>,
|
||
/// The document fingerprint
|
||
fingerprint: String,
|
||
/// Whether this is a remote document
|
||
is_remote: bool,
|
||
}
|
||
|
||
impl Document {
|
||
/// Open a PDF from a local file path.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `path` - Path to the PDF file
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A parsed Document ready for extraction.
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if:
|
||
/// - The file cannot be opened
|
||
/// - The PDF is malformed
|
||
/// - The xref table cannot be parsed
|
||
pub fn open<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
|
||
let path = path.as_ref();
|
||
let parser_source = ParserFileSource::open(path).context("Failed to open PDF file")?;
|
||
Self::from_source(Box::new(parser_source), false)
|
||
}
|
||
|
||
/// Open a PDF from a remote HTTP/HTTPS URL.
|
||
///
|
||
/// This performs the HTTP fetch sequence:
|
||
/// 1. HEAD request to verify Range support and get Content-Length
|
||
/// 2. Tail Range fetch (last 16 KB, progressive up to 1 MB) for startxref
|
||
/// 3. Xref parsing with forward-scan disabled (no full file fetch)
|
||
/// 4. Returns a parsed Document
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `url` - HTTP/HTTPS URL to the PDF file
|
||
/// * `opts` - Remote options (headers, credentials, etc.)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A parsed Document ready for extraction.
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if:
|
||
/// - URL is invalid or DNS fails
|
||
/// - TLS handshake fails
|
||
/// - Server returns 401/403
|
||
/// - Server doesn't support Range requests
|
||
/// - No Content-Length header
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// use pdftract_core::{Document, source::RemoteOpts};
|
||
///
|
||
/// let opts = RemoteOpts::new()
|
||
/// .with_header("Authorization", "Bearer token");
|
||
///
|
||
/// let doc = Document::open_remote("https://example.com/doc.pdf", &opts)?;
|
||
/// ```
|
||
#[cfg(feature = "remote")]
|
||
pub fn open_remote(url: &str, opts: &RemoteOpts) -> Result<Self> {
|
||
use crate::parser::stream::SourceAdapter;
|
||
use crate::source::open_remote as open_remote_source;
|
||
let source = open_remote_source(url, opts, None).context("Failed to open remote PDF source")?;
|
||
let adapted = Box::new(SourceAdapter::new(source)) as Box<dyn ParserPdfSource>;
|
||
Self::from_source(adapted, true)
|
||
}
|
||
|
||
/// Create a Document from a generic PdfSource.
|
||
///
|
||
/// This is used internally by both `open` and `open_remote`.
|
||
fn from_source(source: Box<dyn ParserPdfSource>, is_remote: bool) -> Result<Self> {
|
||
// Find the startxref offset
|
||
let startxref_offset = find_startxref(&*source).context("Failed to find startxref offset")?;
|
||
|
||
// Load the xref table (forward-scan is disabled for remote sources automatically)
|
||
let xref_section = load_xref_with_prev_chain(&*source, startxref_offset);
|
||
|
||
// Create resolver from xref section
|
||
let resolver = XrefResolver::from_section(xref_section.clone());
|
||
|
||
// Get the root reference from trailer
|
||
let root_ref = xref_section
|
||
.trailer
|
||
.as_ref()
|
||
.and_then(|trailer| trailer.get("Root"))
|
||
.and_then(|obj| obj.as_ref())
|
||
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
|
||
|
||
// Parse the catalog
|
||
let catalog = parse_catalog(&resolver, root_ref, Some(&*source)).map_err(|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow!("Failed to parse catalog: {}", msg)
|
||
})?;
|
||
|
||
// Resolve AcroForm dictionary if present (for XFA detection)
|
||
let acroform = catalog
|
||
.acroform_ref
|
||
.and_then(|r| resolver.resolve(r).ok())
|
||
.and_then(|o| o.as_dict().map(|d| d.clone()));
|
||
|
||
// Build fingerprint (lazy version without full page tree)
|
||
let fingerprint = compute_fingerprint_lazy(&catalog, &resolver, &acroform);
|
||
|
||
Ok(Self {
|
||
catalog,
|
||
resolver,
|
||
source: Some(source),
|
||
fingerprint,
|
||
is_remote,
|
||
})
|
||
}
|
||
|
||
/// Get the document fingerprint.
|
||
pub fn fingerprint(&self) -> &str {
|
||
&self.fingerprint
|
||
}
|
||
|
||
/// Get the catalog.
|
||
pub fn catalog(&self) -> &Catalog {
|
||
&self.catalog
|
||
}
|
||
|
||
/// Check if this is a remote document.
|
||
pub fn is_remote(&self) -> bool {
|
||
self.is_remote
|
||
}
|
||
|
||
/// Get the total page count.
|
||
///
|
||
/// This walks the page tree to count pages without materializing PageDict objects.
|
||
/// Uses O(depth) memory, making it safe for large documents.
|
||
pub fn page_count(&self) -> Result<usize> {
|
||
use crate::parser::pages::count_pages_tree;
|
||
count_pages_tree(&self.resolver, self.catalog.pages_ref)
|
||
.map_err(|e| anyhow!("Failed to count pages: {:?}", e))
|
||
}
|
||
|
||
/// Get a lazy iterator over pages.
|
||
///
|
||
/// The iterator yields pages one at a time, decoding each page's
|
||
/// content streams on-demand and dropping them after use.
|
||
///
|
||
/// # Memory Behavior
|
||
///
|
||
/// This uses LazyPageIter which walks the page tree depth-first,
|
||
/// materializing only the current path from root to leaf (max ~16 nodes).
|
||
/// Each yielded PageExtraction contains the extracted data for one page,
|
||
/// and all intermediate data is dropped before yielding the next page.
|
||
pub fn pages(&self) -> PageIter<'_> {
|
||
PageIter {
|
||
lazy_iter: None,
|
||
catalog: &self.catalog,
|
||
resolver: &self.resolver,
|
||
source: self.source.as_ref().map(|s| s.as_ref()),
|
||
index: 0,
|
||
}
|
||
}
|
||
|
||
/// Get the xref resolver.
|
||
pub fn resolver(&self) -> &XrefResolver {
|
||
&self.resolver
|
||
}
|
||
|
||
/// Get the underlying source if available.
|
||
pub fn source(&self) -> Option<&dyn ParserPdfSource> {
|
||
self.source.as_ref().map(|s| s.as_ref())
|
||
}
|
||
}
|
||
|
||
/// Lazy iterator over PDF pages.
|
||
///
|
||
/// This iterator yields pages one at a time without materializing
|
||
/// the entire document model in memory.
|
||
///
|
||
/// # Memory Behavior
|
||
///
|
||
/// Uses LazyPageIter internally, which walks the page tree depth-first
|
||
/// and materializes only the current path from root to leaf (max ~16 nodes).
|
||
/// Each yielded PageExtraction contains the extracted data for one page,
|
||
/// and all intermediate data is dropped before yielding the next page.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// Iterate over pages with bounded memory:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::document::Document;
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// let doc = Document::open("large_document.pdf")?;
|
||
///
|
||
/// // Memory stays O(depth × per-page), not O(pages × per-page)
|
||
/// for page_result in doc.pages() {
|
||
/// let page = page_result?;
|
||
/// println!("Page {}: {}x{}", page.index, page.width, page.height);
|
||
/// // PageExtraction is dropped after each iteration
|
||
/// }
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
pub struct PageIter<'a> {
|
||
/// Lazy page iterator from the parser
|
||
lazy_iter: Option<LazyPageIter<'a>>,
|
||
/// Reference to the catalog for page tree root
|
||
catalog: &'a Catalog,
|
||
/// Reference to the resolver for object resolution
|
||
resolver: &'a XrefResolver,
|
||
/// Reference to the source for stream reading
|
||
source: Option<&'a dyn ParserPdfSource>,
|
||
/// Current page index
|
||
index: usize,
|
||
}
|
||
|
||
impl<'a> Iterator for PageIter<'a> {
|
||
type Item = Result<PageExtraction>;
|
||
|
||
fn next(&mut self) -> Option<Self::Item> {
|
||
// Initialize lazy iterator on first use
|
||
if self.lazy_iter.is_none() {
|
||
match LazyPageIter::new(self.resolver, self.catalog.pages_ref) {
|
||
Ok(iter) => self.lazy_iter = Some(iter),
|
||
Err(diagnostics) => {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
return Some(Err(anyhow!("Failed to create lazy page iterator: {}", msg)));
|
||
}
|
||
}
|
||
}
|
||
|
||
let iter = self.lazy_iter.as_mut()?;
|
||
|
||
match iter.next() {
|
||
Some(Ok(page_dict)) => {
|
||
let [x0, y0, x1, y1] = page_dict.media_box;
|
||
let result = Ok(PageExtraction {
|
||
index: self.index,
|
||
width: x1 - x0,
|
||
height: y1 - y0,
|
||
rotation: page_dict.rotate,
|
||
spans: vec![],
|
||
blocks: vec![],
|
||
});
|
||
self.index += 1;
|
||
|
||
// Explicitly drop page_dict to ensure memory is freed
|
||
drop(page_dict);
|
||
|
||
Some(result)
|
||
}
|
||
Some(Err(diagnostics)) => {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
self.index += 1;
|
||
Some(Err(anyhow!(
|
||
"Error extracting page {}: {}",
|
||
self.index - 1,
|
||
msg
|
||
)))
|
||
}
|
||
None => None,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Open a PDF from a remote HTTP/HTTPS URL.
|
||
///
|
||
/// This is a convenience function that performs the HTTP fetch sequence:
|
||
/// 1. HEAD request to verify Range support and get Content-Length
|
||
/// 2. Tail Range fetch (last 16 KB) to parse startxref and trailer
|
||
/// 3. Xref parsing with forward-scan disabled for remote sources
|
||
/// 4. Returns the parsed catalog, resolver, source, and fingerprint
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `url` - HTTP/HTTPS URL to the PDF file
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A tuple of (catalog, resolver, source, fingerprint) for further processing.
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if:
|
||
/// - URL is invalid or DNS fails
|
||
/// - TLS handshake fails
|
||
/// - Server returns 401/403
|
||
/// - Server doesn't support Range
|
||
/// - HEAD fails with 405 → Falls back to GET with Range: bytes=0-0
|
||
/// - No Content-Length → Returns error
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// use pdftract_core::document::open_remote_url;
|
||
///
|
||
/// let (catalog, resolver, source, fingerprint) = open_remote_url("https://example.com/doc.pdf")?;
|
||
/// // Use catalog, resolver, source for custom processing
|
||
/// ```
|
||
#[cfg(feature = "remote")]
|
||
pub fn open_remote_url(url: &str) -> std::io::Result<Box<dyn PdfSource>> {
|
||
use crate::source::open_remote as open_remote_source;
|
||
open_remote_source(url, &RemoteOpts::new(), None)
|
||
}
|
||
|
||
/// Open a PDF from a remote HTTP/HTTPS URL with options.
|
||
///
|
||
/// This is a convenience function that performs the HTTP fetch sequence
|
||
/// with custom options (headers, credentials).
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `url` - HTTP/HTTPS URL to the PDF file
|
||
/// * `opts` - Remote options (headers, credentials, etc.)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A `Box<dyn PdfSource>` that can be used for PDF parsing.
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if:
|
||
/// - URL is invalid or DNS fails → std::io::Error with kind `NotFound`
|
||
/// - TLS handshake fails → std::io::Error with kind `PermissionDenied`
|
||
/// - Server returns 401/403 → std::io::Error with kind `PermissionDenied`
|
||
/// - Server doesn't support Range → std::io::Error with kind `Unsupported`
|
||
/// - HEAD fails with 405 → Falls back to GET with Range: bytes=0-0
|
||
/// - No Content-Length → Returns error with kind `Other`
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// use pdftract_core::document::open_remote_url_with_opts;
|
||
/// use pdftract_core::source::RemoteOpts;
|
||
///
|
||
/// let opts = RemoteOpts::new()
|
||
/// .with_header("Authorization", "Bearer token");
|
||
///
|
||
/// let source = open_remote_url_with_opts("https://example.com/doc.pdf", &opts)?;
|
||
/// ```
|
||
#[cfg(feature = "remote")]
|
||
pub fn open_remote_url_with_opts(url: &str, opts: &RemoteOpts) -> std::io::Result<Box<dyn PdfSource>> {
|
||
use crate::source::open_remote as open_remote_source;
|
||
open_remote_source(url, opts, None)
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use std::fs::File;
|
||
use std::io::Write;
|
||
|
||
/// Create a minimal valid PDF for testing.
|
||
fn create_minimal_pdf(path: &std::path::Path) -> Result<()> {
|
||
let pdf_data = br#"%PDF-1.4
|
||
1 0 obj
|
||
<<
|
||
/Type /Catalog
|
||
/Pages 2 0 R
|
||
>>
|
||
endobj
|
||
2 0 obj
|
||
<<
|
||
/Type /Pages
|
||
/Kids [3 0 R]
|
||
/Count 1
|
||
>>
|
||
endobj
|
||
3 0 obj
|
||
<<
|
||
/Type /Page
|
||
/Parent 2 0 R
|
||
/MediaBox [0 0 612 792]
|
||
/Contents 4 0 R
|
||
/Resources <<
|
||
/Font <<
|
||
/F1 <<
|
||
/Type /Font
|
||
/Subtype /Type1
|
||
/BaseFont /Helvetica
|
||
>>
|
||
>>
|
||
>>
|
||
>>
|
||
endobj
|
||
4 0 obj
|
||
<<
|
||
/Length 44
|
||
>>
|
||
stream
|
||
BT
|
||
/F1 12 Tf
|
||
100 700 Td
|
||
(Test) Tj
|
||
ET
|
||
endstream
|
||
endobj
|
||
xref
|
||
0 5
|
||
0000000000 65535 f
|
||
0000000009 00000 n
|
||
0000000058 00000 n
|
||
0000000115 00000 n
|
||
0000000298 00000 n
|
||
trailer
|
||
<<
|
||
/Size 5
|
||
/Root 1 0 R
|
||
>>
|
||
startxref
|
||
403
|
||
%%EOF
|
||
"#;
|
||
|
||
let mut file = File::create(path)?;
|
||
file.write_all(pdf_data)?;
|
||
Ok(())
|
||
}
|
||
|
||
#[test]
|
||
fn test_find_startxref() {
|
||
let temp_dir = tempfile::tempdir().unwrap();
|
||
let pdf_path = temp_dir.path().join("test.pdf");
|
||
create_minimal_pdf(&pdf_path).unwrap();
|
||
|
||
let source = FileSource::open(&pdf_path).unwrap();
|
||
let offset = find_startxref(&source).unwrap();
|
||
assert_eq!(offset, 403);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_pdf_file() {
|
||
let temp_dir = tempfile::tempdir().unwrap();
|
||
let pdf_path = temp_dir.path().join("test.pdf");
|
||
create_minimal_pdf(&pdf_path).unwrap();
|
||
|
||
let (fingerprint, catalog, pages, resolver) = parse_pdf_file(&pdf_path).unwrap();
|
||
|
||
assert!(fingerprint.starts_with("pdftract-v1:"));
|
||
assert_eq!(pages.len(), 1);
|
||
assert_eq!(pages[0].media_box, [0.0, 0.0, 612.0, 792.0]);
|
||
assert_eq!(pages[0].rotate, 0);
|
||
|
||
// Verify resolver has entries
|
||
assert!(resolver.len() > 0);
|
||
}
|
||
|
||
#[test]
|
||
fn test_compute_pdf_fingerprint() {
|
||
let temp_dir = tempfile::tempdir().unwrap();
|
||
let pdf_path = temp_dir.path().join("test.pdf");
|
||
create_minimal_pdf(&pdf_path).unwrap();
|
||
|
||
let fingerprint = compute_pdf_fingerprint(&pdf_path).unwrap();
|
||
|
||
assert!(fingerprint.starts_with("pdftract-v1:"));
|
||
assert_eq!(fingerprint.len(), "pdftract-v1:".len() + 64);
|
||
|
||
// Verify hex format
|
||
let hex_part = &fingerprint["pdftract-v1:".len()..];
|
||
assert!(hex_part.chars().all(|c| c.is_ascii_hexdigit()));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_spans_from_page() {
|
||
let temp_dir = tempfile::tempdir().unwrap();
|
||
let pdf_path = temp_dir.path().join("test.pdf");
|
||
create_minimal_pdf(&pdf_path).unwrap();
|
||
|
||
let spans = extract_spans_from_page(&pdf_path, 0).unwrap();
|
||
|
||
// Should have at least one span (placeholder for now)
|
||
assert!(!spans.is_empty());
|
||
|
||
// Check the span has the expected structure
|
||
let span = &spans[0];
|
||
assert!(!span.text.is_empty());
|
||
assert_eq!(span.bbox, [0.0, 0.0, 612.0, 792.0]);
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_spans_out_of_bounds() {
|
||
let temp_dir = tempfile::tempdir().unwrap();
|
||
let pdf_path = temp_dir.path().join("test.pdf");
|
||
create_minimal_pdf(&pdf_path).unwrap();
|
||
|
||
let result = extract_spans_from_page(&pdf_path, 10);
|
||
assert!(result.is_err());
|
||
}
|
||
}
|