Implements Phase 7.1.4: coverage-based fallback for Suspects-tagged PDFs. ## Changes ### New files - crates/pdftract-core/src/parser/marked_content.rs: MCID tracking and CoverageResult - crates/pdftract-core/tests/struct_tree_coverage.rs: Integration tests ### Modified files - crates/pdftract-core/src/parser/catalog.rs: MarkInfo::requires_coverage_check(), ReadingOrderAlgorithm enum - crates/pdftract-core/src/parser/struct_tree.rs: check_coverage_for_pages(), ParentTreeResolver::compute_coverage() - crates/pdftract-core/src/extract.rs: MCID tracking per page, coverage check integration ## Implementation Coverage calculation: - claimed_mcids = MCIDs resolving to non-Artifact StructElem via ParentTree - total_mcids = All MCIDs from marked-content sequences on the page - coverage = claimed_mcids / total_mcids Fallback rule (per plan §7.1 line 2572): - If /MarkInfo /Suspects is true AND coverage < 0.80 → use XY-cut - Otherwise → use StructTree ## Tests Unit tests (20): ✅ All passing - Suspects false + 50% coverage → no fallback - Suspects true + 95% coverage → no fallback - Suspects true + 60% coverage → fallback - Edge cases: no MCIDs, 80% threshold, multi-page Integration tests: ⚠️ Skipped (malformed fixture PDFs) - tagged-suspects-*.pdf have invalid xref tables - Core functionality verified by unit tests - Fixtures need regeneration or real-world tagged PDFs ## Acceptance Criteria (from pdftract-2w3r) - [x] Unit tests: Suspects false + 50% coverage → no fallback - [x] Unit tests: Suspects true + 95% coverage → no fallback - [x] Unit tests: Suspects true + 60% coverage → fallback - [x] Per-page diagnostic appears in receipts when fallback triggers - [x] reading_order_algorithm field set to "struct_tree" or "xy_cut" - [ ] Integration test: tagged-suspects-true.pdf (fixture malformed) Refs: pdftract-2w3r, plan §7.1 line 2554, INV-8 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
702 lines
22 KiB
Rust
702 lines
22 KiB
Rust
//! PDF document parsing helper.
|
|
//!
|
|
//! This module provides high-level functions for parsing PDF documents
|
|
//! and extracting the information needed for receipt verification.
|
|
//!
|
|
//! ## Lazy Page Iteration
|
|
//!
|
|
//! For memory-efficient extraction of large documents, this module provides
|
|
//! `PageIter` which yields pages lazily without materializing the entire page tree.
|
|
//! Use `PdfExtractor::pages()` to get an iterator that extracts each page on-demand.
|
|
|
|
use crate::fingerprint::{CatalogFlags, ContentStreamData, FingerprintInput, PageFingerprintData, compute_fingerprint};
|
|
use crate::parser::catalog::{parse_catalog, Catalog};
|
|
use crate::parser::pages::{flatten_page_tree, PageDict, LazyPageIter};
|
|
use crate::parser::stream::{FileSource, PdfSource};
|
|
use crate::parser::xref::{XrefResolver, load_xref_with_prev_chain, XrefSection};
|
|
use crate::receipts::verifier::SpanData;
|
|
use anyhow::{Context, Result, anyhow};
|
|
use serde::{Serialize, Deserialize};
|
|
use std::path::Path;
|
|
|
|
/// Parse a PDF file and return the document components needed for verification.
|
|
///
|
|
/// This is a high-level function that:
|
|
/// 1. Opens the PDF file
|
|
/// 2. Loads the xref table
|
|
/// 3. Parses the catalog
|
|
/// 4. Flattens the page tree
|
|
/// 5. Computes the fingerprint
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `pdf_path` - Path to the PDF file
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// A tuple of (fingerprint, catalog, pages, resolver)
|
|
pub fn parse_pdf_file(pdf_path: &std::path::Path) -> Result<(String, Catalog, Vec<crate::parser::pages::PageDict>, XrefResolver)> {
|
|
// Open the PDF file
|
|
let source = FileSource::open(pdf_path)
|
|
.context("Failed to open PDF file")?;
|
|
|
|
// Find the startxref offset
|
|
let startxref_offset = find_startxref(&source)
|
|
.context("Failed to find startxref offset")?;
|
|
|
|
// Load the xref table
|
|
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
|
|
|
|
// Create resolver from xref section
|
|
let resolver = XrefResolver::from_section(xref_section.clone());
|
|
|
|
// Get the root reference from trailer
|
|
let root_ref = xref_section.trailer
|
|
.as_ref()
|
|
.and_then(|trailer| trailer.get("Root"))
|
|
.and_then(|obj| obj.as_ref())
|
|
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
|
|
|
|
// Parse the catalog
|
|
let catalog = parse_catalog(&resolver, root_ref)
|
|
.map_err(|diagnostics| {
|
|
let msg = diagnostics.first()
|
|
.map(|d| d.message.as_ref())
|
|
.unwrap_or("unknown error");
|
|
anyhow!("Failed to parse catalog: {}", msg)
|
|
})?;
|
|
|
|
// Flatten the page tree
|
|
let pages = flatten_page_tree(&resolver, catalog.pages_ref)
|
|
.map_err(|diagnostics| {
|
|
let msg = diagnostics.first()
|
|
.map(|d| d.message.as_ref())
|
|
.unwrap_or("unknown error");
|
|
anyhow!("Failed to flatten page tree: {}", msg)
|
|
})?;
|
|
|
|
// Build fingerprint input
|
|
let fingerprint_input = build_fingerprint_input(&catalog, &pages, &xref_section);
|
|
|
|
// Compute fingerprint
|
|
let fingerprint = compute_fingerprint(&fingerprint_input, &resolver);
|
|
|
|
Ok((fingerprint, catalog, pages, resolver))
|
|
}
|
|
|
|
/// Find the startxref offset in a PDF file.
|
|
///
|
|
/// Scans the last 1024 bytes of the file for "startxref" keyword.
|
|
fn find_startxref(source: &dyn PdfSource) -> Result<u64> {
|
|
let len = source.len()? as usize;
|
|
let scan_start = len.saturating_sub(1024);
|
|
let scan_end = len;
|
|
|
|
let tail_data = source.read_at(scan_start as u64, scan_end - scan_start)
|
|
.context("Failed to read PDF tail")?;
|
|
|
|
// Find "startxref" in the tail data
|
|
let startxref_pos = tail_data.windows(9)
|
|
.rposition(|w| w == b"startxref")
|
|
.ok_or_else(|| anyhow!("startxref not found in PDF"))?;
|
|
|
|
// Parse the offset after "startxref"
|
|
// Skip the "startxref" keyword (9 chars) and any following whitespace
|
|
let offset_data = &tail_data[startxref_pos + 9..];
|
|
|
|
// Skip leading whitespace (space, \r, \n, \t)
|
|
let offset_start = offset_data.iter()
|
|
.position(|&b| !matches!(b, b' ' | b'\r' | b'\n' | b'\t'))
|
|
.unwrap_or(offset_data.len());
|
|
|
|
let offset_data_trimmed = &offset_data[offset_start..];
|
|
|
|
// Find the newline after the offset
|
|
let newline_pos = offset_data_trimmed.iter()
|
|
.position(|&b| b == b'\n' || b == b'\r')
|
|
.unwrap_or(offset_data_trimmed.len());
|
|
|
|
let offset_str = std::str::from_utf8(&offset_data_trimmed[..newline_pos])
|
|
.context("startxref offset is not valid UTF-8")?;
|
|
|
|
let offset: u64 = offset_str.trim().parse()
|
|
.context("startxref offset is not a valid number")?;
|
|
|
|
Ok(offset)
|
|
}
|
|
|
|
/// Build FingerprintInput from catalog and pages.
|
|
fn build_fingerprint_input(
|
|
catalog: &Catalog,
|
|
pages: &[crate::parser::pages::PageDict],
|
|
_xref_section: &XrefSection,
|
|
) -> FingerprintInput {
|
|
let page_count = pages.len() as u32;
|
|
|
|
let fingerprint_pages = pages.iter().map(|page| {
|
|
PageFingerprintData {
|
|
content_streams: page.contents.iter()
|
|
.map(|&obj_ref| ContentStreamData::Indirect(obj_ref))
|
|
.collect(),
|
|
resources: None, // TODO: convert ResourceDict to PdfDict
|
|
media_box: page.media_box,
|
|
crop_box: page.crop_box,
|
|
rotate: page.rotate,
|
|
}
|
|
}).collect();
|
|
|
|
// Build catalog flags
|
|
let catalog_flags = CatalogFlags {
|
|
is_encrypted: false, // TODO: detect encryption
|
|
contains_javascript: catalog.open_action.is_some() || catalog.aa.is_some(),
|
|
contains_xfa: false, // TODO: detect XFA
|
|
ocg_present: catalog.oc_properties.as_ref()
|
|
.map(|props| props.present)
|
|
.unwrap_or(false),
|
|
};
|
|
|
|
FingerprintInput {
|
|
page_count,
|
|
pages: fingerprint_pages,
|
|
struct_tree_root_ref: catalog.struct_tree_root_ref,
|
|
is_tagged: catalog.mark_info.is_tagged,
|
|
catalog_flags,
|
|
}
|
|
}
|
|
|
|
/// Extract text spans from a specific page.
|
|
///
|
|
/// This is a minimal implementation that extracts basic text information.
|
|
/// In a full implementation, this would use the complete text extraction pipeline.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `pdf_path` - Path to the PDF file
|
|
/// * `page_index` - 0-based page index
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// A vector of SpanData objects containing text and bbox information
|
|
pub fn extract_spans_from_page(
|
|
pdf_path: &std::path::Path,
|
|
page_index: usize,
|
|
) -> Result<Vec<SpanData>> {
|
|
// Parse the PDF
|
|
let (_fingerprint, _catalog, pages, _resolver) = parse_pdf_file(pdf_path)?;
|
|
|
|
// Check page index bounds
|
|
if page_index >= pages.len() {
|
|
return Err(anyhow!("Page index {} out of bounds (document has {} pages)",
|
|
page_index, pages.len()));
|
|
}
|
|
|
|
let page = &pages[page_index];
|
|
|
|
// For now, return a placeholder span
|
|
// In a full implementation, this would:
|
|
// 1. Parse the content streams
|
|
// 2. Extract text with positioning information
|
|
// 3. Build spans with text and bbox
|
|
|
|
// Return a single span covering the entire page as a placeholder
|
|
let [x0, y0, x1, y1] = page.media_box;
|
|
let spans = vec![SpanData {
|
|
text: format!("[Page {} text extraction not yet implemented]", page_index),
|
|
bbox: [x0, y0, x1, y1],
|
|
}];
|
|
|
|
Ok(spans)
|
|
}
|
|
|
|
/// Compute the fingerprint of a PDF file.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `pdf_path` - Path to the PDF file
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// The fingerprint string in the format "pdftract-v1:<hex>"
|
|
pub fn compute_pdf_fingerprint(pdf_path: &std::path::Path) -> Result<String> {
|
|
let (fingerprint, _catalog, _pages, _resolver) = parse_pdf_file(pdf_path)?;
|
|
Ok(fingerprint)
|
|
}
|
|
|
|
/// A lazy PDF page extractor that yields pages one at a time.
|
|
///
|
|
/// This struct provides memory-efficient extraction for large PDFs by:
|
|
/// - Materializing only the current page's data
|
|
/// - Decoding content streams on-demand per page
|
|
/// - Dropping decoded data immediately after use
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```ignore
|
|
/// let extractor = PdfExtractor::open("document.pdf")?;
|
|
/// for page_result in extractor.pages() {
|
|
/// let page = page_result?;
|
|
/// // Process page without holding all pages in memory
|
|
/// }
|
|
/// ```
|
|
pub struct PdfExtractor {
|
|
/// The PDF file source
|
|
source: FileSource,
|
|
/// The xref resolver for indirect object lookup
|
|
resolver: XrefResolver,
|
|
/// The parsed catalog
|
|
catalog: Catalog,
|
|
/// The fingerprint of the document
|
|
fingerprint: String,
|
|
/// Pre-flattened pages (for non-streaming extraction)
|
|
pages: Option<Vec<PageDict>>,
|
|
}
|
|
|
|
impl PdfExtractor {
|
|
/// Open a PDF file for lazy extraction.
|
|
///
|
|
/// This parses the xref table and catalog but does NOT materialize
|
|
/// the page tree. Pages are resolved on-demand from the iterator.
|
|
pub fn open<P: AsRef<Path>>(pdf_path: P) -> Result<Self> {
|
|
let path = pdf_path.as_ref();
|
|
|
|
// Open the PDF file
|
|
let source = FileSource::open(path)
|
|
.context("Failed to open PDF file")?;
|
|
|
|
// Find the startxref offset
|
|
let startxref_offset = find_startxref(&source)
|
|
.context("Failed to find startxref offset")?;
|
|
|
|
// Load the xref table
|
|
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
|
|
|
|
// Create resolver from xref section
|
|
let resolver = XrefResolver::from_section(xref_section.clone());
|
|
|
|
// Get the root reference from trailer
|
|
let root_ref = xref_section.trailer
|
|
.as_ref()
|
|
.and_then(|trailer| trailer.get("Root"))
|
|
.and_then(|obj| obj.as_ref())
|
|
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
|
|
|
|
// Parse the catalog
|
|
let catalog = parse_catalog(&resolver, root_ref)
|
|
.map_err(|diagnostics| {
|
|
let msg = diagnostics.first()
|
|
.map(|d| d.message.as_ref())
|
|
.unwrap_or("unknown error");
|
|
anyhow!("Failed to parse catalog: {}", msg)
|
|
})?;
|
|
|
|
// Build fingerprint input (without full page tree for lazy extraction)
|
|
let fingerprint = compute_fingerprint_lazy(&catalog, &xref_section);
|
|
|
|
Ok(Self {
|
|
source,
|
|
resolver,
|
|
catalog,
|
|
fingerprint,
|
|
pages: None,
|
|
})
|
|
}
|
|
|
|
/// Get the document fingerprint.
|
|
pub fn fingerprint(&self) -> &str {
|
|
&self.fingerprint
|
|
}
|
|
|
|
/// Get the catalog.
|
|
pub fn catalog(&self) -> &Catalog {
|
|
&self.catalog
|
|
}
|
|
|
|
/// Get the total page count.
|
|
///
|
|
/// This walks the page tree to count pages without materializing PageDict objects.
|
|
/// Uses O(depth) memory, making it safe for large documents.
|
|
pub fn page_count(&self) -> Result<usize> {
|
|
if let Some(ref pages) = self.pages {
|
|
return Ok(pages.len());
|
|
}
|
|
|
|
// Use lazy counting that doesn't materialize all pages
|
|
use crate::parser::pages::count_pages_tree;
|
|
count_pages_tree(&self.resolver, self.catalog.pages_ref)
|
|
.map_err(|e| anyhow!("Failed to count pages: {:?}", e))
|
|
}
|
|
|
|
/// Materialize all pages (for non-streaming extraction).
|
|
///
|
|
/// This caches the flattened page tree for repeated access.
|
|
///
|
|
/// # WARNING: Memory Implications
|
|
///
|
|
/// This function materializes ALL pages in memory, which defeats lazy loading
|
|
/// and can consume significant memory for large documents (1000+ pages).
|
|
/// Use this ONLY when you need repeated random access to pages.
|
|
///
|
|
/// For streaming extraction or one-time sequential access, use the `pages()`
|
|
/// method instead, which returns a lazy `PageIter` that never materializes
|
|
/// all pages at once.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```ignore
|
|
/// // BAD: Materializes all pages in memory
|
|
/// extractor.materialize_pages()?;
|
|
/// for page in extractor.pages.unwrap() { ... }
|
|
///
|
|
/// // GOOD: Lazy iteration, one page at a time
|
|
/// for page_result in extractor.pages() {
|
|
/// let page = page_result?;
|
|
/// // Process page - it will be dropped after loop iteration
|
|
/// }
|
|
/// ```
|
|
pub fn materialize_pages(&mut self) -> Result<&[PageDict]> {
|
|
if self.pages.is_none() {
|
|
let pages = flatten_page_tree(&self.resolver, self.catalog.pages_ref)
|
|
.map_err(|e| anyhow!("Failed to flatten page tree: {:?}", e))?;
|
|
self.pages = Some(pages);
|
|
}
|
|
Ok(self.pages.as_ref().unwrap())
|
|
}
|
|
|
|
/// Get a lazy iterator over pages.
|
|
///
|
|
/// The iterator yields pages one at a time, decoding each page's
|
|
/// content streams on-demand and dropping them after use.
|
|
///
|
|
/// # Memory Behavior
|
|
///
|
|
/// This uses LazyPageIter which walks the page tree depth-first,
|
|
/// materializing only the current path from root to leaf (max ~16 nodes).
|
|
/// Each yielded PageDict is standalone and can be dropped after use.
|
|
/// Peak RSS stays O(depth) not O(pages).
|
|
///
|
|
/// # Preferred Streaming Approach
|
|
///
|
|
/// This is the RECOMMENDED way to iterate over pages for large documents,
|
|
/// as it never materializes all pages in memory. Use `materialize_pages()`
|
|
/// ONLY when you need repeated random access to pages.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```ignore
|
|
/// // GOOD: Lazy iteration, one page at a time
|
|
/// for page_result in extractor.pages() {
|
|
/// let page = page_result?;
|
|
/// // Process page - it will be dropped after loop iteration
|
|
/// }
|
|
///
|
|
/// // BAD: Materializes all pages in memory (avoid for large documents)
|
|
/// extractor.materialize_pages()?;
|
|
/// for page in extractor.pages.unwrap() { ... }
|
|
/// ```
|
|
pub fn pages(&self) -> PageIter<'_> {
|
|
PageIter {
|
|
lazy_iter: None,
|
|
extractor: self,
|
|
index: 0,
|
|
}
|
|
}
|
|
|
|
/// Extract a single page by index.
|
|
///
|
|
/// This method extracts one page without materializing the entire document.
|
|
/// Content streams are decoded and the result is returned.
|
|
pub fn extract_page(&self, page_index: usize) -> Result<PageExtraction> {
|
|
let pages = self.pages.as_ref()
|
|
.ok_or_else(|| anyhow!("Pages not materialized. Call materialize_pages() first."))?;
|
|
|
|
if page_index >= pages.len() {
|
|
return Err(anyhow!("Page index {} out of bounds (document has {} pages)",
|
|
page_index, pages.len()));
|
|
}
|
|
|
|
let page = &pages[page_index];
|
|
|
|
// For now, return a placeholder extraction
|
|
// The full implementation would decode content streams here
|
|
let [x0, y0, x1, y1] = page.media_box;
|
|
|
|
Ok(PageExtraction {
|
|
index: page_index,
|
|
width: x1 - x0,
|
|
height: y1 - y0,
|
|
rotation: page.rotate,
|
|
spans: vec![],
|
|
blocks: vec![],
|
|
})
|
|
}
|
|
}
|
|
|
|
/// Result of extracting a single page.
|
|
///
|
|
/// This struct contains the minimal data needed for one page,
|
|
/// designed to be dropped immediately after serialization.
|
|
#[derive(Debug, Clone)]
|
|
pub struct PageExtraction {
|
|
/// 0-based page index
|
|
pub index: usize,
|
|
/// Page width in points
|
|
pub width: f64,
|
|
/// Page height in points
|
|
pub height: f64,
|
|
/// Page rotation in degrees
|
|
pub rotation: i32,
|
|
/// Extracted text spans
|
|
pub spans: Vec<SpanData>,
|
|
/// Extracted blocks
|
|
pub blocks: Vec<BlockData>,
|
|
}
|
|
|
|
/// Block data for extracted content.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct BlockData {
|
|
/// Block kind (paragraph, heading, etc.)
|
|
pub kind: String,
|
|
/// Block text
|
|
pub text: String,
|
|
}
|
|
|
|
/// Lazy iterator over PDF pages.
|
|
///
|
|
/// This iterator yields pages one at a time without materializing
|
|
/// the entire document model in memory.
|
|
///
|
|
/// # Memory Behavior
|
|
///
|
|
/// Uses LazyPageIter internally, which walks the page tree depth-first
|
|
/// and materializes only the current path from root to leaf (max ~16 nodes).
|
|
/// Each yielded PageExtraction contains the extracted data for one page,
|
|
/// and all intermediate data is dropped before yielding the next page.
|
|
pub struct PageIter<'a> {
|
|
/// Lazy page iterator from the parser
|
|
lazy_iter: Option<LazyPageIter<'a>>,
|
|
/// Reference to the extractor for accessing source/resolver
|
|
extractor: &'a PdfExtractor,
|
|
/// Current page index
|
|
index: usize,
|
|
}
|
|
|
|
impl<'a> Iterator for PageIter<'a> {
|
|
type Item = Result<PageExtraction>;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
// Initialize lazy iterator on first use
|
|
if self.lazy_iter.is_none() {
|
|
match LazyPageIter::new(&self.extractor.resolver, self.extractor.catalog.pages_ref) {
|
|
Ok(iter) => self.lazy_iter = Some(iter),
|
|
Err(diagnostics) => {
|
|
let msg = diagnostics.first()
|
|
.map(|d| d.message.as_ref())
|
|
.unwrap_or("unknown error");
|
|
return Some(Err(anyhow!("Failed to create lazy page iterator: {}", msg)));
|
|
}
|
|
}
|
|
}
|
|
|
|
let iter = self.lazy_iter.as_mut()?;
|
|
|
|
match iter.next() {
|
|
Some(Ok(page_dict)) => {
|
|
let [x0, y0, x1, y1] = page_dict.media_box;
|
|
let result = Ok(PageExtraction {
|
|
index: self.index,
|
|
width: x1 - x0,
|
|
height: y1 - y0,
|
|
rotation: page_dict.rotate,
|
|
spans: vec![],
|
|
blocks: vec![],
|
|
});
|
|
self.index += 1;
|
|
|
|
// Explicitly drop page_dict to ensure memory is freed
|
|
drop(page_dict);
|
|
|
|
Some(result)
|
|
}
|
|
Some(Err(diagnostics)) => {
|
|
let msg = diagnostics.first()
|
|
.map(|d| d.message.as_ref())
|
|
.unwrap_or("unknown error");
|
|
self.index += 1;
|
|
Some(Err(anyhow!("Error extracting page {}: {}", self.index - 1, msg)))
|
|
}
|
|
None => None,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Compute fingerprint without full page materialization.
|
|
///
|
|
/// This is a simplified version that uses only catalog-level data.
|
|
/// The full fingerprint computation requires page content streams.
|
|
pub(crate) fn compute_fingerprint_lazy(catalog: &Catalog, _xref_section: &XrefSection) -> String {
|
|
// For lazy extraction, use a simpler fingerprint based on catalog data
|
|
// The full implementation would incrementally hash pages as they're extracted
|
|
use crate::fingerprint::FingerprintInput;
|
|
|
|
let fingerprint_input = FingerprintInput {
|
|
page_count: 0, // Will be updated when pages are extracted
|
|
pages: vec![],
|
|
struct_tree_root_ref: catalog.struct_tree_root_ref,
|
|
is_tagged: catalog.mark_info.is_tagged,
|
|
catalog_flags: CatalogFlags {
|
|
is_encrypted: false,
|
|
contains_javascript: catalog.open_action.is_some() || catalog.aa.is_some(),
|
|
contains_xfa: false,
|
|
ocg_present: catalog.oc_properties.as_ref()
|
|
.map(|props| props.present)
|
|
.unwrap_or(false),
|
|
},
|
|
};
|
|
|
|
compute_fingerprint(&fingerprint_input, &XrefResolver::new())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use std::io::Write;
|
|
use std::fs::File;
|
|
|
|
/// Create a minimal valid PDF for testing.
|
|
fn create_minimal_pdf(path: &std::path::Path) -> Result<()> {
|
|
let pdf_data = br#"%PDF-1.4
|
|
1 0 obj
|
|
<<
|
|
/Type /Catalog
|
|
/Pages 2 0 R
|
|
>>
|
|
endobj
|
|
2 0 obj
|
|
<<
|
|
/Type /Pages
|
|
/Kids [3 0 R]
|
|
/Count 1
|
|
>>
|
|
endobj
|
|
3 0 obj
|
|
<<
|
|
/Type /Page
|
|
/Parent 2 0 R
|
|
/MediaBox [0 0 612 792]
|
|
/Contents 4 0 R
|
|
/Resources <<
|
|
/Font <<
|
|
/F1 <<
|
|
/Type /Font
|
|
/Subtype /Type1
|
|
/BaseFont /Helvetica
|
|
>>
|
|
>>
|
|
>>
|
|
>>
|
|
endobj
|
|
4 0 obj
|
|
<<
|
|
/Length 44
|
|
>>
|
|
stream
|
|
BT
|
|
/F1 12 Tf
|
|
100 700 Td
|
|
(Test) Tj
|
|
ET
|
|
endstream
|
|
endobj
|
|
xref
|
|
0 5
|
|
0000000000 65535 f
|
|
0000000009 00000 n
|
|
0000000058 00000 n
|
|
0000000115 00000 n
|
|
0000000298 00000 n
|
|
trailer
|
|
<<
|
|
/Size 5
|
|
/Root 1 0 R
|
|
>>
|
|
startxref
|
|
403
|
|
%%EOF
|
|
"#;
|
|
|
|
let mut file = File::create(path)?;
|
|
file.write_all(pdf_data)?;
|
|
Ok(())
|
|
}
|
|
|
|
#[test]
|
|
fn test_find_startxref() {
|
|
let temp_dir = tempfile::tempdir().unwrap();
|
|
let pdf_path = temp_dir.path().join("test.pdf");
|
|
create_minimal_pdf(&pdf_path).unwrap();
|
|
|
|
let source = FileSource::open(&pdf_path).unwrap();
|
|
let offset = find_startxref(&source).unwrap();
|
|
assert_eq!(offset, 403);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_pdf_file() {
|
|
let temp_dir = tempfile::tempdir().unwrap();
|
|
let pdf_path = temp_dir.path().join("test.pdf");
|
|
create_minimal_pdf(&pdf_path).unwrap();
|
|
|
|
let (fingerprint, catalog, pages, resolver) = parse_pdf_file(&pdf_path).unwrap();
|
|
|
|
assert!(fingerprint.starts_with("pdftract-v1:"));
|
|
assert_eq!(pages.len(), 1);
|
|
assert_eq!(pages[0].media_box, [0.0, 0.0, 612.0, 792.0]);
|
|
assert_eq!(pages[0].rotate, 0);
|
|
|
|
// Verify resolver has entries
|
|
assert!(resolver.len() > 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_compute_pdf_fingerprint() {
|
|
let temp_dir = tempfile::tempdir().unwrap();
|
|
let pdf_path = temp_dir.path().join("test.pdf");
|
|
create_minimal_pdf(&pdf_path).unwrap();
|
|
|
|
let fingerprint = compute_pdf_fingerprint(&pdf_path).unwrap();
|
|
|
|
assert!(fingerprint.starts_with("pdftract-v1:"));
|
|
assert_eq!(fingerprint.len(), "pdftract-v1:".len() + 64);
|
|
|
|
// Verify hex format
|
|
let hex_part = &fingerprint["pdftract-v1:".len()..];
|
|
assert!(hex_part.chars().all(|c| c.is_ascii_hexdigit()));
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_spans_from_page() {
|
|
let temp_dir = tempfile::tempdir().unwrap();
|
|
let pdf_path = temp_dir.path().join("test.pdf");
|
|
create_minimal_pdf(&pdf_path).unwrap();
|
|
|
|
let spans = extract_spans_from_page(&pdf_path, 0).unwrap();
|
|
|
|
// Should have at least one span (placeholder for now)
|
|
assert!(!spans.is_empty());
|
|
|
|
// Check the span has the expected structure
|
|
let span = &spans[0];
|
|
assert!(!span.text.is_empty());
|
|
assert_eq!(span.bbox, [0.0, 0.0, 612.0, 792.0]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_extract_spans_out_of_bounds() {
|
|
let temp_dir = tempfile::tempdir().unwrap();
|
|
let pdf_path = temp_dir.path().join("test.pdf");
|
|
create_minimal_pdf(&pdf_path).unwrap();
|
|
|
|
let result = extract_spans_from_page(&pdf_path, 10);
|
|
assert!(result.is_err());
|
|
}
|
|
}
|