- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator)
2733 lines
95 KiB
Rust
2733 lines
95 KiB
Rust
//! PDF text extraction with receipt generation.
|
||
//!
|
||
//! This module provides the main extraction pipeline that processes PDFs
|
||
//! and generates spans and blocks with optional cryptographic receipts.
|
||
//!
|
||
//! Page extraction runs in parallel using rayon, with the number of
|
||
//! simultaneously-resident pages capped by a semaphore to keep memory
|
||
//! bounded regardless of core count.
|
||
//!
|
||
//! ## Lazy Stream Decoding
|
||
//!
|
||
//! Content streams are decoded lazily per page and dropped immediately after
|
||
//! processing. This ensures peak RSS stays flat across page count, even for
|
||
//! large documents with 10,000+ pages.
|
||
|
||
use crate::annotation::{dispatch_annotations, json as annotation_json};
|
||
use crate::attachment::associated_files::walk_af_array;
|
||
use crate::attachment::filespec::extract_one;
|
||
use crate::diagnostics::{DiagCode, Diagnostic};
|
||
use crate::document::compute_fingerprint_lazy;
|
||
use secrecy::ExposeSecret;
|
||
use crate::forms::{
|
||
acro_field_to_value, combine, walk_acroform_fields, AcroFormField, FormFieldValue,
|
||
};
|
||
use crate::options::{ExtractionOptions, ReceiptsMode};
|
||
use crate::parser::catalog::ReadingOrderAlgorithm;
|
||
use crate::parser::marked_content::{track_mcids_from_content_stream, McidTracker};
|
||
use crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES;
|
||
use crate::source::FileSource;
|
||
// Import both PdfSource traits with aliases to avoid ambiguity
|
||
use crate::source::PdfSource as SourcePdfSource;
|
||
use crate::parser::stream::PdfSource as ParserPdfSource;
|
||
use crate::parser::struct_tree::{check_coverage_for_pages, parse_struct_tree};
|
||
use crate::receipts::Receipt;
|
||
use crate::schema::{
|
||
AnnotationJson, AttachmentJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson,
|
||
FormFieldValueJson, JavascriptActionJson, LinkJson, SignatureJson, SpanJson, TableJson,
|
||
ThreadJson,
|
||
};
|
||
use crate::semaphore::{Semaphore, SemaphoreExt};
|
||
use crate::signature::{discover, extract_signatures};
|
||
use crate::table::{
|
||
detect_two_page_tables, grid_to_table_json, GridCandidate, PageContext, TableDetector,
|
||
};
|
||
use crate::table::{TableCell as Cell, TableSpan};
|
||
use anyhow::{Context, Result};
|
||
use rayon::prelude::*;
|
||
#[cfg(feature = "schemars")]
|
||
use schemars::JsonSchema;
|
||
use serde::{Deserialize, Serialize};
|
||
use serde_json::json;
|
||
use std::cmp::Ordering;
|
||
use std::sync::Arc;
|
||
|
||
#[cfg(feature = "receipts")]
|
||
use crate::receipts::svg::GlyphList;
|
||
|
||
/// Decode content streams for a page, returning the concatenated decoded bytes.
|
||
///
|
||
/// This function decodes all content streams for a page lazily and drops them
|
||
/// immediately after returning. The decoded bytes are scoped to ensure they're
|
||
/// freed before processing the next page.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `page` - The page dictionary containing content stream references
|
||
/// * `resolver` - The xref resolver for resolving indirect references
|
||
/// * `source` - The PDF source for reading stream data
|
||
/// * `max_decompress_bytes` - Maximum decompressed bytes allowed (bomb limit)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// The decoded content stream bytes, or an empty Vec if decoding fails.
|
||
///
|
||
/// # Memory Behavior
|
||
///
|
||
/// This function ensures decoded streams are dropped immediately after use:
|
||
/// - Each stream is decoded and returned as Vec<u8>
|
||
/// - The caller must drop the Vec before processing the next page
|
||
/// - No decoded data is held across page boundaries
|
||
fn decode_page_content_streams(
|
||
page: &crate::parser::pages::PageDict,
|
||
resolver: &crate::parser::xref::XrefResolver,
|
||
source: &dyn crate::parser::stream::PdfSource,
|
||
max_decompress_bytes: u64,
|
||
) -> Vec<u8> {
|
||
use crate::parser::stream::{decode_stream, ExtractionOptions as StreamExtractionOptions};
|
||
|
||
// Create stream extraction options with the bomb limit
|
||
let stream_opts = StreamExtractionOptions {
|
||
max_decompress_bytes,
|
||
password: None, // No password support for content streams yet
|
||
};
|
||
|
||
let mut all_decoded = Vec::new();
|
||
let mut doc_counter = 0u64;
|
||
|
||
for stream_ref in &page.contents {
|
||
match resolver.resolve(*stream_ref) {
|
||
Ok(obj) => {
|
||
if let Some(stream) = obj.as_stream() {
|
||
// Decode this stream - it will be dropped after this iteration
|
||
let decoded = decode_stream(stream, source, &stream_opts, &mut doc_counter);
|
||
|
||
// Extend the accumulated content
|
||
all_decoded.extend_from_slice(&decoded);
|
||
|
||
// Explicitly drop decoded to free memory before next iteration
|
||
drop(decoded);
|
||
}
|
||
}
|
||
Err(_) => {
|
||
// Failed to resolve stream - skip it
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
|
||
all_decoded
|
||
}
|
||
|
||
/// Result of a PDF extraction operation.
|
||
///
|
||
/// Contains the extracted pages, spans, blocks, and metadata.
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||
pub struct ExtractionResult {
|
||
/// The PDF fingerprint (for receipt generation).
|
||
pub fingerprint: String,
|
||
/// Extracted pages, each containing spans and blocks.
|
||
pub pages: Vec<PageResult>,
|
||
/// Metadata about the extraction.
|
||
pub metadata: ExtractionMetadata,
|
||
/// Digital signatures extracted from the document.
|
||
///
|
||
/// This array contains all signature fields discovered in the AcroForm,
|
||
/// including both signed and unsigned (blank) signature fields.
|
||
/// Empty when the PDF has no signature fields.
|
||
pub signatures: Vec<SignatureJson>,
|
||
/// Interactive form fields extracted from the document.
|
||
///
|
||
/// This array contains all form fields from the AcroForm and/or XFA data.
|
||
/// Fields are sorted alphabetically by name. When both AcroForm and XFA
|
||
/// are present, XFA values take precedence on collision.
|
||
/// Empty when the PDF has no form fields.
|
||
pub form_fields: Vec<FormFieldJson>,
|
||
/// Document-scoped hyperlinks extracted from the document.
|
||
///
|
||
/// This array contains all link annotations (URI and internal destination links)
|
||
/// extracted from all pages. Links are sorted by (page_index, rect.y0 desc, rect.x0).
|
||
/// Empty when the PDF has no link annotations.
|
||
pub links: Vec<LinkJson>,
|
||
/// Embedded file attachments extracted from the document.
|
||
///
|
||
/// This array contains all embedded files from the PDF's `/EmbeddedFiles`
|
||
/// name tree or `/AF` (Associated Files) array. Attachments exceeding
|
||
/// 50 MB are truncated (metadata only, `data: null`, `truncated: true`).
|
||
/// Empty when the PDF has no embedded files.
|
||
pub attachments: Vec<AttachmentJson>,
|
||
/// Article thread chains extracted from the document.
|
||
///
|
||
/// This array contains all article threads from the PDF's `/Threads` array.
|
||
/// Each thread includes metadata from the thread info dict (/I) and the
|
||
/// complete bead chain walked from the first bead. Empty when the PDF has
|
||
/// no article threads.
|
||
pub threads: Vec<ThreadJson>,
|
||
/// JavaScript actions detected in the document.
|
||
///
|
||
/// Per TH-04, this array contains all discovered JavaScript actions
|
||
/// with their location and code excerpt. pdftract NEVER executes
|
||
/// embedded JavaScript; this is for downstream security review.
|
||
/// Empty when no JavaScript is present.
|
||
#[serde(default)]
|
||
pub javascript_actions: Vec<JavascriptActionJson>,
|
||
}
|
||
|
||
/// Result for a single page.
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||
pub struct PageResult {
|
||
/// 0-based page index.
|
||
pub index: usize,
|
||
/// 1-based page number (= index + 1).
|
||
///
|
||
/// Emitted as a convenience for human-facing display. For programmatic
|
||
/// access, use index instead.
|
||
pub page_number: u32,
|
||
/// Human-readable label from PDF /PageLabels number tree.
|
||
///
|
||
/// Examples: "iv", "A-3", "1". Null if the PDF defines no page labels.
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub page_label: Option<String>,
|
||
/// Page width in points (1/72 inch).
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub width: Option<f32>,
|
||
/// Page height in points (1/72 inch).
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub height: Option<f32>,
|
||
/// Page rotation in degrees clockwise (0, 90, 180, or 270).
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub rotation: Option<u16>,
|
||
/// Page classification from the page classifier.
|
||
///
|
||
/// One of: "text", "scanned", "mixed", "broken_vector", "blank", "figure_only".
|
||
#[serde(rename = "type")]
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub page_type: Option<String>,
|
||
/// Extracted spans (text fragments with consistent styling).
|
||
pub spans: Vec<SpanJson>,
|
||
/// Extracted blocks (semantic units like paragraphs, headings).
|
||
pub blocks: Vec<BlockJson>,
|
||
/// Extracted tables (cell-level structure).
|
||
///
|
||
/// This array provides detailed table structure with rows and cells.
|
||
/// Table blocks in the `blocks` array reference entries here via `table_index`.
|
||
pub tables: Vec<TableJson>,
|
||
/// Page-level annotations (highlights, stamps, notes, etc.).
|
||
///
|
||
/// This array contains all non-link annotations on this page.
|
||
/// Annotations are sorted by (rect.y0 desc, rect.x0) for deterministic output.
|
||
/// Empty when the page has no annotations.
|
||
#[serde(default)]
|
||
pub annotations: Vec<AnnotationJson>,
|
||
/// Error message if extraction failed for this page.
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub error: Option<String>,
|
||
}
|
||
|
||
/// Temporary structure holding both TableJson and GridCandidate during extraction.
|
||
///
|
||
/// This is used to preserve GridCandidate information for two-page table detection,
|
||
/// which runs after all pages have been extracted. After detection, only the
|
||
/// TableJson is retained in the final output.
|
||
#[derive(Debug, Clone)]
|
||
struct TableWithGrid {
|
||
/// The JSON output structure for this table.
|
||
json: TableJson,
|
||
/// The grid candidate used for two-page detection.
|
||
grid: GridCandidate,
|
||
}
|
||
|
||
/// Internal page result that includes grid information for two-page detection.
|
||
///
|
||
/// This is used during extraction to preserve GridCandidate information.
|
||
/// After two-page detection, this is converted to the public PageResult.
|
||
#[derive(Debug, Clone)]
|
||
struct PageResultInternal {
|
||
/// 0-based page index.
|
||
pub index: usize,
|
||
/// Extracted spans (text fragments with consistent styling).
|
||
pub spans: Vec<SpanJson>,
|
||
/// Extracted blocks (semantic units like paragraphs, headings).
|
||
pub blocks: Vec<BlockJson>,
|
||
/// Extracted tables with grid information.
|
||
pub tables: Vec<TableWithGrid>,
|
||
/// Page-level annotations (highlights, stamps, notes, etc.).
|
||
pub annotations: Vec<AnnotationJson>,
|
||
/// Error message if extraction failed for this page.
|
||
pub error: Option<String>,
|
||
/// Page media box height for two-page detection.
|
||
pub page_height: f64,
|
||
}
|
||
|
||
impl From<PageResultInternal> for PageResult {
|
||
fn from(internal: PageResultInternal) -> Self {
|
||
PageResult {
|
||
index: internal.index,
|
||
page_number: (internal.index + 1) as u32,
|
||
page_label: None,
|
||
width: None,
|
||
height: None,
|
||
rotation: None,
|
||
page_type: None,
|
||
spans: internal.spans,
|
||
blocks: internal.blocks,
|
||
tables: internal.tables.into_iter().map(|t| t.json).collect(),
|
||
annotations: internal.annotations,
|
||
error: internal.error,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Metadata about the extraction process.
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||
pub struct ExtractionMetadata {
|
||
/// Total number of pages in the document.
|
||
pub page_count: usize,
|
||
/// Receipts mode used for this extraction.
|
||
pub receipts_mode: ReceiptsMode,
|
||
/// Number of spans extracted.
|
||
pub span_count: usize,
|
||
/// Number of blocks extracted.
|
||
pub block_count: usize,
|
||
/// Cache status: "hit", "miss", or "skipped"
|
||
pub cache_status: Option<String>,
|
||
/// Cache entry age in seconds (only present when cache_status == "hit")
|
||
pub cache_age_seconds: Option<u64>,
|
||
/// Number of pages that failed to extract.
|
||
pub error_count: usize,
|
||
/// Reading order algorithm used for this extraction.
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub reading_order_algorithm: Option<String>,
|
||
/// Diagnostics emitted during extraction (coverage warnings, etc.)
|
||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||
pub diagnostics: Vec<String>,
|
||
/// Profile name if a profile was applied (Phase 7.10)
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub profile_name: Option<String>,
|
||
/// Profile version if a profile was applied (Phase 7.10)
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub profile_version: Option<String>,
|
||
/// Extracted fields from profile if a profile was applied (Phase 7.10)
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub profile_fields: Option<serde_json::Value>,
|
||
}
|
||
|
||
/// Extract text and structure from a PDF file.
|
||
///
|
||
/// This is the main entry point for PDF extraction. It:
|
||
/// 1. Parses the PDF and computes its fingerprint
|
||
/// 2. Extracts spans and blocks from each page in parallel (bounded by semaphore)
|
||
/// 3. Generates receipts if requested
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file
|
||
/// * `options` - Extraction options controlling receipt generation and parallelism
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// An `ExtractionResult` containing pages with spans and blocks.
|
||
///
|
||
/// # Memory Bounding
|
||
///
|
||
/// The number of simultaneously-resident pages is capped by `max_parallel_pages`
|
||
/// in the options. This ensures document-wide peak RSS stays under the memory
|
||
/// ceiling regardless of core count. Each page extraction acquires a semaphore
|
||
/// permit before allocating its working buffers and releases it when done.
|
||
///
|
||
/// # Streaming/Lazy Decode
|
||
///
|
||
/// This function uses lazy page iteration via LazyPageIter, which walks the page
|
||
/// tree depth-first and materializes only the current path from root to leaf
|
||
/// (max ~16 nodes). Pages are processed sequentially but extracted in parallel
|
||
/// with semaphore bounding. Decoded content streams are dropped immediately after
|
||
/// each page is processed, ensuring peak RSS stays O(depth × per-page) not O(pages × per-page).
|
||
///
|
||
/// # WARNING: Accumulates All Results
|
||
///
|
||
/// This function accumulates all extracted pages in memory before returning.
|
||
/// For large documents (1000+ pages), this can consume significant memory.
|
||
/// Use `extract_pdf_ndjson` for true streaming extraction that never accumulates
|
||
/// all pages in memory.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_pdf, ExtractionOptions, OutputOptions};
|
||
/// use std::path::Path;
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// // Extract text from a PDF file with default options
|
||
/// let result = extract_pdf(
|
||
/// Path::new("document.pdf"),
|
||
/// &ExtractionOptions::default()
|
||
/// )?;
|
||
///
|
||
/// // Access extracted text per page
|
||
/// for (page_num, page_result) in result.pages.iter().enumerate() {
|
||
/// println!("Page {}: {} chars extracted", page_num + 1, page_result.text.len());
|
||
/// println!("Text: {}", &page_result.text[..page_result.text.len().min(100)]);
|
||
/// }
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if:
|
||
/// - The PDF file cannot be opened or read
|
||
/// - The PDF structure is invalid or corrupted
|
||
/// - Decryption fails (for encrypted PDFs)
|
||
/// - Content stream decoding exceeds bomb limits
|
||
/// Extract text, tables, and metadata from a PDF file.
|
||
///
|
||
/// This is the main entry point for PDF extraction. It processes the entire
|
||
/// document and returns structured data including text spans, blocks, tables,
|
||
/// form fields, links, and more.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file to extract from
|
||
/// * `options` - Extraction options controlling OCR, DPI, page limits, etc.
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A [`ExtractionResult`] containing:
|
||
/// - `fingerprint` - Cryptographic hash of the PDF for receipt verification
|
||
/// - `pages` - Array of extracted pages with spans, blocks, and tables
|
||
/// - `signatures` - Digital signature information
|
||
/// - `form_fields` - Interactive form field values
|
||
/// - `links` - Hyperlinks and internal destinations
|
||
/// - `attachments` - Embedded file attachments
|
||
/// - `threads` - Article thread chains
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if:
|
||
/// - The PDF file cannot be opened or read
|
||
/// - The PDF is malformed or corrupted
|
||
/// - The PDF is encrypted and no password is provided
|
||
/// - Decompression bomb limits are exceeded
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// Basic extraction with default options:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_pdf, ExtractionOptions};
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// let result = extract_pdf(
|
||
/// "document.pdf",
|
||
/// &ExtractionOptions::default()
|
||
/// )?;
|
||
///
|
||
/// println!("Extracted {} pages", result.pages.len());
|
||
/// println!("Fingerprint: {}", result.fingerprint);
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
///
|
||
/// Extraction with OCR for scanned documents:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_pdf, ExtractionOptions};
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// # #[cfg(feature = "ocr")]
|
||
/// let result = extract_pdf(
|
||
/// "scanned.pdf",
|
||
/// &ExtractionOptions {
|
||
/// ocr_languages: vec!["eng".to_string()],
|
||
/// ..Default::default()
|
||
/// }
|
||
/// )?;
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
///
|
||
/// Extraction with page limit for large files:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_pdf, ExtractionOptions};
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// let result = extract_pdf(
|
||
/// "large_document.pdf",
|
||
/// &ExtractionOptions {
|
||
/// max_pages: Some(10),
|
||
/// ..Default::default()
|
||
/// }
|
||
/// )?;
|
||
///
|
||
/// println!("First 10 pages extracted");
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
pub fn extract_pdf(
|
||
pdf_path: &std::path::Path,
|
||
options: &ExtractionOptions,
|
||
) -> Result<ExtractionResult> {
|
||
use crate::parser::catalog::parse_catalog;
|
||
use crate::parser::pages::LazyPageIter;
|
||
use crate::parser::xref::{load_xref_with_prev_chain, XrefResolver};
|
||
|
||
// Open the PDF file
|
||
let source = FileSource::open(pdf_path).context("Failed to open PDF file")?;
|
||
|
||
// Find the startxref offset
|
||
let startxref_offset = find_startxref(&source).context("Failed to find startxref offset")?;
|
||
|
||
// Load the xref table
|
||
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
|
||
|
||
// Create resolver from xref section
|
||
let resolver = XrefResolver::from_section(xref_section.clone());
|
||
|
||
// Detect and handle encryption (Phase 1.4)
|
||
#[cfg(feature = "decrypt")]
|
||
let decryption_context = {
|
||
use crate::encryption::decrypt_with_password;
|
||
|
||
// Get the trailer for encryption detection
|
||
let trailer_dict = xref_section.trailer.as_ref().cloned();
|
||
|
||
let mut diagnostics = Vec::new();
|
||
let password = options.password.as_ref().map(|p| p.expose_secret());
|
||
|
||
if let Some(trailer) = trailer_dict {
|
||
match decrypt_with_password(&trailer, &resolver, password, &mut diagnostics) {
|
||
Ok(ctx_opt) => ctx_opt,
|
||
Err(e) => {
|
||
// Emit diagnostic and return error
|
||
let diag = e.to_diagnostic();
|
||
return Err(anyhow::anyhow!("PDF decryption failed: {}", diag.message));
|
||
}
|
||
}
|
||
} else {
|
||
None
|
||
}
|
||
};
|
||
|
||
#[cfg(not(feature = "decrypt"))]
|
||
let decryption_context = Option::<crate::encryption::decryptor::DecryptionContext>::None;
|
||
|
||
// Get the root reference from trailer
|
||
let root_ref = xref_section
|
||
.trailer
|
||
.as_ref()
|
||
.and_then(|trailer| trailer.get("Root"))
|
||
.and_then(|obj| obj.as_ref())
|
||
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
|
||
|
||
// Parse the catalog
|
||
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn ParserPdfSource)).map_err(
|
||
|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow::anyhow!("Failed to parse catalog: {}", msg)
|
||
},
|
||
)?;
|
||
|
||
// Resolve AcroForm if present for fingerprint computation
|
||
let acroform = catalog.acroform_ref.and_then(|ref_| {
|
||
resolver.resolve(ref_).ok().and_then(|obj| obj.as_dict().cloned())
|
||
});
|
||
|
||
// Build fingerprint input (without full page tree for lazy extraction)
|
||
let fingerprint = compute_fingerprint_lazy(&catalog, &resolver, &acroform);
|
||
|
||
// Wrap resolver in Arc for sharing across threads
|
||
let resolver_arc = Arc::new(resolver);
|
||
|
||
// Create lazy page iterator - this walks the tree on-demand
|
||
let mut page_iter =
|
||
LazyPageIter::new(&resolver_arc, catalog.pages_ref).map_err(|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow::anyhow!("Failed to create lazy page iterator: {}", msg)
|
||
})?;
|
||
|
||
// Phase 4.5: Determine reading order algorithm
|
||
// For v0.1.0-v0.3.0: Tagged PDFs emit TAGGED_PDF_STRUCT_TREE_DEFERRED and use XY-cut
|
||
// Phase 7.1 will replace this with real StructTree traversal
|
||
let (reading_order_algorithm, struct_tree, deferred_diagnostic) = if catalog.mark_info.is_tagged
|
||
{
|
||
// Tagged PDF: emit diagnostic once per document and use XY-cut
|
||
let diagnostic = Diagnostic::with_static_no_offset(
|
||
DiagCode::LayoutTaggedPdfDeferred,
|
||
"Tagged PDF detected; StructTree traversal deferred to Phase 7.1, using XY-cut for now",
|
||
);
|
||
(ReadingOrderAlgorithm::XyCut, None, Some(diagnostic))
|
||
} else {
|
||
// Untagged PDF: use XY-cut
|
||
(ReadingOrderAlgorithm::XyCut, None, None)
|
||
};
|
||
|
||
// Wrap options in Arc for sharing across threads
|
||
let fingerprint_arc = Arc::new(fingerprint.clone());
|
||
let options_arc = Arc::new(options.clone());
|
||
|
||
// Create a semaphore to bound the number of in-flight pages
|
||
let semaphore = Arc::new(Semaphore::new(options.max_parallel_pages));
|
||
|
||
// First, collect all PageDict objects for annotation extraction
|
||
// We need these before extracting content so we can dispatch annotations once
|
||
let mut all_pages: Vec<crate::parser::pages::PageDict> = Vec::new();
|
||
loop {
|
||
match page_iter.next() {
|
||
Some(Ok(page_dict)) => {
|
||
all_pages.push(page_dict);
|
||
}
|
||
Some(Err(_)) | None => {
|
||
// End of pages or error - stop collecting
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Parse page range if specified
|
||
let mut page_count = all_pages.len();
|
||
let mut page_range_diagnostics = Vec::new();
|
||
let page_filter: Option<std::collections::BTreeSet<usize>> = if let Some(ref range_str) = options.pages {
|
||
Some(crate::pages::parse_pages(range_str, page_count, &mut page_range_diagnostics)?)
|
||
} else {
|
||
None
|
||
};
|
||
|
||
// Phase 1.8: Hint stream prefetch for linearized PDFs
|
||
// If the PDF is linearized and has a hint stream, prefetch the pages
|
||
// that will be extracted. This reduces latency by pipelining HTTP requests.
|
||
if let Some(ref page_filter) = page_filter {
|
||
use crate::parser::xref::detect_linearization;
|
||
use crate::parser::hint_stream::prefetch_from_hint_stream;
|
||
|
||
let mut prefetch_diagnostics = Vec::new();
|
||
if let Some(lin_info) = detect_linearization(&source) {
|
||
if let (Some(hint_offset), Some(hint_length)) = (lin_info.hint_stream_offset, lin_info.hint_stream_length) {
|
||
// Prefetch the pages that will be extracted
|
||
// page_filter contains 0-based page indices
|
||
prefetch_from_hint_stream(
|
||
&source,
|
||
hint_offset,
|
||
hint_length,
|
||
page_filter.iter().copied(),
|
||
&mut prefetch_diagnostics,
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Phase 7.6: Extract annotations and links from all pages
|
||
// Walk all pages and extract annotations by subtype
|
||
//
|
||
// Note: For now, we pass None for dests_dict and names_dests_ref.
|
||
// A full implementation would resolve /Catalog /Dests and /Catalog /Names /Dests
|
||
// to support named destination resolution. This is sufficient for URI links
|
||
// and explicit destination arrays.
|
||
let (link_annotations, annotations) = dispatch_annotations(
|
||
&resolver_arc,
|
||
&all_pages,
|
||
None, // dests_dict
|
||
None, // names_dests_ref
|
||
);
|
||
|
||
// Convert links to JSON format and sort by (page_index, rect.y0 desc, rect.x0)
|
||
let mut links_json: Vec<LinkJson> = link_annotations
|
||
.iter()
|
||
.map(|link| annotation_json::link_to_json(link, &None))
|
||
.collect();
|
||
annotation_json::sort_links(&mut links_json);
|
||
|
||
// Convert annotations to JSON format and group by page
|
||
let mut annotations_by_page: std::collections::HashMap<usize, Vec<AnnotationJson>> =
|
||
std::collections::HashMap::new();
|
||
|
||
for annot in &annotations {
|
||
let json = annotation_json::annotation_to_json(annot);
|
||
let page_idx = annot.common.page_index;
|
||
annotations_by_page
|
||
.entry(page_idx)
|
||
.or_insert_with(Vec::new)
|
||
.push(json);
|
||
}
|
||
|
||
// Sort annotations within each page by (rect.y0 desc, rect.x0)
|
||
for page_annotations in annotations_by_page.values_mut() {
|
||
annotation_json::sort_annotations(page_annotations);
|
||
}
|
||
|
||
// Now process pages for content extraction (re-using the collected pages)
|
||
let mut extracted_pages = Vec::new();
|
||
let mut total_spans = 0;
|
||
let mut total_blocks = 0;
|
||
let mut error_count = 0;
|
||
let mut page_count = 0;
|
||
let mut page_heights = Vec::new(); // Track page heights for two-page table detection
|
||
|
||
// Phase 7.1.4: Collect page data for coverage check
|
||
// Track MCIDs and struct_parents for each page
|
||
let mut pages_with_mcids: Vec<(usize, Option<i32>, std::collections::HashSet<u32>)> =
|
||
Vec::new();
|
||
let needs_coverage_check = catalog.mark_info.requires_coverage_check() && struct_tree.is_some();
|
||
|
||
// Save a clone of pages for JavaScript detection later
|
||
// We need to clone because all_pages will be consumed in the loop
|
||
let pages_for_js_detection = all_pages.clone();
|
||
|
||
// Process pages for content extraction
|
||
for (page_index, page_dict) in all_pages.into_iter().enumerate() {
|
||
// Skip pages not in the selected range (if --pages was specified)
|
||
if let Some(ref filter) = page_filter {
|
||
if !filter.contains(&page_index) {
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// Get page height for two-page table detection
|
||
let [_x0, _y0, _x1, y1] = page_dict.media_box;
|
||
let page_height = (y1 - page_dict.media_box[1]).max(0.0);
|
||
page_heights.push(page_height);
|
||
|
||
// Track MCIDs for this page if coverage check is needed
|
||
if needs_coverage_check {
|
||
// Decode content streams and track MCIDs
|
||
let decoded_streams = decode_page_content_streams(
|
||
&page_dict,
|
||
&resolver_arc,
|
||
&source,
|
||
options.max_decompress_bytes,
|
||
);
|
||
|
||
let mut tracker = McidTracker::new();
|
||
track_mcids_from_content_stream(&decoded_streams, &mut tracker);
|
||
|
||
// Get the struct_parents value for this page
|
||
let struct_parents = page_dict.struct_parents();
|
||
|
||
// Record page data for coverage check
|
||
let mcid_set = tracker.mcid_set().clone();
|
||
pages_with_mcids.push((page_index, struct_parents, mcid_set));
|
||
|
||
// Drop decoded_streams and tracker to free memory
|
||
drop(decoded_streams);
|
||
// tracker dropped implicitly
|
||
}
|
||
|
||
// Get the annotations for this page (already sorted)
|
||
let page_annotations = annotations_by_page.remove(&page_index).unwrap_or_default();
|
||
|
||
// Extract this page with lazy stream decoding.
|
||
// Content streams are decoded, processed, and dropped immediately.
|
||
let extract_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||
extract_page_from_dict(
|
||
&fingerprint_arc,
|
||
page_index,
|
||
&page_dict,
|
||
&options_arc,
|
||
Some(&source),
|
||
Some(&resolver_arc),
|
||
)
|
||
}));
|
||
|
||
match extract_result {
|
||
Ok(Ok(mut page)) => {
|
||
total_spans += page.spans.len();
|
||
total_blocks += page.blocks.len();
|
||
page.annotations = page_annotations;
|
||
extracted_pages.push(page);
|
||
}
|
||
Ok(Err(e)) => {
|
||
error_count += 1;
|
||
extracted_pages.push(PageResultInternal {
|
||
index: page_index,
|
||
spans: vec![],
|
||
blocks: vec![],
|
||
tables: vec![],
|
||
annotations: page_annotations,
|
||
error: Some(e.to_string()),
|
||
page_height,
|
||
});
|
||
}
|
||
Err(_) => {
|
||
error_count += 1;
|
||
extracted_pages.push(PageResultInternal {
|
||
index: page_index,
|
||
spans: vec![],
|
||
blocks: vec![],
|
||
tables: vec![],
|
||
annotations: page_annotations,
|
||
error: Some(format!("Page {} extraction panicked", page_index)),
|
||
page_height,
|
||
});
|
||
}
|
||
}
|
||
|
||
// Explicitly drop page_dict to ensure memory is freed before next iteration
|
||
drop(page_dict);
|
||
page_count += 1;
|
||
}
|
||
|
||
// Phase 7.1.4: Perform coverage check if Suspects is true
|
||
// This must happen after we've collected MCID data from all pages
|
||
let (final_reading_order_algorithm, coverage_diagnostics) = if needs_coverage_check {
|
||
if let Some(ref tree) = struct_tree {
|
||
let coverage_result =
|
||
check_coverage_for_pages(tree, &catalog.mark_info, &pages_with_mcids);
|
||
let diagnostics: Vec<String> = coverage_result
|
||
.diagnostics
|
||
.iter()
|
||
.map(|d| d.message.as_ref().to_string())
|
||
.collect();
|
||
(coverage_result.reading_order_algorithm, diagnostics)
|
||
} else {
|
||
// Shouldn't happen due to the needs_coverage_check condition
|
||
(reading_order_algorithm, Vec::new())
|
||
}
|
||
} else {
|
||
(reading_order_algorithm, Vec::new())
|
||
};
|
||
|
||
// Add the tagged PDF deferred diagnostic if present
|
||
let mut all_diagnostics = coverage_diagnostics;
|
||
if let Some(ref deferred) = deferred_diagnostic {
|
||
all_diagnostics.push(deferred.message.as_ref().to_string());
|
||
}
|
||
|
||
// Phase 7.2.6: Detect two-page table continuation
|
||
// This must happen after all pages have been extracted so we can compare
|
||
// tables on adjacent pages
|
||
let extracted_pages = apply_two_page_table_detection(extracted_pages, &page_heights);
|
||
|
||
// Convert PageResultInternal to PageResult for final output
|
||
let extracted_pages: Vec<PageResult> = extracted_pages.into_iter().map(Into::into).collect();
|
||
|
||
// Phase 7.3: Extract digital signature metadata
|
||
// Discover signature fields and extract metadata from them
|
||
let sig_fields = discover(&resolver_arc, &catalog);
|
||
let file_size = Some(SourcePdfSource::len(&source));
|
||
let signatures_core = extract_signatures(&sig_fields, &resolver_arc, file_size);
|
||
let signatures: Vec<SignatureJson> = signatures_core.into_iter().map(|s| s.into()).collect();
|
||
|
||
// Phase 7.5: Extract embedded file attachments from /EmbeddedFiles and /AF
|
||
let attachments = match resolver_arc.resolve(root_ref) {
|
||
Ok(catalog_obj) => match catalog_obj.as_dict() {
|
||
Some(catalog_dict) => extract_attachments(&resolver_arc, catalog_dict, Some(&source as &dyn ParserPdfSource)),
|
||
None => Vec::new(),
|
||
},
|
||
Err(_) => Vec::new(),
|
||
};
|
||
|
||
// Phase 7.4: Extract form fields from AcroForm and XFA
|
||
// Walk AcroForm fields and convert to FormFieldValue
|
||
let acro_fields = walk_acroform_fields(&resolver_arc, &catalog, None);
|
||
let mut acro_fields_typed: Vec<(String, FormFieldValue)> = Vec::new();
|
||
for field in acro_fields {
|
||
let field_value = acro_field_to_value(&field);
|
||
acro_fields_typed.push((field.full_name.clone(), field_value));
|
||
}
|
||
|
||
// Extract XFA fields if present (requires re-opening the source for stream access)
|
||
let xfa_fields = if catalog.acroform_ref.is_some() {
|
||
// Resolve the AcroForm dictionary
|
||
use crate::parser::xref::XrefResolver;
|
||
let acroform_ref = catalog.acroform_ref.unwrap();
|
||
if let Ok(acroform_obj) = resolver_arc.resolve(acroform_ref) {
|
||
if let Some(acroform_dict) = acroform_obj.as_dict() {
|
||
// Create extraction options for stream decoding
|
||
use crate::parser::stream::ExtractionOptions as StreamExtractionOptions;
|
||
let stream_opts = StreamExtractionOptions {
|
||
max_decompress_bytes: DEFAULT_MAX_DECOMPRESS_BYTES,
|
||
password: None,
|
||
};
|
||
use crate::forms::extract_xfa_fields;
|
||
let xfa_extracted =
|
||
extract_xfa_fields(&resolver_arc, acroform_dict, &source, &stream_opts);
|
||
xfa_extracted
|
||
.into_iter()
|
||
.filter_map(|f| f.value.map(|v| (f.full_name, v)))
|
||
.collect()
|
||
} else {
|
||
Vec::new()
|
||
}
|
||
} else {
|
||
Vec::new()
|
||
}
|
||
} else {
|
||
Vec::new()
|
||
};
|
||
|
||
// Combine AcroForm and XFA fields (XFA wins on collision)
|
||
let (combined_fields, _form_diagnostics) = combine(acro_fields_typed, xfa_fields);
|
||
|
||
// Convert to FormFieldJson
|
||
let form_fields: Vec<FormFieldJson> = combined_fields
|
||
.into_iter()
|
||
.map(|(name, value)| convert_form_field_to_json(name, value, &resolver_arc, &catalog))
|
||
.collect();
|
||
|
||
// Phase 7.7: Extract article thread chains
|
||
// Discover thread headers from /Threads array and walk bead chains
|
||
use crate::parser::pages::build_page_ref_to_index;
|
||
use crate::threads::{discover as discover_threads, thread_to_json, walk_beads};
|
||
|
||
// Build page ref to index map for bead chain walking
|
||
let page_ref_to_index = build_page_ref_to_index(&catalog, &resolver_arc);
|
||
|
||
// Discover thread headers from /Threads array
|
||
let thread_headers = match discover_threads(&catalog, &resolver_arc) {
|
||
Ok(headers) => headers,
|
||
Err(_) => Vec::new(), // Return empty on error
|
||
};
|
||
|
||
// Walk bead chains for each thread and convert to JSON
|
||
let mut threads_json = Vec::new();
|
||
for header in &thread_headers {
|
||
match walk_beads(header, &resolver_arc, &page_ref_to_index) {
|
||
Ok(beads) => {
|
||
threads_json.push(thread_to_json(header, &beads));
|
||
}
|
||
Err(_) => {
|
||
// Skip threads with malformed bead chains but continue processing others
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
|
||
// TH-04: Detect JavaScript actions in the document
|
||
// This checks /OpenAction, /AA, page /AA, and annotation /A entries
|
||
use crate::javascript::detect_javascript;
|
||
let (js_actions, js_diagnostics) =
|
||
detect_javascript(&catalog, &pages_for_js_detection, &resolver_arc);
|
||
|
||
// Convert JavascriptAction to JavascriptActionJson
|
||
let javascript_actions: Vec<JavascriptActionJson> = js_actions
|
||
.into_iter()
|
||
.map(|action| JavascriptActionJson {
|
||
location: action.location,
|
||
code_excerpt: action.code_excerpt,
|
||
})
|
||
.collect();
|
||
|
||
// Add JavaScript detection diagnostics to the error list
|
||
let mut all_diagnostics_with_js = all_diagnostics;
|
||
for diag in js_diagnostics {
|
||
all_diagnostics_with_js.push(diag.message.as_ref().to_string());
|
||
}
|
||
|
||
// Add page range diagnostics (PAGE_OUT_OF_RANGE warnings)
|
||
for diag in page_range_diagnostics {
|
||
all_diagnostics_with_js.push(diag.message.as_ref().to_string());
|
||
}
|
||
|
||
Ok(ExtractionResult {
|
||
fingerprint,
|
||
pages: extracted_pages,
|
||
metadata: ExtractionMetadata {
|
||
page_count,
|
||
receipts_mode: options.receipts,
|
||
span_count: total_spans,
|
||
block_count: total_blocks,
|
||
cache_status: None,
|
||
cache_age_seconds: None,
|
||
error_count,
|
||
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
||
diagnostics: all_diagnostics_with_js,
|
||
profile_name: None,
|
||
profile_version: None,
|
||
profile_fields: None,
|
||
},
|
||
signatures,
|
||
form_fields,
|
||
links: links_json,
|
||
attachments,
|
||
threads: threads_json,
|
||
javascript_actions,
|
||
})
|
||
}
|
||
|
||
/// Apply two-page table detection flags to extracted pages.
|
||
///
|
||
/// This function examines tables on adjacent pages and sets the
|
||
/// `continued` and `continued_from_prev` flags where appropriate.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pages` - Pages with internal table information (grids preserved)
|
||
/// * `page_heights` - Page heights in points for edge detection
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// Pages with table continuation flags applied.
|
||
fn apply_two_page_table_detection(
|
||
mut pages: Vec<PageResultInternal>,
|
||
page_heights: &[f64],
|
||
) -> Vec<PageResultInternal> {
|
||
// Collect all GridCandidates by page
|
||
let all_grids: Vec<Vec<GridCandidate>> = pages
|
||
.iter()
|
||
.map(|p| p.tables.iter().map(|t| t.grid.clone()).collect())
|
||
.collect();
|
||
|
||
// Run two-page detection
|
||
let continuation_flags = detect_two_page_tables(&all_grids, page_heights);
|
||
|
||
// Apply flags to the tables
|
||
for (page_idx, page) in pages.iter_mut().enumerate() {
|
||
if let Some(page_flags) = continuation_flags.get(page_idx) {
|
||
for (table_idx, table) in page.tables.iter_mut().enumerate() {
|
||
if let Some(&(continued, continued_from_prev)) = page_flags.get(table_idx) {
|
||
table.json.continued = continued;
|
||
table.json.continued_from_prev = continued_from_prev;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
pages
|
||
}
|
||
|
||
/// Convert a FormFieldValue to FormFieldJson for serialization.
|
||
///
|
||
/// This helper function converts the internal FormFieldValue representation
|
||
/// to the JSON-serializable FormFieldJson structure.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `name` - The field name
|
||
/// * `value` - The FormFieldValue to convert
|
||
/// * `resolver` - Xref resolver (for looking up field metadata)
|
||
/// * `catalog` - Document catalog (for accessing AcroForm)
|
||
fn convert_form_field_to_json(
|
||
name: String,
|
||
value: FormFieldValue,
|
||
resolver: &crate::parser::xref::XrefResolver,
|
||
catalog: &crate::parser::catalog::Catalog,
|
||
) -> FormFieldJson {
|
||
match value {
|
||
FormFieldValue::Text {
|
||
value,
|
||
default,
|
||
multiline,
|
||
max_length,
|
||
} => FormFieldJson {
|
||
name,
|
||
field_type: FormFieldTypeJson::Text,
|
||
value: FormFieldValueJson::Text(value),
|
||
default: default.map(|v| FormFieldValueJson::Text(Some(v))),
|
||
page_index: None,
|
||
rect: None,
|
||
required: false,
|
||
read_only: false,
|
||
multiline: Some(multiline),
|
||
max_length,
|
||
options: None,
|
||
multi_select: None,
|
||
selected: None,
|
||
state_name: None,
|
||
pushbutton: None,
|
||
radio: None,
|
||
},
|
||
|
||
FormFieldValue::Button {
|
||
kind,
|
||
selected,
|
||
state_name,
|
||
default_selected,
|
||
pushbutton,
|
||
radio,
|
||
} => FormFieldJson {
|
||
name,
|
||
field_type: FormFieldTypeJson::Button,
|
||
value: FormFieldValueJson::Button(selected),
|
||
default: default_selected.map(FormFieldValueJson::Button),
|
||
page_index: None,
|
||
rect: None,
|
||
required: false,
|
||
read_only: false,
|
||
multiline: None,
|
||
max_length: None,
|
||
options: None,
|
||
multi_select: None,
|
||
selected: Some(selected),
|
||
state_name,
|
||
pushbutton: Some(pushbutton),
|
||
radio: Some(radio),
|
||
},
|
||
|
||
FormFieldValue::Choice {
|
||
value,
|
||
default,
|
||
options,
|
||
is_combo,
|
||
is_multi_select,
|
||
} => {
|
||
let json_value = match value {
|
||
crate::forms::ChoiceValue::Single(s) => {
|
||
FormFieldValueJson::Choice(ChoiceValueJson::Single(s))
|
||
}
|
||
crate::forms::ChoiceValue::Multiple(vec) => {
|
||
FormFieldValueJson::Choice(ChoiceValueJson::Multiple(vec))
|
||
}
|
||
};
|
||
|
||
let json_default = default.map(|dv| match dv {
|
||
crate::forms::ChoiceValue::Single(s) => {
|
||
FormFieldValueJson::Choice(ChoiceValueJson::Single(s))
|
||
}
|
||
crate::forms::ChoiceValue::Multiple(vec) => {
|
||
FormFieldValueJson::Choice(ChoiceValueJson::Multiple(vec))
|
||
}
|
||
});
|
||
|
||
let json_options: Vec<[String; 2]> = options
|
||
.into_iter()
|
||
.map(|(export, display)| [export, display])
|
||
.collect();
|
||
|
||
FormFieldJson {
|
||
name,
|
||
field_type: FormFieldTypeJson::Choice,
|
||
value: json_value,
|
||
default: json_default,
|
||
page_index: None,
|
||
rect: None,
|
||
required: false,
|
||
read_only: false,
|
||
multiline: None,
|
||
max_length: None,
|
||
options: Some(json_options),
|
||
multi_select: Some(is_multi_select),
|
||
selected: None,
|
||
state_name: None,
|
||
pushbutton: None,
|
||
radio: None,
|
||
}
|
||
}
|
||
|
||
FormFieldValue::Signature { signature_ref } => FormFieldJson {
|
||
name,
|
||
field_type: FormFieldTypeJson::Signature,
|
||
value: FormFieldValueJson::Signature(signature_ref),
|
||
default: None,
|
||
page_index: None,
|
||
rect: None,
|
||
required: false,
|
||
read_only: false,
|
||
multiline: None,
|
||
max_length: None,
|
||
options: None,
|
||
multi_select: None,
|
||
selected: None,
|
||
state_name: None,
|
||
pushbutton: None,
|
||
radio: None,
|
||
},
|
||
}
|
||
}
|
||
|
||
/// Extract embedded file attachments from the PDF.
|
||
///
|
||
/// This function walks both the /EmbeddedFiles name tree and the /AF (Associated Files)
|
||
/// array to extract all embedded file attachments. It handles PDF 1.7 /EmbeddedFiles
|
||
/// and PDF 2.0 /AF sources, deduplicating by Filespec reference.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `resolver` - The xref resolver for resolving indirect references
|
||
/// * `catalog_dict` - The raw catalog dictionary (PdfDict)
|
||
/// * `source` - Optional PDF source for reading stream data (None for metadata-only extraction)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A `Vec<AttachmentJson>` containing all extracted attachments, sorted by name
|
||
/// for deterministic output.
|
||
fn extract_attachments(
|
||
resolver: &Arc<crate::parser::xref::XrefResolver>,
|
||
catalog_dict: &crate::parser::object::PdfDict,
|
||
source: Option<&dyn crate::parser::stream::PdfSource>,
|
||
) -> Vec<AttachmentJson> {
|
||
use crate::parser::object::ObjRef;
|
||
use std::collections::HashSet;
|
||
|
||
let mut attachments = Vec::new();
|
||
let mut seen_refs: HashSet<ObjRef> = HashSet::new();
|
||
|
||
// Walk /AF array from the catalog
|
||
let af_entries = match walk_af_array(resolver, catalog_dict) {
|
||
Ok(entries) => entries,
|
||
Err(_) => return Vec::new(), // Return empty if /AF walk fails
|
||
};
|
||
for entry in af_entries {
|
||
if seen_refs.contains(&entry.filespec_ref) {
|
||
continue; // Skip duplicates
|
||
}
|
||
seen_refs.insert(entry.filespec_ref);
|
||
|
||
// Extract the attachment
|
||
match extract_one(resolver, entry.filespec_ref, source) {
|
||
Ok(attachment) => {
|
||
attachments.push(attachment.into_json());
|
||
}
|
||
Err(_) => {
|
||
// Skip failed attachments but continue with others
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
|
||
// TODO: Also walk /EmbeddedFiles name tree for PDF 1.7 compatibility
|
||
// This requires implementing a name tree walker for /EmbeddedFiles
|
||
|
||
// Sort by name for deterministic output
|
||
attachments.sort_by(|a, b| a.name.cmp(&b.name));
|
||
|
||
attachments
|
||
}
|
||
|
||
/// Extract content from a single page.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `fingerprint` - The PDF fingerprint for receipt generation
|
||
/// * `page_index` - 0-based page index
|
||
/// * `page` - The page dictionary from the PDF
|
||
/// * `options` - Extraction options
|
||
fn extract_page(
|
||
fingerprint: &str,
|
||
page_index: usize,
|
||
page: &crate::parser::pages::PageDict,
|
||
options: &ExtractionOptions,
|
||
) -> Result<PageResult> {
|
||
// For now, create placeholder spans based on the page media box
|
||
// In a full implementation, this would parse the content streams
|
||
// and extract actual text with positioning information
|
||
|
||
let [x0, y0, x1, y1] = page.media_box;
|
||
|
||
// Create a placeholder span for the entire page
|
||
// This is a minimal implementation - the full Phase 3 pipeline
|
||
// would extract actual text from content streams
|
||
let span_text = format!("[Page {} text extraction]", page_index);
|
||
let span_bbox = [x0, y0, x1, y1];
|
||
|
||
// Generate receipt if requested
|
||
let receipt = generate_receipt(
|
||
fingerprint,
|
||
page_index,
|
||
span_bbox,
|
||
&span_text,
|
||
options.receipts,
|
||
#[cfg(feature = "receipts")]
|
||
None,
|
||
)?;
|
||
|
||
let span = SpanJson {
|
||
text: span_text,
|
||
bbox: span_bbox,
|
||
font: "Unknown".to_string(),
|
||
size: 12.0,
|
||
color: None,
|
||
rendering_mode: None,
|
||
confidence: None,
|
||
confidence_source: None,
|
||
lang: None,
|
||
flags: vec![],
|
||
receipt,
|
||
column: None,
|
||
};
|
||
|
||
// Create a block containing the span
|
||
let block_text = span.text.clone();
|
||
let block_bbox = span_bbox;
|
||
let block_receipt = generate_receipt(
|
||
fingerprint,
|
||
page_index,
|
||
block_bbox,
|
||
&block_text,
|
||
options.receipts,
|
||
#[cfg(feature = "receipts")]
|
||
None,
|
||
)?;
|
||
|
||
let block = BlockJson {
|
||
kind: "paragraph".to_string(),
|
||
text: block_text,
|
||
bbox: block_bbox,
|
||
level: None,
|
||
table_index: None,
|
||
spans: vec![],
|
||
receipt: block_receipt,
|
||
};
|
||
|
||
Ok(PageResult {
|
||
index: page_index,
|
||
page_number: (page_index + 1) as u32,
|
||
page_label: None,
|
||
width: None,
|
||
height: None,
|
||
rotation: None,
|
||
page_type: None,
|
||
spans: vec![span],
|
||
blocks: vec![block],
|
||
tables: vec![],
|
||
annotations: vec![],
|
||
error: None,
|
||
})
|
||
}
|
||
|
||
/// Generate a receipt for a span or block.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `fingerprint` - The PDF fingerprint
|
||
/// * `page_index` - 0-based page index
|
||
/// * `bbox` - Bounding box in PDF points
|
||
/// * `text` - The text content
|
||
/// * `mode` - Receipt generation mode
|
||
/// * `glyph_list` - Optional glyph list for SVG generation (only used with receipts feature)
|
||
fn generate_receipt(
|
||
fingerprint: &str,
|
||
page_index: usize,
|
||
bbox: [f64; 4],
|
||
text: &str,
|
||
mode: ReceiptsMode,
|
||
#[cfg(feature = "receipts")] glyph_list: Option<&GlyphList>,
|
||
) -> Result<Option<Receipt>> {
|
||
match mode {
|
||
ReceiptsMode::Off => Ok(None),
|
||
ReceiptsMode::Lite => Ok(Some(Receipt::lite(
|
||
fingerprint.to_string(),
|
||
page_index,
|
||
bbox,
|
||
text,
|
||
))),
|
||
#[cfg(feature = "receipts")]
|
||
ReceiptsMode::SvgClip => {
|
||
// For SVG mode, we need a glyph list to generate the SVG clip
|
||
// In this minimal implementation, we fall back to lite mode
|
||
// if no glyph list is provided
|
||
if let Some(glyphs) = glyph_list {
|
||
let svg_gen = crate::receipts::svg::SvgGenerator::new(glyphs.clone());
|
||
let svg_clip = svg_gen.generate(bbox);
|
||
Ok(Some(Receipt::with_svg(
|
||
fingerprint.to_string(),
|
||
page_index,
|
||
bbox,
|
||
text,
|
||
svg_clip,
|
||
)))
|
||
} else {
|
||
// No glyph data available - fall back to lite mode
|
||
Ok(Some(Receipt::lite(
|
||
fingerprint.to_string(),
|
||
page_index,
|
||
bbox,
|
||
text,
|
||
)))
|
||
}
|
||
}
|
||
#[cfg(not(feature = "receipts"))]
|
||
ReceiptsMode::SvgClip => {
|
||
// Receipts feature not enabled - fall back to lite mode
|
||
Ok(Some(Receipt::lite(
|
||
fingerprint.to_string(),
|
||
page_index,
|
||
bbox,
|
||
text,
|
||
)))
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Convert an ExtractionResult to JSON format.
|
||
///
|
||
/// This produces the JSON output format expected by the CLI and API.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_pdf, ExtractionOptions, result_to_json};
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// let result = extract_pdf(
|
||
/// "document.pdf",
|
||
/// &ExtractionOptions::default()
|
||
/// )?;
|
||
///
|
||
/// // Convert to JSON for API output
|
||
/// let json_value = result_to_json(&result);
|
||
/// println!("{}", json_value.to_string());
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
pub fn result_to_json(result: &ExtractionResult) -> serde_json::Value {
|
||
let pages: Vec<serde_json::Value> = result
|
||
.pages
|
||
.iter()
|
||
.map(|page| {
|
||
json!({
|
||
"index": page.index,
|
||
"spans": page.spans,
|
||
"blocks": page.blocks,
|
||
"tables": page.tables,
|
||
})
|
||
})
|
||
.collect();
|
||
|
||
let mut metadata_obj = json!({
|
||
"page_count": result.metadata.page_count,
|
||
"span_count": result.metadata.span_count,
|
||
"block_count": result.metadata.block_count,
|
||
"cache_status": result.metadata.cache_status,
|
||
"cache_age_seconds": result.metadata.cache_age_seconds,
|
||
});
|
||
|
||
// Add reading_order_algorithm if present
|
||
if let Some(ref algo) = result.metadata.reading_order_algorithm {
|
||
metadata_obj["reading_order_algorithm"] = json!(algo);
|
||
}
|
||
|
||
// Add diagnostics if present
|
||
if !result.metadata.diagnostics.is_empty() {
|
||
metadata_obj["diagnostics"] = json!(result.metadata.diagnostics);
|
||
}
|
||
|
||
json!({
|
||
"fingerprint": result.fingerprint,
|
||
"schema_version": "1.0",
|
||
"pages": pages,
|
||
"metadata": metadata_obj,
|
||
"signatures": result.signatures,
|
||
"form_fields": result.form_fields,
|
||
"links": result.links,
|
||
"attachments": result.attachments,
|
||
"threads": result.threads,
|
||
"javascript_actions": result.javascript_actions
|
||
})
|
||
}
|
||
|
||
/// Extract plain text from a PDF file.
|
||
///
|
||
/// This is a convenience function that extracts text from a PDF and returns
|
||
/// it as a single string, with span texts concatenated in reading order.
|
||
/// Each span's text is followed by a newline, matching the CLI `--text` format.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file
|
||
/// * `options` - Extraction options controlling page range, password, etc.
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A `String` containing all extracted text from the PDF.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_text, ExtractionOptions};
|
||
/// use std::path::Path;
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// let text = extract_text(
|
||
/// Path::new("document.pdf"),
|
||
/// &ExtractionOptions::default()
|
||
/// )?;
|
||
/// println!("Extracted {} characters", text.len());
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
///
|
||
/// # Text Format
|
||
///
|
||
/// - Spans are emitted in reading order (as ordered in the spans array)
|
||
/// - Each span's text is followed by a newline
|
||
/// - Pages are concatenated without separator
|
||
/// - Invisible text (rendering_mode=3) is excluded unless `include_invisible` is set
|
||
pub fn extract_text(
|
||
pdf_path: &std::path::Path,
|
||
options: &ExtractionOptions,
|
||
) -> Result<String> {
|
||
let result = extract_pdf(pdf_path, options)?;
|
||
|
||
let mut text = String::new();
|
||
for page in &result.pages {
|
||
for span in &page.spans {
|
||
// Filter invisible text based on include_invisible option
|
||
if !options.output.include_invisible {
|
||
if let Some(mode) = span.rendering_mode {
|
||
if mode >= 3 {
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
text.push_str(&span.text);
|
||
text.push('\n');
|
||
}
|
||
}
|
||
|
||
Ok(text)
|
||
}
|
||
|
||
/// Extract text and structure from a PDF file, writing NDJSON output.
|
||
///
|
||
/// This is the streaming variant of `extract_pdf` that writes each page
|
||
/// as a newline-delimited JSON object immediately after extraction.
|
||
/// This keeps memory usage bounded regardless of document size.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file
|
||
/// * `options` - Extraction options controlling receipt generation and parallelism
|
||
/// * `writer` - Any type implementing `std::io::Write` to receive NDJSON output
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// An `ExtractionMetadata` containing summary statistics (pages, spans, blocks extracted).
|
||
///
|
||
/// # Memory Bounding
|
||
///
|
||
/// Unlike `extract_pdf`, this function never accumulates all pages in memory.
|
||
/// Pages are iterated lazily via LazyPageIter, which walks the page tree depth-first
|
||
/// and materializes only the current path from root to leaf (max ~16 nodes).
|
||
/// Each page is serialized to NDJSON and written immediately, then dropped.
|
||
/// Peak RSS stays O(depth × per-page) not O(pages × per-page).
|
||
///
|
||
/// # Output Format
|
||
///
|
||
/// Each line is a JSON object representing one page:
|
||
/// ```json
|
||
/// {"index": 0, "spans": [...], "blocks": [...]}
|
||
/// {"index": 1, "spans": [...], "blocks": [...]}
|
||
/// ```
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_pdf_ndjson, ExtractionOptions};
|
||
/// use std::fs::File;
|
||
/// use std::path::Path;
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// // Stream extraction to NDJSON file (memory-efficient for large PDFs)
|
||
/// let output = File::create("output.ndjson")?;
|
||
/// let metadata = extract_pdf_ndjson(
|
||
/// Path::new("large_document.pdf"),
|
||
/// &ExtractionOptions::default(),
|
||
/// output
|
||
/// )?;
|
||
///
|
||
/// println!("Extracted {} pages", metadata.total_pages);
|
||
/// println!("Total spans: {}", metadata.total_spans);
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if:
|
||
/// - The PDF file cannot be opened or read
|
||
/// - The PDF structure is invalid or corrupted
|
||
/// - Writing to the output fails
|
||
pub fn extract_pdf_ndjson<W: std::io::Write>(
|
||
pdf_path: &std::path::Path,
|
||
options: &ExtractionOptions,
|
||
mut writer: W,
|
||
) -> Result<ExtractionMetadata> {
|
||
use crate::parser::catalog::parse_catalog;
|
||
use crate::parser::pages::LazyPageIter;
|
||
use crate::parser::xref::{load_xref_with_prev_chain, XrefResolver};
|
||
use std::io::Write;
|
||
|
||
// Open the PDF file
|
||
let source = FileSource::open(pdf_path).context("Failed to open PDF file")?;
|
||
|
||
// Find the startxref offset
|
||
let startxref_offset = find_startxref(&source).context("Failed to find startxref offset")?;
|
||
|
||
// Load the xref table
|
||
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
|
||
|
||
// Create resolver from xref section
|
||
let resolver = XrefResolver::from_section(xref_section.clone());
|
||
|
||
// Get the root reference from trailer
|
||
let root_ref = xref_section
|
||
.trailer
|
||
.as_ref()
|
||
.and_then(|trailer| trailer.get("Root"))
|
||
.and_then(|obj| obj.as_ref())
|
||
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
|
||
|
||
// Parse the catalog
|
||
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn ParserPdfSource)).map_err(
|
||
|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow::anyhow!("Failed to parse catalog: {}", msg)
|
||
},
|
||
)?;
|
||
|
||
// Phase 4.5: Determine reading order algorithm
|
||
// For v0.1.0-v0.3.0: Tagged PDFs emit TAGGED_PDF_STRUCT_TREE_DEFERRED and use XY-cut
|
||
// Phase 7.1 will replace this with real StructTree traversal
|
||
let resolver_arc = Arc::new(resolver);
|
||
|
||
let (reading_order_algorithm, struct_tree, deferred_diagnostic) = if catalog.mark_info.is_tagged
|
||
{
|
||
// Tagged PDF: emit diagnostic once per document and use XY-cut
|
||
let diagnostic = Diagnostic::with_static_no_offset(
|
||
DiagCode::LayoutTaggedPdfDeferred,
|
||
"Tagged PDF detected; StructTree traversal deferred to Phase 7.1, using XY-cut for now",
|
||
);
|
||
(ReadingOrderAlgorithm::XyCut, None, Some(diagnostic))
|
||
} else {
|
||
// Untagged PDF: use XY-cut
|
||
(ReadingOrderAlgorithm::XyCut, None, None)
|
||
};
|
||
|
||
// For lazy extraction, use a placeholder fingerprint
|
||
// The full fingerprint would require walking all pages, which defeats the purpose
|
||
let fingerprint = format!(
|
||
"pdftract-v1:lazy{:016x}",
|
||
std::time::SystemTime::now()
|
||
.duration_since(std::time::UNIX_EPOCH)
|
||
.unwrap()
|
||
.as_nanos()
|
||
);
|
||
|
||
// Create lazy page iterator - this walks the tree on-demand
|
||
let mut page_iter =
|
||
LazyPageIter::new(&resolver_arc, catalog.pages_ref).map_err(|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow::anyhow!("Failed to create lazy page iterator: {}", msg)
|
||
})?;
|
||
|
||
// Wrap options in Arc for sharing across threads
|
||
let fingerprint_arc = Arc::new(fingerprint.clone());
|
||
let options_arc = Arc::new(options.clone());
|
||
|
||
// Track metadata across all pages
|
||
let mut total_spans = 0u64;
|
||
let mut total_blocks = 0u64;
|
||
let mut error_count = 0u64;
|
||
let mut page_count = 0usize;
|
||
|
||
// Phase 7.1.4: Collect page data for coverage check
|
||
// Track MCIDs and struct_parents for each page
|
||
let mut pages_with_mcids: Vec<(usize, Option<i32>, std::collections::HashSet<u32>)> =
|
||
Vec::new();
|
||
let needs_coverage_check = catalog.mark_info.requires_coverage_check() && struct_tree.is_some();
|
||
|
||
// Create a semaphore to bound the number of in-flight pages
|
||
let semaphore = Arc::new(Semaphore::new(options.max_parallel_pages));
|
||
|
||
// First, collect all pages to get the page count for range parsing
|
||
// This is necessary because the page range needs to know the total count
|
||
let mut all_pages: Vec<crate::parser::pages::PageDict> = Vec::new();
|
||
let mut page_diagnostics: Vec<Diagnostic> = Vec::new();
|
||
loop {
|
||
match page_iter.next() {
|
||
Some(Ok(page_dict)) => {
|
||
all_pages.push(page_dict);
|
||
}
|
||
Some(Err(diags)) => {
|
||
page_diagnostics.extend(diags);
|
||
break;
|
||
}
|
||
None => break,
|
||
}
|
||
}
|
||
|
||
// Parse page range if specified
|
||
let mut page_count = all_pages.len();
|
||
let mut page_range_diagnostics = Vec::new();
|
||
let page_filter: Option<std::collections::BTreeSet<usize>> = if let Some(ref range_str) = options.pages {
|
||
Some(crate::pages::parse_pages(range_str, page_count, &mut page_range_diagnostics)?)
|
||
} else {
|
||
None
|
||
};
|
||
|
||
// Phase 1.8: Hint stream prefetch for linearized PDFs
|
||
// If the PDF is linearized and has a hint stream, prefetch the pages
|
||
// that will be extracted. This reduces latency by pipelining HTTP requests.
|
||
if let Some(ref page_filter) = page_filter {
|
||
use crate::parser::xref::detect_linearization;
|
||
use crate::parser::hint_stream::prefetch_from_hint_stream;
|
||
|
||
let mut prefetch_diagnostics = Vec::new();
|
||
if let Some(lin_info) = detect_linearization(&source) {
|
||
if let (Some(hint_offset), Some(hint_length)) = (lin_info.hint_stream_offset, lin_info.hint_stream_length) {
|
||
// Prefetch the pages that will be extracted
|
||
// page_filter contains 0-based page indices
|
||
prefetch_from_hint_stream(
|
||
&source,
|
||
hint_offset,
|
||
hint_length,
|
||
page_filter.iter().copied(),
|
||
&mut prefetch_diagnostics,
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Process pages sequentially from the collected pages
|
||
for (page_index, page_dict) in all_pages.into_iter().enumerate() {
|
||
// Skip pages not in the selected range (if --pages was specified)
|
||
if let Some(ref filter) = page_filter {
|
||
if !filter.contains(&page_index) {
|
||
continue;
|
||
}
|
||
}
|
||
|
||
// Track MCIDs for this page if coverage check is needed
|
||
if needs_coverage_check {
|
||
// Decode content streams and track MCIDs
|
||
let decoded_streams = decode_page_content_streams(
|
||
&page_dict,
|
||
&resolver_arc,
|
||
&source,
|
||
options.max_decompress_bytes,
|
||
);
|
||
|
||
let mut tracker = McidTracker::new();
|
||
track_mcids_from_content_stream(&decoded_streams, &mut tracker);
|
||
|
||
// Get the struct_parents value for this page
|
||
let struct_parents = page_dict.struct_parents();
|
||
|
||
// Record page data for coverage check
|
||
let mcid_set = tracker.mcid_set().clone();
|
||
pages_with_mcids.push((page_index, struct_parents, mcid_set));
|
||
|
||
// Drop decoded_streams and tracker to free memory
|
||
drop(decoded_streams);
|
||
// tracker dropped implicitly
|
||
}
|
||
|
||
// Extract this page with lazy stream decoding.
|
||
// Content streams are decoded, processed, and dropped immediately.
|
||
let _permit = semaphore.acquire_guard();
|
||
|
||
let extract_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||
extract_page_from_dict(
|
||
&fingerprint_arc,
|
||
page_index,
|
||
&page_dict,
|
||
&options_arc,
|
||
Some(&source),
|
||
Some(&resolver_arc),
|
||
)
|
||
}));
|
||
|
||
match extract_result {
|
||
Ok(Ok(page)) => {
|
||
total_spans += page.spans.len() as u64;
|
||
total_blocks += page.blocks.len() as u64;
|
||
|
||
// Serialize and write this page immediately
|
||
// Extract TableJson from TableWithGrid for serialization
|
||
let tables_json: Vec<_> = page.tables.into_iter().map(|t| t.json).collect();
|
||
let page_json = json!({
|
||
"index": page.index,
|
||
"spans": page.spans,
|
||
"blocks": page.blocks,
|
||
"tables": tables_json,
|
||
});
|
||
|
||
serde_json::to_writer(&mut writer, &page_json).context("Failed to write NDJSON")?;
|
||
writeln!(writer).context("Failed to write newline")?;
|
||
writer.flush().context("Failed to flush output")?;
|
||
}
|
||
Ok(Err(e)) => {
|
||
error_count += 1;
|
||
// Write error page to maintain page ordering
|
||
let error_json = json!({
|
||
"index": page_index,
|
||
"error": e.to_string(),
|
||
"spans": [],
|
||
"blocks": [],
|
||
"tables": [],
|
||
});
|
||
|
||
serde_json::to_writer(&mut writer, &error_json)
|
||
.context("Failed to write NDJSON")?;
|
||
writeln!(writer).context("Failed to write newline")?;
|
||
writer.flush().context("Failed to flush output")?;
|
||
}
|
||
Err(_) => {
|
||
error_count += 1;
|
||
let error_json = json!({
|
||
"index": page_index,
|
||
"error": format!("Page {} extraction panicked", page_index),
|
||
"spans": [],
|
||
"blocks": [],
|
||
"tables": [],
|
||
});
|
||
|
||
serde_json::to_writer(&mut writer, &error_json)
|
||
.context("Failed to write NDJSON")?;
|
||
writeln!(writer).context("Failed to write newline")?;
|
||
writer.flush().context("Failed to flush output")?;
|
||
}
|
||
}
|
||
|
||
// Drop page_dict explicitly to ensure memory is freed before next iteration
|
||
drop(page_dict);
|
||
}
|
||
|
||
// Phase 7.1.4: Perform coverage check if Suspects is true
|
||
// This must happen after we've collected MCID data from all pages
|
||
let (final_reading_order_algorithm, coverage_diagnostics) = if needs_coverage_check {
|
||
if let Some(ref tree) = struct_tree {
|
||
let coverage_result =
|
||
check_coverage_for_pages(tree, &catalog.mark_info, &pages_with_mcids);
|
||
let diagnostics: Vec<String> = coverage_result
|
||
.diagnostics
|
||
.iter()
|
||
.map(|d| d.message.as_ref().to_string())
|
||
.collect();
|
||
(coverage_result.reading_order_algorithm, diagnostics)
|
||
} else {
|
||
// Shouldn't happen due to the needs_coverage_check condition
|
||
(reading_order_algorithm, Vec::new())
|
||
}
|
||
} else {
|
||
(reading_order_algorithm, Vec::new())
|
||
};
|
||
|
||
// Add the tagged PDF deferred diagnostic if present
|
||
let mut all_diagnostics = coverage_diagnostics;
|
||
if let Some(ref deferred) = deferred_diagnostic {
|
||
all_diagnostics.push(deferred.message.as_ref().to_string());
|
||
}
|
||
|
||
Ok(ExtractionMetadata {
|
||
page_count,
|
||
receipts_mode: options.receipts,
|
||
span_count: total_spans as usize,
|
||
block_count: total_blocks as usize,
|
||
cache_status: None,
|
||
cache_age_seconds: None,
|
||
error_count: error_count as usize,
|
||
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
||
diagnostics: all_diagnostics,
|
||
profile_name: None,
|
||
profile_version: None,
|
||
profile_fields: None,
|
||
})
|
||
}
|
||
|
||
/// Extract text and structure from a PDF file, invoking a callback for each page.
|
||
///
|
||
/// This is the callback-based streaming variant of `extract_pdf`. Each page
|
||
/// is extracted and passed to the callback immediately after extraction,
|
||
/// then dropped from memory. This keeps memory usage bounded regardless of
|
||
/// document size.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_path` - Path to the PDF file
|
||
/// * `options` - Extraction options controlling receipt generation and parallelism
|
||
/// * `callback` - Function called with each PageResult as it completes
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// An `ExtractionMetadata` containing summary statistics.
|
||
///
|
||
/// # Memory Bounding
|
||
///
|
||
/// This function never accumulates all pages in memory. Pages are iterated
|
||
/// lazily via LazyPageIter, extracted one at a time, and passed to the callback.
|
||
/// Peak RSS stays O(depth × per-page) not O(pages × per-page).
|
||
///
|
||
/// # Callback Contract
|
||
///
|
||
/// The callback is invoked from the extraction thread with a reference to each
|
||
/// PageResult. If the callback returns `false`, extraction stops early.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```rust,no_run
|
||
/// use pdftract_core::{extract_pdf_streaming, ExtractionOptions};
|
||
/// use std::path::Path;
|
||
///
|
||
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
/// // Process a large PDF one page at a time with bounded memory
|
||
/// let mut page_count = 0;
|
||
/// let metadata = extract_pdf_streaming(
|
||
/// Path::new("large_document.pdf"),
|
||
/// &ExtractionOptions::default(),
|
||
/// |page_result| {
|
||
/// page_count += 1;
|
||
/// println!("Page {}: {} spans", page_count, page_result.spans.len());
|
||
/// // Return true to continue, false to stop early
|
||
/// page_count < 10 // Only process first 10 pages
|
||
/// }
|
||
/// )?;
|
||
///
|
||
/// println!("Processed {} pages", metadata.total_pages);
|
||
/// # Ok(())
|
||
/// # }
|
||
/// ```
|
||
pub fn extract_pdf_streaming<F>(
|
||
pdf_path: &std::path::Path,
|
||
options: &ExtractionOptions,
|
||
mut callback: F,
|
||
) -> Result<ExtractionMetadata>
|
||
where
|
||
F: FnMut(&PageResult) -> bool,
|
||
{
|
||
use crate::parser::catalog::parse_catalog;
|
||
use crate::parser::pages::LazyPageIter;
|
||
use crate::parser::xref::{load_xref_with_prev_chain, XrefResolver};
|
||
|
||
// Open the PDF file
|
||
let source = FileSource::open(pdf_path).context("Failed to open PDF file")?;
|
||
|
||
// Find the startxref offset
|
||
let startxref_offset = find_startxref(&source).context("Failed to find startxref offset")?;
|
||
|
||
// Load the xref table
|
||
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
|
||
|
||
// Create resolver from xref section
|
||
let resolver = XrefResolver::from_section(xref_section.clone());
|
||
|
||
// Get the root reference from trailer
|
||
let root_ref = xref_section
|
||
.trailer
|
||
.as_ref()
|
||
.and_then(|trailer| trailer.get("Root"))
|
||
.and_then(|obj| obj.as_ref())
|
||
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
|
||
|
||
// Parse the catalog
|
||
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn ParserPdfSource)).map_err(
|
||
|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow::anyhow!("Failed to parse catalog: {}", msg)
|
||
},
|
||
)?;
|
||
|
||
// Resolve AcroForm if present for fingerprint computation
|
||
let acroform = catalog.acroform_ref.and_then(|ref_| {
|
||
resolver.resolve(ref_).ok().and_then(|obj| obj.as_dict().cloned())
|
||
});
|
||
|
||
// Wrap resolver in Arc for sharing across threads
|
||
let resolver_arc = Arc::new(resolver);
|
||
|
||
// Phase 4.5: Determine reading order algorithm
|
||
// For v0.1.0-v0.3.0: Tagged PDFs emit TAGGED_PDF_STRUCT_TREE_DEFERRED and use XY-cut
|
||
// Phase 7.1 will replace this with real StructTree traversal
|
||
let (reading_order_algorithm, struct_tree, deferred_diagnostic) = if catalog.mark_info.is_tagged
|
||
{
|
||
// Tagged PDF: emit diagnostic once per document and use XY-cut
|
||
let diagnostic = Diagnostic::with_static_no_offset(
|
||
DiagCode::LayoutTaggedPdfDeferred,
|
||
"Tagged PDF detected; StructTree traversal deferred to Phase 7.1, using XY-cut for now",
|
||
);
|
||
(ReadingOrderAlgorithm::XyCut, None, Some(diagnostic))
|
||
} else {
|
||
// Untagged PDF: use XY-cut
|
||
(ReadingOrderAlgorithm::XyCut, None, None)
|
||
};
|
||
|
||
// Build fingerprint
|
||
let fingerprint = compute_fingerprint_lazy(&catalog, &resolver_arc, &acroform);
|
||
|
||
// Wrap options in Arc for sharing across threads
|
||
let fingerprint_arc = Arc::new(fingerprint.clone());
|
||
let options_arc = Arc::new(options.clone());
|
||
|
||
// Create lazy page iterator
|
||
let mut page_iter =
|
||
LazyPageIter::new(&resolver_arc, catalog.pages_ref).map_err(|diagnostics| {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
anyhow::anyhow!("Failed to create lazy page iterator: {}", msg)
|
||
})?;
|
||
|
||
// Create a semaphore to bound the number of in-flight pages
|
||
let semaphore = Arc::new(Semaphore::new(options.max_parallel_pages));
|
||
|
||
// Track metadata across all pages
|
||
let mut total_spans = 0;
|
||
let mut total_blocks = 0;
|
||
let mut error_count = 0;
|
||
let mut page_count = 0;
|
||
|
||
// Phase 7.1.4: Collect page data for coverage check
|
||
let mut pages_with_mcids: Vec<(usize, Option<i32>, std::collections::HashSet<u32>)> =
|
||
Vec::new();
|
||
let needs_coverage_check = catalog.mark_info.requires_coverage_check() && struct_tree.is_some();
|
||
|
||
while let Some(page_result) = page_iter.next() {
|
||
let page_dict = match page_result {
|
||
Ok(p) => p,
|
||
Err(diagnostics) => {
|
||
let msg = diagnostics
|
||
.first()
|
||
.map(|d| d.message.as_ref())
|
||
.unwrap_or("unknown error");
|
||
error_count += 1;
|
||
let error_page = PageResult {
|
||
index: page_count,
|
||
page_number: (page_count + 1) as u32,
|
||
page_label: None,
|
||
width: None,
|
||
height: None,
|
||
rotation: None,
|
||
page_type: None,
|
||
spans: vec![],
|
||
blocks: vec![],
|
||
tables: vec![],
|
||
annotations: vec![],
|
||
error: Some(msg.to_string()),
|
||
};
|
||
if !callback(&error_page) {
|
||
break;
|
||
}
|
||
if needs_coverage_check {
|
||
pages_with_mcids.push((page_count, None, std::collections::HashSet::new()));
|
||
}
|
||
page_count += 1;
|
||
continue;
|
||
}
|
||
};
|
||
|
||
// Track MCIDs for this page if coverage check is needed
|
||
if needs_coverage_check {
|
||
let decoded_streams = decode_page_content_streams(
|
||
&page_dict,
|
||
&resolver_arc,
|
||
&source,
|
||
DEFAULT_MAX_DECOMPRESS_BYTES,
|
||
);
|
||
|
||
let mut tracker = McidTracker::new();
|
||
track_mcids_from_content_stream(&decoded_streams, &mut tracker);
|
||
|
||
let struct_parents = page_dict.struct_parents();
|
||
let mcid_set = tracker.mcid_set().clone();
|
||
pages_with_mcids.push((page_count, struct_parents, mcid_set));
|
||
|
||
drop(decoded_streams);
|
||
}
|
||
|
||
// Extract this page
|
||
let _permit = semaphore.acquire_guard();
|
||
let extract_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||
extract_page_from_dict(
|
||
&fingerprint_arc,
|
||
page_count,
|
||
&page_dict,
|
||
&options_arc,
|
||
Some(&source),
|
||
Some(&resolver_arc),
|
||
)
|
||
}));
|
||
|
||
let page_result = match extract_result {
|
||
Ok(Ok(internal_page)) => {
|
||
total_spans += internal_page.spans.len();
|
||
total_blocks += internal_page.blocks.len();
|
||
PageResult::from(internal_page)
|
||
}
|
||
Ok(Err(e)) => {
|
||
error_count += 1;
|
||
PageResult {
|
||
index: page_count,
|
||
page_number: (page_count + 1) as u32,
|
||
page_label: None,
|
||
width: None,
|
||
height: None,
|
||
rotation: None,
|
||
page_type: None,
|
||
spans: vec![],
|
||
blocks: vec![],
|
||
tables: vec![],
|
||
annotations: vec![],
|
||
error: Some(e.to_string()),
|
||
}
|
||
}
|
||
Err(_) => {
|
||
error_count += 1;
|
||
PageResult {
|
||
index: page_count,
|
||
page_number: (page_count + 1) as u32,
|
||
page_label: None,
|
||
width: None,
|
||
height: None,
|
||
rotation: None,
|
||
page_type: None,
|
||
spans: vec![],
|
||
blocks: vec![],
|
||
tables: vec![],
|
||
annotations: vec![],
|
||
error: Some(format!("Page {} extraction panicked", page_count)),
|
||
}
|
||
}
|
||
};
|
||
|
||
// Invoke callback with this page
|
||
if !callback(&page_result) {
|
||
// Caller requested early termination
|
||
break;
|
||
}
|
||
|
||
drop(page_dict);
|
||
page_count += 1;
|
||
}
|
||
|
||
// Phase 7.1.4: Perform coverage check if Suspects is true
|
||
let (final_reading_order_algorithm, coverage_diagnostics) = if needs_coverage_check {
|
||
if let Some(ref tree) = struct_tree {
|
||
let coverage_result =
|
||
check_coverage_for_pages(tree, &catalog.mark_info, &pages_with_mcids);
|
||
let diagnostics: Vec<String> = coverage_result
|
||
.diagnostics
|
||
.iter()
|
||
.map(|d| d.message.as_ref().to_string())
|
||
.collect();
|
||
(coverage_result.reading_order_algorithm, diagnostics)
|
||
} else {
|
||
(reading_order_algorithm, Vec::new())
|
||
}
|
||
} else {
|
||
(reading_order_algorithm, Vec::new())
|
||
};
|
||
|
||
// Add the tagged PDF deferred diagnostic if present
|
||
let mut all_diagnostics = coverage_diagnostics;
|
||
if let Some(ref deferred) = deferred_diagnostic {
|
||
all_diagnostics.push(deferred.message.as_ref().to_string());
|
||
}
|
||
|
||
Ok(ExtractionMetadata {
|
||
page_count,
|
||
receipts_mode: options.receipts,
|
||
span_count: total_spans,
|
||
block_count: total_blocks,
|
||
cache_status: None,
|
||
cache_age_seconds: None,
|
||
error_count,
|
||
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
||
diagnostics: all_diagnostics,
|
||
profile_name: None,
|
||
profile_version: None,
|
||
profile_fields: None,
|
||
})
|
||
}
|
||
|
||
/// Find the startxref offset in a PDF file.
|
||
///
|
||
/// Scans the last 1024 bytes of the file for "startxref" keyword.
|
||
fn find_startxref(source: &FileSource) -> anyhow::Result<u64> {
|
||
let len = SourcePdfSource::len(source) as usize;
|
||
let scan_start = len.saturating_sub(1024);
|
||
let scan_end = len;
|
||
|
||
let tail_data = source
|
||
.read_at(scan_start as u64, scan_end - scan_start)
|
||
.context("Failed to read PDF tail")?;
|
||
|
||
// Find "startxref" in the tail data
|
||
let startxref_pos = tail_data
|
||
.windows(9)
|
||
.rposition(|w| w == b"startxref")
|
||
.ok_or_else(|| anyhow::anyhow!("startxref not found in PDF"))?;
|
||
|
||
// Parse the offset after "startxref"
|
||
let offset_data = &tail_data[startxref_pos + 9..];
|
||
|
||
// Skip leading whitespace (space, \r, \n, \t)
|
||
let offset_start = offset_data
|
||
.iter()
|
||
.position(|&b| !matches!(b, b' ' | b'\r' | b'\n' | b'\t'))
|
||
.unwrap_or(offset_data.len());
|
||
|
||
let offset_data_trimmed = &offset_data[offset_start..];
|
||
|
||
// Find the newline after the offset
|
||
let newline_pos = offset_data_trimmed
|
||
.iter()
|
||
.position(|&b| b == b'\n' || b == b'\r')
|
||
.unwrap_or(offset_data_trimmed.len());
|
||
|
||
let offset_str = std::str::from_utf8(&offset_data_trimmed[..newline_pos])
|
||
.context("startxref offset is not valid UTF-8")?;
|
||
|
||
let offset: u64 = offset_str
|
||
.trim()
|
||
.parse()
|
||
.context("startxref offset is not a valid number")?;
|
||
|
||
Ok(offset)
|
||
}
|
||
|
||
/// Extract content from a single page dict.
|
||
///
|
||
/// This function extracts content from a page using lazy stream decoding:
|
||
/// 1. Content streams are decoded only for this page (not pre-fetched)
|
||
/// 2. Decoded bytes are dropped immediately after processing
|
||
/// 3. No state is held across page boundaries
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `fingerprint` - The PDF fingerprint for receipt generation
|
||
/// * `page_index` - 0-based page index
|
||
/// * `page` - The page dictionary from the PDF
|
||
/// * `options` - Extraction options
|
||
/// * `source` - The PDF source for reading stream data (optional, for lazy decode)
|
||
/// * `resolver` - The xref resolver (optional, for lazy decode)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A `PageResultInternal` with grid information preserved for two-page detection.
|
||
fn extract_page_from_dict(
|
||
fingerprint: &str,
|
||
page_index: usize,
|
||
page: &crate::parser::pages::PageDict,
|
||
options: &ExtractionOptions,
|
||
source: Option<&dyn crate::parser::stream::PdfSource>,
|
||
resolver: Option<&crate::parser::xref::XrefResolver>,
|
||
) -> Result<PageResultInternal> {
|
||
let [x0, y0, x1, y1] = page.media_box;
|
||
let page_height = y1 - y0;
|
||
|
||
// Lazy decode content streams if source and resolver are provided
|
||
let decoded_streams = if let (Some(src), Some(res)) = (source, resolver) {
|
||
Some(decode_page_content_streams(
|
||
page,
|
||
res,
|
||
src,
|
||
DEFAULT_MAX_DECOMPRESS_BYTES,
|
||
))
|
||
} else {
|
||
None
|
||
};
|
||
|
||
// Detect tables using line-based and borderless detection
|
||
let tables = if let Some(ref content_bytes) = decoded_streams {
|
||
detect_tables_on_page(page, content_bytes, page_index)?
|
||
} else {
|
||
Vec::new()
|
||
};
|
||
|
||
// Create a placeholder span for the entire page
|
||
// This is a minimal implementation - the full Phase 3 pipeline
|
||
// would extract actual text from the decoded content streams
|
||
let span_text = format!("[Page {} text extraction]", page_index);
|
||
let span_bbox = [x0, y0, x1, y1];
|
||
|
||
// Generate receipt if requested
|
||
let receipt = generate_receipt(
|
||
fingerprint,
|
||
page_index,
|
||
span_bbox,
|
||
&span_text,
|
||
options.receipts,
|
||
#[cfg(feature = "receipts")]
|
||
None,
|
||
)?;
|
||
|
||
let span = SpanJson {
|
||
text: span_text,
|
||
bbox: span_bbox,
|
||
font: "Unknown".to_string(),
|
||
size: 12.0,
|
||
color: None,
|
||
rendering_mode: None,
|
||
confidence: None,
|
||
confidence_source: None,
|
||
lang: None,
|
||
flags: vec![],
|
||
receipt,
|
||
column: None,
|
||
};
|
||
|
||
// Create blocks including table blocks
|
||
let mut blocks = Vec::new();
|
||
|
||
// Add table blocks
|
||
for (table_idx, table) in tables.iter().enumerate() {
|
||
// Use the grid's bbox for the block, not a placeholder
|
||
let table_bbox = [
|
||
table.grid.bbox[0] as f64,
|
||
table.grid.bbox[1] as f64,
|
||
table.grid.bbox[2] as f64,
|
||
table.grid.bbox[3] as f64,
|
||
];
|
||
|
||
let table_receipt = generate_receipt(
|
||
fingerprint,
|
||
page_index,
|
||
table_bbox,
|
||
"table",
|
||
options.receipts,
|
||
#[cfg(feature = "receipts")]
|
||
None,
|
||
)?;
|
||
|
||
blocks.push(BlockJson {
|
||
kind: "table".to_string(),
|
||
text: format!("Table {}", table_idx),
|
||
bbox: table_bbox,
|
||
level: None,
|
||
table_index: Some(table_idx),
|
||
spans: vec![],
|
||
receipt: table_receipt,
|
||
});
|
||
}
|
||
|
||
// Add a placeholder paragraph block
|
||
let block_text = span.text.clone();
|
||
let block_bbox = span_bbox;
|
||
let block_receipt = generate_receipt(
|
||
fingerprint,
|
||
page_index,
|
||
block_bbox,
|
||
&block_text,
|
||
options.receipts,
|
||
#[cfg(feature = "receipts")]
|
||
None,
|
||
)?;
|
||
|
||
blocks.push(BlockJson {
|
||
kind: "paragraph".to_string(),
|
||
text: block_text,
|
||
bbox: block_bbox,
|
||
level: None,
|
||
table_index: None,
|
||
spans: vec![],
|
||
receipt: block_receipt,
|
||
});
|
||
|
||
Ok(PageResultInternal {
|
||
index: page_index,
|
||
spans: vec![span],
|
||
blocks,
|
||
tables,
|
||
annotations: vec![],
|
||
error: None,
|
||
page_height,
|
||
})
|
||
}
|
||
|
||
/// Detect tables on a page using line-based and borderless detection.
|
||
///
|
||
/// This function runs both detection methods and combines the results,
|
||
/// preferring line-based detection when both find tables in similar positions.
|
||
///
|
||
/// Returns `Vec<TableWithGrid>` to preserve grid information for two-page detection.
|
||
fn detect_tables_on_page(
|
||
page: &crate::parser::pages::PageDict,
|
||
content_bytes: &[u8],
|
||
page_index: usize,
|
||
) -> Result<Vec<TableWithGrid>> {
|
||
use crate::table::PageContext;
|
||
|
||
let ctx = PageContext::new(page, content_bytes);
|
||
let detector = TableDetector::new();
|
||
|
||
// Try line-based detection first
|
||
let line_based_grids = detector.detect_line_based(&ctx);
|
||
|
||
// If no tables found, try borderless detection
|
||
let grids = if line_based_grids.is_empty() {
|
||
detector.detect_borderless(&ctx)
|
||
} else {
|
||
line_based_grids
|
||
};
|
||
|
||
// Convert grids to TableWithGrid
|
||
let mut tables = Vec::new();
|
||
for grid in grids {
|
||
// Create empty cells (no span assignment yet - that requires full text extraction)
|
||
let cells = create_empty_cells(&grid);
|
||
|
||
let detection_method = if grid.segments.is_empty() {
|
||
"borderless"
|
||
} else {
|
||
"line_based"
|
||
};
|
||
|
||
let table_json = grid_to_table_json(
|
||
&grid,
|
||
&cells,
|
||
page_index,
|
||
detection_method,
|
||
false, // continued - will be set by two-page detection
|
||
false, // continued_from_prev - will be set by two-page detection
|
||
);
|
||
|
||
tables.push(TableWithGrid {
|
||
json: table_json,
|
||
grid,
|
||
});
|
||
}
|
||
|
||
Ok(tables)
|
||
}
|
||
|
||
/// Create empty cells for a grid (placeholder for when text extraction is not available).
|
||
fn create_empty_cells(grid: &crate::table::GridCandidate) -> Vec<Cell> {
|
||
let mut cells = Vec::new();
|
||
|
||
for row in 0..grid.row_count() {
|
||
for col in 0..grid.col_count() {
|
||
if let Some(bbox) = grid.cell_bbox(row, col) {
|
||
cells.push(Cell::new(bbox, row, col));
|
||
}
|
||
}
|
||
}
|
||
|
||
cells
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use std::fs;
|
||
use std::path::Path;
|
||
|
||
/// Create a minimal valid PDF for testing.
|
||
fn create_minimal_pdf(path: &Path) -> Result<()> {
|
||
let pdf_data = br#"%PDF-1.4
|
||
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj
|
||
2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
|
||
3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>>>endobj
|
||
|
||
xref
|
||
0 4
|
||
0000000000 65535 f
|
||
0000000009 00000 n
|
||
0000000052 00000 n
|
||
0000000101 00000 n
|
||
trailer<</Size 4/Root 1 0 R>>
|
||
startxref
|
||
239
|
||
%%EOF
|
||
"#;
|
||
fs::write(path, pdf_data)?;
|
||
Ok(())
|
||
}
|
||
|
||
/// Get a test PDF file path.
|
||
/// Uses one of the classifier fixture PDFs for testing.
|
||
fn get_test_pdf_path() -> std::path::PathBuf {
|
||
// For now, use the temp-based minimal PDF to ensure tests are self-contained
|
||
// This avoids dependency on external fixture files that may be malformed
|
||
std::path::PathBuf::from("__test__.pdf")
|
||
}
|
||
|
||
/// Get or create the test PDF file.
|
||
fn ensure_test_pdf() -> std::path::PathBuf {
|
||
let path = get_test_pdf_path();
|
||
if !path.exists() {
|
||
create_minimal_pdf(&path).unwrap();
|
||
}
|
||
path
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_pdf_with_receipts_off() {
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::default();
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
assert!(result.pages.len() >= 1);
|
||
assert_eq!(result.metadata.receipts_mode, ReceiptsMode::Off);
|
||
|
||
let page = &result.pages[0];
|
||
assert!(!page.spans.is_empty());
|
||
|
||
// Receipts should be None when mode is Off
|
||
for span in &page.spans {
|
||
assert!(span.receipt.is_none());
|
||
}
|
||
for block in &page.blocks {
|
||
assert!(block.receipt.is_none());
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_pdf_with_receipts_lite() {
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::with_receipts(ReceiptsMode::Lite);
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
assert_eq!(result.metadata.receipts_mode, ReceiptsMode::Lite);
|
||
|
||
let page = &result.pages[0];
|
||
assert!(!page.spans.is_empty());
|
||
|
||
// Receipts should be present in lite mode
|
||
for span in &page.spans {
|
||
assert!(span.receipt.is_some());
|
||
let receipt = span.receipt.as_ref().unwrap();
|
||
assert_eq!(receipt.pdf_fingerprint, result.fingerprint);
|
||
assert!(receipt.svg_clip.is_none());
|
||
}
|
||
|
||
for block in &page.blocks {
|
||
assert!(block.receipt.is_some());
|
||
let receipt = block.receipt.as_ref().unwrap();
|
||
assert_eq!(receipt.pdf_fingerprint, result.fingerprint);
|
||
assert!(receipt.svg_clip.is_none());
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_pdf_with_receipts_svg() {
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::with_receipts(ReceiptsMode::SvgClip);
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
assert_eq!(result.metadata.receipts_mode, ReceiptsMode::SvgClip);
|
||
|
||
let page = &result.pages[0];
|
||
assert!(!page.spans.is_empty());
|
||
|
||
// Receipts should be present
|
||
// Note: In this minimal implementation without glyph data,
|
||
// SVG mode falls back to lite mode (svg_clip is None)
|
||
for span in &page.spans {
|
||
assert!(span.receipt.is_some());
|
||
let receipt = span.receipt.as_ref().unwrap();
|
||
assert_eq!(receipt.pdf_fingerprint, result.fingerprint);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_result_to_json_format() {
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::default();
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
let json = result_to_json(&result);
|
||
|
||
assert!(json.is_object());
|
||
assert!(json.get("fingerprint").is_some());
|
||
assert!(json.get("schema_version").is_some());
|
||
assert!(json.get("pages").is_some());
|
||
assert!(json.get("metadata").is_some());
|
||
|
||
let pages = json.get("pages").and_then(|v| v.as_array()).unwrap();
|
||
assert_eq!(pages.len(), 1);
|
||
|
||
let page = &pages[0];
|
||
assert!(page.get("index").is_some());
|
||
assert!(page.get("spans").is_some());
|
||
assert!(page.get("blocks").is_some());
|
||
}
|
||
|
||
#[test]
|
||
fn test_result_to_json_with_receipts() {
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::with_receipts(ReceiptsMode::Lite);
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
let json = result_to_json(&result);
|
||
|
||
let pages = json.get("pages").and_then(|v| v.as_array()).unwrap();
|
||
let page = &pages[0];
|
||
let spans = page.get("spans").and_then(|v| v.as_array()).unwrap();
|
||
let span = &spans[0];
|
||
|
||
// Span should have receipt field
|
||
assert!(span.get("receipt").is_some());
|
||
|
||
let receipt = span.get("receipt").unwrap();
|
||
assert!(receipt.get("pdf_fingerprint").is_some());
|
||
assert!(receipt.get("page_index").is_some());
|
||
assert!(receipt.get("bbox").is_some());
|
||
assert!(receipt.get("content_hash").is_some());
|
||
assert!(receipt.get("extraction_version").is_some());
|
||
|
||
// svg_clip should not be present in lite mode
|
||
assert!(receipt.get("svg_clip").is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn test_extraction_metadata() {
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::with_receipts(ReceiptsMode::Lite);
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
assert!(result.metadata.page_count >= 1);
|
||
assert!(result.metadata.span_count > 0);
|
||
assert!(result.metadata.block_count > 0);
|
||
assert_eq!(result.metadata.receipts_mode, ReceiptsMode::Lite);
|
||
}
|
||
|
||
#[test]
|
||
fn test_result_to_json_includes_signatures() {
|
||
// Test that result_to_json includes the signatures array
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::default();
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
let json = result_to_json(&result);
|
||
|
||
// Verify signatures key exists
|
||
assert!(json.get("signatures").is_some());
|
||
|
||
// Verify signatures is an array
|
||
assert!(json["signatures"].is_array());
|
||
|
||
// For most test PDFs, signatures will be empty (no signature fields)
|
||
// But the array should always be present
|
||
}
|
||
|
||
#[test]
|
||
fn test_signatures_always_not_checked() {
|
||
// Test that all signatures have validation_status == "not_checked"
|
||
// This is required by the plan - cryptographic verification is out of scope for v1
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::default();
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
for sig in &result.signatures {
|
||
assert_eq!(sig.validation_status, "not_checked");
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_signature_json_schema_round_trip() {
|
||
// Test that SignatureJson round-trips through JSON correctly
|
||
use crate::schema::SignatureJson;
|
||
|
||
let sig = SignatureJson {
|
||
field_name: "test_sig".to_string(),
|
||
signer_name: "John Doe".to_string(),
|
||
signing_date: Some("2023-01-15T14:30:45Z".to_string()),
|
||
reason: Some("Test".to_string()),
|
||
location: Some("Test Location".to_string()),
|
||
sub_filter: Some("adbe.pkcs7.detached".to_string()),
|
||
byte_range: Some(vec![0, 1000, 2000, 500]),
|
||
coverage_fraction: Some(0.5),
|
||
validation_status: "not_checked".to_string(),
|
||
};
|
||
|
||
let json_str = serde_json::to_string(&sig).unwrap();
|
||
let deserialized: SignatureJson = serde_json::from_str(&json_str).unwrap();
|
||
|
||
assert_eq!(deserialized, sig);
|
||
}
|
||
|
||
#[test]
|
||
fn test_signature_json_validation_status_enum() {
|
||
// Test that validation_status accepts only valid enum values
|
||
use crate::schema::SignatureJson;
|
||
|
||
let sig_valid = SignatureJson {
|
||
field_name: "test".to_string(),
|
||
signer_name: String::new(),
|
||
signing_date: None,
|
||
reason: None,
|
||
location: None,
|
||
sub_filter: None,
|
||
byte_range: None,
|
||
coverage_fraction: None,
|
||
validation_status: "not_checked".to_string(),
|
||
};
|
||
|
||
// Should serialize correctly
|
||
let json = serde_json::to_string(&sig_valid).unwrap();
|
||
assert!(json.contains("not_checked"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_tagged_pdf_emits_deferred_diagnostic() {
|
||
// Test that tagged PDFs emit TAGGED_PDF_STRUCT_TREE_DEFERRED diagnostic
|
||
use crate::diagnostics::DiagCode;
|
||
|
||
let temp_dir = tempfile::tempdir().unwrap();
|
||
let pdf_path = temp_dir.path().join("tagged_test.pdf");
|
||
|
||
// Create a minimal tagged PDF (with /MarkInfo /Marked true)
|
||
let pdf_data = br#"%PDF-1.4
|
||
1 0 obj<</Type/Catalog/Pages 2 0 R/MarkInfo<</Marked true>>>>endobj
|
||
2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj
|
||
3 0 obj<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]/Resources<</Font<</F1<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>>>>>>>>>endobj
|
||
|
||
xref
|
||
0 4
|
||
0000000000 65535 f
|
||
0000000009 00000 n
|
||
0000000096 00000 n
|
||
0000000145 00000 n
|
||
trailer<</Size 4/Root 1 0 R>>
|
||
startxref
|
||
283
|
||
%%EOF
|
||
"#;
|
||
fs::write(&pdf_path, pdf_data).unwrap();
|
||
|
||
let options = ExtractionOptions::default();
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
// Verify the tagged PDF diagnostic is emitted
|
||
assert!(!result.metadata.diagnostics.is_empty());
|
||
let deferred_diag = result
|
||
.metadata
|
||
.diagnostics
|
||
.iter()
|
||
.find(|d| d.contains("TAGGED_PDF_STRUCT_TREE_DEFERRED"))
|
||
.expect("TAGGED_PDF_STRUCT_TREE_DEFERRED diagnostic should be emitted for tagged PDFs");
|
||
|
||
// Verify the reading order algorithm is xy_cut
|
||
assert_eq!(
|
||
result.metadata.reading_order_algorithm,
|
||
Some("xy_cut".to_string()),
|
||
"Tagged PDFs should use xy_cut algorithm in v0.1.0-v0.3.0"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_untagged_pdf_no_deferred_diagnostic() {
|
||
// Test that untagged PDFs do NOT emit TAGGED_PDF_STRUCT_TREE_DEFERRED
|
||
let pdf_path = ensure_test_pdf();
|
||
|
||
let options = ExtractionOptions::default();
|
||
let result = extract_pdf(&pdf_path, &options).unwrap();
|
||
|
||
// Verify NO tagged PDF diagnostic is emitted
|
||
let has_deferred_diag = result
|
||
.metadata
|
||
.diagnostics
|
||
.iter()
|
||
.any(|d| d.contains("TAGGED_PDF_STRUCT_TREE_DEFERRED"));
|
||
|
||
assert!(
|
||
!has_deferred_diag,
|
||
"Untagged PDFs should NOT emit TAGGED_PDF_STRUCT_TREE_DEFERRED diagnostic"
|
||
);
|
||
}
|
||
}
|