diff --git a/crates/pdftract-cli/src/mcp/tools/registry.rs b/crates/pdftract-cli/src/mcp/tools/registry.rs index f12578f..558638c 100644 --- a/crates/pdftract-cli/src/mcp/tools/registry.rs +++ b/crates/pdftract-cli/src/mcp/tools/registry.rs @@ -281,7 +281,7 @@ fn open_pdf( let resolver = parser::xref::XrefResolver::from_section(xref_section.clone()); // Try to parse the catalog - let catalog_result = catalog::parse_catalog(&resolver, *root_ref); + let catalog_result = catalog::parse_catalog(&resolver, *root_ref, Some(&source as &dyn pdftract_core::parser::stream::PdfSource)); match catalog_result { Ok(catalog) => { diff --git a/crates/pdftract-core/src/document.rs b/crates/pdftract-core/src/document.rs index 266cc0a..ab9d577 100644 --- a/crates/pdftract-core/src/document.rs +++ b/crates/pdftract-core/src/document.rs @@ -66,7 +66,7 @@ pub fn parse_pdf_file( .ok_or_else(|| anyhow!("No /Root reference in trailer"))?; // Parse the catalog - let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| { + let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| { let msg = diagnostics .first() .map(|d| d.message.as_ref()) @@ -305,7 +305,7 @@ impl PdfExtractor { .ok_or_else(|| anyhow!("No /Root reference in trailer"))?; // Parse the catalog - let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| { + let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| { let msg = diagnostics .first() .map(|d| d.message.as_ref()) diff --git a/crates/pdftract-core/src/extract.rs b/crates/pdftract-core/src/extract.rs index 6ec51d8..dd83a1d 100644 --- a/crates/pdftract-core/src/extract.rs +++ b/crates/pdftract-core/src/extract.rs @@ -24,7 +24,7 @@ use crate::forms::{ use crate::options::{ExtractionOptions, ReceiptsMode}; use crate::parser::catalog::ReadingOrderAlgorithm; use crate::parser::marked_content::{track_mcids_from_content_stream, McidTracker}; -use crate::parser::stream::FileSource; +use crate::parser::stream::{FileSource, PdfSource}; use crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES; use crate::parser::struct_tree::{check_coverage_for_pages, parse_struct_tree}; use crate::receipts::Receipt; @@ -368,7 +368,7 @@ pub fn extract_pdf( .ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?; // Parse the catalog - let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| { + let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| { let msg = diagnostics .first() .map(|d| d.message.as_ref()) @@ -1249,7 +1249,7 @@ pub fn extract_pdf_ndjson( .ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?; // Parse the catalog - let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| { + let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| { let msg = diagnostics .first() .map(|d| d.message.as_ref()) @@ -1544,7 +1544,7 @@ where .ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?; // Parse the catalog - let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| { + let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| { let msg = diagnostics .first() .map(|d| d.message.as_ref()) diff --git a/crates/pdftract-core/src/parser/catalog.rs b/crates/pdftract-core/src/parser/catalog.rs index d528d99..d6bc06a 100644 --- a/crates/pdftract-core/src/parser/catalog.rs +++ b/crates/pdftract-core/src/parser/catalog.rs @@ -6,6 +6,7 @@ use crate::diagnostics::{DiagCode, Diagnostic}; use crate::parser::object::{intern, ObjRef, PdfObject}; +use crate::parser::stream::PdfSource; use crate::parser::ocg::{parse_oc_properties, OcProperties}; use crate::parser::xref::XrefResolver; @@ -451,6 +452,7 @@ impl Default for Catalog { /// # Arguments /// * `resolver` - The xref resolver for resolving indirect references /// * `root_ref` - The object reference to the catalog (/Root in trailer) +/// * `source` - Optional PDF source for reading indirect objects. If None, uses cached objects only. /// /// # Returns /// A `Result` containing the parsed catalog or a list of diagnostics. @@ -459,12 +461,20 @@ impl Default for Catalog { /// - If /Pages is missing, emits STRUCT_MISSING_KEY and returns an empty catalog /// - All other entries are optional; missing entries are None/defaults /// - Never panics; all errors become diagnostics -pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result { +pub fn parse_catalog( + resolver: &XrefResolver, + root_ref: ObjRef, + source: Option<&dyn PdfSource>, +) -> Result { let mut catalog = Catalog::default(); let mut diagnostics = Vec::new(); - // Resolve the root object - let root_obj = match resolver.resolve(root_ref) { + // Resolve the root object using source if available, otherwise use cache-only resolve + let root_obj = match source { + Some(src) => resolver.resolve_with_source(root_ref, src), + None => resolver.resolve(root_ref), + }; + let root_obj = match root_obj { Ok(obj) => obj, Err(e) => { diagnostics.push(Diagnostic::with_dynamic_no_offset( @@ -824,7 +834,7 @@ mod tests { let catalog_obj = make_test_catalog_dict(); resolver.cache_object(root_ref, catalog_obj); - let result = parse_catalog(&resolver, root_ref); + let result = parse_catalog(&resolver, root_ref, None); assert!(result.is_ok()); let catalog = result.unwrap(); @@ -846,7 +856,7 @@ mod tests { let catalog_obj = PdfObject::Dict(Box::new(dict)); resolver.cache_object(root_ref, catalog_obj); - let result = parse_catalog(&resolver, root_ref); + let result = parse_catalog(&resolver, root_ref, None); assert!(result.is_ok()); let catalog = result.unwrap(); @@ -867,7 +877,7 @@ mod tests { // Cache a non-dict object resolver.cache_object(root_ref, PdfObject::Integer(42)); - let result = parse_catalog(&resolver, root_ref); + let result = parse_catalog(&resolver, root_ref, None); assert!(result.is_err()); } @@ -877,7 +887,7 @@ mod tests { let root_ref = ObjRef::new(999, 0); // Don't cache anything; resolve will fail - let result = parse_catalog(&resolver, root_ref); + let result = parse_catalog(&resolver, root_ref, None); assert!(result.is_err()); } @@ -892,7 +902,7 @@ mod tests { let catalog_obj = PdfObject::Dict(Box::new(dict)); resolver.cache_object(root_ref, catalog_obj); - let result = parse_catalog(&resolver, root_ref); + let result = parse_catalog(&resolver, root_ref, None); assert!(result.is_ok()); let catalog = result.unwrap(); @@ -926,7 +936,7 @@ mod tests { let catalog_obj = PdfObject::Dict(Box::new(dict)); resolver.cache_object(root_ref, catalog_obj); - let catalog = parse_catalog(&resolver, root_ref).unwrap(); + let catalog = parse_catalog(&resolver, root_ref, None).unwrap(); assert!(catalog.mark_info.is_tagged); } @@ -941,7 +951,7 @@ mod tests { let catalog_obj = PdfObject::Dict(Box::new(dict)); resolver.cache_object(root_ref, catalog_obj); - let catalog = parse_catalog(&resolver, root_ref).unwrap(); + let catalog = parse_catalog(&resolver, root_ref, None).unwrap(); assert_eq!(catalog.version, Some("2.0".to_string())); } @@ -1178,7 +1188,7 @@ mod proptests { resolver.cache_object(root_ref, catalog_obj); // This should never panic - it should always return Ok or Err with diagnostics - let result = parse_catalog(&resolver, root_ref); + let result = parse_catalog(&resolver, root_ref, None); // If we get Ok, verify the catalog is structurally valid // If we get Err, verify diagnostics are present