fix(bf-3gmkz): implement XrefResolver::resolve by using resolve_with_source

The XrefResolver::resolve method was a stub returning Null, causing
parse_catalog to fail with '/Root is not a dictionary (type: null)'.

Changes:
- Added source: Option<&dyn PdfSource> parameter to parse_catalog
- Uses resolve_with_source when source is Some, otherwise uses cache-only resolve
- Updated all callers (document.rs, extract.rs, CLI registry.rs) to pass source
- Tests continue to pass None and use cached objects

Fixes: bf-3gmkz

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-26 01:31:57 -04:00
parent d48c6856fb
commit 9889b96aca
4 changed files with 28 additions and 18 deletions

View file

@ -281,7 +281,7 @@ fn open_pdf(
let resolver = parser::xref::XrefResolver::from_section(xref_section.clone());
// Try to parse the catalog
let catalog_result = catalog::parse_catalog(&resolver, *root_ref);
let catalog_result = catalog::parse_catalog(&resolver, *root_ref, Some(&source as &dyn pdftract_core::parser::stream::PdfSource));
match catalog_result {
Ok(catalog) => {

View file

@ -66,7 +66,7 @@ pub fn parse_pdf_file(
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| {
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
@ -305,7 +305,7 @@ impl PdfExtractor {
.ok_or_else(|| anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| {
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())

View file

@ -24,7 +24,7 @@ use crate::forms::{
use crate::options::{ExtractionOptions, ReceiptsMode};
use crate::parser::catalog::ReadingOrderAlgorithm;
use crate::parser::marked_content::{track_mcids_from_content_stream, McidTracker};
use crate::parser::stream::FileSource;
use crate::parser::stream::{FileSource, PdfSource};
use crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES;
use crate::parser::struct_tree::{check_coverage_for_pages, parse_struct_tree};
use crate::receipts::Receipt;
@ -368,7 +368,7 @@ pub fn extract_pdf(
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| {
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
@ -1249,7 +1249,7 @@ pub fn extract_pdf_ndjson<W: std::io::Write>(
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| {
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
@ -1544,7 +1544,7 @@ where
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref).map_err(|diagnostics| {
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())

View file

@ -6,6 +6,7 @@
use crate::diagnostics::{DiagCode, Diagnostic};
use crate::parser::object::{intern, ObjRef, PdfObject};
use crate::parser::stream::PdfSource;
use crate::parser::ocg::{parse_oc_properties, OcProperties};
use crate::parser::xref::XrefResolver;
@ -451,6 +452,7 @@ impl Default for Catalog {
/// # Arguments
/// * `resolver` - The xref resolver for resolving indirect references
/// * `root_ref` - The object reference to the catalog (/Root in trailer)
/// * `source` - Optional PDF source for reading indirect objects. If None, uses cached objects only.
///
/// # Returns
/// A `Result<Catalog>` containing the parsed catalog or a list of diagnostics.
@ -459,12 +461,20 @@ impl Default for Catalog {
/// - If /Pages is missing, emits STRUCT_MISSING_KEY and returns an empty catalog
/// - All other entries are optional; missing entries are None/defaults
/// - Never panics; all errors become diagnostics
pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result<Catalog> {
pub fn parse_catalog(
resolver: &XrefResolver,
root_ref: ObjRef,
source: Option<&dyn PdfSource>,
) -> Result<Catalog> {
let mut catalog = Catalog::default();
let mut diagnostics = Vec::new();
// Resolve the root object
let root_obj = match resolver.resolve(root_ref) {
// Resolve the root object using source if available, otherwise use cache-only resolve
let root_obj = match source {
Some(src) => resolver.resolve_with_source(root_ref, src),
None => resolver.resolve(root_ref),
};
let root_obj = match root_obj {
Ok(obj) => obj,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
@ -824,7 +834,7 @@ mod tests {
let catalog_obj = make_test_catalog_dict();
resolver.cache_object(root_ref, catalog_obj);
let result = parse_catalog(&resolver, root_ref);
let result = parse_catalog(&resolver, root_ref, None);
assert!(result.is_ok());
let catalog = result.unwrap();
@ -846,7 +856,7 @@ mod tests {
let catalog_obj = PdfObject::Dict(Box::new(dict));
resolver.cache_object(root_ref, catalog_obj);
let result = parse_catalog(&resolver, root_ref);
let result = parse_catalog(&resolver, root_ref, None);
assert!(result.is_ok());
let catalog = result.unwrap();
@ -867,7 +877,7 @@ mod tests {
// Cache a non-dict object
resolver.cache_object(root_ref, PdfObject::Integer(42));
let result = parse_catalog(&resolver, root_ref);
let result = parse_catalog(&resolver, root_ref, None);
assert!(result.is_err());
}
@ -877,7 +887,7 @@ mod tests {
let root_ref = ObjRef::new(999, 0);
// Don't cache anything; resolve will fail
let result = parse_catalog(&resolver, root_ref);
let result = parse_catalog(&resolver, root_ref, None);
assert!(result.is_err());
}
@ -892,7 +902,7 @@ mod tests {
let catalog_obj = PdfObject::Dict(Box::new(dict));
resolver.cache_object(root_ref, catalog_obj);
let result = parse_catalog(&resolver, root_ref);
let result = parse_catalog(&resolver, root_ref, None);
assert!(result.is_ok());
let catalog = result.unwrap();
@ -926,7 +936,7 @@ mod tests {
let catalog_obj = PdfObject::Dict(Box::new(dict));
resolver.cache_object(root_ref, catalog_obj);
let catalog = parse_catalog(&resolver, root_ref).unwrap();
let catalog = parse_catalog(&resolver, root_ref, None).unwrap();
assert!(catalog.mark_info.is_tagged);
}
@ -941,7 +951,7 @@ mod tests {
let catalog_obj = PdfObject::Dict(Box::new(dict));
resolver.cache_object(root_ref, catalog_obj);
let catalog = parse_catalog(&resolver, root_ref).unwrap();
let catalog = parse_catalog(&resolver, root_ref, None).unwrap();
assert_eq!(catalog.version, Some("2.0".to_string()));
}
@ -1178,7 +1188,7 @@ mod proptests {
resolver.cache_object(root_ref, catalog_obj);
// This should never panic - it should always return Ok or Err with diagnostics
let result = parse_catalog(&resolver, root_ref);
let result = parse_catalog(&resolver, root_ref, None);
// If we get Ok, verify the catalog is structurally valid
// If we get Err, verify diagnostics are present