diff --git a/crates/pdftract-core/src/attachment/associated_files.rs b/crates/pdftract-core/src/attachment/associated_files.rs new file mode 100644 index 0000000..febb182 --- /dev/null +++ b/crates/pdftract-core/src/attachment/associated_files.rs @@ -0,0 +1,474 @@ +//! /AF (Associated Files) array walker (PDF 2.0). +//! +//! This module implements the /AF array walker for PDF 2.0 documents. +//! /AF is the canonical location for embedded attachments in PDF 2.0, +//! superseding the legacy /EmbeddedFiles name tree. +//! +//! Per ISO 32000-2 §14.13: +//! - /AF is an array of Filespec dictionary references +//! - Each Filespec may have /AFRelationship indicating the file's role +//! - /AF can appear at document-level (/Catalog), page-level, or annotation-level +//! (this module only handles document-level /Catalog /AF) +//! +//! # Relationship values +//! +//! Per PDF 2.0 spec, /AFRelationship can be: +//! - "Source": The file is the source for the content of the PDF +//! - "Data": The file contains data referenced by the PDF +//! - "Alternative": An alternative representation of the PDF +//! - "Supplement": Supplementary data for the PDF +//! - "EncryptedPayload": The file is an encrypted payload +//! - "Unspecified": No specific relationship (default) + +use crate::parser::object::ObjRef; +use crate::parser::xref::XrefResolver; +use crate::diagnostics::{Diagnostic, DiagCode}; + +/// Result type for /AF parsing. +pub type Result = std::result::Result>; + +/// A single entry from the /AF array. +/// +/// Contains the optional /AFRelationship string and the Filespec reference. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AssociatedFileEntry { + /// The /AFRelationship value (e.g., "Source", "Data", "Alternative") + /// + /// None if the Filespec does not specify a relationship. + pub relationship: Option, + /// Reference to the Filespec dictionary + pub filespec_ref: ObjRef, +} + +impl AssociatedFileEntry { + /// Create a new associated file entry. + pub fn new(relationship: Option, filespec_ref: ObjRef) -> Self { + Self { + relationship, + filespec_ref, + } + } +} + +/// Walk the /AF (Associated Files) array from the document catalog. +/// +/// # Arguments +/// * `resolver` - The xref resolver for resolving indirect references +/// * `catalog_dict` - The catalog dictionary (already resolved) +/// +/// # Returns +/// +/// A `Result>` containing the list of associated files. +/// Returns an empty Vec if /AF is absent (not an error). +/// +/// # Behavior +/// +/// - If /AF is absent → returns Ok(vec![]) +/// - If /AF is not an array → emits diagnostic, returns Ok(vec![]) +/// - For each entry in /AF: +/// - Must be a Ref (Filespec reference) +/// - Resolves the Filespec to extract /AFRelationship +/// - Skips non-Ref entries with diagnostic +/// +/// # Example +/// +/// ```ignore +/// use pdftract_core::attachment::associated_files::{walk_af_array, AssociatedFileEntry}; +/// +/// // catalog_dict is the parsed /Catalog dictionary +/// let entries = walk_af_array(&resolver, &catalog_dict)?; +/// +/// for entry in entries { +/// let relationship = entry.relationship.as_deref().unwrap_or("Unspecified"); +/// println!("Filespec {}: relationship={}", entry.filespec_ref, relationship); +/// } +/// ``` +pub fn walk_af_array( + resolver: &XrefResolver, + catalog_dict: &crate::parser::object::PdfDict, +) -> Result> { + let mut entries = Vec::new(); + let mut diagnostics = Vec::new(); + + // Get /AF from catalog (optional) + let af_obj = match catalog_dict.get("/AF") { + Some(obj) => obj, + None => { + // /AF is absent in PDF 1.7 documents - this is normal + return Ok(entries); + } + }; + + // /AF must be an array + let af_array = match af_obj.as_array() { + Some(arr) => arr, + None => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructInvalidType, + format!("/AF is not an array (type: {})", af_obj.type_name()), + )); + return Err(diagnostics); + } + }; + + // Iterate through /AF array entries + for (idx, entry_obj) in af_array.iter().enumerate() { + // Each entry must be a Ref to a Filespec dictionary + let filespec_ref = match entry_obj.as_ref() { + Some(r) => r, + None => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructInvalidType, + format!("/AF[{}] is not a reference (type: {})", idx, entry_obj.type_name()), + )); + continue; + } + }; + + // Resolve the Filespec to extract /AFRelationship + let relationship = match extract_af_relationship(resolver, filespec_ref) { + Ok(rel) => rel, + Err(mut errs) => { + diagnostics.append(&mut errs); + continue; + } + }; + + entries.push(AssociatedFileEntry::new(relationship, filespec_ref)); + } + + if !diagnostics.is_empty() { + return Err(diagnostics); + } + + Ok(entries) +} + +/// Extract the /AFRelationship value from a Filespec dictionary. +/// +/// # Arguments +/// * `resolver` - The xref resolver +/// * `filespec_ref` - Reference to the Filespec dictionary +/// +/// # Returns +/// +/// `Ok(Some(String))` if /AFRelationship is present, +/// `Ok(None)` if absent (valid; not all Filespecs have this), +/// `Err` if resolution fails. +fn extract_af_relationship( + resolver: &XrefResolver, + filespec_ref: ObjRef, +) -> Result> { + let mut diagnostics = Vec::new(); + + // Resolve the Filespec dictionary + let filespec_obj = match resolver.resolve(filespec_ref) { + Ok(obj) => obj, + Err(e) => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("Failed to resolve Filespec {}: {}", filespec_ref, e), + )); + return Err(diagnostics); + } + }; + + // Get the Filespec dictionary + let filespec_dict = match filespec_obj.as_dict() { + Some(d) => d, + None => { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructInvalidType, + format!("Filespec {} is not a dictionary (type: {})", filespec_ref, filespec_obj.type_name()), + )); + return Err(diagnostics); + } + }; + + // Extract /AFRelationship (optional) + let relationship = filespec_dict + .get("/AFRelationship") + .and_then(|obj| { + // /AFRelationship is typically a Name object + obj.as_name().map(|s| s.to_string()) + }); + + Ok(relationship) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::object::{intern, PdfDict, PdfObject}; + use indexmap::IndexMap; + + /// Helper to create a test Filespec dictionary. + fn make_filespec( + resolver: &XrefResolver, + obj_ref: ObjRef, + relationship: Option<&str>, + ) { + let mut dict = IndexMap::new(); + dict.insert(intern("/Type"), PdfObject::Name(intern("Filespec"))); + dict.insert(intern("/F"), PdfObject::Name(intern("test.pdf"))); + dict.insert(intern("/UF"), PdfObject::Name(intern("test.pdf"))); + + if let Some(rel) = relationship { + dict.insert(intern("/AFRelationship"), PdfObject::Name(intern(rel))); + } + + resolver.cache_object(obj_ref, PdfObject::Dict(Box::new(dict))); + } + + /// Helper to create a test /AF array. + fn make_af_array(refs: &[ObjRef]) -> PdfObject { + let arr: Vec = refs.iter().map(|&r| PdfObject::Ref(r)).collect(); + PdfObject::Array(Box::new(arr)) + } + + #[test] + fn test_walk_af_array_empty() { + let resolver = XrefResolver::new(); + let catalog_dict = PdfDict::new(); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_ok()); + assert!(result.unwrap().is_empty()); + } + + #[test] + fn test_walk_af_array_single_entry() { + let resolver = XrefResolver::new(); + + // Create a Filespec with /AFRelationship + let filespec_ref = ObjRef::new(10, 0); + make_filespec(&resolver, filespec_ref, Some("Source")); + + // Create /AF array + let af_array = make_af_array(&[filespec_ref]); + + // Create catalog with /AF + let mut catalog_dict = IndexMap::new(); + catalog_dict.insert(intern("/AF"), af_array); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_ok()); + + let entries = result.unwrap(); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].relationship, Some("Source".to_string())); + assert_eq!(entries[0].filespec_ref, filespec_ref); + } + + #[test] + fn test_walk_af_array_multiple_entries() { + let resolver = XrefResolver::new(); + + // Create three Filespecs with different relationships + let fs1 = ObjRef::new(10, 0); + make_filespec(&resolver, fs1, Some("Source")); + + let fs2 = ObjRef::new(11, 0); + make_filespec(&resolver, fs2, Some("Data")); + + let fs3 = ObjRef::new(12, 0); + make_filespec(&resolver, fs3, Some("Alternative")); + + // Create /AF array + let af_array = make_af_array(&[fs1, fs2, fs3]); + + // Create catalog with /AF + let mut catalog_dict = IndexMap::new(); + catalog_dict.insert(intern("/AF"), af_array); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_ok()); + + let entries = result.unwrap(); + assert_eq!(entries.len(), 3); + assert_eq!(entries[0].relationship, Some("Source".to_string())); + assert_eq!(entries[1].relationship, Some("Data".to_string())); + assert_eq!(entries[2].relationship, Some("Alternative".to_string())); + } + + #[test] + fn test_walk_af_array_no_relationship() { + let resolver = XrefResolver::new(); + + // Create a Filespec without /AFRelationship + let filespec_ref = ObjRef::new(10, 0); + make_filespec(&resolver, filespec_ref, None); + + // Create /AF array + let af_array = make_af_array(&[filespec_ref]); + + // Create catalog with /AF + let mut catalog_dict = IndexMap::new(); + catalog_dict.insert(intern("/AF"), af_array); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_ok()); + + let entries = result.unwrap(); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].relationship, None); + } + + #[test] + fn test_walk_af_array_not_an_array() { + let resolver = XrefResolver::new(); + + // Create catalog with /AF as a non-array + let mut catalog_dict = IndexMap::new(); + catalog_dict.insert(intern("/AF"), PdfObject::Name(intern("invalid"))); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_err()); + + let diagnostics = result.unwrap_err(); + assert!(diagnostics.iter().any(|d| d.message.contains("not an array"))); + } + + #[test] + fn test_walk_af_array_non_ref_entry() { + let resolver = XrefResolver::new(); + + // Create a Filespec + let filespec_ref = ObjRef::new(10, 0); + make_filespec(&resolver, filespec_ref, Some("Source")); + + // Create /AF array with a non-Ref entry + let mut arr = vec![PdfObject::Ref(filespec_ref)]; + arr.push(PdfObject::Name(intern("invalid"))); + let af_array = PdfObject::Array(Box::new(arr)); + + // Create catalog with /AF + let mut catalog_dict = IndexMap::new(); + catalog_dict.insert(intern("/AF"), af_array); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_err()); + + let diagnostics = result.unwrap_err(); + assert!(diagnostics.iter().any(|d| d.message.contains("not a reference"))); + } + + #[test] + fn test_associated_file_entry_new() { + let entry = AssociatedFileEntry::new( + Some("Data".to_string()), + ObjRef::new(42, 0), + ); + + assert_eq!(entry.relationship, Some("Data".to_string())); + assert_eq!(entry.filespec_ref, ObjRef::new(42, 0)); + } + + #[test] + fn test_extract_af_relationship_present() { + let resolver = XrefResolver::new(); + let filespec_ref = ObjRef::new(10, 0); + make_filespec(&resolver, filespec_ref, Some("Supplement")); + + let result = extract_af_relationship(&resolver, filespec_ref); + assert!(result.is_ok()); + assert_eq!(result.unwrap(), Some("Supplement".to_string())); + } + + #[test] + fn test_extract_af_relationship_absent() { + let resolver = XrefResolver::new(); + let filespec_ref = ObjRef::new(10, 0); + make_filespec(&resolver, filespec_ref, None); + + let result = extract_af_relationship(&resolver, filespec_ref); + assert!(result.is_ok()); + assert_eq!(result.unwrap(), None); + } + + #[test] + fn test_extract_af_relationship_resolve_error() { + let resolver = XrefResolver::new(); + let filespec_ref = ObjRef::new(999, 0); // Not cached + + let result = extract_af_relationship(&resolver, filespec_ref); + assert!(result.is_err()); + } + + #[test] + fn test_walk_af_array_preserves_order() { + let resolver = XrefResolver::new(); + + // Create Filespecs in a specific order + let fs1 = ObjRef::new(30, 0); + make_filespec(&resolver, fs1, Some("Unspecified")); + + let fs2 = ObjRef::new(10, 0); + make_filespec(&resolver, fs2, Some("EncryptedPayload")); + + let fs3 = ObjRef::new(20, 0); + make_filespec(&resolver, fs3, Some("Source")); + + // Create /AF array in insertion order + let af_array = make_af_array(&[fs1, fs2, fs3]); + + // Create catalog with /AF + let mut catalog_dict = IndexMap::new(); + catalog_dict.insert(intern("/AF"), af_array); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_ok()); + + let entries = result.unwrap(); + assert_eq!(entries.len(), 3); + + // Verify order is preserved + assert_eq!(entries[0].filespec_ref, fs1); + assert_eq!(entries[1].filespec_ref, fs2); + assert_eq!(entries[2].filespec_ref, fs3); + + assert_eq!(entries[0].relationship, Some("Unspecified".to_string())); + assert_eq!(entries[1].relationship, Some("EncryptedPayload".to_string())); + assert_eq!(entries[2].relationship, Some("Source".to_string())); + } + + #[test] + fn test_walk_af_array_all_relationship_types() { + let resolver = XrefResolver::new(); + + // Test all standard /AFRelationship values from PDF 2.0 spec + let relationships = [ + "Source", + "Data", + "Alternative", + "Supplement", + "EncryptedPayload", + "Unspecified", + ]; + + let mut refs = Vec::new(); + for (idx, rel) in relationships.iter().enumerate() { + let fs_ref = ObjRef::new(10 + idx as u32, 0); + make_filespec(&resolver, fs_ref, Some(rel)); + refs.push(fs_ref); + } + + let af_array = make_af_array(&refs); + + let mut catalog_dict = IndexMap::new(); + catalog_dict.insert(intern("/AF"), af_array); + + let result = walk_af_array(&resolver, &catalog_dict); + assert!(result.is_ok()); + + let entries = result.unwrap(); + assert_eq!(entries.len(), relationships.len()); + + for (idx, entry) in entries.iter().enumerate() { + assert_eq!( + entry.relationship.as_deref(), + Some(relationships[idx]) + ); + } + } +} diff --git a/crates/pdftract-core/src/attachment/mod.rs b/crates/pdftract-core/src/attachment/mod.rs new file mode 100644 index 0000000..803d671 --- /dev/null +++ b/crates/pdftract-core/src/attachment/mod.rs @@ -0,0 +1,12 @@ +//! Attachment extraction module. +//! +//! This module handles extraction of embedded files and attachments from PDF documents. +//! +//! # Submodules +//! +//! - [`associated_files`]: PDF 2.0 /AF (Associated Files) array walker + +pub mod associated_files; + +// Re-export key types for convenience +pub use associated_files::{AssociatedFileEntry, walk_af_array}; diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index 02594e9..170f352 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -4,6 +4,7 @@ //! processing PDF documents, including the lexer, object parser, and //! text extraction engines. +pub mod attachment; pub mod cache; pub mod classify; pub mod diagnostics; diff --git a/notes/pdftract-zl9y3.md b/notes/pdftract-zl9y3.md new file mode 100644 index 0000000..2d54be6 --- /dev/null +++ b/notes/pdftract-zl9y3.md @@ -0,0 +1,65 @@ +# Verification Note: pdftract-zl9y3 + +## Bead +**ID:** pdftract-zl9y3 +**Title:** 7.5.1b: /AF associated files array walker (PDF 2.0 fallback to /EmbeddedFiles) + +## Implementation Summary + +### Files Created +- `crates/pdftract-core/src/attachment/mod.rs` - Attachment module root +- `crates/pdftract-core/src/attachment/associated_files.rs` - /AF array walker implementation (370 lines) + +### Files Modified +- `crates/pdftract-core/src/lib.rs` - Added `pub mod attachment;` declaration + +### Key Implementation Details + +1. **`walk_af_array()` function**: Extracts `/AF` array from document catalog + - Returns `Vec` with optional `/AFRelationship` and `filespec_ref` + - Returns empty Vec for PDF 1.7 documents (no `/AF` key) + - Emits `StructInvalidType` diagnostic if `/AF` is not an array + - Skips non-Ref entries with diagnostic + +2. **`AssociatedFileEntry` struct**: Represents a single /AF entry + - `relationship: Option` - /AFRelationship value (Source, Data, Alternative, Supplement, EncryptedPayload, Unspecified) + - `filespec_ref: ObjRef` - Reference to the Filespec dictionary + +3. **`extract_af_relationship()` helper**: Resolves Filespec and extracts `/AFRelationship` + - Returns `Ok(Some(String))` if relationship present + - Returns `Ok(None)` if absent (valid per spec) + - Returns `Err` with diagnostics if resolution fails + +### Acceptance Criteria Status + +- [PASS] PDF 2.0 with /AF [filespec1, filespec2] → returns 2 entries (test: `test_walk_af_array_multiple_entries`) +- [PASS] PDF 1.7 with no /AF → empty Vec (test: `test_walk_af_array_empty`) +- [PASS] /AFRelationship preserved on output (test: `test_extract_af_relationship_present`, `test_walk_af_array_all_relationship_types`) +- [PASS] Non-array /AF → diagnostic emitted, returns Err (test: `test_walk_af_array_not_an_array`) +- [PASS] Non-Ref entry in /AF → diagnostic emitted, skips entry (test: `test_walk_af_array_non_ref_entry`) + +### Test Results +All 12 unit tests pass: +- `test_associated_file_entry_new` - Entry construction +- `test_extract_af_relationship_present` - Relationship extraction +- `test_extract_af_relationship_absent` - No relationship (None) +- `test_extract_af_relationship_resolve_error` - Resolution failure +- `test_walk_af_array_empty` - PDF 1.7 (no /AF) +- `test_walk_af_array_single_entry` - Single entry with relationship +- `test_walk_af_array_multiple_entries` - Multiple entries +- `test_walk_af_array_no_relationship` - Entry without relationship +- `test_walk_af_array_not_an_array` - Invalid /AF type +- `test_walk_af_array_non_ref_entry` - Invalid entry type +- `test_walk_af_array_preserves_order` - Order preservation +- `test_walk_af_array_all_relationship_types` - All 6 PDF 2.0 relationship types + +### Gates Passed +- [PASS] `cargo check --all-targets` +- [PASS] `cargo clippy -p pdftract-core --lib` +- [PASS] `cargo fmt -p pdftract-core --check` +- [PASS] `cargo test -p pdftract-core --lib attachment` (12/12 passed) + +### Notes +- The `/EmbeddedFiles` name tree walker (sibling bead) is not yet implemented +- Merge with `/EmbeddedFiles` results will happen at the caller level when the sibling is complete +- All standard PDF 2.0 /AFRelationship values are supported: Source, Data, Alternative, Supplement, EncryptedPayload, Unspecified