diff --git a/crates/pdftract-core/src/conformance.rs b/crates/pdftract-core/src/conformance.rs index a5b16de..343f813 100644 --- a/crates/pdftract-core/src/conformance.rs +++ b/crates/pdftract-core/src/conformance.rs @@ -13,6 +13,7 @@ //! The conformance information is stored in the document's /Metadata //! stream as XMP XML with the pdfaid namespace. +use crate::diagnostics::{DiagCode, Diagnostic}; use crate::parser::stream::PdfSource; use crate::parser::xref::XrefResolver; use crate::parser::object::PdfObject; @@ -62,10 +63,42 @@ use anyhow::Result; /// assert_eq!(result, Some("PDF/A-1b".to_string())); /// ``` pub fn detect_conformance(metadata_stream: Option<&[u8]>) -> Option { + detect_conformance_impl(metadata_stream, &mut Vec::new()).0 +} + +/// Detect PDF/A conformance from an XMP metadata stream with diagnostics. +/// +/// Same as `detect_conformance` but emits diagnostics when XMP parsing fails. +/// +/// # Arguments +/// +/// * `metadata_stream` - Optional byte slice containing the XMP metadata stream +/// * `diagnostics` - Optional diagnostics vector to emit errors into +/// +/// # Returns +/// +/// * `Some(String)` - PDF/A conformance string if detected (e.g., "PDF/A-1b") +/// * `None` - No PDF/A conformance detected or malformed XML +pub fn detect_conformance_with_diagnostics( + metadata_stream: Option<&[u8]>, + diagnostics: &mut Vec, +) -> Option { + detect_conformance_impl(metadata_stream, diagnostics).0 +} + +/// Internal implementation of conformance detection. +fn detect_conformance_impl( + metadata_stream: Option<&[u8]>, + diagnostics: &mut Vec, +) -> (Option, bool) { use quick_xml::events::Event; use quick_xml::reader::Reader; - let xml = metadata_stream?; + let xml = match metadata_stream { + Some(x) => x, + None => return (None, false), + }; + let mut reader = Reader::from_reader(xml); let mut part: Option = None; let mut conf: Option = None; @@ -97,17 +130,26 @@ pub fn detect_conformance(metadata_stream: Option<&[u8]>) -> Option { current_tag = None; } Ok(Event::Eof) => break, - Err(_) => return None, // Malformed XML - graceful failure + Err(_) => { + // Malformed XML - emit diagnostic and return None + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StructInvalidXmp, + "Malformed XMP metadata in /Metadata stream; unable to parse PDF/A conformance", + )); + return (None, true); + } _ => {} } buf.clear(); } - match (part, conf) { + let result = match (part, conf) { (Some(p), Some(c)) => Some(format!("PDF/A-{}{}", p, c)), (Some(p), None) => Some(format!("PDF/A-{}", p)), _ => None, - } + }; + + (result, false) } /// Detect PDF/A conformance from a catalog's metadata reference. diff --git a/crates/pdftract-core/src/detection.rs b/crates/pdftract-core/src/detection.rs index 73ab1a0..1fae26a 100644 --- a/crates/pdftract-core/src/detection.rs +++ b/crates/pdftract-core/src/detection.rs @@ -402,6 +402,78 @@ mod tests { assert!(!detect_javascript(&catalog, &pages, &acroform, &resolver)); } + #[test] + fn test_detect_javascript_with_annotation_js() { + let resolver = XrefResolver::new(); + let catalog = Catalog::new(ObjRef::new(1, 0)); + + // Create a page with an annotation that has JavaScript in /A + let mut page = PageDict::default(); + page.obj_ref = ObjRef::new(2, 0); + + // Create an annotation with JavaScript action + let annot_ref = ObjRef::new(10, 0); + let mut annot_dict = PdfDict::new(); + let mut js_dict = PdfDict::new(); + js_dict.insert(Arc::from("S"), PdfObject::Name(Arc::from("JavaScript"))); + js_dict.insert(Arc::from("JS"), PdfObject::String(Box::new(b"app.alert('annot')".to_vec()))); + annot_dict.insert(Arc::from("A"), PdfObject::Dict(Box::new(js_dict))); + resolver.cache_object(annot_ref, PdfObject::Dict(Box::new(annot_dict))); + + page.annots.push(annot_ref); + let pages = vec![page]; + + let acroform = None; + + assert!(detect_javascript(&catalog, &pages, &acroform, &resolver)); + } + + #[test] + fn test_detect_javascript_with_page_aa_js() { + let resolver = XrefResolver::new(); + let catalog = Catalog::new(ObjRef::new(1, 0)); + + // Create a page with /AA containing JavaScript + let mut page = PageDict::default(); + page.obj_ref = ObjRef::new(2, 0); + + let mut aa_dict = PdfDict::new(); + let mut js_dict = PdfDict::new(); + js_dict.insert(Arc::from("S"), PdfObject::Name(Arc::from("JavaScript"))); + js_dict.insert(Arc::from("JS"), PdfObject::String(Box::new(b"app.alert('page')".to_vec()))); + aa_dict.insert(Arc::from("O"), PdfObject::Dict(Box::new(js_dict))); + page.aa = Some(PdfObject::Dict(Box::new(aa_dict))); + + let pages = vec![page]; + let acroform = None; + + assert!(detect_javascript(&catalog, &pages, &acroform, &resolver)); + } + + #[test] + fn test_detect_javascript_with_acroform_field_js() { + let resolver = XrefResolver::new(); + let catalog = Catalog::new(ObjRef::new(1, 0)); + + let page = PageDict::default(); + let pages = vec![page]; + + // Create AcroForm with a field that has JavaScript in /AA + let mut acroform = PdfDict::new(); + let mut field_dict = PdfDict::new(); + let mut aa_dict = PdfDict::new(); + let mut js_dict = PdfDict::new(); + js_dict.insert(Arc::from("S"), PdfObject::Name(Arc::from("JavaScript"))); + js_dict.insert(Arc::from("JS"), PdfObject::String(Box::new(b"app.alert('field')".to_vec()))); + aa_dict.insert(Arc::from("C"), PdfObject::Dict(Box::new(js_dict))); + field_dict.insert(Arc::from("AA"), PdfObject::Dict(Box::new(aa_dict))); + + let fields = vec![PdfObject::Dict(Box::new(field_dict))]; + acroform.insert(Arc::from("Fields"), PdfObject::Array(Box::new(fields))); + + assert!(detect_javascript(&catalog, &pages, &Some(acroform), &resolver)); + } + #[test] fn test_has_js_action_with_s_javascript() { let resolver = XrefResolver::new();