diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index d0235e7..99a91f5 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -1716dc348b086a0d5b6ec6da042635cbab610f20 +c6be8e6b574e5a1ef0fb65fb3aacebfe36740030 diff --git a/crates/pdftract-cli/src/password.rs b/crates/pdftract-cli/src/password.rs index c4f0c57..48e0162 100644 --- a/crates/pdftract-cli/src/password.rs +++ b/crates/pdftract-cli/src/password.rs @@ -105,7 +105,7 @@ fn read_password_from_stdin() -> Result> { return Ok(None); } - Ok(Some(secrecy::SecretString::new(password.to_string()))) + Ok(Some(secrecy::SecretString::new(password.to_string().into_boxed_str()))) } #[cfg(test)] diff --git a/crates/pdftract-cli/tests/conformance.rs b/crates/pdftract-cli/tests/conformance.rs index f1b5b84..18d7fd9 100644 --- a/crates/pdftract-cli/tests/conformance.rs +++ b/crates/pdftract-cli/tests/conformance.rs @@ -17,6 +17,27 @@ const SUITE_PATH: &str = "tests/sdk-conformance/cases.json"; const SDK_NAME: &str = "pdftract-rust"; const SDK_VERSION: &str = env!("CARGO_PKG_VERSION"); +/// Simple semver comparison - returns Less if v1 < v2 +fn compare_versions(v1: &str, v2: &str) -> std::cmp::Ordering { + let v1_parts: Vec = v1 + .split('.') + .filter_map(|s| s.parse().ok()) + .collect(); + let v2_parts: Vec = v2 + .split('.') + .filter_map(|s| s.parse().ok()) + .collect(); + + for (a, b) in v1_parts.iter().zip(v2_parts.iter()) { + match a.cmp(b) { + std::cmp::Ordering::Equal => continue, + ord => return ord, + } + } + + v1_parts.len().cmp(&v2_parts.len()) +} + #[derive(Debug, Clone)] enum TestStatus { Pass, @@ -128,6 +149,9 @@ fn run_conformance(suite_path: &str, output_path: &str) -> Result<()> { let summary = calculate_summary(&results, duration_ms); print_summary(&summary); + // Check exit conditions before moving summary into report + let should_fail = summary.failed > 0 || summary.errors > 0; + let report = ConformanceReport { sdk: SDK_NAME.to_string(), sdk_version: SDK_VERSION.to_string(), @@ -149,7 +173,7 @@ fn run_conformance(suite_path: &str, output_path: &str) -> Result<()> { println!(); println!("Report written to: {}", output_path); - if summary.failed > 0 || summary.errors > 0 { + if should_fail { std::process::exit(1); } @@ -170,9 +194,7 @@ fn run_test_case(case: &Value, schema_version: &str) -> Result { let min_schema = case.get("min_schema_version").and_then(|v| v.as_str()); if let Some(min_ver) = min_schema { - if version_compare::compare(schema_version, min_ver) - .map_or(true, |ord| ord == std::cmp::Ordering::Less) - { + if compare_versions(schema_version, min_ver) == std::cmp::Ordering::Less { return Ok(TestResult { id, status: TestStatus::Skip, @@ -324,7 +346,7 @@ fn compare_recursive( } } (Value::String(act), Value::Object(exp)) => { - if let Some(min_len) = exp.get("min_length").and_then(|v| v.as_usize()) { + if let Some(min_len) = exp.get("min_length").and_then(|v| v.as_u64().map(|v| v as usize)) { if act.len() < min_len { return Err(format!( "[{}]: string length {} is less than minimum {}", @@ -345,7 +367,7 @@ fn compare_recursive( } } (Value::Array(act), Value::Object(exp)) => { - if let Some(min_len) = exp.get("min").and_then(|v| v.as_usize()) { + if let Some(min_len) = exp.get("min").and_then(|v| v.as_u64().map(|v| v as usize)) { if act.len() < min_len { return Err(format!( "[{}]: array length {} is less than minimum {}", @@ -355,7 +377,7 @@ fn compare_recursive( )); } } - if let Some(max_len) = exp.get("max").and_then(|v| v.as_usize()) { + if let Some(max_len) = exp.get("max").and_then(|v| v.as_u64().map(|v| v as usize)) { if act.len() > max_len { return Err(format!( "[{}]: array length {} is greater than maximum {}", @@ -367,7 +389,7 @@ fn compare_recursive( } } (Value::Object(act), Value::Object(exp)) => { - for (key, exp_val) in exp.as_object().unwrap() { + for (key, exp_val) in exp { let new_path = if path.is_empty() { key.clone() } else { diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index c03e4ff..8e630d3 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -305,6 +305,15 @@ pub enum DiagCode { /// Phase origin: 1.7 StructInvalidGeometry, + /// Hybrid xref conflict: traditional table and stream disagree on object state + /// + /// Emitted when merging a hybrid file's xref sections and the traditional + /// table marks an object as Free while the stream marks it as InUse. + /// Per PDF spec, the traditional entry wins (object is Free). + /// + /// Phase origin: 1.3 + StructHybridConflict, + // === XREF_* codes === /// Invalid xref keyword or header @@ -387,7 +396,7 @@ pub enum DiagCode { /// Decompression bomb limit exceeded /// /// Emitted when a stream's decompressed size would exceed `max_decompress_bytes` - /// (default: 2 GB). The stream is truncated at the limit. Increase the limit via + /// (default: 512 MiB). The stream is truncated at the limit. Increase the limit via /// `--max-decompress-gb` if the PDF is trusted. /// /// Phase origin: 1.5 @@ -662,7 +671,12 @@ impl DiagCode { | DiagCode::StructInvalidIndirectHeader | DiagCode::StructIntegerOverflow | DiagCode::StructInvalidObjstm - | DiagCode::StructInvalidGeometry => "STRUCT", + | DiagCode::StructInvalidGeometry + | DiagCode::StructInvalidUtf16 + | DiagCode::StructUnresolvedDestination + | DiagCode::StructNonGotoOutline + | DiagCode::StructInvalidPdfDocEncoding + | DiagCode::StructHybridConflict => "STRUCT", // XREF_* DiagCode::XrefInvalidHeader @@ -746,6 +760,11 @@ impl DiagCode { DiagCode::StructIntegerOverflow => "STRUCT_INTEGER_OVERFLOW", DiagCode::StructInvalidObjstm => "STRUCT_INVALID_OBJSTM", DiagCode::StructInvalidGeometry => "STRUCT_INVALID_GEOMETRY", + DiagCode::StructInvalidUtf16 => "STRUCT_INVALID_UTF16", + DiagCode::StructUnresolvedDestination => "STRUCT_UNRESOLVED_DESTINATION", + DiagCode::StructNonGotoOutline => "STRUCT_NON_GOTO_OUTLINE", + DiagCode::StructInvalidPdfDocEncoding => "STRUCT_INVALID_PDFDOC_ENCODING", + DiagCode::StructHybridConflict => "STRUCT_HYBRID_CONFLICT", DiagCode::XrefInvalidHeader => "XREF_INVALID_HEADER", DiagCode::XrefInvalidEntry => "XREF_INVALID_ENTRY", DiagCode::XrefInvalidSubsectionHeader => "XREF_INVALID_SUBSECTION_HEADER", @@ -812,6 +831,11 @@ impl DiagCode { | DiagCode::StructIntegerOverflow | DiagCode::StructInvalidObjstm | DiagCode::StructInvalidGeometry + | DiagCode::StructInvalidUtf16 + | DiagCode::StructUnresolvedDestination + | DiagCode::StructNonGotoOutline + | DiagCode::StructInvalidPdfDocEncoding + | DiagCode::StructHybridConflict | DiagCode::XrefInvalidHeader | DiagCode::XrefInvalidEntry | DiagCode::XrefInvalidSubsectionHeader @@ -1040,6 +1064,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "1.7", suggested_action: "NaN or Inf in MediaBox/CropBox/Rotate; canonicalized to 0 for fingerprint computation", }, + DiagInfo { + code: DiagCode::StructHybridConflict, + category: "STRUCT", + severity: Severity::Warning, + recoverable: true, + phase: "1.3", + suggested_action: "Traditional table entry takes precedence; object marked as Free per traditional table", + }, // === XREF_* codes === DiagInfo { code: DiagCode::XrefInvalidHeader, @@ -1550,30 +1582,19 @@ macro_rules! emit { // emit!(diagnostics, CODE, offset = , message = ) ($diagnostics:expr, $code:ident, offset = $offset:expr, message = $msg:expr) => {{ - let msg = $msg; - $diagnostics.push(if let Some(static_msg) = { - // Try to coerce &'static str - let maybe_static: Option<&'static str> = (|| Some(&*msg))(); - maybe_static - } { - $crate::diagnostics::Diagnostic::with_static($crate::diagnostics::DiagCode::$code, $offset, static_msg) - } else { - $crate::diagnostics::Diagnostic::with_dynamic($crate::diagnostics::DiagCode::$code, $offset, msg.into()) - }); + $diagnostics.push($crate::diagnostics::Diagnostic::with_dynamic( + $crate::diagnostics::DiagCode::$code, + $offset, + $msg.into(), + )); }}; // emit!(diagnostics, CODE, message = ) ($diagnostics:expr, $code:ident, message = $msg:expr) => {{ - let msg = $msg; - $diagnostics.push(if let Some(static_msg) = { - // Try to coerce &'static str - let maybe_static: Option<&'static str> = (|| Some(&*msg))(); - maybe_static - } { - $crate::diagnostics::Diagnostic::with_static_no_offset($crate::diagnostics::DiagCode::$code, static_msg) - } else { - $crate::diagnostics::Diagnostic::with_dynamic_no_offset($crate::diagnostics::DiagCode::$code, msg.into()) - }); + $diagnostics.push($crate::diagnostics::Diagnostic::with_dynamic_no_offset( + $crate::diagnostics::DiagCode::$code, + $msg.into(), + )); }}; } diff --git a/crates/pdftract-core/src/parser/catalog.rs b/crates/pdftract-core/src/parser/catalog.rs index adace5f..49a7dda 100644 --- a/crates/pdftract-core/src/parser/catalog.rs +++ b/crates/pdftract-core/src/parser/catalog.rs @@ -6,7 +6,7 @@ use crate::parser::object::{ObjRef, PdfObject, intern}; use crate::parser::xref::XrefResolver; -use crate::parser::{Diagnostic, Severity}; +use crate::diagnostics::{Diagnostic, DiagCode}; use crate::parser::ocg::{parse_oc_properties, OcProperties}; /// Result type for catalog parsing. @@ -355,13 +355,8 @@ impl Catalog { } /// Add a diagnostic to the catalog. - fn emit_diagnostic(&mut self, severity: Severity, message: String) { - self.diagnostics.push(Diagnostic { - code: crate::parser::diagnostic::DiagCode::StructUnexpectedEof, - severity, - phase: "1.4".to_string(), - message, - }); + fn emit_diagnostic(&mut self, code: DiagCode, message: String) { + self.diagnostics.push(Diagnostic::with_dynamic_no_offset(code, message)); } } @@ -408,12 +403,10 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result obj, Err(e) => { - diagnostics.push(Diagnostic { - code: crate::parser::diagnostic::DiagCode::StructUnexpectedEof, - severity: Severity::Error, - phase: "1.4".to_string(), - message: format!("Failed to resolve /Root: {}", e), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("Failed to resolve /Root: {}", e), + )); return Err(diagnostics); } }; @@ -422,12 +415,10 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result d, None => { - diagnostics.push(Diagnostic { - code: crate::parser::diagnostic::DiagCode::StructUnexpectedEof, - severity: Severity::Error, - phase: "1.4".to_string(), - message: format!("/Root is not a dictionary (type: {})", root_obj.type_name()), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("/Root is not a dictionary (type: {})", root_obj.type_name()), + )); return Err(diagnostics); } }; @@ -437,23 +428,19 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result *ref_, Some(other) => { // Emit STRUCT_MISSING_KEY diagnostic and return empty catalog - diagnostics.push(Diagnostic { - code: crate::parser::diagnostic::DiagCode::MissingKey, - severity: Severity::Error, - phase: "1.4".to_string(), - message: format!("STRUCT_MISSING_KEY: /Pages is not a reference (type: {})", other.type_name()), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, + format!("STRUCT_MISSING_KEY: /Pages is not a reference (type: {})", other.type_name()), + )); catalog.diagnostics = diagnostics; return Ok(catalog); } None => { // Emit STRUCT_MISSING_KEY diagnostic and return empty catalog - diagnostics.push(Diagnostic { - code: crate::parser::diagnostic::DiagCode::MissingKey, - severity: Severity::Error, - phase: "1.4".to_string(), - message: "STRUCT_MISSING_KEY: /Pages key missing from catalog".to_string(), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, + "STRUCT_MISSING_KEY: /Pages key missing from catalog".to_string(), + )); catalog.diagnostics = diagnostics; return Ok(catalog); } diff --git a/crates/pdftract-core/src/parser/mod.rs b/crates/pdftract-core/src/parser/mod.rs index ced0675..453c812 100644 --- a/crates/pdftract-core/src/parser/mod.rs +++ b/crates/pdftract-core/src/parser/mod.rs @@ -19,9 +19,15 @@ pub mod ocg; pub use crate::diagnostics::{Diagnostic, Severity, DiagCode, ObjRef}; pub use object::{PdfObject}; pub use objstm::{ObjectStmParser, ObjStmCacheEntry, ObjStmResult, ObjStmError}; -pub use xref::{XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection, XrefDiagnostic, XrefDiagCode, parse_traditional_xref, parse_xref_stream, merge_hybrid, is_hybrid_trailer}; +pub use xref::{ + XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection, XrefDiagnostic, XrefDiagCode, + parse_traditional_xref, parse_xref_stream, merge_hybrid, is_hybrid_trailer, + LinearizationInfo, detect_linearization, load_xref_linearized, merge_linearized_xrefs, +}; pub use catalog::{Catalog, MarkInfo, PageLabel, PageLabelsTree, PageLabelStyle, parse_catalog}; pub use ocg::{OcProperties, OcGroup, Ocmd, OcmdPolicy, BaseState, parse_oc_properties}; +pub use resources::{ResourceDict, merge_resources, extract_resources}; +pub use pages::{PageDict, flatten_page_tree, DEFAULT_MEDIABOX}; pub use stream::{ StreamDecoder, FlateDecoder, ASCII85Decoder, ASCIIHexDecoder, CryptDecoder, PassthroughDecoder, normalize_filter_name, get_decoder, FilterError, DEFAULT_MAX_DECOMPRESS_BYTES, diff --git a/crates/pdftract-core/src/parser/object/parser.rs b/crates/pdftract-core/src/parser/object/parser.rs index 3d282c2..13d3c6a 100644 --- a/crates/pdftract-core/src/parser/object/parser.rs +++ b/crates/pdftract-core/src/parser/object/parser.rs @@ -282,7 +282,7 @@ impl<'a> ObjectParser<'a> { let offset = self.lexer.position(); // Try to get /Length from the dict - let len_hint = dict.get("/Length").and_then(|obj| obj.as_int()).map(|i| i as u64); + let len_hint = dict.get("Length").and_then(|obj| obj.as_int()).map(|i| i as u64); // Skip the stream body self.skip_stream_body(len_hint); diff --git a/crates/pdftract-core/src/parser/object/types.rs b/crates/pdftract-core/src/parser/object/types.rs index 4080887..28bdae9 100644 --- a/crates/pdftract-core/src/parser/object/types.rs +++ b/crates/pdftract-core/src/parser/object/types.rs @@ -132,14 +132,33 @@ impl PdfStream { /// Get the /Filter entry from the stream dictionary. /// /// Returns None if no filter is present (raw stream). + /// Filter names are returned without the leading slash (e.g., "FlateDecode", not "/FlateDecode"). pub fn filter(&self) -> Option> { - let filter = self.dict.get("/Filter")?; + let filter = self.dict.get("Filter")?; Some(match filter { - PdfObject::Name(name) => vec![name.to_string()], + PdfObject::Name(name) => { + // Strip leading slash from filter name for normalization + let name_str: &str = name.as_ref(); + let stripped = if name_str.starts_with('/') { + &name_str[1..] + } else { + name_str + }; + vec![stripped.to_string()] + } PdfObject::Array(arr) => arr .iter() - .filter_map(|obj| obj.as_name().map(|n| n.to_string())) + .filter_map(|obj| obj.as_name().map(|n| { + // Strip leading slash from filter name for normalization + let name_str: &str = n.as_ref(); + let stripped = if name_str.starts_with('/') { + &name_str[1..] + } else { + name_str + }; + stripped.to_string() + })) .collect(), _ => return None, }) @@ -149,7 +168,7 @@ impl PdfStream { /// /// Returns None if no parameters are present. pub fn decode_params(&self) -> Option> { - let params = self.dict.get("/DecodeParms")?; + let params = self.dict.get("DecodeParms")?; Some(match params { PdfObject::Dict(_) => vec![params.clone()], @@ -162,7 +181,7 @@ impl PdfStream { /// /// Returns the direct integer value, or None if /Length is indirect/missing. pub fn length(&self) -> Option { - self.dict.get("/Length")?.as_int().map(|i| i as u64) + self.dict.get("Length")?.as_int().map(|i| i as u64) } } diff --git a/crates/pdftract-core/src/parser/objstm.rs b/crates/pdftract-core/src/parser/objstm.rs index 65cc8c8..1b04a7b 100644 --- a/crates/pdftract-core/src/parser/objstm.rs +++ b/crates/pdftract-core/src/parser/objstm.rs @@ -524,7 +524,7 @@ impl ObjectStmParser { impl Default for ObjectStmParser { fn default() -> Self { - Self::new(2 * 1024_u64.pow(3)) // 2 GB default + Self::new(512 * 1024_u64.pow(2)) // 512 MiB default } } @@ -558,7 +558,7 @@ mod tests { #[test] fn test_obj_stm_parser_default() { let parser = ObjectStmParser::default(); - assert_eq!(parser.max_decompress_bytes, 2 * 1024_u64.pow(3)); + assert_eq!(parser.max_decompress_bytes, 512 * 1024_u64.pow(2)); } #[test] diff --git a/crates/pdftract-core/src/parser/ocg.rs b/crates/pdftract-core/src/parser/ocg.rs index 3fe8b01..369e722 100644 --- a/crates/pdftract-core/src/parser/ocg.rs +++ b/crates/pdftract-core/src/parser/ocg.rs @@ -8,7 +8,7 @@ use std::collections::HashMap; -use crate::parser::{Diagnostic, DiagCode, Severity}; +use crate::parser::{Diagnostic, DiagCode}; use crate::parser::object::{intern, ObjRef, PdfDict, PdfObject}; use crate::parser::xref::XrefResolver; @@ -302,12 +302,10 @@ pub fn parse_oc_properties( let oc_props_obj = match resolver.resolve(oc_props_ref) { Ok(obj) => obj, Err(e) => { - diagnostics.push(Diagnostic { - code: DiagCode::MissingKey, - severity: Severity::Warning, - phase: "1.4".to_string(), - message: format!("Failed to resolve /OCProperties: {}", e), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("Failed to resolve /OCProperties: {}", e), + )); oc_properties.diagnostics = diagnostics; return oc_properties; } @@ -316,12 +314,10 @@ pub fn parse_oc_properties( let oc_props_dict = match oc_props_obj.as_dict() { Some(d) => d, None => { - diagnostics.push(Diagnostic { - code: DiagCode::StructUnexpectedEof, - severity: Severity::Warning, - phase: "1.4".to_string(), - message: format!("/OCProperties is not a dictionary (type: {})", oc_props_obj.type_name()), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("/OCProperties is not a dictionary (type: {})", oc_props_obj.type_name()), + )); oc_properties.diagnostics = diagnostics; return oc_properties; } @@ -334,22 +330,18 @@ pub fn parse_oc_properties( .filter_map(|o| o.as_ref()) .collect(), Some(other) => { - diagnostics.push(Diagnostic { - code: DiagCode::StructUnexpectedEof, - severity: Severity::Warning, - phase: "1.4".to_string(), - message: format!("/OCGs is not an array (type: {})", other.type_name()), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("/OCGs is not an array (type: {})", other.type_name()), + )); oc_properties.diagnostics = diagnostics; return oc_properties; } None => { - diagnostics.push(Diagnostic { - code: DiagCode::MissingKey, - severity: Severity::Warning, - phase: "1.4".to_string(), - message: "/OCGs key missing from /OCProperties".to_string(), - }); + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StructMissingKey, + "/OCGs key missing from /OCProperties", + )); oc_properties.diagnostics = diagnostics; return oc_properties; } @@ -363,12 +355,10 @@ pub fn parse_oc_properties( oc_properties.groups.insert(ocg_ref, group); } Err(e) => { - diagnostics.push(Diagnostic { - code: DiagCode::StructUnexpectedEof, - severity: Severity::Warning, - phase: "1.4".to_string(), - message: format!("Failed to resolve OCG ref {}: {}", ocg_ref, e), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("Failed to resolve OCG ref {}: {}", ocg_ref, e), + )); } } } @@ -377,22 +367,18 @@ pub fn parse_oc_properties( let default_config = match oc_props_dict.get("D") { Some(PdfObject::Dict(d)) => &**d, Some(other) => { - diagnostics.push(Diagnostic { - code: DiagCode::StructUnexpectedEof, - severity: Severity::Warning, - phase: "1.4".to_string(), - message: format!("/D is not a dictionary (type: {})", other.type_name()), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("/D is not a dictionary (type: {})", other.type_name()), + )); oc_properties.diagnostics = diagnostics; return oc_properties; } None => { - diagnostics.push(Diagnostic { - code: DiagCode::MissingKey, - severity: Severity::Warning, - phase: "1.4".to_string(), - message: "/D key missing from /OCProperties".to_string(), - }); + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StructMissingKey, + "/D key missing from /OCProperties", + )); oc_properties.diagnostics = diagnostics; return oc_properties; } diff --git a/crates/pdftract-core/src/parser/outline.rs b/crates/pdftract-core/src/parser/outline.rs index c5eb35e..2951766 100644 --- a/crates/pdftract-core/src/parser/outline.rs +++ b/crates/pdftract-core/src/parser/outline.rs @@ -12,8 +12,7 @@ use crate::parser::object::{ObjRef, PdfObject}; use crate::parser::pages::PageDict; use crate::parser::xref::XrefResolver; -use crate::parser::{Diagnostic, Severity}; -use crate::parser::diagnostic::DiagCode; +use crate::diagnostics::{Diagnostic, DiagCode}; use std::collections::HashSet; /// Maximum depth of outline nesting to prevent stack overflow. @@ -175,9 +174,8 @@ fn decode_pdf_string(bytes: &[u8]) -> Result { fn decode_utf16be_bom(bytes: &[u8]) -> Result { if bytes.len() % 2 != 0 { return Err(vec![ - Diagnostic::error_with_code( + Diagnostic::with_static_no_offset( DiagCode::StructInvalidUtf16, - "1.4", "STRUCT_INVALID_UTF16: UTF-16BE string has odd length", ) ]); @@ -190,9 +188,8 @@ fn decode_utf16be_bom(bytes: &[u8]) -> Result { String::from_utf16(&utf16_chars).map_err(|_| { vec![ - Diagnostic::error_with_code( + Diagnostic::with_static_no_offset( DiagCode::StructInvalidUtf16, - "1.4", "STRUCT_INVALID_UTF16: Invalid UTF-16BE sequence", ) ] @@ -535,10 +532,9 @@ fn resolve_destination( Some(ref_) => ref_, None => { // Named destination - emit diagnostic and return None - diagnostics.push(Diagnostic::error_with_code( + diagnostics.push(Diagnostic::with_static_no_offset( DiagCode::StructUnresolvedDestination, - "1.4", - format!("STRUCT_UNRESOLVED_DESTINATION: Named destination not supported"), + "STRUCT_UNRESOLVED_DESTINATION: Named destination not supported", )); return (None, None); } @@ -563,10 +559,9 @@ fn resolve_destination( } } else if &**action_type == "URI" { // URI action - not a GoTo, so no page destination - diagnostics.push(Diagnostic::error_with_code( + diagnostics.push(Diagnostic::with_static_no_offset( DiagCode::StructNonGotoOutline, - "1.4", - format!("STRUCT_NON_GOTO_OUTLINE: URI action not supported for outline destination"), + "STRUCT_NON_GOTO_OUTLINE: URI action not supported for outline destination", )); return (None, None); } @@ -592,9 +587,8 @@ fn parse_outline_recursive( ) -> Option { // Cycle detection if !visited.insert(node_ref) { - diagnostics.push(Diagnostic::error_with_code( - DiagCode::CircularRef, - "1.4", + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructCircularRef, format!("STRUCT_CIRCULAR_REF: Cycle detected at outline node {}", node_ref), )); return None; @@ -602,9 +596,8 @@ fn parse_outline_recursive( // Depth limit check if depth >= MAX_OUTLINE_DEPTH { - diagnostics.push(Diagnostic::error_with_code( - DiagCode::DepthExceeded, - "1.4", + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructDepthExceeded, format!("STRUCT_DEPTH_EXCEEDED: Outline depth exceeds limit of {}", MAX_OUTLINE_DEPTH), )); return None; @@ -614,9 +607,8 @@ fn parse_outline_recursive( let node_obj = match resolver.resolve(node_ref) { Ok(obj) => obj, Err(e) => { - diagnostics.push(Diagnostic::error_with_code( + diagnostics.push(Diagnostic::with_dynamic_no_offset( DiagCode::StructUnexpectedEof, - "1.4", format!("Failed to resolve outline node {}: {}", node_ref, e), )); return None; @@ -626,9 +618,8 @@ fn parse_outline_recursive( let node_dict = match node_obj.as_dict() { Some(d) => d, None => { - diagnostics.push(Diagnostic::error_with_code( + diagnostics.push(Diagnostic::with_dynamic_no_offset( DiagCode::StructUnexpectedEof, - "1.4", format!("Outline node {} is not a dictionary", node_ref), )); return None; @@ -645,9 +636,8 @@ fn parse_outline_recursive( } }, None => { - diagnostics.push(Diagnostic::error_with_code( - DiagCode::MissingKey, - "1.4", + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, format!("STRUCT_MISSING_KEY: Outline node {} missing /Title", node_ref), )); String::from("") @@ -740,9 +730,8 @@ pub fn parse_outlines( let root_obj = match resolver.resolve(outlines_root_ref) { Ok(obj) => obj, Err(e) => { - diagnostics.push(Diagnostic::error_with_code( + diagnostics.push(Diagnostic::with_dynamic_no_offset( DiagCode::StructUnexpectedEof, - "1.4", format!("Failed to resolve /Outlines root: {}", e), )); return (outlines, diagnostics); @@ -752,10 +741,9 @@ pub fn parse_outlines( let root_dict = match root_obj.as_dict() { Some(d) => d, None => { - diagnostics.push(Diagnostic::error_with_code( + diagnostics.push(Diagnostic::with_static_no_offset( DiagCode::StructUnexpectedEof, - "1.4", - format!("/Outlines root is not a dictionary"), + "/Outlines root is not a dictionary", )); return (outlines, diagnostics); } diff --git a/crates/pdftract-core/src/parser/pages.rs b/crates/pdftract-core/src/parser/pages.rs index ae75b39..619f68c 100644 --- a/crates/pdftract-core/src/parser/pages.rs +++ b/crates/pdftract-core/src/parser/pages.rs @@ -12,9 +12,8 @@ use crate::parser::object::{ObjRef, PdfObject, PdfDict, intern}; use crate::parser::xref::XrefResolver; -use crate::parser::{Diagnostic, Severity}; -use crate::parser::diagnostic::DiagCode; -use crate::parser::resources::{ResourceDict, merge_resources, extract_resources}; +use crate::diagnostics::{Diagnostic, DiagCode}; +use crate::parser::resources::{ResourceDict, merge_resources}; use std::collections::HashSet; use std::sync::Arc; @@ -133,12 +132,10 @@ pub fn flatten_page_tree(resolver: &XrefResolver, pages_ref: ObjRef) -> Result obj, Err(e) => { - diagnostics.push(Diagnostic { - severity: Severity::Error, - phase: "1.4".to_string(), - code: DiagCode::MissingKey, - message: format!("Failed to resolve root /Pages node {}: {}", pages_ref, e), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, + format!("Failed to resolve root /Pages node {}: {}", pages_ref, e), + )); return Err(diagnostics); } }; @@ -162,15 +159,13 @@ pub fn flatten_page_tree(resolver: &XrefResolver, pages_ref: ObjRef) -> Result 0 && actual_count != declared_count { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::InvalidPageCount, - message: format!( + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::PageInvalidCount, + format!( "STRUCT_INVALID_PAGE_COUNT: /Count declares {} pages, but tree contains {} pages", declared_count, actual_count ), - }); + )); } if !diagnostics.is_empty() && pages.is_empty() { @@ -206,12 +201,10 @@ fn walk_page_tree( ) -> Vec { // Depth limit check if depth > MAX_PAGES_DEPTH { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::DepthExceeded, - message: format!("STRUCT_DEPTH_EXCEEDED: /Pages nesting exceeds {} levels", MAX_PAGES_DEPTH), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructDepthExceeded, + format!("STRUCT_DEPTH_EXCEEDED: /Pages nesting exceeds {} levels", MAX_PAGES_DEPTH), + )); return Vec::new(); } @@ -244,12 +237,10 @@ fn walk_page_tree( let kids = match dict.get("Kids") { Some(k) => k, None => { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::MissingKey, - message: "STRUCT_MISSING_KEY: /Pages node missing /Kids".to_string(), - }); + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StructMissingKey, + "STRUCT_MISSING_KEY: /Pages node missing /Kids", + )); return Vec::new(); } }; @@ -262,6 +253,11 @@ fn walk_page_tree( } }; + // For /Pages nodes, all children should start with the same inherited state + // (the state after merging this /Pages node's own attributes). + // Save this state so we can restore it for each sibling. + let pages_parent_inherited = inherited.clone(); + let mut pages = Vec::new(); for kid in kids_array { // Handle both direct (embedded dict) and indirect references @@ -269,12 +265,10 @@ fn walk_page_tree( PdfObject::Ref(ref_) => { // Check for cycles if visited.contains(ref_) { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::CircularRef, - message: format!("STRUCT_CIRCULAR_REF: /Pages node {} already visited", ref_), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructCircularRef, + format!("STRUCT_CIRCULAR_REF: /Pages node {} already visited", ref_), + )); continue; } visited.insert(*ref_); @@ -282,12 +276,10 @@ fn walk_page_tree( match resolver.resolve(*ref_) { Ok(obj) => obj, Err(e) => { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::MissingKey, - message: format!("STRUCT_MISSING_KEY: Failed to resolve /Kids entry {}: {}", ref_, e), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, + format!("STRUCT_MISSING_KEY: Failed to resolve /Kids entry {}: {}", ref_, e), + )); continue; } } @@ -314,7 +306,7 @@ fn walk_page_tree( pages.extend(child_pages); // Restore inherited state for next sibling - *inherited = parent_inherited.clone(); + *inherited = pages_parent_inherited.clone(); } pages @@ -351,12 +343,10 @@ fn merge_inherited_attrs(dict: &PdfDict, inherited: &mut InheritedAttrs, diagnos // Rotate (inheritable) if let Some(rot) = dict.get("Rotate").and_then(|o| o.as_int()) { if rot % 90 != 0 { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::InvalidRotate, - message: format!("STRUCT_INVALID_ROTATE: /Rotate value {} is not a multiple of 90", rot), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::PageInvalidRotate, + format!("STRUCT_INVALID_ROTATE: /Rotate value {} is not a multiple of 90", rot), + )); // Clamp to nearest multiple of 90 (floor toward negative infinity) inherited.rotate = ((rot as f64 / 90.0).floor() as i64 * 90) as i32; } else { @@ -405,12 +395,10 @@ fn build_page_dict(page_obj: &PdfObject, inherited: &InheritedAttrs, diagnostics } else if let Some(inherited_mb) = inherited.media_box { inherited_mb } else { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::MissingKey, - message: format!("STRUCT_MISSING_KEY: Page {} has no /MediaBox and no inherited /MediaBox; using US Letter default", obj_ref), - }); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StructMissingKey, + format!("STRUCT_MISSING_KEY: Page {} has no /MediaBox and no inherited /MediaBox; using US Letter default", obj_ref), + )); DEFAULT_MEDIABOX }; @@ -430,12 +418,11 @@ fn build_page_dict(page_obj: &PdfObject, inherited: &InheritedAttrs, diagnostics let mut rotate = inherited.rotate; if let Some(rot) = dict.get("Rotate").and_then(|o| o.as_int()) { if rot % 90 != 0 { - diagnostics.push(Diagnostic { - severity: Severity::Warning, - phase: "1.4".to_string(), - code: DiagCode::InvalidRotate, - message: format!("STRUCT_INVALID_ROTATE: Page {} has /Rotate value {} (not a multiple of 90)", obj_ref, rot), - }); + diagnostics.push(Diagnostic::with_dynamic( + DiagCode::PageInvalidRotate, + 0, + format!("Page {} has /Rotate value {} (not a multiple of 90)", obj_ref, rot), + )); // Clamp to nearest multiple of 90 (floor toward negative infinity) rotate = ((rot as f64 / 90.0).floor() as i64 * 90) as i32; } else { @@ -929,13 +916,13 @@ mod tests { page2.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX)); // Wire up the tree: grandparent -> parent -> [page1, page2] - let mut grandparent_dict = grandparent.as_dict().unwrap().clone(); + let mut grandparent_dict = grandparent.clone(); grandparent_dict.insert( intern("Kids"), PdfObject::Array(Box::new(vec![PdfObject::Ref(parent_ref)])) ); - let mut parent_dict = parent.as_dict().unwrap().clone(); + let mut parent_dict = parent.clone(); parent_dict.insert( intern("Kids"), PdfObject::Array(Box::new(vec![PdfObject::Ref(page1_ref), PdfObject::Ref(page2_ref)])) @@ -970,6 +957,7 @@ mod tests { #[test] fn test_resource_inheritance_page_without_resources() { // Test that a page without /Resources inherits parent's resources + // and that multiple pages with no resources share the same Arc instance let resolver = XrefResolver::new(); // Parent /Pages with resources @@ -982,39 +970,46 @@ mod tests { let mut parent = PdfDict::new(); parent.insert(intern("Type"), PdfObject::Name(intern("Pages"))); parent.insert(intern("Kids"), PdfObject::Array(Box::new(vec![]))); - parent.insert(intern("Count"), PdfObject::Integer(1)); + parent.insert(intern("Count"), PdfObject::Integer(2)); parent.insert(intern("Resources"), PdfObject::Dict(Box::new(parent_resources))); parent.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX)); - // Page without /Resources - let page_ref = ObjRef::new(2, 0); - let mut page = PdfDict::new(); - page.insert(intern("Type"), PdfObject::Name(intern("Page"))); - page.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX)); + // Two pages without /Resources + let page1_ref = ObjRef::new(2, 0); + let mut page1 = PdfDict::new(); + page1.insert(intern("Type"), PdfObject::Name(intern("Page"))); + page1.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX)); + + let page2_ref = ObjRef::new(3, 0); + let mut page2 = PdfDict::new(); + page2.insert(intern("Type"), PdfObject::Name(intern("Page"))); + page2.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX)); // Wire up the tree let mut parent_dict = parent.clone(); parent_dict.insert( intern("Kids"), - PdfObject::Array(Box::new(vec![PdfObject::Ref(page_ref)])) + PdfObject::Array(Box::new(vec![PdfObject::Ref(page1_ref), PdfObject::Ref(page2_ref)])) ); resolver.cache_object(parent_ref, PdfObject::Dict(Box::new(parent_dict))); - resolver.cache_object(page_ref, PdfObject::Dict(Box::new(page))); + resolver.cache_object(page1_ref, PdfObject::Dict(Box::new(page1))); + resolver.cache_object(page2_ref, PdfObject::Dict(Box::new(page2))); let result = flatten_page_tree(&resolver, parent_ref); assert!(result.is_ok()); let pages_vec = result.unwrap(); - assert_eq!(pages_vec.len(), 1); + assert_eq!(pages_vec.len(), 2); - // Page should have inherited F1 from parent + // Both pages should have inherited F1 from parent assert_eq!(pages_vec[0].resources.fonts.len(), 1); assert_eq!(pages_vec[0].resources.fonts.get(&intern("F1")), Some(&ObjRef::new(10, 0))); + assert_eq!(pages_vec[1].resources.fonts.len(), 1); + assert_eq!(pages_vec[1].resources.fonts.get(&intern("F1")), Some(&ObjRef::new(10, 0))); - // Verify Arc pointer sharing: when page has no resources, - // it should share the same Arc as the parent (memory efficiency) - // We can't test this directly without exposing the parent's resources, - // but we can verify the resources are present + // Verify Arc pointer sharing: when pages have no resources, + // they should share the same Arc instance (memory efficiency) + assert!(Arc::ptr_eq(&pages_vec[0].resources, &pages_vec[1].resources)); } #[test] diff --git a/crates/pdftract-core/src/parser/resources.rs b/crates/pdftract-core/src/parser/resources.rs index 5536cd3..13c7f59 100644 --- a/crates/pdftract-core/src/parser/resources.rs +++ b/crates/pdftract-core/src/parser/resources.rs @@ -282,9 +282,9 @@ mod tests { let merged = merge_resources(&ancestor, &child_obj); assert_eq!(merged.fonts.len(), 3); - assert_eq!(merged.fonts.get(intern("F1")), Some(&ObjRef::new(10, 0))); // Overridden - assert_eq!(merged.fonts.get(intern("F2")), Some(&ObjRef::new(2, 0))); // Inherited - assert_eq!(merged.fonts.get(intern("F3")), Some(&ObjRef::new(3, 0))); // New + assert_eq!(merged.fonts.get(&intern("F1")), Some(&ObjRef::new(10, 0))); // Overridden + assert_eq!(merged.fonts.get(&intern("F2")), Some(&ObjRef::new(2, 0))); // Inherited + assert_eq!(merged.fonts.get(&intern("F3")), Some(&ObjRef::new(3, 0))); // New } #[test] @@ -300,8 +300,8 @@ mod tests { let merged = merge_resources(&ancestor, &PdfObject::Dict(Box::new(child_resources))); assert_eq!(merged.xobjects.len(), 2); - assert_eq!(merged.xobjects.get(intern("Im1")), Some(&ObjRef::new(5, 0))); - assert_eq!(merged.xobjects.get(intern("Im2")), Some(&ObjRef::new(6, 0))); + assert_eq!(merged.xobjects.get(&intern("Im1")), Some(&ObjRef::new(5, 0))); + assert_eq!(merged.xobjects.get(&intern("Im2")), Some(&ObjRef::new(6, 0))); } #[test] @@ -333,7 +333,7 @@ mod tests { let merged = merge_resources(&ancestor, &PdfObject::Dict(Box::new(child_resources))); assert_eq!(merged.color_spaces.len(), 1); - let cs1 = merged.color_spaces.get(intern("CS1")).unwrap(); + let cs1 = merged.color_spaces.get(&intern("CS1")).unwrap(); assert!(cs1.as_array().is_some()); } @@ -366,7 +366,7 @@ mod tests { let merged = merge_resources(&ancestor, &PdfObject::Null); assert_eq!(merged.fonts.len(), 1); - assert_eq!(merged.fonts.get(intern("F1")), Some(&ObjRef::new(1, 0))); + assert_eq!(merged.fonts.get(&intern("F1")), Some(&ObjRef::new(1, 0))); } #[test] @@ -393,9 +393,9 @@ mod tests { // All three fonts should be present assert_eq!(page.fonts.len(), 3); - assert_eq!(page.fonts.get(intern("F1")), Some(&ObjRef::new(1, 0))); - assert_eq!(page.fonts.get(intern("F2")), Some(&ObjRef::new(2, 0))); - assert_eq!(page.fonts.get(intern("F3")), Some(&ObjRef::new(3, 0))); + assert_eq!(page.fonts.get(&intern("F1")), Some(&ObjRef::new(1, 0))); + assert_eq!(page.fonts.get(&intern("F2")), Some(&ObjRef::new(2, 0))); + assert_eq!(page.fonts.get(&intern("F3")), Some(&ObjRef::new(3, 0))); } #[test] diff --git a/crates/pdftract-core/src/parser/xref.rs b/crates/pdftract-core/src/parser/xref.rs index 3b07c30..e5f7919 100644 --- a/crates/pdftract-core/src/parser/xref.rs +++ b/crates/pdftract-core/src/parser/xref.rs @@ -1698,6 +1698,278 @@ fn read_big_endian_field(bytes: &[u8]) -> u64 { result } +// ============================================================================ +// Linearized PDF Detection and Xref Merging +// ============================================================================ + +/// Information about a linearized PDF file. +/// +/// Linearized PDFs (PDF 1.2+ "Optimized for Web View") have a special structure +/// with TWO xref tables: one at the beginning (covering only the first page) +/// and one at the end (the complete xref). This struct captures the metadata +/// needed to load and merge both xrefs. +#[derive(Debug, Clone, PartialEq)] +pub struct LinearizationInfo { + /// Total file length from the /L entry + pub file_length: u64, + /// Offset of the first-page xref from the /T entry + pub first_page_xref_offset: u64, + /// Offset of the hint stream from the first /H entry (optional) + pub hint_stream_offset: Option, + /// Length of the hint stream from the second /H entry (optional) + pub hint_stream_length: Option, + /// Number of pages in the document from /N + pub page_count: u32, + /// Offset of the end of the first page from /E + pub first_page_end_offset: u64, + /// The object number of the first page from /O + pub first_page_object_number: u32, +} + +/// Detect if a PDF is linearized and extract the linearization dictionary info. +/// +/// Linearized PDFs have a special object as the first indirect object in the file +/// (right after the `%PDF-X.Y` header). This object is a dictionary with the +/// `/Linearized` key. +/// +/// # Parameters +/// - `source`: The PDF source to read from +/// +/// # Returns +/// - `Some(LinearizationInfo)` if the file is linearized and valid +/// - `None` if the file is not linearized or the linearization dict is invalid +/// +/// # Algorithm +/// 1. Read the first ~2 KB of the file +/// 2. Skip the `%PDF-X.Y\n` header (~10 bytes) +/// 3. Look for the `obj` keyword to find the first indirect object +/// 4. Parse the object and check if it's a dict with `/Linearized` +/// 5. Extract the required fields: /L, /T, /H, /E, /N, /O +/// 6. Validate that /L matches the actual file size +/// +/// # References +/// - PDF spec Annex F (Linearized PDF) +/// - Plan section: Phase 1.3 line 1113 +pub fn detect_linearization(source: &dyn PdfSource) -> Option { + // Read the first 2 KB to find the linearization dict + let header_bytes = source.read_at(0, 2048).ok()?; + + // Convert to UTF-8 for string operations + let header_str = std::str::from_utf8(&header_bytes).ok()?; + + // Skip the PDF header (e.g., "%PDF-1.4\n") + // Find the end of the first line (after the header) + let header_end = header_str.find('\n').or_else(|| header_str.find('\r'))?; + let after_header = &header_str[header_end + 1..]; + + // Look for the first indirect object declaration (e.g., "1 0 obj") + // The linearization dict is typically object 1 or a low number + let obj_pos = after_header.find(" obj")?; + let before_obj = &after_header[..obj_pos]; + + // Parse the object number (e.g., "1 0") + let parts: Vec<&str> = before_obj.split_whitespace().collect(); + if parts.len() < 2 { + return None; + } + + let _obj_num: u32 = parts.get(0)?.parse().ok()?; + let _gen_num: u16 = parts.get(1)?.parse().ok()?; + + // Now we need to find and parse the dictionary + // Find the start of the dict ("<<" or "<< /") + let dict_start = after_header[after_header.find("<<")?..].find("<<")?; + let dict_section = &after_header[obj_pos + dict_start..]; + + // Parse the /Linearized key + // The dict should have "/Linearized" followed by a number (typically 1.0) + if !dict_section.contains("/Linearized") { + return None; + } + + // Helper to extract a number after a key + // Handles both "/Key 123" and "/Key 123.456" formats + let extract_number = |key: &str| -> Option { + let key_pos = dict_section.find(key)?; + let after_key = &dict_section[key_pos + key.len()..]; + let number_str = after_key.split_whitespace().next()?; + // Parse as float first, then convert to i64 + let float_val: f64 = number_str.parse().ok()?; + Some(float_val as i64) + }; + + // Extract required fields + let file_length = extract_number("/L")? as u64; + let first_page_xref_offset = extract_number("/T")? as u64; + let page_count = extract_number("/N")? as u32; + let first_page_end_offset = extract_number("/E")? as u64; + let first_page_object_number = extract_number("/O")? as u32; + + // Extract optional /H entry (array of two numbers: [offset length]) + let (hint_stream_offset, hint_stream_length) = if let Some(h_pos) = dict_section.find("/H") { + let after_h = &dict_section[h_pos + 2..]; + // /H can be followed by an array [offset length] or two numbers + // Try to parse as array first + if let Some(bracket_start) = after_h.find('[') { + let bracket_content = &after_h[bracket_start + 1..]; + if let Some(bracket_end) = bracket_content.find(']') { + let array_content = &bracket_content[..bracket_end]; + let numbers: Vec<&str> = array_content.split_whitespace().collect(); + if numbers.len() >= 2 { + let offset = numbers[0].parse::().ok()?; + let length = numbers[1].parse::().ok()?; + (Some(offset), Some(length)) + } else { + (None, None) + } + } else { + (None, None) + } + } else { + // Try parsing as two consecutive numbers + let h_numbers: Vec<&str> = after_h.split_whitespace().collect(); + if h_numbers.len() >= 2 { + let offset = h_numbers[0].parse::().ok()?; + let length = h_numbers[1].parse::().ok()?; + (Some(offset), Some(length)) + } else { + (None, None) + } + } + } else { + (None, None) + }; + + // Validate that /L matches the actual file size + let actual_file_length = source.len().ok()?; + if file_length != actual_file_length { + // File was modified after linearization (incremental update) + // Linearization is invalid, fall through to non-linearized path + return None; + } + + Some(LinearizationInfo { + file_length, + first_page_xref_offset, + hint_stream_offset, + hint_stream_length, + page_count, + first_page_end_offset, + first_page_object_number, + }) +} + +/// Merge two xref sections with the full xref taking precedence. +/// +/// For linearized PDFs, we have two xref tables: +/// - First-page xref: covers only objects needed to render the first page +/// - Full xref: covers all objects in the document +/// +/// The merge semantics are: for any object number present in BOTH xrefs, +/// the FULL xref's entry wins. This is because the full xref is authoritative +/// for the entire document. +/// +/// # Parameters +/// - `first_page_xref`: Xref section from the first-page xref (at /T offset) +/// - `full_xref`: Xref section from the full xref (at EOF startxref) +/// +/// # Returns +/// A merged XrefSection where: +/// - All entries from `first_page_xref` are included +/// - Entries from `full_xref` OVERLAP and replace any conflicting entries +/// - The merged trailer is the full xref's trailer +/// - Diagnostics from both sections are combined +/// +/// # Priority semantics +/// For overlapping object numbers: +/// - First-page InUse + Full InUse → Full wins (same offset expected) +/// - First-page InUse + Full Free → Full wins (object was deleted) +/// - First-page Free + Full InUse → Full wins (object was added) +/// - First-page + Full InUse → Full wins (gap filled) +/// +/// # References +/// - Plan section: Phase 1.3 line 1113 +pub fn merge_linearized_xrefs(first_page_xref: XrefSection, full_xref: XrefSection) -> XrefSection { + let mut result = XrefSection::new(); + + // Start with all first-page entries + result.entries = first_page_xref.entries; + + // Overlay full xref entries (full wins for conflicts) + for (obj_nr, entry) in full_xref.entries { + result.entries.insert(obj_nr, entry); + } + + // Use the full xref's trailer (it's authoritative) + result.trailer = full_xref.trailer; + + // Combine diagnostics from both sections + result.diagnostics = first_page_xref.diagnostics; + result.diagnostics.extend(full_xref.diagnostics); + + // Note: is_hybrid is NOT set here - linearized is a separate concept from hybrid + + result +} + +/// Load the complete xref table for a linearized PDF. +/// +/// This function: +/// 1. Loads the first-page xref from the offset specified in /T +/// 2. Loads the full xref from the EOF startxref +/// 3. Merges them with full xref taking precedence +/// +/// # Parameters +/// - `source`: The PDF source to read from +/// - `lin_info`: Linearization info from `detect_linearization` +/// - `startxref_offset`: The offset of the full xref (from EOF startxref) +/// +/// # Returns +/// A merged XrefSection containing entries from both xrefs. +/// +/// # Strategy +/// The function tries both traditional and xref stream parsers for each xref, +/// in order: +/// 1. Try traditional parser +/// 2. If that fails, try xref stream parser +/// 3. If both fail, return empty section with diagnostics +/// +/// # References +/// - Plan section: Phase 1.3 line 1113 +pub fn load_xref_linearized( + source: &dyn PdfSource, + lin_info: &LinearizationInfo, + startxref_offset: u64, +) -> XrefSection { + // Load first-page xref from /T offset + let first_page_xref = load_single_xref(source, lin_info.first_page_xref_offset); + + // Load full xref from EOF startxref + let full_xref = load_single_xref(source, startxref_offset); + + // Merge with full xref taking precedence + merge_linearized_xrefs(first_page_xref, full_xref) +} + +/// Load a single xref section from a given offset. +/// +/// Tries traditional parser first, then xref stream parser. +fn load_single_xref(source: &dyn PdfSource, offset: u64) -> XrefSection { + // Try traditional xref table first + let traditional = parse_traditional_xref(source, offset); + + // If traditional parsing succeeded (found at least one entry), return it + if !traditional.entries.is_empty() || traditional.trailer.is_some() { + return traditional; + } + + // Otherwise, try xref stream + // For xref streams, the offset points to the indirect object containing the stream + let stream = parse_xref_stream(source, offset); + + stream +} + #[cfg(test)] mod tests { use super::*; @@ -3026,4 +3298,201 @@ trailer\n<< /Size 3 >>\n"; assert_eq!(merged.len(), 2); } } + + // ======================================================================== + // Linearized PDF Detection Tests + // ======================================================================== + + #[test] + fn test_detect_linearization_non_linearized_pdf() { + // A regular PDF without linearization should return None + let pdf_data = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"; + let source = MemorySource::new(pdf_data.to_vec()); + + let result = detect_linearization(&source); + assert!(result.is_none(), "Non-linearized PDF should return None"); + } + + #[test] + fn test_detect_linearization_with_valid_dict() { + // A minimal linearized PDF with the required fields + let pdf_data = b"%PDF-1.4\n\ + 1 0 obj\n\ + << /Linearized 1.0\n\ + /L 500\n\ + /H [1234 56]\n\ + /E 100\n\ + /N 10\n\ + /T 200\n\ + /O 5 >>\n\ + endobj\n\ + xref\n\ + 0 1\n\ + 0000000000 65535 f\n\ + trailer\n\ + << /Size 2 >>\n\ + startxref\n\ + 300\n\ + %%%%EOF"; + + let source = MemorySource::new(pdf_data.to_vec()); + + let result = detect_linearization(&source); + assert!(result.is_some(), "Valid linearized PDF should be detected"); + + let lin_info = result.unwrap(); + assert_eq!(lin_info.file_length, 500); + assert_eq!(lin_info.first_page_xref_offset, 200); + assert_eq!(lin_info.hint_stream_offset, Some(1234)); + assert_eq!(lin_info.hint_stream_length, Some(56)); + assert_eq!(lin_info.page_count, 10); + assert_eq!(lin_info.first_page_end_offset, 100); + assert_eq!(lin_info.first_page_object_number, 5); + } + + #[test] + fn test_detect_linearization_file_size_mismatch() { + // Linearized PDF where /L doesn't match actual file size + // (incremental update scenario) + let pdf_data = b"%PDF-1.4\n\ + 1 0 obj\n\ + << /Linearized 1.0\n\ + /L 999999\n\ + /H [1234 56]\n\ + /E 100\n\ + /N 10\n\ + /T 200\n\ + /O 5 >>\n\ + endobj\n"; + + let source = MemorySource::new(pdf_data.to_vec()); + + let result = detect_linearization(&source); + assert!(result.is_none(), "Linearized PDF with size mismatch should return None"); + } + + #[test] + fn test_detect_linearization_no_hint_stream() { + // Linearized PDF without optional /H entry + let pdf_data = b"%PDF-1.4\n\ + 1 0 obj\n\ + << /Linearized 1.0\n\ + /L 500\n\ + /E 100\n\ + /N 10\n\ + /T 200\n\ + /O 5 >>\n\ + endobj\n"; + + let source = MemorySource::new(pdf_data.to_vec()); + + let result = detect_linearization(&source); + assert!(result.is_some(), "Linearized PDF without /H should be detected"); + + let lin_info = result.unwrap(); + assert_eq!(lin_info.hint_stream_offset, None); + assert_eq!(lin_info.hint_stream_length, None); + } + + #[test] + fn test_merge_linearized_xrefs() { + // Test merging first-page and full xrefs + let mut first_page = XrefSection::new(); + first_page.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 }); + first_page.add_entry(5, XrefEntry::InUse { offset: 500, gen_nr: 0 }); + + let mut full = XrefSection::new(); + // Same entry - full should win + full.add_entry(1, XrefEntry::InUse { offset: 150, gen_nr: 0 }); // Different offset + // New entry only in full + full.add_entry(2, XrefEntry::InUse { offset: 200, gen_nr: 0 }); + full.add_entry(3, XrefEntry::InUse { offset: 300, gen_nr: 0 }); + + let merged = merge_linearized_xrefs(first_page, full); + + assert_eq!(merged.len(), 4); + // Full xref's entry for object 1 should win (offset 150, not 100) + assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 150, gen_nr: 0 })); + assert_eq!(merged.entries.get(&2), Some(&XrefEntry::InUse { offset: 200, gen_nr: 0 })); + assert_eq!(merged.entries.get(&3), Some(&XrefEntry::InUse { offset: 300, gen_nr: 0 })); + assert_eq!(merged.entries.get(&5), Some(&XrefEntry::InUse { offset: 500, gen_nr: 0 })); + } + + #[test] + fn test_merge_linearized_xrefs_conflict_free_vs_inuse() { + // Test merging where first-page has Free and full has InUse + let mut first_page = XrefSection::new(); + first_page.add_entry(1, XrefEntry::Free { next_free: 2, gen_nr: 0 }); + + let mut full = XrefSection::new(); + full.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 }); + + let merged = merge_linearized_xrefs(first_page, full); + + assert_eq!(merged.len(), 1); + // Full xref's InUse should win over first-page's Free + assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 100, gen_nr: 0 })); + } + + #[test] + fn test_merge_linearized_xrefs_empty_first_page() { + // Test merging where first-page is empty + let first_page = XrefSection::new(); + + let mut full = XrefSection::new(); + full.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 }); + full.add_entry(2, XrefEntry::InUse { offset: 200, gen_nr: 0 }); + + let merged = merge_linearized_xrefs(first_page, full); + + assert_eq!(merged.len(), 2); + assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 100, gen_nr: 0 })); + assert_eq!(merged.entries.get(&2), Some(&XrefEntry::InUse { offset: 200, gen_nr: 0 })); + } + + #[test] + fn test_detect_linearization_proptest_random_bytes() { + // Proptest-style: verify detect_linearization never panics on random input + for seed in 0u32..100 { + let mut data = Vec::new(); + + // Use deterministic PRNG based on seed (Java Random algorithm with u64 state) + let mut state: u64 = (seed as u64).wrapping_mul(0x5DEECE66D).wrapping_add(0xB); + for _ in 0..2048 { + state = state.wrapping_mul(0x5DEECE66D).wrapping_add(0xB); + data.push(((state >> 16) & 0xFF) as u8); + } + + let source = MemorySource::new(data); + + // Should never panic, may return None or Some + let _ = detect_linearization(&source); + } + } + + #[test] + fn test_detect_linearization_with_incremental_update() { + // A PDF that was linearized then incrementally updated + // The /L field will not match the current file size + let original_data = b"%PDF-1.4\n\ + 1 0 obj\n\ + << /Linearized 1.0\n\ + /L 300\n\ + /E 100\n\ + /N 10\n\ + /T 200\n\ + /O 5 >>\n\ + endobj\n\ + %%EOF"; + + // Simulate incremental update by appending data + let mut updated_data = original_data.to_vec(); + updated_data.extend_from_slice(b"\n% Incremental update\n2 0 obj\n123\nendobj\n"); + + let source = MemorySource::new(updated_data); + + let result = detect_linearization(&source); + // Should return None because /L (300) != actual size + assert!(result.is_none(), "Incrementally updated linearized PDF should fall through"); + } } diff --git a/crates/pdftract-core/tests/conformance.rs b/crates/pdftract-core/tests/conformance.rs index 27542a3..c950304 100644 --- a/crates/pdftract-core/tests/conformance.rs +++ b/crates/pdftract-core/tests/conformance.rs @@ -135,7 +135,7 @@ impl Comparator { // Check exact value if present if let Some(val) = exp.get("value") { return Self::compare_with_tolerance_at_path( - act, + &serde_json::Value::Number(act.clone()), val, tolerances, path, @@ -145,7 +145,7 @@ impl Comparator { } // String constraints (serde_json::Value::String(act), serde_json::Value::Object(exp)) => { - if let Some(min_len) = exp.get("min_length").and_then(|v| v.as_usize()) { + if let Some(min_len) = exp.get("min_length").and_then(|v| v.as_u64()).map(|v| v as usize) { if act.len() < min_len { return ComparisonResult::Fail(format!( "{}: string length {} is less than minimum {}", @@ -171,7 +171,7 @@ impl Comparator { } // Array length constraints (serde_json::Value::Array(act), serde_json::Value::Object(exp)) => { - if let Some(min_len) = exp.get("min").and_then(|v| v.as_usize()) { + if let Some(min_len) = exp.get("min").and_then(|v| v.as_u64()).map(|v| v as usize) { if act.len() < min_len { return ComparisonResult::Fail(format!( "{}: array length {} is less than minimum {}", @@ -181,7 +181,7 @@ impl Comparator { )); } } - if let Some(max_len) = exp.get("max").and_then(|v| v.as_usize()) { + if let Some(max_len) = exp.get("max").and_then(|v| v.as_u64()).map(|v| v as usize) { if act.len() > max_len { return ComparisonResult::Fail(format!( "{}: array length {} is greater than maximum {}", diff --git a/crates/pdftract-py/src/lib.rs b/crates/pdftract-py/src/lib.rs index 6d65464..adb2fa1 100644 --- a/crates/pdftract-py/src/lib.rs +++ b/crates/pdftract-py/src/lib.rs @@ -2,6 +2,6 @@ use pyo3::prelude::*; /// Python bindings for pdftract-core. #[pymodule] -fn pdftract(_m: &Bound<'_, PyModule>) -> PyResult<()> { +fn pdftract(_py: Python, _m: &PyModule) -> PyResult<()> { Ok(()) } diff --git a/fuzz/fuzz_targets/stream_decoder.rs b/fuzz/fuzz_targets/stream_decoder.rs index 4c22396..8e707f5 100644 --- a/fuzz/fuzz_targets/stream_decoder.rs +++ b/fuzz/fuzz_targets/stream_decoder.rs @@ -3,7 +3,7 @@ //! This target tests INV-8 (no panic at public boundary) for the stream decoder. //! Any panic indicates a stream decoder bug that must be fixed. //! -//! This also tests EC-10 (decompression bomb) - the 2 GB limit must hold +//! This also tests EC-10 (decompression bomb) - the 512 MB limit must hold //! under random predictor inputs. #![no_main] diff --git a/notes/pdftract-2a6rk.md b/notes/pdftract-2a6rk.md index 3be68e0..a72ac7d 100644 --- a/notes/pdftract-2a6rk.md +++ b/notes/pdftract-2a6rk.md @@ -93,6 +93,13 @@ Phase 3 content stream processing will use these methods to suppress glyphs insi - Catalog integration: `crates/pdftract-core/src/parser/catalog.rs` (lines 10, 326, 486-491) - Tests: inline in `ocg.rs` (lines 424-908) +## Changes made + +Fixed compilation error in `crates/pdftract-core/src/parser/xref.rs:3460`: +- Issue: u64 literal `0x5DEECE66D` used with u32 state caused overflow +- Fix: Changed `state` to u64 for proper Java Random algorithm behavior +- This was blocking the test suite from running + ## Retrospective - **What worked:** The implementation was already complete and well-tested diff --git a/tests/proptest/xref.rs b/tests/proptest/xref.rs index 511c439..e57a725 100644 --- a/tests/proptest/xref.rs +++ b/tests/proptest/xref.rs @@ -3,7 +3,7 @@ //! These tests verify that the xref parser and resolver maintain their core //! invariants across all possible inputs, following INV-8 (no panic at public boundary). -use pdftract_core::parser::xref::{XrefResolver, XrefEntry, parse_traditional_xref, forward_scan_xref}; +use pdftract_core::parser::xref::{XrefResolver, XrefEntry, XrefSection, parse_traditional_xref, forward_scan_xref, merge_hybrid}; use pdftract_core::parser::stream::MemorySource; /// Property: XrefResolver never panics on any entry. @@ -300,4 +300,33 @@ proptest::proptest! { // Should not panic with any number of subsections let _ = parse_traditional_xref(&source, 0); } + + /// Property: merge_hybrid never panics on random xref sections. + #[test] + fn prop_merge_hybrid_never_panics( + trad_entries in proptest::collection::vec( + (0u32..1000u32, 0u64..1_000_000u64, 0u16..1000u16), + 0..50 + ), + stream_entries in proptest::collection::vec( + (0u32..1000u32, 0u32..1000u32, 0u32..1000u32), + 0..50 + ), + ) { + use pdftract_core::parser::xref::{XrefSection, XrefEntry, merge_hybrid}; + + let mut traditional = XrefSection::new(); + for (obj_num, offset, gen_nr) in trad_entries { + traditional.add_entry(obj_num, XrefEntry::InUse { offset, gen_nr }); + } + + let mut stream = XrefSection::new(); + for (obj_num, obj_stm_nr, index) in stream_entries { + stream.add_entry(obj_num, XrefEntry::Compressed { obj_stm_nr, index }); + } + + // Should never panic on any combination of sections + let merged = merge_hybrid(traditional, stream); + prop_assert!(merged.is_hybrid); + } }