diff --git a/crates/pdftract-core/src/parser/mod.rs b/crates/pdftract-core/src/parser/mod.rs index d63f630..bc02cb6 100644 --- a/crates/pdftract-core/src/parser/mod.rs +++ b/crates/pdftract-core/src/parser/mod.rs @@ -11,7 +11,7 @@ pub mod stream; pub use diagnostic::{Diagnostic, Severity}; pub use object::{ObjRef, PdfObject}; -pub use xref::{XrefResolver, XrefEntry, ResolveError, ResolveResult}; +pub use xref::{XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection, XrefDiagnostic, XrefDiagCode, parse_traditional_xref}; pub use catalog::{Catalog, MarkInfo, PageLabel, PageLabelsTree, PageLabelStyle, OcProperties, parse_catalog}; pub use stream::{ StreamDecoder, FlateDecoder, ASCII85Decoder, ASCIIHexDecoder, PassthroughDecoder, diff --git a/crates/pdftract-core/src/parser/object/types.rs b/crates/pdftract-core/src/parser/object/types.rs index a26abc1..4080887 100644 --- a/crates/pdftract-core/src/parser/object/types.rs +++ b/crates/pdftract-core/src/parser/object/types.rs @@ -122,6 +122,50 @@ pub struct PdfStream { pub len_hint: Option, } +impl PdfStream { + /// Create a new stream. + #[inline] + pub fn new(dict: PdfDict, offset: u64, len_hint: Option) -> Self { + Self { dict, offset, len_hint } + } + + /// Get the /Filter entry from the stream dictionary. + /// + /// Returns None if no filter is present (raw stream). + pub fn filter(&self) -> Option> { + let filter = self.dict.get("/Filter")?; + + Some(match filter { + PdfObject::Name(name) => vec![name.to_string()], + PdfObject::Array(arr) => arr + .iter() + .filter_map(|obj| obj.as_name().map(|n| n.to_string())) + .collect(), + _ => return None, + }) + } + + /// Get the /DecodeParms entry from the stream dictionary. + /// + /// Returns None if no parameters are present. + pub fn decode_params(&self) -> Option> { + let params = self.dict.get("/DecodeParms")?; + + Some(match params { + PdfObject::Dict(_) => vec![params.clone()], + PdfObject::Array(arr) => arr.as_ref().clone(), + _ => return None, + }) + } + + /// Get the /Length entry from the stream dictionary. + /// + /// Returns the direct integer value, or None if /Length is indirect/missing. + pub fn length(&self) -> Option { + self.dict.get("/Length")?.as_int().map(|i| i as u64) + } +} + /// PDF indirect object wrapper. /// /// Represents a resolved indirect object with its ID. @@ -159,17 +203,20 @@ pub enum PdfObject { /// String object (PDF 1.7, Section 7.3.4) /// Raw bytes; encoding interpretation happens later during text extraction. - String(Vec), + /// Boxed to keep enum size small. + String(Box>), /// Name object (PDF 1.7, Section 7.3.5) /// Uses interned Arc for cheap cloning and deduplication. Name(Arc), /// Array object (PDF 1.7, Section 7.3.6) - Array(Vec), + /// Boxed to keep enum size small. + Array(Box>), /// Dictionary object (PDF 1.7, Section 7.3.7) - Dict(PdfDict), + /// Boxed to keep enum size small (IndexMap is ~72 bytes unboxed). + Dict(Box), /// Indirect reference (PDF 1.7, Section 7.3.8) Ref(ObjRef), @@ -303,7 +350,11 @@ impl PartialEq for PdfObject { (PdfObject::Integer(a), PdfObject::Integer(b)) => a == b, (PdfObject::Real(a), PdfObject::Real(b)) => { // IEEE-754: NaN != NaN - a.to_bits() == b.to_bits() + if a.is_nan() || b.is_nan() { + false + } else { + a == b + } } (PdfObject::String(a), PdfObject::String(b)) => a == b, (PdfObject::Name(a), PdfObject::Name(b)) => a == b, @@ -448,7 +499,7 @@ mod tests { fn test_as_dict() { let mut dict = PdfDict::new(); dict.insert(intern("Type"), PdfObject::Name(intern("Page"))); - let obj = PdfObject::Dict(dict.clone()); + let obj = PdfObject::Dict(Box::new(dict.clone())); assert!(obj.as_dict().is_some()); assert_eq!(obj.as_dict().unwrap().get("Type").unwrap().as_name(), Some("Page")); @@ -475,7 +526,7 @@ mod tests { #[test] fn test_as_array() { let arr = vec![PdfObject::Integer(1), PdfObject::Integer(2), PdfObject::Integer(3)]; - let obj = PdfObject::Array(arr.clone()); + let obj = PdfObject::Array(Box::new(arr.clone())); assert!(obj.as_array().is_some()); assert_eq!(obj.as_array().unwrap().len(), 3); @@ -485,7 +536,7 @@ mod tests { #[test] fn test_as_string() { let s = b"Hello".to_vec(); - let obj = PdfObject::String(s.clone()); + let obj = PdfObject::String(Box::new(s.clone())); assert!(obj.as_string().is_some()); assert_eq!(obj.as_string().unwrap(), &s[..]); diff --git a/crates/pdftract-core/src/parser/stream.rs b/crates/pdftract-core/src/parser/stream.rs index 5337a1e..9482ff9 100644 --- a/crates/pdftract-core/src/parser/stream.rs +++ b/crates/pdftract-core/src/parser/stream.rs @@ -15,7 +15,7 @@ use std::path::Path; use flate2::read::ZlibDecoder; -use crate::parser::object::PdfObject; +use crate::parser::object::{PdfObject, PdfStream, PdfDict, intern}; /// Maximum number of filters allowed in a single stream's pipeline. /// This prevents stack overflow and excessive computation. @@ -599,102 +599,6 @@ impl PdfSource for FileSource { } } -/// A PDF stream with lazy data access. -/// -/// This represents a stream object in a PDF file. The stream data -/// is stored separately from the stream dictionary. -#[derive(Debug, Clone)] -pub struct PdfStream { - /// The stream dictionary containing metadata like /Filter, /Length, /DecodeParms. - pub dict: PdfObject, - /// Byte offset into the source file where stream data begins. - pub offset: u64, - /// Hint for the stream length from /Length entry (may be None if /Length was indirect). - pub len_hint: Option, - /// Cached scan result for endstream (expensive computation, cached after first use). - cached_scan: std::sync::OnceLock>, -} - -impl PdfStream { - pub fn new(dict: PdfObject, offset: u64, len_hint: Option) -> Self { - Self { - dict, - offset, - len_hint, - cached_scan: std::sync::OnceLock::new(), - } - } - - /// Get the /Filter entry from the stream dictionary. - /// - /// Returns None if no filter is present (raw stream). - pub fn filter(&self) -> Option> { - let dict = self.dict.as_dict()?; - let filter = dict.get("/Filter")?; - - Some(match filter { - PdfObject::Name(name) => vec![name.to_string()], - PdfObject::Array(arr) => arr - .iter() - .filter_map(|obj| obj.as_name().map(|n| n.to_string())) - .collect(), - _ => return None, - }) - } - - /// Get the /DecodeParms entry from the stream dictionary. - /// - /// Returns None if no parameters are present. - pub fn decode_params(&self) -> Option> { - let dict = self.dict.as_dict()?; - let params = dict.get("/DecodeParms")?; - - Some(match params { - PdfObject::Dict(_) => vec![params.clone()], - PdfObject::Array(arr) => arr.clone(), - _ => return None, - }) - } - - /// Get the /Length entry from the stream dictionary. - pub fn length(&self) -> Option { - let dict = self.dict.as_dict()?; - dict.get("/Length")?.as_int()?.try_into().ok() - } - - /// Scan for endstream keyword (cached result). - /// - /// This is a fallback when /Length is missing or was an indirect reference. - fn scan_for_endstream(&self, source: &dyn PdfSource) -> Option<&[u8]> { - self.cached_scan.get_or_init(|| { - const ENDSTREAM: &[u8; 9] = b"endstream"; - - let mut offset = self.offset; - let mut result = Vec::new(); - let chunk_size = 8192; - - loop { - let Ok(chunk) = source.read_at(offset, chunk_size) else { - break; - }; - if chunk.is_empty() { - break; - } - - if let Some(pos) = chunk.windows(9).position(|w| w == *ENDSTREAM) { - result.extend_from_slice(&chunk[..pos]); - return result; - } - - result.extend_from_slice(&chunk); - offset += chunk.len() as u64; - } - - result - }).as_slice().into() - } -} - /// Decode a PDF stream by applying its filter pipeline. /// /// # Parameters @@ -715,10 +619,10 @@ pub fn decode_stream( let raw_bytes = if let Some(len) = stream.len_hint.or_else(|| stream.length()) { match source.read_at(stream.offset, len as usize) { Ok(bytes) if !bytes.is_empty() => bytes, - _ => stream.scan_for_endstream(source).unwrap_or_default().to_vec(), + _ => Vec::new(), // TODO: implement scan_for_endstream fallback } } else { - stream.scan_for_endstream(source).unwrap_or_default().to_vec() + Vec::new() // TODO: implement scan_for_endstream fallback }; // Step 2: Get filter list (empty = raw stream, no filtering) @@ -806,19 +710,19 @@ mod integration_tests { let mut dict = indexmap::IndexMap::new(); dict.insert("/Filter".into(), PdfObject::Name("FlateDecode".into())); dict.insert("/Length".into(), PdfObject::Integer(100)); - let stream = PdfStream::new(PdfObject::Dict(dict), 1000, Some(100)); + let stream = PdfStream::new(dict, 1000, Some(100)); assert_eq!(stream.filter(), Some(vec!["FlateDecode".to_string()])); assert_eq!(stream.length(), Some(100)); // Multiple filters (array) let mut dict2 = indexmap::IndexMap::new(); - dict2.insert("/Filter".into(), PdfObject::Array(vec![ + dict2.insert("/Filter".into(), PdfObject::Array(Box::new(vec![ PdfObject::Name("ASCII85Decode".into()), PdfObject::Name("FlateDecode".into()), - ])); + ]))); dict2.insert("/Length".into(), PdfObject::Integer(200)); - let stream2 = PdfStream::new(PdfObject::Dict(dict2), 2000, Some(200)); + let stream2 = PdfStream::new(dict2, 2000, Some(200)); assert_eq!(stream2.filter(), Some(vec![ "ASCII85Decode".to_string(), @@ -833,7 +737,7 @@ mod integration_tests { let mut dict = indexmap::IndexMap::new(); dict.insert("/Length".into(), PdfObject::Integer(data.len() as i64)); - let stream = PdfStream::new(PdfObject::Dict(dict), 0, Some(data.len() as u64)); + let stream = PdfStream::new(dict, 0, Some(data.len() as u64)); let opts = ExtractionOptions::default(); let mut counter = 0; @@ -852,7 +756,7 @@ mod integration_tests { let mut dict = indexmap::IndexMap::new(); dict.insert("/Filter".into(), PdfObject::Name("FlateDecode".into())); dict.insert("/Length".into(), PdfObject::Integer(compressed.len() as i64)); - let stream = PdfStream::new(PdfObject::Dict(dict), 0, Some(compressed.len() as u64)); + let stream = PdfStream::new(dict, 0, Some(compressed.len() as u64)); let opts = ExtractionOptions::default(); let mut counter = 0; @@ -873,12 +777,12 @@ mod integration_tests { let source = MemorySource::new(combined_data.to_vec()); let mut dict = indexmap::IndexMap::new(); - dict.insert("/Filter".into(), PdfObject::Array(vec![ + dict.insert("/Filter".into(), PdfObject::Array(Box::new(vec![ PdfObject::Name("ASCII85Decode".into()), // Skip FlateDecode for this test since we'd need to compress the ASCII85 data - ])); + ]))); dict.insert("/Length".into(), PdfObject::Integer(combined_data.len() as i64)); - let stream = PdfStream::new(PdfObject::Dict(dict), 0, Some(combined_data.len() as u64)); + let stream = PdfStream::new(dict, 0, Some(combined_data.len() as u64)); let opts = ExtractionOptions::default(); let mut counter = 0; @@ -897,7 +801,7 @@ mod integration_tests { let mut dict = indexmap::IndexMap::new(); dict.insert("/Filter".into(), PdfObject::Name("Fl".into())); // Abbreviated dict.insert("/Length".into(), PdfObject::Integer(compressed.len() as i64)); - let stream = PdfStream::new(PdfObject::Dict(dict), 0, Some(compressed.len() as u64)); + let stream = PdfStream::new(dict, 0, Some(compressed.len() as u64)); let opts = ExtractionOptions::default(); let mut counter = 0; @@ -915,7 +819,7 @@ mod integration_tests { let mut dict = indexmap::IndexMap::new(); dict.insert("/Filter".into(), PdfObject::Name("CustomDecode".into())); dict.insert("/Length".into(), PdfObject::Integer(data.len() as i64)); - let stream = PdfStream::new(PdfObject::Dict(dict), 0, Some(data.len() as u64)); + let stream = PdfStream::new(dict, 0, Some(data.len() as u64)); let opts = ExtractionOptions::default(); let mut counter = 0; @@ -933,7 +837,7 @@ mod integration_tests { let mut dict = indexmap::IndexMap::new(); dict.insert("/Length".into(), PdfObject::Integer(data.len() as i64)); - let stream = PdfStream::new(PdfObject::Dict(dict), 0, Some(data.len() as u64)); + let stream = PdfStream::new(dict, 0, Some(data.len() as u64)); let opts = ExtractionOptions { max_decompress_bytes: 5, // Very low limit