pdftract/crates/pdftract-core/src/parser/xref.rs
jedarden 6a35bdd869 feat(pdftract-29z7b): implement unified diagnostic system + CLI commands
- Added `cmd_explain_diagnostic` function to CLI for detailed diagnostic code explanation
- Added `--list-diagnostics` and `--explain-diagnostic <code>` CLI commands
- Verified all Phase 1.1-1.5 modules use unified DiagCode (lexer, parser, xref, stream, catalog, outline, pages)
- DIAGNOSTIC_CATALOG provides metadata for all 61 diagnostic codes
- Diagnostic struct size: 56 bytes (within 48-64 target range)
- emit! macro provides ergonomic diagnostic emission
- INV-8 maintained: no panics in error paths

All diagnostic codes follow naming convention:
- STRUCT_*: PDF structure errors
- STREAM_*: Stream decoder errors
- XREF_*: Cross-reference table errors
- ENCRYPTION_*: Encryption-related errors
- OCR_*: OCR pipeline errors
- REMOTE_*: Remote source errors
- PAGE_*: Page-level errors
- FONT_*: Font pipeline errors
- GSTATE_*: Graphics state errors
- LAYOUT_*: Layout and reading order errors
- MCP_*: MCP server errors
- CACHE_*: Cache errors

References: Phase 1.6 (error recovery), INV-8, Phase 0.4 (clippy enforces doc comments)
2026-05-22 22:38:31 -04:00

4270 lines
154 KiB
Rust

//! Cross-reference table resolver and traditional xref parser.
//!
//! This module provides:
//! - Traditional xref table parser (20-byte fixed-width entries)
//! - Xref resolver for indirect object resolution
//! - Handling of object streams and circular reference detection
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock};
use crate::parser::object::{ObjRef, PdfObject, PdfDict, PdfStream, ObjectParser};
use crate::parser::stream::{PdfSource, MemorySource};
use crate::diagnostics::{Diagnostic as Diag, DiagCode};
// Use memchr for SIMD-accelerated byte searching in forward_scan_xref
use memchr::{memchr, memchr_iter};
/// Error type for xref resolution.
#[derive(Debug, Clone)]
pub enum ResolveError {
/// Object not found in xref table
NotFound(ObjRef),
/// Circular reference detected
CircularRef(ObjRef),
/// I/O error
Io(String),
}
impl std::fmt::Display for ResolveError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ResolveError::NotFound(obj_ref) => write!(f, "object {} not found", obj_ref),
ResolveError::CircularRef(obj_ref) => write!(f, "circular reference at {}", obj_ref),
ResolveError::Io(msg) => write!(f, "I/O error: {}", msg),
}
}
}
impl std::error::Error for ResolveError {}
/// Result type for resolution operations.
pub type ResolveResult<T> = Result<T, ResolveError>;
/// Cross-reference table entry.
#[derive(Debug, Clone, PartialEq)]
pub enum XrefEntry {
/// Free entry (available for reuse)
Free { next_free: u32, gen_nr: u16 },
/// In-use entry at a specific byte offset
InUse { offset: u64, gen_nr: u16 },
/// Compressed object in an object stream
Compressed { obj_stm_nr: u32, index: u32 },
}
/// Result of parsing a traditional xref table.
///
/// Contains the parsed xref entries and the trailer dictionary.
#[derive(Debug, Clone)]
pub struct XrefSection {
/// Map from object number to xref entry
pub entries: HashMap<u32, XrefEntry>,
/// The trailer dictionary
pub trailer: Option<PdfDict>,
/// Diagnostics emitted during parsing
pub diagnostics: Vec<Diag>,
/// Whether this xref section is from a hybrid file (traditional + stream merged)
pub is_hybrid: bool,
}
impl XrefSection {
/// Create a new empty xref section.
pub fn new() -> Self {
XrefSection {
entries: HashMap::new(),
trailer: None,
diagnostics: Vec::new(),
is_hybrid: false,
}
}
/// Add an entry to the xref section.
pub fn add_entry(&mut self, obj_nr: u32, entry: XrefEntry) {
self.entries.insert(obj_nr, entry);
}
/// Get the number of entries.
pub fn len(&self) -> usize {
self.entries.len()
}
/// Check if the xref section is empty.
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
}
impl Default for XrefSection {
fn default() -> Self {
Self::new()
}
}
/// Merge a hybrid xref file's traditional table and xref stream.
///
/// Hybrid files have BOTH a traditional xref table at `startxref` AND a
/// supplementary xref stream pointed to by `/XRefStm` in the trailer.
/// Per PDF spec, the traditional table is AUTHORITATIVE for objects it
/// covers; the stream's type-2 entries (compressed-in-ObjStm) fill gaps.
///
/// # Parameters
/// - `traditional`: Xref section from the traditional table (authoritative)
/// - `stream`: Xref section from the xref stream (supplementary)
///
/// # Returns
/// A merged XrefSection where:
/// - All entries from `traditional` are preserved (even type-1 Free entries)
/// - Entries from `stream` are added ONLY if not present in `traditional`
/// - The merged trailer is the traditional one (with `/XRefStm` key removed)
/// - `is_hybrid` is set to true
/// - `STRUCT_HYBRID_CONFLICT` diagnostics emitted for Free/InUse conflicts
///
/// # Priority semantics
/// For overlapping object numbers:
/// - Traditional Free + Stream Free → Free (no conflict, both agree)
/// - Traditional Free + Stream InUse → Free (CONFLICT, traditional wins)
/// - Traditional InUse + Stream Free → InUse (CONFLICT, traditional wins)
/// - Traditional InUse + Stream InUse → InUse (no conflict, both agree)
/// - Traditional InUse + Stream Compressed → InUse (traditional wins)
/// - Traditional <absent> + Stream Compressed → Compressed (gap fill)
///
/// # Example
/// ```rust
/// let merged = merge_hybrid(traditional_section, stream_section);
/// assert!(merged.is_hybrid);
/// ```
pub fn merge_hybrid(traditional: XrefSection, stream: XrefSection) -> XrefSection {
let mut result = XrefSection {
entries: HashMap::new(),
trailer: None,
diagnostics: Vec::new(),
is_hybrid: true,
};
// Start with all traditional entries
for (obj_nr, entry) in &traditional.entries {
result.entries.insert(*obj_nr, entry.clone());
}
// Merge stream entries: only add if not in traditional
for (obj_nr, stream_entry) in stream.entries {
if let Some(trad_entry) = traditional.entries.get(&obj_nr) {
// Conflict: both tables have this object
// Check for Free/InUse conflict and emit diagnostic
let trad_is_free = matches!(trad_entry, XrefEntry::Free { .. });
let stream_is_inuse = matches!(stream_entry, XrefEntry::InUse { .. } | XrefEntry::Compressed { .. });
if trad_is_free && stream_is_inuse {
result.diagnostics.push(Diag::with_dynamic(
DiagCode::StructHybridConflict,
0,
format!(
"Object {}: traditional table marks as Free, stream marks as InUse; traditional wins (object is Free)",
obj_nr
),
));
}
// Traditional wins - don't insert stream entry
} else {
// Gap fill: object not in traditional, add from stream
result.entries.insert(obj_nr, stream_entry);
}
}
// Merge diagnostics from both sections
result.diagnostics.extend(traditional.diagnostics);
result.diagnostics.extend(stream.diagnostics);
// Use traditional trailer, removing /XRefStm key if present
if let Some(mut trad_trailer) = traditional.trailer {
trad_trailer.swap_remove("XRefStm");
result.trailer = Some(trad_trailer);
} else {
result.trailer = stream.trailer;
}
result
}
/// Detect if a trailer dictionary indicates a hybrid file.
///
/// A hybrid file has a `/XRefStm` key in the trailer dictionary,
/// pointing to the offset of a supplementary xref stream.
///
/// # Parameters
/// - `trailer`: The trailer dictionary to check (may be None)
///
/// # Returns
/// true if the trailer has a `/XRefStm` key, false otherwise
pub fn is_hybrid_trailer(trailer: Option<&PdfDict>) -> bool {
match trailer {
Some(dict) => dict.contains_key("XRefStm"),
None => false,
}
}
/// Cross-reference resolver.
///
/// This resolver tracks the mapping from object numbers to their file locations
/// and handles resolution through object streams. It also detects circular
/// references to prevent infinite loops.
pub struct XrefResolver {
/// Map from object number to xref entry
entries: HashMap<u32, XrefEntry>,
/// Cache of resolved objects (for object streams)
cache: Arc<RwLock<HashMap<ObjRef, PdfObject>>>,
/// Per-thread resolution stack for circular reference detection
resolving: Arc<RwLock<HashSet<ObjRef>>>,
}
impl XrefResolver {
/// Create a new xref resolver.
pub fn new() -> Self {
XrefResolver {
entries: HashMap::new(),
cache: Arc::new(RwLock::new(HashMap::new())),
resolving: Arc::new(RwLock::new(HashSet::new())),
}
}
/// Create a new xref resolver from an XrefSection.
pub fn from_section(section: XrefSection) -> Self {
XrefResolver {
entries: section.entries,
cache: Arc::new(RwLock::new(HashMap::new())),
resolving: Arc::new(RwLock::new(HashSet::new())),
}
}
/// Add an xref entry.
pub fn add_entry(&mut self, obj_nr: u32, entry: XrefEntry) {
self.entries.insert(obj_nr, entry);
}
/// Get the xref entry for an object number.
pub fn get_entry(&self, obj_nr: u32) -> Option<&XrefEntry> {
self.entries.get(&obj_nr)
}
/// Check if a resolution is in progress (for circular reference detection).
pub fn is_resolving(&self, obj_ref: ObjRef) -> bool {
self.resolving.read()
.map(|guard| guard.contains(&obj_ref))
.unwrap_or(false)
}
/// Mark an object as being resolved.
pub fn start_resolving(&self, obj_ref: ObjRef) -> bool {
match self.resolving.write() {
Ok(mut resolving) => {
if resolving.contains(&obj_ref) {
return false;
}
resolving.insert(obj_ref);
true
}
Err(_) => false, // Lock poisoned - treat as failed to start
}
}
/// Mark an object as finished resolving.
pub fn finish_resolving(&self, obj_ref: ObjRef) {
if let Ok(mut resolving) = self.resolving.write() {
resolving.remove(&obj_ref);
}
// If lock is poisoned, ignore - cleanup is optional
}
/// Resolve an object reference to its value.
///
/// This is a stub implementation that returns Null. The full implementation
/// (Phase 1.3) will:
/// - Check for circular references
/// - Look up the xref entry
/// - Read and parse the object from its offset
/// - Handle object streams
/// - Cache resolved objects
pub fn resolve(&self, obj_ref: ObjRef) -> ResolveResult<PdfObject> {
// Check for circular reference
if !self.start_resolving(obj_ref) {
return Err(ResolveError::CircularRef(obj_ref));
}
// Check cache first
{
match self.cache.read() {
Ok(cache) => {
if let Some(obj) = cache.get(&obj_ref) {
self.finish_resolving(obj_ref);
return Ok(obj.clone());
}
}
Err(_) => {
// Lock poisoned - clear the poisoned state and continue
// The cache is optional, so we can proceed without it
}
}
}
// Look up the xref entry
let _entry = self.entries.get(&obj_ref.object)
.ok_or_else(|| ResolveError::NotFound(obj_ref))?;
// Stub: return Null for now
// Full implementation will read from file offset and parse
self.finish_resolving(obj_ref);
Ok(PdfObject::Null)
}
/// Cache a resolved object.
pub fn cache_object(&self, obj_ref: ObjRef, obj: PdfObject) {
if let Ok(mut cache) = self.cache.write() {
cache.insert(obj_ref, obj);
}
// If lock is poisoned, ignore - caching is optional
}
/// Get the number of entries in the xref table.
pub fn len(&self) -> usize {
self.entries.len()
}
/// Check if the xref table is empty.
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
}
impl Default for XrefResolver {
fn default() -> Self {
Self::new()
}
}
/// Parse a traditional PDF xref table starting from the given offset.
///
/// # Parameters
/// - `source`: The PDF source to read bytes from
/// - `start_offset`: The byte offset where the xref table begins (from `startxref`)
///
/// # Returns
/// An `XrefSection` containing the parsed entries and trailer dictionary.
///
/// # Format
/// The xref table has the following format:
/// ```text
/// xref
/// 0 6
/// 0000000003 65535 f
/// 0000000017 00000 n
/// ...
/// trailer
/// << /Size 6 /Root 1 0 R >>
/// ```
///
/// Each entry is exactly 20 bytes:
/// - 10 digits: byte offset (for `n`) or next-free-object number (for `f`)
/// - 1 space
/// - 5 digits: generation number
/// - 1 space
/// - 1 byte: `n` (in use) or `f` (free)
/// - 2 bytes: line ending (`\r\n` or ` \n`)
///
/// Some buggy producers use `\n` alone (19 bytes), which is detected and handled.
pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> XrefSection {
let mut result = XrefSection::new();
let mut pos = start_offset;
// Read initial chunk to look for xref keyword
let header_bytes = match source.read_at(pos, 1024) {
Ok(bytes) if !bytes.is_empty() => bytes,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Failed to read xref header",
));
return result;
}
};
// Look for xref keyword (case-sensitive per PDF spec)
// Find it in the raw bytes, accounting for leading whitespace
let xref_keyword_pos = loop {
let header_str = match std::str::from_utf8(&header_bytes) {
Ok(s) => s,
Err(_) => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidHeader,
pos,
"Invalid UTF-8 in xref header",
));
return result;
}
};
// Skip leading whitespace to find xref
let trimmed = header_str.trim_start();
let ws_offset = header_str.len() - trimmed.len();
if trimmed.starts_with("xref") {
// Found it! ws_offset is the position of "xref" in header_bytes
break ws_offset;
} else {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidHeader,
pos,
"xref keyword not found",
));
return result;
}
};
// Advance past "xref" keyword (4 bytes) to the byte after it
pos += xref_keyword_pos as u64 + 4;
// Skip the line ending after "xref" (could be \n, \r\n, or \r)
let line_end_bytes = source.read_at(pos, 2).ok();
if let Some(chunk) = line_end_bytes {
if chunk.get(0) == Some(&b'\r') {
if chunk.get(1) == Some(&b'\n') {
pos += 2; // CRLF
} else {
pos += 1; // CR alone
}
} else if chunk.get(0) == Some(&b'\n') {
pos += 1; // LF alone
}
// If no line ending found, continue anyway (might be EOF or next subsection)
}
// Track whether we found the trailer keyword
let mut trailer_found = false;
// Parse subsections until we hit "trailer"
loop {
// Read a chunk to check for trailer or subsection header
let chunk_bytes = match source.read_at(pos, 100) {
Ok(bytes) if !bytes.is_empty() => bytes,
_ => {
// EOF or error - we're done
break;
}
};
let chunk_str = match std::str::from_utf8(&chunk_bytes) {
Ok(s) => s,
Err(_) => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Invalid UTF-8 in xref data",
));
break;
}
};
let trimmed = chunk_str.trim_start();
let ws_offset = chunk_str.len() - trimmed.len();
// Check for trailer keyword
if trimmed.starts_with("trailer") {
trailer_found = true;
pos += ws_offset as u64 + 7; // Skip "trailer"
result.trailer = parse_trailer_dict(source, &mut pos, &mut result.diagnostics);
break;
}
// Otherwise, expect subsection header: "obj_start obj_count"
let subsection_start = pos + ws_offset as u64;
let header_line = match read_line_at(source, subsection_start) {
Some(line) => line,
None => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
"Failed to read subsection header",
));
break;
}
};
let header_parts: Vec<&str> = header_line.split_whitespace().collect();
if header_parts.len() != 2 {
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
format!("Invalid subsection header: {}", header_line),
));
// Skip this line and try to continue
// Find the line ending length
let line_bytes = source.read_at(subsection_start, header_line.len() + 2).ok();
let line_ending_len = if let Some(chunk) = line_bytes {
if chunk.get(header_line.len()) == Some(&b'\r') {
if chunk.get(header_line.len() + 1) == Some(&b'\n') { 2 } else { 1 }
} else if chunk.get(header_line.len()) == Some(&b'\n') {
1
} else {
1 // assume at least 1 byte for line ending
}
} else {
1
};
pos = subsection_start + header_line.len() as u64 + line_ending_len as u64;
continue;
}
let obj_start: u32 = match header_parts[0].parse() {
Ok(n) => n,
Err(_) => {
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
format!("Invalid subsection start: {}", header_parts[0]),
));
pos = subsection_start + header_line.len() as u64 + 1;
continue;
}
};
let obj_count: u32 = match header_parts[1].parse() {
Ok(n) => n,
Err(_) => {
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
format!("Invalid subsection count: {}", header_parts[1]),
));
pos = subsection_start + header_line.len() as u64 + 1;
continue;
}
};
// Position advances past the subsection header line (including line ending)
// Find the line ending length
let line_bytes = source.read_at(subsection_start, header_line.len() + 2).ok();
let line_ending_len = if let Some(chunk) = line_bytes {
if chunk.get(header_line.len()) == Some(&b'\r') {
if chunk.get(header_line.len() + 1) == Some(&b'\n') { 2 } else { 1 }
} else if chunk.get(header_line.len()) == Some(&b'\n') {
1
} else {
1 // assume at least 1 byte for line ending
}
} else {
1
};
pos = subsection_start + header_line.len() as u64 + line_ending_len as u64;
// Parse subsection entries
// We need to detect stride (20 vs 19 bytes) by trying the first entry
let mut stride = 20; // Default to 20 bytes
let mut entries_parsed = 0u32;
while entries_parsed < obj_count {
let entry_start = pos;
// Read a candidate entry (try 20 bytes first, fall back to 19)
let entry_bytes = match source.read_at(pos, 20) {
Ok(bytes) => bytes,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Failed to read xref entry",
));
break;
}
};
if entry_bytes.len() < 19 {
// Definitely truncated
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Xref entry truncated (< 19 bytes)",
));
break;
}
// Try to parse as 20-byte entry first
let parsed = if entry_bytes.len() >= 20 {
parse_xref_entry(&entry_bytes[..20], obj_start + entries_parsed, entry_start, stride, &mut result.diagnostics)
} else {
// Try 19-byte entry for buggy producers
stride = 19;
parse_xref_entry(&entry_bytes[..19], obj_start + entries_parsed, entry_start, stride, &mut result.diagnostics)
};
match parsed {
Some((obj_nr, entry)) => {
// Object 0 must be free (PDF spec requirement)
if obj_nr == 0 {
if let XrefEntry::InUse { .. } = entry {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefObjectZeroNotFree,
entry_start,
"Object 0 is not free (violates PDF spec)",
));
}
}
// Add all entries to the result (both InUse and Free)
// Free entries are needed for /Prev chain merge semantics to track object lifecycle
result.add_entry(obj_nr, entry);
pos += stride as u64;
entries_parsed += 1;
}
None => {
// Failed to parse - try 19-byte stride if we haven't yet
if stride == 20 && entry_bytes.len() >= 19 {
stride = 19;
continue;
}
// Skip this entry and move on
pos += stride as u64;
entries_parsed += 1;
}
}
}
}
// If we exited the loop without finding a trailer, emit a diagnostic
if !trailer_found {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
pos,
"Trailer dictionary not found (xref table may be truncated)",
));
}
result
}
/// Parse a single xref entry.
///
/// Returns Some((obj_nr, entry)) on success, None on failure.
fn parse_xref_entry(
bytes: &[u8],
obj_nr: u32,
offset: u64,
stride: usize,
diagnostics: &mut Vec<Diag>,
) -> Option<(u32, XrefEntry)> {
if bytes.len() != stride {
return None;
}
// Convert to string for parsing
let entry_str = match std::str::from_utf8(bytes) {
Ok(s) => s,
Err(_) => {
diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidEntry,
offset,
"Invalid UTF-8 in xref entry",
));
return None;
}
};
// Entry format: "offset/next_free generation f/n" with line ending
let parts: Vec<&str> = entry_str.split_whitespace().collect();
if parts.len() < 3 {
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Malformed xref entry: {}", entry_str.trim()),
));
return None;
}
let first_field: u64 = match parts[0].parse() {
Ok(n) => n,
Err(_) => {
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Invalid offset/next_free: {}", parts[0]),
));
return None;
}
};
let gen_nr: u16 = match parts[1].parse() {
Ok(n) => n,
Err(_) => {
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Invalid generation: {}", parts[1]),
));
return None;
}
};
let entry_type = parts[2].chars().next();
match entry_type {
Some('n') | Some('N') => Some((obj_nr, XrefEntry::InUse { offset: first_field, gen_nr })),
Some('f') | Some('F') => Some((obj_nr, XrefEntry::Free { next_free: first_field as u32, gen_nr })),
_ => {
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Invalid entry type: {}", parts[2]),
));
None
}
}
}
/// Read a line from the source at a specific position (without updating position).
///
/// Returns None on EOF or error.
fn read_line_at(source: &dyn PdfSource, mut pos: u64) -> Option<String> {
let mut result = String::new();
let mut chunk_pos = 0;
let chunk_size = 256;
loop {
let chunk = source.read_at(pos + chunk_pos, chunk_size).ok()?;
if chunk.is_empty() {
break;
}
// Look for line ending
for (i, &byte) in chunk.iter().enumerate() {
if byte == b'\r' {
// Check for CRLF
if i + 1 < chunk.len() && chunk[i + 1] == b'\n' {
result.push_str(std::str::from_utf8(&chunk[..i]).ok()?);
return Some(result);
}
// Single CR
result.push_str(std::str::from_utf8(&chunk[..i]).ok()?);
return Some(result);
}
if byte == b'\n' {
// Single LF
result.push_str(std::str::from_utf8(&chunk[..i]).ok()?);
return Some(result);
}
}
// No line ending found - add chunk and continue
result.push_str(std::str::from_utf8(&chunk).ok()?);
chunk_pos += chunk.len() as u64;
// Safety: don't read forever
if chunk_pos > 10000 {
break;
}
}
if result.is_empty() {
None
} else {
Some(result)
}
}
/// Read a line from the source, updating the position.
///
/// Returns None on EOF or error.
fn read_line(
source: &dyn PdfSource,
pos: &mut u64,
diagnostics: &mut Vec<Diag>,
) -> Option<String> {
let line = read_line_at(source, *pos)?;
// Advance position past the line (including line ending)
// We need to find the actual line ending length
let chunk = source.read_at(*pos, line.len() + 2).ok()?;
let line_ending_len = if chunk.get(line.len()) == Some(&b'\r') {
if chunk.get(line.len() + 1) == Some(&b'\n') {
2 // CRLF
} else {
1 // CR alone
}
} else if chunk.get(line.len()) == Some(&b'\n') {
1 // LF alone
} else {
0 // No line ending found (shouldn't happen)
};
*pos += line.len() as u64 + line_ending_len as u64;
Some(line)
}
/// Parse the trailer dictionary.
///
/// Parse the trailer dictionary from the xref trailer section.
///
/// This function extracts the trailer dictionary bytes and parses them
/// using the object parser to get the actual key-value pairs.
fn parse_trailer_dict(
source: &dyn PdfSource,
pos: &mut u64,
diagnostics: &mut Vec<Diag>,
) -> Option<PdfDict> {
// Skip whitespace before <<
let mut seen_bracket = false;
let mut depth = 0;
let mut chunk_pos = 0u64;
let dict_start_offset = *pos;
let mut dict_end_offset = None;
// First, find the extent of the trailer dict (from << to >>)
loop {
let chunk = match source.read_at(dict_start_offset + chunk_pos, 4096) {
Ok(bytes) => bytes,
Err(_) => {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"I/O error reading trailer",
));
return None;
}
};
if chunk.is_empty() {
break;
}
for (i, &byte) in chunk.iter().enumerate() {
if !seen_bracket {
if byte == b'<' {
// Check for << (dict start)
if i + 1 < chunk.len() && chunk[i + 1] == b'<' {
seen_bracket = true;
depth = 1;
chunk_pos += i as u64 + 2;
// Start fresh scan after <<
let remaining = &chunk[i + 2..];
for (j, &b) in remaining.iter().enumerate() {
if b == b'<' {
if j + 1 < remaining.len() && remaining[j + 1] == b'<' {
depth += 1;
}
} else if b == b'>' {
if j + 1 < remaining.len() && remaining[j + 1] == b'>' {
depth -= 1;
if depth == 0 {
// Found the end of the dict
let end_offset = dict_start_offset + chunk_pos + j as u64 + 2;
dict_end_offset = Some(end_offset);
break;
}
}
}
}
break;
}
}
continue;
}
}
if dict_end_offset.is_some() {
break;
}
chunk_pos += chunk.len() as u64;
// Safety limit
if chunk_pos > 100000 {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Trailer dictionary too large or unterminated",
));
return None;
}
}
// If we didn't find the end, return None
let dict_end_offset = match dict_end_offset {
Some(offset) => offset,
None => {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Trailer dictionary not found (no << >> markers)",
));
return None;
}
};
// Read the full dict bytes and parse them
let dict_len = (dict_end_offset - dict_start_offset) as usize;
let dict_bytes = match source.read_at(dict_start_offset, dict_len) {
Ok(bytes) => bytes,
Err(_) => {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Failed to read trailer dictionary bytes",
));
return None;
}
};
// Parse the dict using ObjectParser
let mut parser = ObjectParser::new(&dict_bytes);
if let Some(PdfObject::Dict(dict)) = parser.parse_direct_object() {
// Update pos to after the dict
*pos = dict_end_offset;
// Transfer any diagnostics from the parser
for diag in parser.take_diagnostics() {
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
diag.message.into_owned(),
));
}
Some(*dict)
} else {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Failed to parse trailer dictionary as a dict object",
));
None
}
}
/// Parse a direct PDF object (for trailer dictionary parsing).
///
/// This is a stub implementation that will be completed in Phase 1.2.
/// For now, it returns null for all inputs.
#[allow(dead_code)]
fn parse_direct_object(_source: &dyn PdfSource, _pos: &mut u64) -> Option<PdfObject> {
// Stub: return null for now
// Full implementation will parse the actual PDF object
Some(PdfObject::Null)
}
/// Perform a forward-scan xref recovery (strategy 4 - last resort).
///
/// When all other xref strategies fail, this scans the entire file byte-by-byte
/// looking for indirect-object header patterns (`N G obj`) and builds an xref
/// map from those discoveries.
///
/// # Parameters
/// - `source`: The PDF source to scan
/// - `is_linearized`: If true, forward scan is disabled for linearized files
///
/// # Returns
/// An `XrefSection` containing recovered entries and diagnostics.
///
/// # DISABLED CONDITIONS
/// - **Remote sources**: Would require fetching the entire file. Returns empty
/// XrefSection with `STRUCT_REMOTE_NO_FORWARD_SCAN` diagnostic.
/// - **Linearized files**: Would find the partial first-page xref and incorrectly
/// stop. Returns empty XrefSection with `LINEARIZED_NO_FORWARD_SCAN` diagnostic.
///
/// # Algorithm
/// 1. Use SIMD-optimized search (via `memchr`) to find ` obj` substrings
/// 2. For each candidate, verify preceding bytes match `\d+ \d+ `
/// 3. Parse N (object number) and G (generation number)
/// 4. Record `XrefEntry::InUse { offset, generation }` for each match
/// 5. Forward-scan for the `trailer` keyword and parse the following dict
/// 6. Emit `XREF_REPAIRED` diagnostic with count of recovered objects
///
/// # Performance
/// - O(file_size) time complexity
/// - Expected: ~1 sec for 100 MB on a fast machine
/// - Memory: builds HashMap incrementally; no full-file buffer needed
///
/// # Multi-revision handling
/// - Files with multiple trailer blocks (incremental updates): LAST trailer wins
/// - For each ObjRef, the LAST occurrence in the file wins (highest offset)
pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSection {
let mut result = XrefSection::new();
// Check for linearized file
if is_linearized {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefLinearizedNoForwardScan,
0,
"Forward scan disabled for linearized PDF (partial leading xref would cause false results)",
));
return result;
}
// TODO: Check for remote source (HttpRangeSource) when implemented
// For now, MemorySource and FileSource are both local sources
// Once HttpRangeSource exists, add a trait method like `is_remote()` to PdfSource
let source_len = match source.len() {
Ok(len) if len > 0 => len,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
0,
"Unable to determine source length for forward scan",
));
return result;
}
};
// For large files, use memchr for efficient scanning
// For smaller files, read entirely into memory for faster processing
const SMALL_FILE_THRESHOLD: u64 = 1024 * 1024; // 1 MB
if source_len <= SMALL_FILE_THRESHOLD {
// Small file: read entirely and scan in memory
if let Ok(full_data) = source.read_at(0, source_len as usize) {
return forward_scan_memory(&full_data, source_len);
}
}
// Large file: scan in chunks using memchr for efficient space searching
let mut entries_found = 0u64;
const CHUNK_SIZE: usize = 256 * 1024; // 256 KB chunks
// We search for the pattern " obj" (space followed by "obj")
// First, find all space positions, then verify if "obj" follows
let mut pos = 0u64;
while pos < source_len {
let to_read = CHUNK_SIZE.min((source_len - pos) as usize);
match source.read_at(pos, to_read) {
Ok(chunk) if !chunk.is_empty() => {
// Use memchr_iter for SIMD-accelerated space search
let chunk_offset = pos;
for space_idx in memchr_iter(b' ', &chunk) {
let abs_space_idx = space_idx as u64;
// Check if "obj" follows this space
if space_idx + 4 <= chunk.len() {
let after_space = &chunk[space_idx..];
if after_space.starts_with(b"obj") {
// Found " obj" - verify whitespace after "obj"
let obj_end = space_idx + 3;
let has_trailing_ws = if obj_end < chunk.len() {
let next = chunk[obj_end];
next == b'\n' || next == b'\r' || next == b' ' || next == b'\t'
} else {
// At chunk boundary - check next chunk for this rare case
check_trailing_whitespace(source, chunk_offset + abs_space_idx + 3, source_len)
};
if has_trailing_ws {
let obj_offset = chunk_offset + abs_space_idx;
if let Some((obj_num, gen_num)) = parse_obj_header_at(source, obj_offset) {
result.entries.insert(obj_num, XrefEntry::InUse {
offset: obj_offset,
gen_nr: gen_num,
});
entries_found += 1;
}
}
}
}
}
pos += to_read as u64;
// Slide back to catch " obj" spanning chunk boundaries
pos = pos.saturating_sub(3);
}
Err(_) => break,
Ok(_) => break, // Empty chunk
}
}
// Forward-scan for the trailer dictionary
if let Some(trailer) = forward_scan_trailer(source) {
result.trailer = Some(trailer);
}
// Emit XREF_REPAIRED diagnostic with count
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefRepaired,
0,
format!("Forward scan recovered {} object entries", entries_found),
));
result
}
/// Check for trailing whitespace after "obj" at the given offset.
///
/// This is used when "obj" appears at a chunk boundary and we need to
/// verify the next byte in the file.
fn check_trailing_whitespace(source: &dyn PdfSource, offset: u64, source_len: u64) -> bool {
if offset >= source_len {
return false;
}
match source.read_at(offset, 1) {
Ok(bytes) if !bytes.is_empty() => {
let next = bytes[0];
next == b'\n' || next == b'\r' || next == b' ' || next == b'\t'
}
_ => false,
}
}
/// Forward-scan a memory buffer for xref entries.
///
/// This is a specialized version for small files that can be entirely
/// loaded into memory. Uses memchr for efficient scanning.
fn forward_scan_memory(data: &[u8], source_len: u64) -> XrefSection {
let mut result = XrefSection::new();
let mut entries_found = 0u64;
// Use memchr_iter for SIMD-accelerated space search
for space_idx in memchr_iter(b' ', data) {
let abs_space_idx = space_idx as u64;
// Check if "obj" follows this space
if space_idx + 4 <= data.len() {
let after_space = &data[space_idx..];
if after_space.starts_with(b"obj") {
// Verify whitespace after "obj"
let obj_end = space_idx + 3;
let has_trailing_ws = if obj_end < data.len() {
let next = data[obj_end];
next == b'\n' || next == b'\r' || next == b' ' || next == b'\t'
} else {
// At EOF - still valid
true
};
if has_trailing_ws {
let obj_offset = abs_space_idx;
if let Some((obj_num, gen_num)) = parse_obj_header_at_memory(data, obj_offset) {
result.entries.insert(obj_num, XrefEntry::InUse {
offset: obj_offset,
gen_nr: gen_num,
});
entries_found += 1;
}
}
}
}
}
// Emit XREF_REPAIRED diagnostic with count
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefRepaired,
0,
format!("Forward scan recovered {} object entries", entries_found),
));
result
}
/// Parse the object number and generation number from bytes preceding " obj".
///
/// Scans backwards from the given offset (which points to the space before "obj")
/// to find the pattern `\d+ \d+ ` (digits space digits space).
///
/// Returns Some((object_number, generation_number)) if found, None otherwise.
fn parse_obj_header_at(source: &dyn PdfSource, obj_offset: u64) -> Option<(u32, u16)> {
// Scan backwards to find the start of the pattern
// Max lookback: 20 bytes for "9999999999 65535 " (max valid per spec)
const MAX_LOOKBACK: usize = 30;
let lookback_start = obj_offset.saturating_sub(MAX_LOOKBACK as u64);
let lookback_len = (obj_offset - lookback_start) as usize;
let chunk = source.read_at(lookback_start, lookback_len).ok()?;
// We're looking for: <digits> <space> <digits> <space> obj
// Work backwards from the end
let mut idx = chunk.len();
// Skip trailing space (the one before "obj")
if idx == 0 || chunk[idx - 1] != b' ' {
return None;
}
idx -= 1;
// Parse generation number (digits going backwards)
let gen_end = idx;
while idx > 0 && chunk[idx - 1].is_ascii_digit() {
idx -= 1;
}
if idx == gen_end {
return None; // No digits found
}
let gen_str = std::str::from_utf8(&chunk[idx..gen_end]).ok()?;
let gen_num: u16 = gen_str.parse().ok()?;
// Check for space before generation number
if idx == 0 || chunk[idx - 1] != b' ' {
return None;
}
idx -= 1;
// Parse object number (digits going backwards)
let obj_end = idx;
while idx > 0 && chunk[idx - 1].is_ascii_digit() {
idx -= 1;
}
if idx == obj_end {
return None; // No digits found
}
let obj_str = std::str::from_utf8(&chunk[idx..obj_end]).ok()?;
let obj_num: u32 = obj_str.parse().ok()?;
// Validate: object number should be preceded by start-of-buffer or whitespace
if idx > 0 {
let prev = chunk[idx - 1];
if !prev.is_ascii_whitespace() && prev != b'%' && prev != b'(' && prev != b'<' {
// Not a valid token boundary
return None;
}
}
Some((obj_num, gen_num))
}
/// Parse the object number and generation number from a memory buffer.
///
/// This is a variant of `parse_obj_header_at` that works directly with
/// a byte slice instead of a PdfSource, for use with memory-mapped data.
///
/// Scans backwards from the given offset (which points to the space before "obj")
/// to find the pattern `\d+ \d+ ` (digits space digits space).
///
/// Returns Some((object_number, generation_number)) if found, None otherwise.
fn parse_obj_header_at_memory(data: &[u8], obj_offset: u64) -> Option<(u32, u16)> {
// Scan backwards to find the start of the pattern
// Max lookback: 20 bytes for "9999999999 65535 " (max valid per spec)
const MAX_LOOKBACK: usize = 30;
let lookback_start = obj_offset.saturating_sub(MAX_LOOKBACK as u64) as usize;
let lookback_len = (obj_offset as usize).saturating_sub(lookback_start);
let chunk = data.get(lookback_start..(lookback_start + lookback_len))?;
// We're looking for: <digits> <space> <digits> <space> obj
// Work backwards from the end
let mut idx = chunk.len();
// Skip trailing space (the one before "obj")
if idx == 0 || chunk[idx - 1] != b' ' {
return None;
}
idx -= 1;
// Parse generation number (digits going backwards)
let gen_end = idx;
while idx > 0 && chunk[idx - 1].is_ascii_digit() {
idx -= 1;
}
if idx == gen_end {
return None; // No digits found
}
let gen_str = std::str::from_utf8(&chunk[idx..gen_end]).ok()?;
let gen_num: u16 = gen_str.parse().ok()?;
// Check for space before generation number
if idx == 0 || chunk[idx - 1] != b' ' {
return None;
}
idx -= 1;
// Parse object number (digits going backwards)
let obj_end = idx;
while idx > 0 && chunk[idx - 1].is_ascii_digit() {
idx -= 1;
}
if idx == obj_end {
return None; // No digits found
}
let obj_str = std::str::from_utf8(&chunk[idx..obj_end]).ok()?;
let obj_num: u32 = obj_str.parse().ok()?;
// Validate: object number should be preceded by start-of-buffer or whitespace
if idx > 0 {
let prev = chunk[idx - 1];
if !prev.is_ascii_whitespace() && prev != b'%' && prev != b'(' && prev != b'<' {
// Not a valid token boundary
return None;
}
}
Some((obj_num, gen_num))
}
/// Forward-scan for the trailer dictionary.
///
/// Searches the file for the `trailer` keyword (also handles `trailer<<` with no space)
/// and parses the following dictionary.
///
/// Returns Some(PdfDict) if found, None otherwise.
fn forward_scan_trailer(source: &dyn PdfSource) -> Option<PdfDict> {
let source_len = source.len().ok()?;
const TRAILER_KEYWORD: &[u8] = b"trailer";
// Read from the end of the file backwards (trailer is usually near the end)
// Check last 64KB first
let scan_start = source_len.saturating_sub(64 * 1024);
let mut pos = scan_start;
while pos < source_len {
let to_read = 4096.min((source_len - pos) as usize);
let chunk = source.read_at(pos, to_read).ok()?;
// Search for "trailer" in this chunk
if let Some(idx) = chunk.windows(TRAILER_KEYWORD.len()).position(|w| w == TRAILER_KEYWORD) {
let trailer_offset = pos + idx as u64;
// Verify it's at a token boundary (preceded by whitespace or start)
let valid_boundary = if idx > 0 {
chunk[idx - 1].is_ascii_whitespace() || chunk[idx - 1] == b'\n' || chunk[idx - 1] == b'\r'
} else {
pos == scan_start // At start of scan area
};
if valid_boundary {
// Parse the trailer dictionary
let mut dict_pos = trailer_offset + TRAILER_KEYWORD.len() as u64;
// Skip whitespace before <<
while dict_pos < source_len {
let byte = source.read_at(dict_pos, 1).ok()?;
if !byte.is_empty() && byte[0].is_ascii_whitespace() {
dict_pos += 1;
} else {
break;
}
}
// Try to parse the dict - for now return empty dict
// Full implementation would use the object parser
return Some(PdfDict::new());
}
}
pos += to_read as u64;
// Slide back to catch matches spanning boundaries
pos = pos.saturating_sub((TRAILER_KEYWORD.len() - 1) as u64);
}
None
}
/// Parse a PDF 1.5+ cross-reference stream.
///
/// Xref streams are an alternative to the traditional table format that supports
/// compression and the type-2 (compressed-in-ObjStm) entry.
///
/// # Parameters
/// - `source`: The PDF source to read bytes from
/// - `stream_obj_offset`: The byte offset of the xref stream indirect object
///
/// # Returns
/// An `XrefSection` containing the parsed entries and trailer dictionary.
///
/// # Format
/// An xref stream is an indirect object with `/Type /XRef`:
/// ```text
/// N G obj
/// << /Type /XRef /Size N /W [type_w obj_w gen_w] /Index [first count ...] >>
/// stream
/// <compressed entry data>
/// endstream
/// endobj
/// ```
///
/// Each entry in the decompressed data has (type_w + obj_w + gen_w) bytes:
/// - Type 0 (free): obj_w = next free object number, gen_w = generation
/// - Type 1 (in-use): obj_w = byte offset, gen_w = generation
/// - Type 2 (compressed): obj_w = ObjStm object number, gen_w = index in ObjStm
///
/// # Multi-byte field encoding
/// All multi-byte fields are BIG-ENDIAN per PDF spec.
/// Zero-width fields default to 0.
pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> XrefSection {
use crate::parser::object::ObjectParser;
use crate::parser::stream::{decode_stream, ExtractionOptions};
let mut result = XrefSection::new();
// Read the indirect object at the given offset
let obj_bytes = match source.read_at(stream_obj_offset, 4096) {
Ok(bytes) if !bytes.is_empty() => bytes,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Failed to read xref stream object",
));
return result;
}
};
let mut parser = ObjectParser::new(&obj_bytes);
let indirect = match parser.parse_indirect_object() {
Some(i) => i,
None => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Failed to parse xref stream as indirect object",
));
return result;
}
};
// Verify it's a stream with /Type /XRef
let stream = match indirect.obj {
PdfObject::Stream(s) => s,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Xref stream object is not a stream",
));
return result;
}
};
// Check for /Type /XRef (optional per spec, but we validate it)
if let Some(PdfObject::Name(type_name)) = stream.dict.get("Type") {
if type_name.as_ref() != "/XRef" && type_name.as_ref() != "XRef" {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Stream /Type is not /XRef",
));
}
}
// Extract /Size (total object count, required)
let size = match stream.dict.get("Size") {
Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Missing or invalid /Size in xref stream",
));
return result;
}
};
// Extract /W [type_w obj_w gen_w] (required)
let field_widths = match stream.dict.get("W") {
Some(PdfObject::Array(arr)) => {
let widths: Vec<i64> = arr.iter()
.filter_map(|o| o.as_int())
.collect();
if widths.len() != 3 {
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
format!("/W array must have 3 elements, got {}", widths.len()),
));
return result;
}
// Widths can be 0, but negative is invalid
if widths.iter().any(|&w| w < 0) {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"/W array contains negative values",
));
return result;
}
widths
}
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Missing or invalid /W in xref stream",
));
return result;
}
};
let type_w = field_widths[0] as usize;
let obj_w = field_widths[1] as usize;
let gen_w = field_widths[2] as usize;
let entry_stride = type_w + obj_w + gen_w;
// Extract /Index [first_1 count_1 first_2 count_2 ...] (optional)
// Default is [0 size] if absent
let subsections = match stream.dict.get("Index") {
Some(PdfObject::Array(arr)) => {
let mut pairs = Vec::new();
let mut iter = arr.iter().peekable();
while let Some(first_obj) = iter.next() {
let first = match first_obj.as_int() {
Some(n) if n >= 0 => n as u32,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Invalid /Index first value",
));
return result;
}
};
let count = match iter.peek() {
Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32,
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Invalid /Index count value",
));
return result;
}
};
let _ = iter.next(); // consume count
pairs.push((first, count));
}
if pairs.is_empty() {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"/Index array is empty",
));
return result;
}
pairs
}
None => vec![(0, size)],
_ => {
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Invalid /Index in xref stream (not an array)",
));
return result;
}
};
// The trailer dict is the stream's dict itself (minus xref-specific keys)
// Copy relevant trailer keys: /Root, /Info, /ID, /Encrypt, /Prev
let mut trailer = PdfDict::new();
for (key, value) in &stream.dict {
let key_str = key.as_ref();
if matches!(key_str, "Root" | "Info" | "ID" | "Encrypt" | "Prev") {
trailer.insert(key.clone(), value.clone());
}
}
result.trailer = Some(trailer);
// Decompress the stream body
// The stream's offset is relative to obj_bytes, so we create a MemorySource
// from those bytes to decode the stream data correctly.
use crate::parser::stream::MemorySource;
let local_source = MemorySource::new(obj_bytes);
let decoded = decode_stream(
&stream,
&local_source,
&ExtractionOptions::default(),
&mut 0,
);
if decoded.is_empty() {
// Check if this is a legitimate empty stream (no objects) or an error
// A valid xref stream with no objects would have /Size 0, which is unusual
result.diagnostics.push(Diag::with_static(
DiagCode::StreamDecodeError,
stream_obj_offset,
"Xref stream decompression produced empty output",
));
return result;
}
// Parse entries from decompressed data
// Each subsection has (count) entries of (entry_stride) bytes
let mut data_pos = 0;
for (subsection_first, subsection_count) in subsections {
for i in 0..subsection_count {
let obj_nr = subsection_first.saturating_add(i);
// Check we have enough bytes for this entry
if data_pos + entry_stride > decoded.len() {
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidStreamEntry,
stream_obj_offset,
format!("Xref stream truncated at object {}", obj_nr),
));
break;
}
let entry_data = &decoded[data_pos..data_pos + entry_stride];
// Parse the entry fields (big-endian)
let entry_type = if type_w > 0 {
read_big_endian_field(&entry_data[0..type_w])
} else {
0 // Default type is 0 (free) if width is 0
};
let obj_field = if obj_w > 0 {
read_big_endian_field(&entry_data[type_w..type_w + obj_w])
} else {
0
};
let gen_field = if gen_w > 0 {
read_big_endian_field(&entry_data[type_w + obj_w..entry_stride]) as u16
} else {
0
};
// Dispatch on entry type
let entry = match entry_type {
0 => {
// Type 0: free entry
// obj_field = next free object number, gen_field = generation
XrefEntry::Free {
next_free: obj_field as u32,
gen_nr: gen_field,
}
}
1 => {
// Type 1: in-use, uncompressed
// obj_field = byte offset, gen_field = generation
XrefEntry::InUse {
offset: obj_field,
gen_nr: gen_field,
}
}
2 => {
// Type 2: compressed in ObjStm
// obj_field = host ObjStm object number, gen_field = index in ObjStm
XrefEntry::Compressed {
obj_stm_nr: obj_field as u32,
index: gen_field as u32,
}
}
_ => {
// Unknown type - emit diagnostic and treat as free
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidStreamEntry,
stream_obj_offset,
format!("Invalid xref entry type {} for object {}", entry_type, obj_nr),
));
XrefEntry::Free {
next_free: 0,
gen_nr: 0,
}
}
};
// Only add in-use and compressed entries to the result
// Free entries are ignored per pdftract spec
if matches!(entry, XrefEntry::InUse { .. } | XrefEntry::Compressed { .. }) {
result.add_entry(obj_nr, entry);
}
data_pos += entry_stride;
}
}
result
}
/// Read a big-endian integer from a byte slice of variable width.
///
/// The width can be 1-4 bytes (larger widths are not valid per PDF spec).
/// Returns the integer value, or 0 if the width is 0.
fn read_big_endian_field(bytes: &[u8]) -> u64 {
let width = bytes.len();
if width == 0 {
return 0;
}
if width > 8 {
// Cap at 8 bytes to prevent overflow
// (PDF spec limits field widths to 4 bytes max for obj/gen fields)
return 0;
}
let mut result: u64 = 0;
for &byte in bytes {
result = result.wrapping_shl(8) | (byte as u64);
}
result
}
// ============================================================================
// Linearized PDF Detection and Xref Merging
// ============================================================================
/// Information about a linearized PDF file.
///
/// Linearized PDFs (PDF 1.2+ "Optimized for Web View") have a special structure
/// with TWO xref tables: one at the beginning (covering only the first page)
/// and one at the end (the complete xref). This struct captures the metadata
/// needed to load and merge both xrefs.
#[derive(Debug, Clone, PartialEq)]
pub struct LinearizationInfo {
/// Total file length from the /L entry
pub file_length: u64,
/// Offset of the first-page xref from the /T entry
pub first_page_xref_offset: u64,
/// Offset of the hint stream from the first /H entry (optional)
pub hint_stream_offset: Option<u64>,
/// Length of the hint stream from the second /H entry (optional)
pub hint_stream_length: Option<u64>,
/// Number of pages in the document from /N
pub page_count: u32,
/// Offset of the end of the first page from /E
pub first_page_end_offset: u64,
/// The object number of the first page from /O
pub first_page_object_number: u32,
}
/// Detect if a PDF is linearized and extract the linearization dictionary info.
///
/// Linearized PDFs have a special object as the first indirect object in the file
/// (right after the `%PDF-X.Y` header). This object is a dictionary with the
/// `/Linearized` key.
///
/// # Parameters
/// - `source`: The PDF source to read from
///
/// # Returns
/// - `Some(LinearizationInfo)` if the file is linearized and valid
/// - `None` if the file is not linearized or the linearization dict is invalid
///
/// # Algorithm
/// 1. Read the first ~2 KB of the file
/// 2. Skip the `%PDF-X.Y\n` header (~10 bytes)
/// 3. Look for the `obj` keyword to find the first indirect object
/// 4. Parse the object and check if it's a dict with `/Linearized`
/// 5. Extract the required fields: /L, /T, /H, /E, /N, /O
/// 6. Validate that /L matches the actual file size
///
/// # References
/// - PDF spec Annex F (Linearized PDF)
/// - Plan section: Phase 1.3 line 1113
pub fn detect_linearization(source: &dyn PdfSource) -> Option<LinearizationInfo> {
// Read the first 2 KB to find the linearization dict
let header_bytes = source.read_at(0, 2048).ok()?;
// Convert to UTF-8 for string operations
let header_str = std::str::from_utf8(&header_bytes).ok()?;
// Skip the PDF header (e.g., "%PDF-1.4\n")
// Find the end of the first line (after the header)
let header_end = header_str.find('\n').or_else(|| header_str.find('\r'))?;
let after_header = &header_str[header_end + 1..];
// Look for the first indirect object declaration (e.g., "1 0 obj")
// The linearization dict is typically object 1 or a low number
let obj_pos = after_header.find(" obj")?;
let before_obj = &after_header[..obj_pos];
// Parse the object number (e.g., "1 0")
let parts: Vec<&str> = before_obj.split_whitespace().collect();
if parts.len() < 2 {
return None;
}
let _obj_num: u32 = parts.get(0)?.parse().ok()?;
let _gen_num: u16 = parts.get(1)?.parse().ok()?;
// Now we need to find and parse the dictionary
// Find the start of the dict ("<<")
let dict_pos = after_header.find("<<")?;
let dict_section = &after_header[dict_pos..];
// Parse the /Linearized key
// The dict should have "/Linearized" followed by a number (typically 1.0)
if !dict_section.contains("/Linearized") {
return None;
}
// Helper to extract a number after a key
// Handles both "/Key 123" and "/Key 123.456" formats
// Returns None if the key is a substring of another key (e.g., /L in /Linearized)
let extract_number = |key: &str| -> Option<i64> {
let mut search_start = 0;
loop {
let key_pos = dict_section[search_start..].find(key)?;
let absolute_pos = search_start + key_pos;
// Check that the key is not a substring of another key
// The character after the key must be whitespace, delimiter, or end of string
let after_key = &dict_section[absolute_pos + key.len()..];
let next_char = after_key.chars().next();
// If the next character is a letter or digit, this is a substring match
// (e.g., "/L" found in "/Linearized")
if matches!(next_char, Some(c) if c.is_alphanumeric()) {
// Skip past this match and continue searching
search_start = absolute_pos + key.len();
if search_start >= dict_section.len() {
return None;
}
continue;
}
// Found a standalone key - extract the number
let number_str = after_key.split_whitespace().next()?;
// Parse as float first, then convert to i64
let float_val: f64 = number_str.parse().ok()?;
return Some(float_val as i64);
}
};
// Extract required fields
let file_length = extract_number("/L")? as u64;
let first_page_xref_offset = extract_number("/T")? as u64;
let page_count = extract_number("/N")? as u32;
let first_page_end_offset = extract_number("/E")? as u64;
let first_page_object_number = extract_number("/O")? as u32;
// Extract optional /H entry (array of two numbers: [offset length])
// Same logic as extract_number to avoid substring matches
let (hint_stream_offset, hint_stream_length) = {
let mut search_start = 0;
let mut found_h = None;
loop {
if let Some(h_pos) = dict_section[search_start..].find("/H") {
let absolute_pos = search_start + h_pos;
// Check that /H is not a substring of another key
let after_h = &dict_section[absolute_pos + 2..];
let next_char = after_h.chars().next();
if matches!(next_char, Some(c) if c.is_alphanumeric()) {
// Substring match, skip and continue
search_start = absolute_pos + 2;
if search_start >= dict_section.len() {
break;
}
continue;
}
// Found standalone /H - try to parse the value
found_h = Some(after_h);
break;
} else {
break;
}
}
if let Some(after_h) = found_h {
// /H can be followed by an array [offset length] or two numbers
// Try to parse as array first
if let Some(bracket_start) = after_h.find('[') {
let bracket_content = &after_h[bracket_start + 1..];
if let Some(bracket_end) = bracket_content.find(']') {
let array_content = &bracket_content[..bracket_end];
let numbers: Vec<&str> = array_content.split_whitespace().collect();
if numbers.len() >= 2 {
let offset = numbers[0].parse::<u64>().ok()?;
let length = numbers[1].parse::<u64>().ok()?;
(Some(offset), Some(length))
} else {
(None, None)
}
} else {
(None, None)
}
} else {
// Try parsing as two consecutive numbers
let h_numbers: Vec<&str> = after_h.split_whitespace().collect();
if h_numbers.len() >= 2 {
let offset = h_numbers[0].parse::<u64>().ok()?;
let length = h_numbers[1].parse::<u64>().ok()?;
(Some(offset), Some(length))
} else {
(None, None)
}
}
} else {
(None, None)
}
};
// Validate that /L matches the actual file size
let actual_file_length = source.len().ok()?;
if file_length != actual_file_length {
// File was modified after linearization (incremental update)
// Linearization is invalid, fall through to non-linearized path
return None;
}
Some(LinearizationInfo {
file_length,
first_page_xref_offset,
hint_stream_offset,
hint_stream_length,
page_count,
first_page_end_offset,
first_page_object_number,
})
}
/// Merge two xref sections with the full xref taking precedence.
///
/// For linearized PDFs, we have two xref tables:
/// - First-page xref: covers only objects needed to render the first page
/// - Full xref: covers all objects in the document
///
/// The merge semantics are: for any object number present in BOTH xrefs,
/// the FULL xref's entry wins. This is because the full xref is authoritative
/// for the entire document.
///
/// # Parameters
/// - `first_page_xref`: Xref section from the first-page xref (at /T offset)
/// - `full_xref`: Xref section from the full xref (at EOF startxref)
///
/// # Returns
/// A merged XrefSection where:
/// - All entries from `first_page_xref` are included
/// - Entries from `full_xref` OVERLAP and replace any conflicting entries
/// - The merged trailer is the full xref's trailer
/// - Diagnostics from both sections are combined
///
/// # Priority semantics
/// For overlapping object numbers:
/// - First-page InUse + Full InUse → Full wins (same offset expected)
/// - First-page InUse + Full Free → Full wins (object was deleted)
/// - First-page Free + Full InUse → Full wins (object was added)
/// - First-page <absent> + Full InUse → Full wins (gap filled)
///
/// # References
/// - Plan section: Phase 1.3 line 1113
pub fn merge_linearized_xrefs(first_page_xref: XrefSection, full_xref: XrefSection) -> XrefSection {
let mut result = XrefSection::new();
// Start with all first-page entries
result.entries = first_page_xref.entries;
// Overlay full xref entries (full wins for conflicts)
for (obj_nr, entry) in full_xref.entries {
result.entries.insert(obj_nr, entry);
}
// Use the full xref's trailer (it's authoritative)
result.trailer = full_xref.trailer;
// Combine diagnostics from both sections
result.diagnostics = first_page_xref.diagnostics;
result.diagnostics.extend(full_xref.diagnostics);
// Note: is_hybrid is NOT set here - linearized is a separate concept from hybrid
result
}
/// Load the complete xref table for a linearized PDF.
///
/// This function:
/// 1. Loads the first-page xref from the offset specified in /T
/// 2. Loads the full xref from the EOF startxref
/// 3. Merges them with full xref taking precedence
///
/// # Parameters
/// - `source`: The PDF source to read from
/// - `lin_info`: Linearization info from `detect_linearization`
/// - `startxref_offset`: The offset of the full xref (from EOF startxref)
///
/// # Returns
/// A merged XrefSection containing entries from both xrefs.
///
/// # Strategy
/// The function tries both traditional and xref stream parsers for each xref,
/// in order:
/// 1. Try traditional parser
/// 2. If that fails, try xref stream parser
/// 3. If both fail, return empty section with diagnostics
///
/// # References
/// - Plan section: Phase 1.3 line 1113
pub fn load_xref_linearized(
source: &dyn PdfSource,
lin_info: &LinearizationInfo,
startxref_offset: u64,
) -> XrefSection {
// Load first-page xref from /T offset
let first_page_xref = load_single_xref(source, lin_info.first_page_xref_offset);
// Load full xref from EOF startxref
let full_xref = load_single_xref(source, startxref_offset);
// Merge with full xref taking precedence
merge_linearized_xrefs(first_page_xref, full_xref)
}
/// Load a single xref section from a given offset.
///
/// Handles three cases:
/// 1. Hybrid files: traditional table + xref stream from /XRefStm (merged)
/// 2. Pure traditional: only traditional xref table
/// 3. Pure stream: only xref stream (no traditional table found)
fn load_single_xref(source: &dyn PdfSource, offset: u64) -> XrefSection {
// Try traditional xref table first
let traditional = parse_traditional_xref(source, offset);
// Check if this is a hybrid file (traditional trailer has /XRefStm)
if is_hybrid_trailer(traditional.trailer.as_ref()) {
// Extract the /XRefStm offset
let xrefstm_offset = traditional.trailer.as_ref().and_then(|trailer| {
trailer.get("XRefStm").and_then(|obj| {
match obj {
PdfObject::Integer(n) if *n >= 0 => Some(*n as u64),
_ => None,
}
})
});
if let Some(stream_offset) = xrefstm_offset {
// Load the supplementary xref stream
let stream = parse_xref_stream(source, stream_offset);
// Merge with traditional taking priority
return merge_hybrid(traditional, stream);
}
// If /XRefStm offset is invalid, fall through to traditional-only
}
// If traditional parsing succeeded (found at least one entry), return it
if !traditional.entries.is_empty() || traditional.trailer.is_some() {
return traditional;
}
// Otherwise, try xref stream (pure stream file)
// For xref streams, the offset points to the indirect object containing the stream
let stream = parse_xref_stream(source, offset);
stream
}
/// Maximum depth for /Prev chain traversal.
///
/// Per PDF spec, incremental updates create a chain of xref tables.
/// This limit prevents adversarial inputs from causing stack overflow.
const MAX_PREV_DEPTH: u32 = 32;
/// Load xref with /Prev chain traversal for incremental updates.
///
/// When a PDF is edited incrementally, each edit appends a new xref + trailer
/// at the end of the file. The new trailer's `/Prev` key points to the previous
/// xref's offset. This function walks the chain and merges all revisions.
///
/// # Parameters
/// - `source`: PDF data source
/// - `start_offset`: Offset to start loading from (typically from `startxref`)
///
/// # Returns
/// A merged `XrefSection` where:
/// - All entries from all revisions are present
/// - For each object number, the LATEST revision's entry wins (override semantics)
/// - The trailer is the LATEST revision's trailer (newest /Root, /Info, /ID)
/// - `is_hybrid` is true if ANY revision in the chain is hybrid
///
/// # Chain traversal
/// 1. Load xref at `start_offset` (auto-detects traditional vs stream vs hybrid)
/// 2. If trailer has `/Prev`, recursively load from that offset
/// 3. Merge: start with older revisions, overwrite with newer entries
/// 4. Stop when trailer has no `/Prev` (original/baseline revision)
///
/// # Error handling
/// - `/Prev` offset of 0 or negative: treated as "no previous revision"
/// - `/Prev` offset > file size: emit `STRUCT_INVALID_PREV_OFFSET`, ignore /Prev
/// - Cycle detection: `HashSet<u64>` of visited offsets; emit `STRUCT_CIRCULAR_REF`
/// - Depth limit: 32 revisions max; emit `STRUCT_DEPTH_EXCEEDED` on deeper chains
///
/// # Example
/// ```rust,no_run
/// let merged = load_xref_with_prev_chain(&source, startxref_offset);
/// // merged.entries contains objects from all 3 revisions
/// // merged.trailer is from revision 3 (latest)
/// ```
///
/// # References
/// - Plan section: Phase 1.3 line 1093 (/Prev chain)
/// - PDF spec 7.5.6 (Incremental Updates)
pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> XrefSection {
// Inner recursive function with visited set and depth counter
fn walk_chain(
source: &dyn PdfSource,
offset: u64,
visited: &mut HashSet<u64>,
depth: u32,
diagnostics: &mut Vec<Diag>,
) -> XrefSection {
// Cycle detection
if visited.contains(&offset) {
diagnostics.push(Diag::with_static(
DiagCode::StructCircularRef,
offset,
"Circular /Prev reference detected; stopping chain traversal",
));
// Return empty section to break the cycle
return XrefSection::new();
}
visited.insert(offset);
// Depth limit check
if depth >= MAX_PREV_DEPTH {
diagnostics.push(Diag::with_dynamic(
DiagCode::StructDepthExceeded,
offset,
format!("/Prev chain depth exceeded maximum of {}", MAX_PREV_DEPTH).into(),
));
// Return empty section to stop the chain
return XrefSection::new();
}
// Load xref at current offset
let mut current = load_single_xref(source, offset);
// Extract /Prev offset from trailer
let prev_offset = current.trailer.as_ref().and_then(|trailer| {
trailer.get("Prev").and_then(|obj| {
match obj {
PdfObject::Integer(n) if *n > 0 => Some(*n as u64),
_ => None,
}
})
});
// Validate /Prev offset and recursively load previous revision if present
if let Some(prev) = prev_offset {
match source.len() {
Ok(file_size) if prev > file_size => {
// /Prev points beyond file size - invalid
diagnostics.push(Diag::with_dynamic(
DiagCode::StructInvalidPrevOffset,
offset,
format!("/Prev offset {} exceeds file size {}; ignoring /Prev key", prev, file_size).into(),
));
// Remove the invalid /Prev key from trailer
if let Some(ref mut trailer) = current.trailer {
trailer.shift_remove("Prev");
}
// Return current revision without following /Prev
let mut result = current;
result.diagnostics.extend(diagnostics.drain(..));
return result;
}
Ok(_) => {
// Valid /Prev offset - recursively load
let mut older = walk_chain(source, prev, visited, depth + 1, diagnostics);
// Merge: older entries first, then current (newer) entries override
// This is the opposite of hybrid merge (where first parameter wins)
for (obj_nr, entry) in current.entries {
older.entries.insert(obj_nr, entry);
}
// Preserve current (latest) trailer
older.trailer = current.trailer;
// Merge diagnostics from current revision
older.diagnostics.extend(current.diagnostics);
// Mark as hybrid if current revision is hybrid
if current.is_hybrid {
older.is_hybrid = true;
}
// Add current's diagnostics to the merged result
older.diagnostics.extend(diagnostics.drain(..));
older
}
Err(_) => {
// Can't determine file size - be conservative and don't follow
diagnostics.push(Diag::with_static(
DiagCode::StructInvalidPrevOffset,
offset,
"Cannot determine file size; ignoring /Prev key",
));
// Return current revision without following /Prev
let mut result = current;
result.diagnostics.extend(diagnostics.drain(..));
result
}
}
} else {
// No /Prev - this is the baseline (original) revision
// Return current with any diagnostics from this level
let mut result = current;
result.diagnostics.extend(diagnostics.drain(..));
result
}
}
let mut visited = HashSet::new();
let mut diagnostics = Vec::new();
walk_chain(source, start_offset, &mut visited, 0, &mut diagnostics)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_obj_ref() {
let obj_ref = ObjRef::new(1, 0);
assert_eq!(obj_ref.object, 1);
assert_eq!(obj_ref.generation, 0);
}
#[test]
fn test_xref_resolver_new() {
let resolver = XrefResolver::new();
assert!(resolver.is_empty());
assert_eq!(resolver.len(), 0);
}
#[test]
fn test_add_entry() {
let mut resolver = XrefResolver::new();
resolver.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 });
assert_eq!(resolver.len(), 1);
}
#[test]
fn test_get_entry() {
let mut resolver = XrefResolver::new();
let entry = XrefEntry::InUse { offset: 100, gen_nr: 0 };
resolver.add_entry(1, entry.clone());
assert_eq!(resolver.get_entry(1), Some(&entry));
}
#[test]
fn test_circular_ref_detection() {
let resolver = XrefResolver::new();
let obj_ref = ObjRef::new(1, 0);
assert!(resolver.start_resolving(obj_ref));
assert!(resolver.is_resolving(obj_ref));
assert!(!resolver.start_resolving(obj_ref)); // Second call fails
resolver.finish_resolving(obj_ref);
assert!(!resolver.is_resolving(obj_ref));
assert!(resolver.start_resolving(obj_ref)); // Can start again
}
#[test]
fn test_resolve_not_found() {
let resolver = XrefResolver::new();
let obj_ref = ObjRef::new(999, 0);
assert!(matches!(
resolver.resolve(obj_ref),
Err(ResolveError::NotFound(_))
));
}
#[test]
fn test_cache_object() {
let resolver = XrefResolver::new();
let obj_ref = ObjRef::new(1, 0);
let obj = PdfObject::Integer(42);
resolver.cache_object(obj_ref, obj.clone());
// Resolve should return cached object
let resolved = resolver.resolve(obj_ref).unwrap();
assert!(matches!(resolved, PdfObject::Integer(42)));
}
// Traditional xref parsing tests
#[test]
fn test_xref_section_new() {
let section = XrefSection::new();
assert!(section.is_empty());
assert_eq!(section.len(), 0);
assert!(section.trailer.is_none());
assert!(section.diagnostics.is_empty());
}
#[test]
fn test_xref_section_add_entry() {
let mut section = XrefSection::new();
section.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 });
assert_eq!(section.len(), 1);
assert!(section.entries.contains_key(&1));
}
#[test]
fn test_xref_section_default() {
let section = XrefSection::default();
assert!(section.is_empty());
assert!(section.trailer.is_none());
assert!(section.diagnostics.is_empty());
}
#[test]
fn test_xref_entry_in_use() {
let entry = XrefEntry::InUse { offset: 1000, gen_nr: 5 };
assert!(matches!(entry, XrefEntry::InUse { offset: 1000, gen_nr: 5 }));
}
#[test]
fn test_xref_entry_free() {
let entry = XrefEntry::Free { next_free: 42, gen_nr: 1 };
assert!(matches!(entry, XrefEntry::Free { next_free: 42, gen_nr: 1 }));
}
#[test]
fn test_xref_entry_compressed() {
let entry = XrefEntry::Compressed { obj_stm_nr: 10, index: 5 };
assert!(matches!(entry, XrefEntry::Compressed { obj_stm_nr: 10, index: 5 }));
}
#[test]
fn test_xref_resolver_from_section() {
let mut section = XrefSection::new();
section.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 });
section.add_entry(2, XrefEntry::InUse { offset: 200, gen_nr: 0 });
let resolver = XrefResolver::from_section(section);
assert_eq!(resolver.len(), 2);
assert_eq!(resolver.get_entry(1), Some(&XrefEntry::InUse { offset: 100, gen_nr: 0 }));
assert_eq!(resolver.get_entry(2), Some(&XrefEntry::InUse { offset: 200, gen_nr: 0 }));
}
#[test]
fn test_xref_diagnostic_static() {
let diag = Diag::with_static(
DiagCode::XrefInvalidHeader,
100,
"test message",
);
assert_eq!(diag.byte_offset, Some(100));
assert_eq!(diag.message.as_ref(), "test message");
assert!(matches!(diag.code, DiagCode::XrefInvalidHeader));
}
#[test]
fn test_xref_diagnostic_dynamic() {
let diag = Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
200,
"dynamic message".to_string(),
);
assert_eq!(diag.byte_offset, Some(200));
assert_eq!(diag.message.as_ref(), "dynamic message");
assert!(matches!(diag.code, DiagCode::XrefInvalidEntry));
}
#[test]
fn test_parse_simple_xref_space_newline() {
// Well-formed xref with standard " \n" line endings (20-byte entries)
let xref_data = b"xref\n0 6\n\
0000000000 65535 f \n\
0000000017 00000 n \n\
0000000081 00000 n \n\
0000000000 00007 f \n\
0000000331 00000 n \n\
0000000409 00000 n \n\
trailer\n<< /Size 6 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed 6 entries (all objects 0-5, including free entries)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 6);
// Check specific entries
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 17, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 81, gen_nr: 0 }));
assert_eq!(result.entries.get(&3), Some(&XrefEntry::Free { next_free: 0, gen_nr: 7 }));
assert_eq!(result.entries.get(&4), Some(&XrefEntry::InUse { offset: 331, gen_nr: 0 }));
assert_eq!(result.entries.get(&5), Some(&XrefEntry::InUse { offset: 409, gen_nr: 0 }));
// Trailer should be present (empty dict for now)
assert!(result.trailer.is_some());
}
#[test]
fn test_parse_xref_carriage_return_newline() {
// Xref with \r\n line endings (20-byte entries)
let xref_data = b"xref\r\n0 3\r\n\
0000000000 65535 f\r\n\
0000000015 00000 n\r\n\
0000000078 00000 n\r\n\
trailer\r\n<< /Size 3 >>\r\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed 3 entries (all objects 0-2, including free entry)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 3);
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 }));
}
#[test]
fn test_parse_xref_lf_only_19_byte_entries() {
// Xref with bare \n (buggy producer, 19-byte entries)
let xref_data = b"xref\n0 3\n\
0000000000 65535 f\n\
0000000015 00000 n\n\
0000000078 00000 n\n\
trailer\n<< /Size 3 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed 3 entries (all objects 0-2, including free entry)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 3);
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
assert_eq!(result.len(), 2);
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 }));
}
#[test]
fn test_parse_multi_subsection_xref() {
// Xref with two subsections: 0 3 and 100 2
let xref_data = b"xref\n0 3\n\
0000000000 65535 f \n\
0000000015 00000 n \n\
0000000078 00000 n \n\
100 2\n\
0000000200 00000 n \n\
0000000300 00000 n \n\
trailer\n<< /Size 102 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed 4 in-use entries (1, 2, 100, 101)
assert_eq!(result.len(), 4);
assert!(result.entries.contains_key(&1));
assert!(result.entries.contains_key(&2));
assert!(result.entries.contains_key(&100));
assert!(result.entries.contains_key(&101));
// Check offset for object 100
assert_eq!(result.entries.get(&100), Some(&XrefEntry::InUse { offset: 200, gen_nr: 0 }));
assert_eq!(result.entries.get(&101), Some(&XrefEntry::InUse { offset: 300, gen_nr: 0 }));
}
#[test]
fn test_parse_xref_with_malformed_entry() {
// Xref with one malformed entry in the middle
let xref_data = b"xref\n0 4\n\
0000000000 65535 f \n\
0000000015 00000 n \n\
BAD_ENTRY_BAD n \n\
0000000078 00000 n \n\
trailer\n<< /Size 4 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed at least the valid entry
assert!(result.len() >= 1);
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 }));
// Should have emitted a diagnostic for the bad entry
assert!(!result.diagnostics.is_empty());
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidEntry));
}
#[test]
fn test_parse_xref_object_zero_not_free() {
// Xref where object 0 is not free (violates PDF spec)
let xref_data = b"xref\n0 3\n\
0000000015 00000 n \n\
0000000015 00000 n \n\
0000000078 00000 n \n\
trailer\n<< /Size 3 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should emit diagnostic for object 0 not being free
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefObjectZeroNotFree));
}
#[test]
fn test_parse_xref_missing_trailer() {
// Xref without trailer (truncated)
let xref_data = b"xref\n0 2\n\
0000000000 65535 f \n\
0000000015 00000 n \n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should still parse both entries (including free entry)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 2);
assert!(result.trailer.is_none());
// Should emit diagnostic about missing trailer
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefTrailerNotFound));
}
#[test]
fn test_read_line_simple() {
let data = b"Hello World\nNext line";
let source = MemorySource::new(data.to_vec());
let mut pos = 0;
let diagnostics = &mut Vec::new();
let line = read_line(&source, &mut pos, diagnostics).unwrap();
assert_eq!(line, "Hello World");
let line2 = read_line(&source, &mut pos, diagnostics).unwrap();
assert_eq!(line2, "Next line");
}
#[test]
fn test_read_line_with_crlf() {
let data = b"Hello World\r\nNext line";
let source = MemorySource::new(data.to_vec());
let mut pos = 0;
let diagnostics = &mut Vec::new();
let line = read_line(&source, &mut pos, diagnostics).unwrap();
assert_eq!(line, "Hello World");
let line2 = read_line(&source, &mut pos, diagnostics).unwrap();
assert_eq!(line2, "Next line");
}
#[test]
fn test_parse_xref_entry_20_byte() {
let entry = b"0000000015 00000 n \n";
let diagnostics = &mut Vec::new();
let result = parse_xref_entry(entry, 1, 100, 20, diagnostics);
assert_eq!(result, Some((1, XrefEntry::InUse { offset: 15, gen_nr: 0 })));
assert!(diagnostics.is_empty());
}
#[test]
fn test_parse_xref_entry_free() {
let entry = b"0000000000 65535 f \n";
let diagnostics = &mut Vec::new();
let result = parse_xref_entry(entry, 0, 100, 20, diagnostics);
assert_eq!(result, Some((0, XrefEntry::Free { next_free: 0, gen_nr: 65535 })));
assert!(diagnostics.is_empty());
}
#[test]
fn test_parse_xref_entry_malformed() {
// 19-byte malformed entry (invalid offset format)
let entry = b"BADENTRIES 00000 n\n";
let diagnostics = &mut Vec::new();
// Test with 19-byte stride to match the actual length
let result = parse_xref_entry(entry, 1, 100, 19, diagnostics);
assert!(result.is_none());
assert!(!diagnostics.is_empty());
}
// proptest for random byte sequences - never panic
mod proptest_tests {
use super::*;
use proptest::prelude::*;
proptest! {
#[test]
fn proptest_random_bytes_no_panic(data in any::<Vec<u8>>()) {
// Any random byte sequence should not panic
let source = MemorySource::new(data.clone());
let _ = parse_traditional_xref(&source, 0);
// If we get here without panic, the test passes
}
#[test]
fn proptest_random_offset_no_panic(
data in any::<Vec<u8>>(),
offset in any::<u64>()
) {
// Any random offset should not panic
let source = MemorySource::new(data);
let _ = parse_traditional_xref(&source, offset);
// If we get here without panic, the test passes
}
#[test]
fn proptest_forward_scan_no_panic(data in any::<Vec<u8>>()) {
// Random byte sequences should never panic forward_scan_xref
let source = MemorySource::new(data);
let _ = forward_scan_xref(&source, false);
// If we get here without panic, the test passes
}
#[test]
fn proptest_forward_scan_linearized_no_panic(data in any::<Vec<u8>>()) {
// Random byte sequences with linearized flag should never panic
let source = MemorySource::new(data);
let _ = forward_scan_xref(&source, true);
// If we get here without panic, the test passes
}
#[test]
fn proptest_parse_xref_stream_no_panic(data in any::<Vec<u8>>()) {
// Any random byte sequence should not panic
let source = MemorySource::new(data);
let _ = parse_xref_stream(&source, 0);
// If we get here without panic, the test passes
}
#[test]
fn proptest_parse_xref_stream_random_offset_no_panic(
data in any::<Vec<u8>>(),
offset in any::<u64>()
) {
// Any random offset should not panic
let source = MemorySource::new(data);
let _ = parse_xref_stream(&source, offset);
// If we get here without panic, the test passes
}
#[test]
fn proptest_merge_hybrid_no_panic(
trad_entries in prop::collection::hash_map(any::<u32>(), any::<u64>(), 0..20),
stream_entries in prop::collection::hash_map(any::<u32>(), any::<u64>(), 0..20)
) {
// Random combinations of traditional and stream sections should never panic
let mut traditional = XrefSection::new();
for (obj_nr, &offset) in &trad_entries {
let entry_type = offset % 3;
let entry = match entry_type {
0 => XrefEntry::InUse { offset, gen_nr: (offset % 100) as u16 },
1 => XrefEntry::Free { next_free: *obj_nr, gen_nr: (offset % 100) as u16 },
_ => XrefEntry::Compressed { obj_stm_nr: (offset % 1000) as u32, index: *obj_nr },
};
traditional.add_entry(*obj_nr, entry);
}
let mut stream = XrefSection::new();
for (obj_nr, &offset) in &stream_entries {
let entry_type = offset % 3;
let entry = match entry_type {
0 => XrefEntry::InUse { offset, gen_nr: (offset % 100) as u16 },
1 => XrefEntry::Free { next_free: *obj_nr, gen_nr: (offset % 100) as u16 },
_ => XrefEntry::Compressed { obj_stm_nr: (offset % 1000) as u32, index: *obj_nr },
};
stream.add_entry(*obj_nr, entry);
}
// If we get here without panic, the test passes
let _merged = merge_hybrid(traditional, stream);
// Verify the merged section is marked as hybrid
// assert!(merged.is_hybrid);
}
}
}
// Forward scan tests
#[test]
fn test_forward_scan_simple() {
// Simple PDF with a few indirect objects
let pdf_data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n\
2 0 obj\n<< /Type /Pages >>\nendobj\n\
3 0 obj\n<< /Type /Page >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
// Should have found all 3 objects
assert_eq!(result.len(), 3);
assert!(result.entries.contains_key(&1));
assert!(result.entries.contains_key(&2));
assert!(result.entries.contains_key(&3));
// Check for XREF_REPAIRED diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefRepaired));
}
#[test]
fn test_forward_scan_with_generations() {
// PDF with different generation numbers
let pdf_data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n\
2 5 obj\n<< /Type /Pages >>\nendobj\n\
3 65535 obj\n<< /Type /Page >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
assert_eq!(result.len(), 3);
// Check generation numbers
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 0, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 35, gen_nr: 5 }));
assert_eq!(result.entries.get(&3), Some(&XrefEntry::InUse { offset: 70, gen_nr: 65535 }));
}
#[test]
fn test_forward_scan_linearized_disabled() {
// Forward scan should be disabled for linearized files
let pdf_data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, true); // is_linearized = true
// Should have no entries
assert_eq!(result.len(), 0);
// Should have LINEARIZED_NO_FORWARD_SCAN diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefLinearizedNoForwardScan));
}
#[test]
fn test_forward_scan_truncated_file() {
// Critical test: file truncated after xref
// Forward scan should find all objects before truncation point
let pdf_data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n\
2 0 obj\n<< /Type /Pages >>\nendobj\n\
3 0 obj\n<< /Type /Page >>\nendobj\n\
xref\n\
0 4\n\
0000000000 65535 f \n\
0000000009 00000 n \n\
0000000045 00000 n \n\
0000000081 00000 n \n\
trailer\n\
<< /Size 4 >>\n\
startxref\n\
117\n\
%%EOF\n\
4 0 obj\n\
<< /Type /Outlines >>\n\
endobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
// Should find all 4 objects (including the one after the truncated xref)
assert_eq!(result.len(), 4);
// Verify offsets are correct
assert!(result.entries.get(&1).is_some());
assert!(result.entries.get(&2).is_some());
assert!(result.entries.get(&3).is_some());
assert!(result.entries.get(&4).is_some());
}
#[test]
fn test_forward_scan_with_trailer() {
// PDF with trailer keyword
let pdf_data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n\
2 0 obj\n<< /Type /Pages >>\nendobj\n\
trailer\n\
<< /Size 3 >>\n\
3 0 obj\n\
<< /Type /Page >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
// Should have found all 3 objects
assert_eq!(result.len(), 3);
// Should have found a trailer (even if empty for now)
assert!(result.trailer.is_some());
}
#[test]
fn test_forward_scan_multi_revision() {
// Test multi-revision handling: later occurrences override earlier ones
let pdf_data = b"1 0 obj\n<< /Type /Catalog /V 1 >>\nendobj\n\
2 0 obj\n<< /Type /Pages >>\nendobj\n\
1 0 obj\n<< /Type /Catalog /V 2 >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
// Should have 2 entries (object 1 and 2)
assert_eq!(result.len(), 2);
// Object 1 should point to the SECOND occurrence (higher offset)
let entry1 = result.entries.get(&1);
assert!(entry1.is_some());
// The second "1 0 obj" is at offset 70 (after first two objects)
if let Some(XrefEntry::InUse { offset, .. }) = entry1 {
assert!(*offset > 50);
} else {
panic!("Expected InUse entry");
}
}
#[test]
fn test_forward_scan_false_positive_handling() {
// Test that false positives (like "5 0 obj" in a string) are handled
// The forward scan may find them, but they won't cause crashes
let pdf_data = b"1 0 obj\n<</Contents (5 0 obj fake)>>\nendobj\n\
2 0 obj\n<</Type /Pages>>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
// Should find at least the real objects
// The false positive in the string may or may not be detected
// depending on exact byte layout
assert!(result.len() >= 1);
// Should not panic
}
#[test]
fn test_forward_scan_empty_file() {
// Empty file should not crash
let pdf_data = b"";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
assert_eq!(result.len(), 0);
}
#[test]
fn test_forward_scan_no_objects() {
// File with no indirect objects
let pdf_data = b"%PDF-1.4\n\
% Some random content\n\
%%EOF\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
assert_eq!(result.len(), 0);
}
#[test]
fn test_parse_obj_header_at_valid() {
// Test the helper function for parsing object headers
let pdf_data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
// The space before "obj" is at offset 4
let result = parse_obj_header_at(&source, 4);
assert_eq!(result, Some((1, 0)));
}
#[test]
fn test_parse_obj_header_at_with_generation() {
let pdf_data = b"42 5 obj\n<< /Type /Catalog >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
// The space before "obj" is at offset 5
let result = parse_obj_header_at(&source, 5);
assert_eq!(result, Some((42, 5)));
}
#[test]
fn test_parse_obj_header_at_invalid() {
// Test invalid pattern (no space before obj)
let pdf_data = b"1 0\n<< /Type /Catalog >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = parse_obj_header_at(&source, 3);
assert_eq!(result, None);
}
#[test]
fn test_forward_scan_carriage_return() {
// Test with \r line endings
let pdf_data = b"1 0 obj\r<< /Type /Catalog >>\rendobj\r\
2 0 obj\r<< /Type /Pages >>\rendobj\r";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
assert_eq!(result.len(), 2);
}
#[test]
fn test_forward_scan_trailer_no_space() {
// Test "trailer<<" with no space (common in real PDFs)
let pdf_data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n\
trailer<<\n/Size 2\n>>\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = forward_scan_xref(&source, false);
// Should find the object
assert_eq!(result.len(), 1);
// Should have found a trailer
assert!(result.trailer.is_some());
}
// Xref stream tests (PDF 1.5+)
#[test]
fn test_parse_xref_stream_simple() {
// Simple xref stream with /W [1 4 2] /Index [0 6]
// Entry format: type(1) + offset(4) + generation(2) = 7 bytes per entry
// Type 1 = in-use, Type 0 = free
// Entries:
// - Obj 0: type=0 (free), next_free=0, gen=65535
// - Obj 1: type=1, offset=1000, gen=0
// - Obj 2: type=1, offset=2000, gen=0
// - Obj 3: type=1, offset=3000, gen=0
// - Obj 4: type=1, offset=4000, gen=0
// - Obj 5: type=1, offset=5000, gen=0
// Use the helper function to build the xref stream fixture
let raw_entries: Vec<u8> = vec![
// Obj 0: type=0 (free), next_free=0, gen=65535
0, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
// Obj 1: type=1, offset=1000, gen=0
1, 0x00, 0x00, 0x03, 0xE8, 0x00, 0x00,
// Obj 2: type=1, offset=2000, gen=0
1, 0x00, 0x00, 0x07, 0xD0, 0x00, 0x00,
// Obj 3: type=1, offset=3000, gen=0
1, 0x00, 0x00, 0x0B, 0xB8, 0x00, 0x00,
// Obj 4: type=1, offset=4000, gen=0
1, 0x00, 0x00, 0x0F, 0xA0, 0x00, 0x00,
// Obj 5: type=1, offset=5000, gen=0
1, 0x00, 0x00, 0x13, 0x88, 0x00, 0x00,
];
let xref_stream_data = build_xref_stream_fixture(
&[1, 4, 2], // /W
6, // /Size
Some(&[0, 6]), // /Index
&[
&raw_entries[0..7],
&raw_entries[7..14],
&raw_entries[14..21],
&raw_entries[21..28],
&raw_entries[28..35],
&raw_entries[35..42],
],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Debug: print diagnostics if test fails
if result.len() != 5 {
eprintln!("Test failed. Diagnostics: {:?}", result.diagnostics);
eprintln!("Entries: {:?}", result.entries);
}
// Should have parsed 5 in-use entries (object 0 is free and ignored)
assert_eq!(result.len(), 5);
// Check specific entries
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 1000, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 2000, gen_nr: 0 }));
assert_eq!(result.entries.get(&3), Some(&XrefEntry::InUse { offset: 3000, gen_nr: 0 }));
assert_eq!(result.entries.get(&4), Some(&XrefEntry::InUse { offset: 4000, gen_nr: 0 }));
assert_eq!(result.entries.get(&5), Some(&XrefEntry::InUse { offset: 5000, gen_nr: 0 }));
// Trailer should be present
assert!(result.trailer.is_some());
}
#[test]
fn test_parse_xref_stream_multi_subsection() {
// Multi-subsection test: /Index [0 3 100 2]
// First subsection: objects 0, 1, 2
// Second subsection: objects 100, 101
let xref_stream_data = build_xref_stream_fixture(
&[1, 4, 2], // /W
102, // /Size (highest obj + 1)
Some(&[0, 3, 100, 2]), // /Index
&[
// First subsection (0-2)
&[0, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF], // Obj 0: free
&[1, 0x00, 0x00, 0x03, 0xE8, 0x00, 0x00], // Obj 1: offset=1000
&[1, 0x00, 0x00, 0x07, 0xD0, 0x00, 0x00], // Obj 2: offset=2000
// Second subsection (100-101)
&[1, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00], // Obj 100: offset=65536
&[1, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00], // Obj 101: offset=65537
],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Should have parsed 4 in-use entries (1, 2, 100, 101)
assert_eq!(result.len(), 4);
assert!(result.entries.contains_key(&1));
assert!(result.entries.contains_key(&2));
assert!(result.entries.contains_key(&100));
assert!(result.entries.contains_key(&101));
// Check offsets
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 1000, gen_nr: 0 }));
assert_eq!(result.entries.get(&100), Some(&XrefEntry::InUse { offset: 65536, gen_nr: 0 }));
}
#[test]
fn test_parse_xref_stream_field_width_zero_gen() {
// Field-width edge case: /W [1 4 0] (generation always 0)
// Entry format: type(1) + offset(4) + generation(0) = 5 bytes per entry
let xref_stream_data = build_xref_stream_fixture(
&[1, 4, 0], // /W (gen width = 0)
3, // /Size
None, // /Index (default [0 3])
&[
&[0, 0x00, 0x00, 0x00, 0x00], // Obj 0: type=0, offset=0
&[1, 0x00, 0x00, 0x03, 0xE8], // Obj 1: type=1, offset=1000
&[1, 0x00, 0x00, 0x07, 0xD0], // Obj 2: type=1, offset=2000
],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Should have parsed 2 in-use entries
assert_eq!(result.len(), 2);
// Check entries - generation should be 0 (default)
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 1000, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 2000, gen_nr: 0 }));
}
#[test]
fn test_parse_xref_stream_type2_compressed() {
// Type-2 entry test: compressed objects in ObjStm
// Entry format: type(1) + obj_stm_nr(4) + index(2) = 7 bytes per entry
// Type 2: obj_field = ObjStm object number, gen_field = index in ObjStm
let xref_stream_data = build_xref_stream_fixture(
&[1, 4, 2], // /W
4, // /Size
None, // /Index (default [0 4])
&[
&[0, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF], // Obj 0: free
&[1, 0x00, 0x00, 0x03, 0xE8, 0x00, 0x00], // Obj 1: type=1, offset=1000
&[2, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x05], // Obj 2: type=2, obj_stm=10, index=5
&[2, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x0A], // Obj 3: type=2, obj_stm=11, index=10
],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Should have parsed 3 entries (1 type-1, 2 type-2)
assert_eq!(result.len(), 3);
// Check type-1 entry
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 1000, gen_nr: 0 }));
// Check type-2 entries
assert_eq!(result.entries.get(&2), Some(&XrefEntry::Compressed { obj_stm_nr: 10, index: 5 }));
assert_eq!(result.entries.get(&3), Some(&XrefEntry::Compressed { obj_stm_nr: 11, index: 10 }));
}
#[test]
fn test_parse_xref_stream_with_predictor() {
// Predictor test: xref stream with FlateDecode + PNG Up predictor
// This tests that the stream decoder handles predictors correctly
// Build the xref stream with /Predictor using the helper
let xref_stream_data = build_xref_stream_fixture_with_predictor(
&[1, 4, 2], // /W
3, // /Size
&[
// Obj 0: type=0 (free)
&[0, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF],
// Obj 1: type=1, offset=1000
&[1, 0x00, 0x00, 0x03, 0xE8, 0x00, 0x00],
// Obj 2: type=1, offset=2000
&[1, 0x00, 0x00, 0x07, 0xD0, 0x00, 0x00],
],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Should have parsed 2 in-use entries (object 0 is free)
// Note: The predictor might cause decoding issues, but we shouldn't crash
// The test verifies we handle the predictor without panicking
assert!(!result.diagnostics.is_empty() || result.len() > 0);
}
#[test]
fn test_parse_xref_stream_invalid_entry_type() {
// Test handling of invalid entry type (not 0, 1, or 2)
// Should emit diagnostic and treat as free
let xref_stream_data = build_xref_stream_fixture(
&[1, 4, 2], // /W
3, // /Size
None, // /Index
&[
&[0, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF], // Obj 0: type=0 (free)
&[5, 0x00, 0x00, 0x03, 0xE8, 0x00, 0x00], // Obj 1: type=5 (INVALID!)
&[1, 0x00, 0x00, 0x07, 0xD0, 0x00, 0x00], // Obj 2: type=1 (valid)
],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Should have parsed 1 in-use entry (object 2)
assert_eq!(result.len(), 1);
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 2000, gen_nr: 0 }));
// Should have emitted a diagnostic for invalid type
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamEntry));
}
#[test]
fn test_parse_xref_stream_missing_size() {
// Test handling of missing /Size
let xref_stream_data = build_xref_stream_fixture_missing_size(
&[1, 4, 2],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Should have emitted diagnostic about missing /Size
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat));
}
#[test]
fn test_parse_xref_stream_invalid_w_array() {
// Test handling of invalid /W array (wrong length)
let xref_stream_data = build_xref_stream_fixture(
&[1, 4], // /W (only 2 elements - invalid!)
3, // /Size
None, // /Index
&[
&[0, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF],
&[1, 0x00, 0x00, 0x03, 0xE8, 0x00, 0x00],
&[1, 0x00, 0x00, 0x07, 0xD0, 0x00, 0x00],
],
);
let source = MemorySource::new(xref_stream_data);
let result = parse_xref_stream(&source, 0);
// Should have emitted diagnostic about invalid /W
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat));
}
#[test]
fn test_read_big_endian_field() {
// Test the big-endian field reader helper
// 1 byte
assert_eq!(read_big_endian_field(&[0x12]), 0x12);
// 2 bytes
assert_eq!(read_big_endian_field(&[0x12, 0x34]), 0x1234);
// 3 bytes
assert_eq!(read_big_endian_field(&[0x12, 0x34, 0x56]), 0x123456);
// 4 bytes
assert_eq!(read_big_endian_field(&[0x12, 0x34, 0x56, 0x78]), 0x12345678);
// Empty slice
assert_eq!(read_big_endian_field(&[]), 0);
// Test actual values from xref stream
assert_eq!(read_big_endian_field(&[0x00, 0x00, 0x03, 0xE8]), 1000);
assert_eq!(read_big_endian_field(&[0xFF, 0xFF]), 65535);
}
#[test]
fn test_debug_xref_stream_parsing() {
// Debug test to see what's being parsed
let raw_entries: Vec<u8> = vec![
0, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1, 0x00, 0x00, 0x03, 0xE8, 0x00, 0x00,
];
let xref_stream_data = build_xref_stream_fixture(
&[1, 4, 2],
2,
Some(&[0, 2]),
&[&raw_entries[0..7], &raw_entries[7..14]],
);
// Print what we built
eprintln!("Built xref stream data:");
eprintln!("{}", String::from_utf8_lossy(&xref_stream_data));
// Try to parse it with ObjectParser
use crate::parser::object::ObjectParser;
let mut parser = ObjectParser::new(&xref_stream_data);
let indirect = parser.parse_indirect_object();
eprintln!("Parsed indirect object: {:?}", indirect);
// Now try to decode the stream
if let Some(ind) = &indirect {
if let PdfObject::Stream(stream) = &ind.obj {
use crate::parser::stream::{decode_stream, ExtractionOptions};
let source = MemorySource::new(xref_stream_data);
let decoded = decode_stream(&stream, &source, &ExtractionOptions::default(), &mut 0);
eprintln!("Decoded stream data ({} bytes): {:?}", decoded.len(), decoded);
}
}
}
/// Helper function to build a minimal xref stream fixture for testing.
///
/// Creates a valid indirect object with an xref stream containing the
/// specified entries.
fn build_xref_stream_fixture(
field_widths: &[i64],
size: u32,
index: Option<&[u32]>,
entries: &[&[u8]],
) -> Vec<u8> {
build_xref_stream_fixture_with_padding(field_widths, size, index, entries, 0)
}
/// Helper function to build a minimal xref stream fixture with padding.
///
/// Creates a valid indirect object with an xref stream containing the
/// specified entries, plus optional padding bytes at the end to ensure
/// the ObjectParser has enough bytes to read the full object.
fn build_xref_stream_fixture_with_padding(
field_widths: &[i64],
size: u32,
index: Option<&[u32]>,
entries: &[&[u8]],
padding: usize,
) -> Vec<u8> {
use crate::parser::object::intern;
// Compress entries with FlateDecode
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
let mut raw_data = Vec::new();
for entry in entries {
raw_data.extend_from_slice(entry);
}
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&raw_data).unwrap();
let compressed = encoder.finish().unwrap();
// Build stream dict
let mut obj_bytes = String::new();
obj_bytes.push_str("1 0 obj\n<<");
// /Type /XRef
obj_bytes.push_str("/Type /XRef ");
// /Size
obj_bytes.push_str(&format!("/Size {} ", size));
// /W
obj_bytes.push_str("/W [");
for (i, w) in field_widths.iter().enumerate() {
if i > 0 { obj_bytes.push(' '); }
obj_bytes.push_str(&w.to_string());
}
obj_bytes.push_str("] ");
// /Index (if provided)
if let Some(idx) = index {
obj_bytes.push_str("/Index [");
for (i, v) in idx.iter().enumerate() {
if i > 0 { obj_bytes.push(' '); }
obj_bytes.push_str(&v.to_string());
}
obj_bytes.push_str("] ");
}
// /Filter /FlateDecode
obj_bytes.push_str("/Filter /FlateDecode ");
// /Length
obj_bytes.push_str(&format!("/Length {} ", compressed.len()));
obj_bytes.push_str(">>\nstream\n");
let mut result = obj_bytes.into_bytes();
result.extend_from_slice(&compressed);
result.extend_from_slice(b"\nendstream\nendobj\n");
// Add padding
if padding > 0 {
result.extend(vec![b' '; padding]);
}
result
}
/// Helper function to build an xref stream fixture with missing /Size.
fn build_xref_stream_fixture_missing_size(field_widths: &[i64]) -> Vec<u8> {
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
// Minimal dummy data
let raw_data = vec![0u8; 7];
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&raw_data).unwrap();
let compressed = encoder.finish().unwrap();
let mut obj_bytes = String::new();
obj_bytes.push_str("1 0 obj\n<<");
// /Type /XRef
obj_bytes.push_str("/Type /XRef ");
// /W (but NO /Size!)
obj_bytes.push_str("/W [");
for (i, w) in field_widths.iter().enumerate() {
if i > 0 { obj_bytes.push(' '); }
obj_bytes.push_str(&w.to_string());
}
obj_bytes.push_str("] ");
// /Filter /FlateDecode
obj_bytes.push_str("/Filter /FlateDecode ");
// /Length
obj_bytes.push_str(&format!("/Length {} ", compressed.len()));
obj_bytes.push_str(">>\nstream\n");
let mut result = obj_bytes.into_bytes();
result.extend_from_slice(&compressed);
result.extend_from_slice(b"\nendstream\nendobj\n");
result
}
/// Helper function to build an xref stream fixture with predictor.
fn build_xref_stream_fixture_with_predictor(
field_widths: &[i64],
size: u32,
entries: &[&[u8]],
) -> Vec<u8> {
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
let mut raw_data = Vec::new();
for entry in entries {
raw_data.extend_from_slice(entry);
}
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&raw_data).unwrap();
let compressed = encoder.finish().unwrap();
let mut obj_bytes = String::new();
obj_bytes.push_str("1 0 obj\n<<");
// /Type /XRef
obj_bytes.push_str("/Type /XRef ");
// /Size
obj_bytes.push_str(&format!("/Size {} ", size));
// /W
obj_bytes.push_str("/W [");
for (i, w) in field_widths.iter().enumerate() {
if i > 0 { obj_bytes.push(' '); }
obj_bytes.push_str(&w.to_string());
}
obj_bytes.push_str("] ");
// /DecodeParms with PNG predictor
obj_bytes.push_str("/DecodeParms << /Predictor 12 /Columns 7 >> ");
// /Filter /FlateDecode
obj_bytes.push_str("/Filter /FlateDecode ");
// /Length
obj_bytes.push_str(&format!("/Length {} ", compressed.len()));
obj_bytes.push_str(">>\nstream\n");
let mut result = obj_bytes.into_bytes();
result.extend_from_slice(&compressed);
result.extend_from_slice(b"\nendstream\nendobj\n");
result
}
// Hybrid file merge tests
#[test]
fn test_merge_hybrid_traditional_priority() {
// Critical test: traditional entries override stream entries for same object numbers
let mut traditional = XrefSection::new();
traditional.add_entry(1, XrefEntry::InUse { offset: 1000, gen_nr: 0 });
traditional.add_entry(2, XrefEntry::InUse { offset: 2000, gen_nr: 0 });
let mut stream = XrefSection::new();
// Stream has different offset for object 1 (should be ignored)
stream.add_entry(1, XrefEntry::InUse { offset: 9999, gen_nr: 0 });
// Stream has object 3 (gap fill - should be added)
stream.add_entry(3, XrefEntry::Compressed { obj_stm_nr: 10, index: 5 });
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
assert_eq!(merged.len(), 3);
// Object 1 should use traditional offset
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 1000, gen_nr: 0 }));
// Object 3 should be added from stream
assert_eq!(merged.entries.get(&3), Some(&XrefEntry::Compressed { obj_stm_nr: 10, index: 5 }));
}
#[test]
fn test_merge_hybrid_free_inuse_conflict() {
// Free/InUse conflict: traditional Free + stream InUse → Free (traditional wins)
let mut traditional = XrefSection::new();
traditional.add_entry(1, XrefEntry::Free { next_free: 0, gen_nr: 65535 });
let mut stream = XrefSection::new();
stream.add_entry(1, XrefEntry::InUse { offset: 5000, gen_nr: 0 });
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
// Should have emitted STRUCT_HYBRID_CONFLICT diagnostic
assert!(merged.diagnostics.iter().any(|d| matches!(d.code, DiagCode::StructHybridConflict)));
// Traditional Free wins
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
}
#[test]
fn test_merge_hybrid_gap_fill() {
// Stream-only type-2 entries fill gaps not covered by traditional table
let mut traditional = XrefSection::new();
traditional.add_entry(1, XrefEntry::InUse { offset: 1000, gen_nr: 0 });
traditional.add_entry(5, XrefEntry::InUse { offset: 5000, gen_nr: 0 });
let mut stream = XrefSection::new();
// Objects 2, 3, 4 are only in stream (gap fill)
stream.add_entry(2, XrefEntry::Compressed { obj_stm_nr: 10, index: 0 });
stream.add_entry(3, XrefEntry::Compressed { obj_stm_nr: 10, index: 1 });
stream.add_entry(4, XrefEntry::Compressed { obj_stm_nr: 10, index: 2 });
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
assert_eq!(merged.len(), 5);
// All gap-fill objects should be present
assert!(merged.entries.contains_key(&2));
assert!(merged.entries.contains_key(&3));
assert!(merged.entries.contains_key(&4));
assert_eq!(merged.entries.get(&2), Some(&XrefEntry::Compressed { obj_stm_nr: 10, index: 0 }));
}
#[test]
fn test_merge_hybrid_trailer_xrefstm_removed() {
// Merged trailer should have /XRefStm key removed
use crate::parser::object::intern;
let mut traditional = XrefSection::new();
let mut trad_trailer = PdfDict::new();
trad_trailer.insert(intern("Size"), PdfObject::Integer(10));
trad_trailer.insert(intern("XRefStm"), PdfObject::Integer(12345));
trad_trailer.insert(intern("Root"), PdfObject::Ref(ObjRef::new(1, 0)));
traditional.trailer = Some(trad_trailer);
let stream = XrefSection::new();
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
let merged_trailer = merged.trailer.expect("Should have trailer");
// /XRefStm should be removed
assert!(!merged_trailer.contains_key("XRefStm"));
// Other keys should be preserved
assert!(merged_trailer.contains_key("Size"));
assert!(merged_trailer.contains_key("Root"));
}
#[test]
fn test_is_hybrid_trailer_detection() {
use crate::parser::object::intern;
// Trailer with /XRefStm is hybrid
let mut hybrid_trailer = PdfDict::new();
hybrid_trailer.insert(intern("Size"), PdfObject::Integer(10));
hybrid_trailer.insert(intern("XRefStm"), PdfObject::Integer(12345));
assert!(is_hybrid_trailer(Some(&hybrid_trailer)));
// Trailer without /XRefStm is not hybrid
let mut normal_trailer = PdfDict::new();
normal_trailer.insert(intern("Size"), PdfObject::Integer(10));
assert!(!is_hybrid_trailer(Some(&normal_trailer)));
// None trailer is not hybrid
assert!(!is_hybrid_trailer(None));
}
#[test]
fn test_merge_hybrid_empty_sections() {
// Edge case: merging with empty sections should work
let traditional = XrefSection::new();
let stream = XrefSection::new();
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
assert_eq!(merged.len(), 0);
}
#[test]
fn test_merge_hybrid_stream_only() {
// Edge case: traditional is empty, stream has entries
let traditional = XrefSection::new();
let mut stream = XrefSection::new();
stream.add_entry(1, XrefEntry::Compressed { obj_stm_nr: 10, index: 0 });
stream.add_entry(2, XrefEntry::Compressed { obj_stm_nr: 10, index: 1 });
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
assert_eq!(merged.len(), 2);
assert!(merged.entries.contains_key(&1));
assert!(merged.entries.contains_key(&2));
}
#[test]
fn test_merge_hybrid_traditional_only() {
// Edge case: stream is empty, traditional has entries
let mut traditional = XrefSection::new();
traditional.add_entry(1, XrefEntry::InUse { offset: 1000, gen_nr: 0 });
let stream = XrefSection::new();
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
assert_eq!(merged.len(), 1);
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 1000, gen_nr: 0 }));
}
#[test]
fn test_merge_hybrid_proptest_simple() {
// Simple proptest-style test: verify merge_hybrid doesn't panic with basic inputs
for obj_nr in 0u32..10 {
let mut traditional = XrefSection::new();
traditional.add_entry(obj_nr, XrefEntry::InUse { offset: obj_nr as u64 * 100, gen_nr: 0 });
let mut stream = XrefSection::new();
stream.add_entry(obj_nr + 100, XrefEntry::Compressed { obj_stm_nr: 10, index: obj_nr });
let merged = merge_hybrid(traditional, stream);
assert!(merged.is_hybrid);
assert_eq!(merged.len(), 2);
}
}
// ========================================================================
// Linearized PDF Detection Tests
// ========================================================================
#[test]
fn test_detect_linearization_non_linearized_pdf() {
// A regular PDF without linearization should return None
let pdf_data = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = detect_linearization(&source);
assert!(result.is_none(), "Non-linearized PDF should return None");
}
#[test]
fn test_detect_linearization_with_valid_dict() {
// A minimal linearized PDF with the required fields
// /L must match the actual file size for the validation to pass
let pdf_data = b"%PDF-1.4\n1 0 obj\n<< /Linearized 1.0\n/L 162\n/H [1234 56]\n/E 100\n/N 10\n/T 200\n/O 5 >>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer\n<< /Size 2 >>\nstartxref\n300\n%%%%EOF";
// Verify the /L value matches actual length
assert_eq!(pdf_data.len() as u64, 162, "Test data /L value should match actual length");
let source = MemorySource::new(pdf_data.to_vec());
let result = detect_linearization(&source);
assert!(result.is_some(), "Valid linearized PDF should be detected");
let lin_info = result.unwrap();
assert_eq!(lin_info.file_length, 162);
assert_eq!(lin_info.first_page_xref_offset, 200);
assert_eq!(lin_info.hint_stream_offset, Some(1234));
assert_eq!(lin_info.hint_stream_length, Some(56));
assert_eq!(lin_info.page_count, 10);
assert_eq!(lin_info.first_page_end_offset, 100);
assert_eq!(lin_info.first_page_object_number, 5);
}
#[test]
fn test_detect_linearization_file_size_mismatch() {
// Linearized PDF where /L doesn't match actual file size
// (incremental update scenario)
let pdf_data = b"%PDF-1.4\n\
1 0 obj\n\
<< /Linearized 1.0\n\
/L 999999\n\
/H [1234 56]\n\
/E 100\n\
/N 10\n\
/T 200\n\
/O 5 >>\n\
endobj\n";
let source = MemorySource::new(pdf_data.to_vec());
let result = detect_linearization(&source);
assert!(result.is_none(), "Linearized PDF with size mismatch should return None");
}
#[test]
fn test_detect_linearization_no_hint_stream() {
// Linearized PDF without optional /H entry
// /L must match the actual file size for the validation to pass
let pdf_data = b"%PDF-1.4\n1 0 obj\n<< /Linearized 1.0\n/L 77\n/E 100\n/N 10\n/T 200\n/O 5 >>\nendobj\n";
// Verify the /L value matches actual length
assert_eq!(pdf_data.len() as u64, 77, "Test data /L value should match actual length");
let source = MemorySource::new(pdf_data.to_vec());
let result = detect_linearization(&source);
assert!(result.is_some(), "Linearized PDF without /H should be detected");
let lin_info = result.unwrap();
assert_eq!(lin_info.hint_stream_offset, None);
assert_eq!(lin_info.hint_stream_length, None);
}
#[test]
fn test_merge_linearized_xrefs() {
// Test merging first-page and full xrefs
let mut first_page = XrefSection::new();
first_page.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 });
first_page.add_entry(5, XrefEntry::InUse { offset: 500, gen_nr: 0 });
let mut full = XrefSection::new();
// Same entry - full should win
full.add_entry(1, XrefEntry::InUse { offset: 150, gen_nr: 0 }); // Different offset
// New entry only in full
full.add_entry(2, XrefEntry::InUse { offset: 200, gen_nr: 0 });
full.add_entry(3, XrefEntry::InUse { offset: 300, gen_nr: 0 });
let merged = merge_linearized_xrefs(first_page, full);
assert_eq!(merged.len(), 4);
// Full xref's entry for object 1 should win (offset 150, not 100)
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 150, gen_nr: 0 }));
assert_eq!(merged.entries.get(&2), Some(&XrefEntry::InUse { offset: 200, gen_nr: 0 }));
assert_eq!(merged.entries.get(&3), Some(&XrefEntry::InUse { offset: 300, gen_nr: 0 }));
assert_eq!(merged.entries.get(&5), Some(&XrefEntry::InUse { offset: 500, gen_nr: 0 }));
}
#[test]
fn test_merge_linearized_xrefs_conflict_free_vs_inuse() {
// Test merging where first-page has Free and full has InUse
let mut first_page = XrefSection::new();
first_page.add_entry(1, XrefEntry::Free { next_free: 2, gen_nr: 0 });
let mut full = XrefSection::new();
full.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 });
let merged = merge_linearized_xrefs(first_page, full);
assert_eq!(merged.len(), 1);
// Full xref's InUse should win over first-page's Free
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 100, gen_nr: 0 }));
}
#[test]
fn test_merge_linearized_xrefs_empty_first_page() {
// Test merging where first-page is empty
let first_page = XrefSection::new();
let mut full = XrefSection::new();
full.add_entry(1, XrefEntry::InUse { offset: 100, gen_nr: 0 });
full.add_entry(2, XrefEntry::InUse { offset: 200, gen_nr: 0 });
let merged = merge_linearized_xrefs(first_page, full);
assert_eq!(merged.len(), 2);
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::InUse { offset: 100, gen_nr: 0 }));
assert_eq!(merged.entries.get(&2), Some(&XrefEntry::InUse { offset: 200, gen_nr: 0 }));
}
#[test]
fn test_detect_linearization_proptest_random_bytes() {
// Proptest-style: verify detect_linearization never panics on random input
for seed in 0u32..100 {
let mut data = Vec::new();
// Use deterministic PRNG based on seed (Java Random algorithm with u64 state)
let mut state: u64 = (seed as u64).wrapping_mul(0x5DEECE66D).wrapping_add(0xB);
for _ in 0..2048 {
state = state.wrapping_mul(0x5DEECE66D).wrapping_add(0xB);
data.push(((state >> 16) & 0xFF) as u8);
}
let source = MemorySource::new(data);
// Should never panic, may return None or Some
let _ = detect_linearization(&source);
}
}
#[test]
fn test_detect_linearization_with_incremental_update() {
// A PDF that was linearized then incrementally updated
// The /L field will not match the current file size
let original_data = b"%PDF-1.4\n\
1 0 obj\n\
<< /Linearized 1.0\n\
/L 300\n\
/E 100\n\
/N 10\n\
/T 200\n\
/O 5 >>\n\
endobj\n\
%%EOF";
// Simulate incremental update by appending data
let mut updated_data = original_data.to_vec();
updated_data.extend_from_slice(b"\n% Incremental update\n2 0 obj\n123\nendobj\n");
let source = MemorySource::new(updated_data);
let result = detect_linearization(&source);
// Should return None because /L (300) != actual size
assert!(result.is_none(), "Incrementally updated linearized PDF should fall through");
}
// /Prev chain tests
/// Test 3-revision /Prev chain - latest value wins.
///
/// This is the critical test from the plan: verify that when an object
/// appears in multiple revisions, the LATEST revision's value wins.
#[test]
fn test_prev_chain_three_revisions_latest_wins() {
// Build a minimal PDF with 3 incremental revisions
// Each revision is a complete xref table with a /Prev pointer
// Start with fixed offsets for predictability
let rev1_offset = 1000u64;
let rev2_offset = 2000u64;
let rev3_offset = 3000u64;
// Revision 1 (baseline): objects 1, 2, 3
let rev1 = format!(
"xref\n0 4\n\
0000000000 65535 f \n\
0000000100 00000 n \n\
0000000200 00000 n \n\
0000000300 00000 n \n\
trailer\n<< /Size 4 >>\n"
);
// Revision 2: updates object 2, adds object 4
let rev2 = format!(
"xref\n2 1\n\
0000000250 00001 n \n\
4 1\n\
0000000400 00000 n \n\
trailer\n<< /Size 5 /Prev {} >>\n",
rev1_offset
);
// Revision 3 (latest): updates object 3, adds object 5
let rev3 = format!(
"xref\n3 1\n\
0000000350 00002 n \n\
5 1\n\
0000000500 00000 n \n\
trailer\n<< /Size 6 /Prev {} >>\n",
rev2_offset
);
// Build file data with padding at exact offsets
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
// Pad to rev1_offset
while file_data.len() < rev1_offset as usize {
file_data.push(b' ');
}
file_data.extend_from_slice(rev1.as_bytes());
// Pad to rev2_offset
while file_data.len() < rev2_offset as usize {
file_data.push(b' ');
}
file_data.extend_from_slice(rev2.as_bytes());
// Pad to rev3_offset
while file_data.len() < rev3_offset as usize {
file_data.push(b' ');
}
file_data.extend_from_slice(rev3.as_bytes());
let source = MemorySource::new(file_data);
// Load from the latest revision
let result = load_xref_with_prev_chain(&source, rev3_offset);
// Verify all 6 entries are present (including object 0)
assert_eq!(result.len(), 6, "Should have entries for objects 0-5, got {}", result.len());
// Verify LATEST values win:
// Object 1: unchanged from rev1 (offset 100)
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 100, gen_nr: 0 }));
// Object 2: rev2 value (offset 250) overrides rev1 (offset 200)
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 250, gen_nr: 1 }));
// Object 3: rev3 value (offset 350) overrides rev1 (offset 300)
assert_eq!(result.entries.get(&3), Some(&XrefEntry::InUse { offset: 350, gen_nr: 2 }));
// Object 4: added in rev2 (offset 400)
assert_eq!(result.entries.get(&4), Some(&XrefEntry::InUse { offset: 400, gen_nr: 0 }));
// Object 5: added in rev3 (offset 500)
assert_eq!(result.entries.get(&5), Some(&XrefEntry::InUse { offset: 500, gen_nr: 0 }));
// Trailer should be from rev3 (latest)
assert!(result.trailer.is_some());
}
/// Test object lifecycle: added in rev2, modified in rev3, freed in rev4.
#[test]
fn test_prev_chain_object_add_modify_free() {
// Build a PDF with 4 revisions tracking object 7's lifecycle
// Rev1: object 7 doesn't exist
let rev1 = b"xref\n0 2\n\
0000000000 65535 f \n\
0000000100 00000 n \n\
trailer\n<< /Size 2 >>\n";
// Rev2: add object 7 as InUse
let rev2 = b"xref\n7 1\n\
0000000700 00000 n \n\
trailer\n<< /Size 8 /Prev 0 >>\n";
// Rev3: modify object 7 (new generation)
let rev3 = b"xref\n7 1\n\
0000000750 00001 n \n\
trailer\n<< /Size 8 /Prev 0 >>\n";
// Rev4: free object 7
let rev4 = b"xref\n7 1\n\
0000000000 00002 f \n\
trailer\n<< /Size 8 /Prev 0 >>\n";
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
// Revision 1
let rev1_offset = file_data.len() as u64;
file_data.extend_from_slice(rev1);
// Revision 2
let rev2_offset = file_data.len() as u64;
let mut rev2_with_prev = rev2.to_vec();
let rev2_str = String::from_utf8_lossy(&rev2_with_prev);
let rev2_str = rev2_str.replace("/Prev 0", &format!("/Prev {}", rev1_offset));
file_data.extend_from_slice(rev2_str.as_bytes());
// Revision 3
let rev3_offset = file_data.len() as u64;
let mut rev3_with_prev = rev3.to_vec();
let rev3_str = String::from_utf8_lossy(&rev3_with_prev);
let rev3_str = rev3_str.replace("/Prev 0", &format!("/Prev {}", rev2_offset));
file_data.extend_from_slice(rev3_str.as_bytes());
// Revision 4 (latest)
let rev4_offset = file_data.len() as u64;
let mut rev4_with_prev = rev4.to_vec();
let rev4_str = String::from_utf8_lossy(&rev4_with_prev);
let rev4_str = rev4_str.replace("/Prev 0", &format!("/Prev {}", rev3_offset));
file_data.extend_from_slice(rev4_str.as_bytes());
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, rev4_offset);
// Object 7 should be Free (freed in rev4)
assert_eq!(result.entries.get(&7), Some(&XrefEntry::Free { next_free: 0, gen_nr: 2 }));
}
/// Test object added only in latest revision.
#[test]
fn test_prev_chain_object_added_only_in_latest() {
// Rev1: baseline
let rev1 = b"xref\n0 2\n\
0000000000 65535 f \n\
0000000100 00000 n \n\
trailer\n<< /Size 2 >>\n";
// Rev2 (latest): add object 99
let rev2 = b"xref\n99 1\n\
0000009900 00000 n \n\
trailer\n<< /Size 100 /Prev 0 >>\n";
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
let rev1_offset = file_data.len() as u64;
file_data.extend_from_slice(rev1);
let rev2_offset = file_data.len() as u64;
let mut rev2_with_prev = rev2.to_vec();
let rev2_str = String::from_utf8_lossy(&rev2_with_prev);
let rev2_str = rev2_str.replace("/Prev 0", &format!("/Prev {}", rev1_offset));
file_data.extend_from_slice(rev2_str.as_bytes());
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, rev2_offset);
// Object 99 should be present (added in rev2)
assert_eq!(result.entries.get(&99), Some(&XrefEntry::InUse { offset: 9900, gen_nr: 0 }));
}
/// Test that trailer is from latest revision.
#[test]
fn test_prev_chain_trailer_from_latest() {
// Rev1: trailer with /Root 1 0 R
let rev1 = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Root 1 0 R >>\n";
// Rev2 (latest): trailer with /Root 2 0 R and /Info
let rev2 = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 2 /Root 2 0 R /Info 3 0 R /Prev 0 >>\n";
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
let rev1_offset = file_data.len() as u64;
file_data.extend_from_slice(rev1);
let rev2_offset = file_data.len() as u64;
let mut rev2_with_prev = rev2.to_vec();
let rev2_str = String::from_utf8_lossy(&rev2_with_prev);
let rev2_str = rev2_str.replace("/Prev 0", &format!("/Prev {}", rev1_offset));
file_data.extend_from_slice(rev2_str.as_bytes());
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, rev2_offset);
// Trailer should be from rev2 (latest)
assert!(result.trailer.is_some());
let trailer = result.trailer.as_ref().unwrap();
// Should have /Root from rev2 (2 0 R), not rev1 (1 0 R)
let root = trailer.get("Root");
assert!(root.is_some());
match root {
Some(PdfObject::Ref(obj_ref)) => {
// 2 0 R - indirect reference to object 2
assert_eq!(obj_ref.object, 2);
assert_eq!(obj_ref.generation, 0);
}
_ => panic!("Expected /Root to be an indirect reference 2 0 R"),
}
// Should have /Info from rev2
assert!(trailer.contains_key("Info"));
}
/// Test /Prev cycle detection.
#[test]
fn test_prev_chain_cycle_detection() {
// Create a cycle: rev3 -> rev2 -> rev1 -> rev3
let rev_base = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 >>\n";
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
// Three revisions at offsets 200, 300, 400
let rev1_offset = 200u64;
let rev2_offset = 300u64;
let rev3_offset = 400u64;
// Rev1: /Prev points to rev3 (creating cycle)
let rev1 = format!("xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Prev {} >>\n", rev3_offset);
// Rev2: /Prev points to rev1
let rev2 = format!("xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Prev {} >>\n", rev1_offset);
// Rev3 (start): /Prev points to rev2
let rev3 = format!("xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Prev {} >>\n", rev2_offset);
// Pad file to rev1_offset
while file_data.len() < rev1_offset as usize {
file_data.push(b' ');
}
file_data.extend_from_slice(rev1.as_bytes());
while file_data.len() < rev2_offset as usize {
file_data.push(b' ');
}
file_data.extend_from_slice(rev2.as_bytes());
while file_data.len() < rev3_offset as usize {
file_data.push(b' ');
}
file_data.extend_from_slice(rev3.as_bytes());
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, rev3_offset);
// Should emit STRUCT_CIRCULAR_REF diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructCircularRef));
}
/// Test depth limit enforcement.
#[test]
fn test_prev_chain_depth_limit() {
let base_xref = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Prev {prev} >>\n";
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
// Create 50 revisions in a chain (exceeds MAX_PREV_DEPTH of 32)
let mut offsets = Vec::new();
for i in 0..50 {
let offset = 1000 + (i * 200);
offsets.push(offset);
}
// Build the chain from oldest to newest
for (i, &offset) in offsets.iter().enumerate() {
// Pad to offset
while file_data.len() < offset as usize {
file_data.push(b' ');
}
let prev_offset = if i > 0 { offsets[i - 1] } else { 0 };
let rev = String::from_utf8_lossy(base_xref).replace("{prev}", &prev_offset.to_string());
file_data.extend_from_slice(rev.as_bytes());
}
let source = MemorySource::new(file_data);
let start_offset = *offsets.last().unwrap();
let result = load_xref_with_prev_chain(&source, start_offset);
// Should emit STRUCT_DEPTH_EXCEEDED diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructDepthExceeded));
}
/// Test /Prev offset pointing beyond file size.
#[test]
fn test_prev_chain_invalid_offset() {
let rev1 = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 >>\n";
let rev2 = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Prev 999999 >>\n"; // Points beyond file
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
let rev1_offset = file_data.len() as u64;
file_data.extend_from_slice(rev1);
let rev2_offset = file_data.len() as u64;
file_data.extend_from_slice(rev2);
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, rev2_offset);
// Should emit STRUCT_INVALID_PREV_OFFSET diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
// /Prev should be removed from trailer
let trailer = result.trailer.as_ref().unwrap();
assert!(!trailer.contains_key("Prev"));
}
/// Test /Prev of 0 treated as "no previous revision".
#[test]
fn test_prev_chain_zero_prev_is_absent() {
let rev = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Prev 0 >>\n"; // /Prev 0 means "no previous"
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
let offset = file_data.len() as u64;
file_data.extend_from_slice(rev);
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, offset);
// Should not follow /Prev 0, should just return this single revision
assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
}
/// Test negative /Prev treated as "no previous revision".
#[test]
fn test_prev_chain_negative_prev_is_absent() {
let rev = b"xref\n0 1\n\
0000000000 65535 f \n\
trailer\n<< /Size 1 /Prev -5 >>\n"; // Negative /Prev
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
let offset = file_data.len() as u64;
file_data.extend_from_slice(rev);
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, offset);
// Should not follow negative /Prev
assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
}
/// Test hybrid file in /Prev chain.
#[test]
fn test_prev_chain_hybrid_file() {
// Rev1: traditional xref
let rev1 = b"xref\n0 2\n\
0000000000 65535 f \n\
0000000100 00000 n \n\
trailer\n<< /Size 2 >>\n";
// Rev2: hybrid (traditional + /XRefStm)
let rev2_trad = b"xref\n0 2\n\
0000000000 65535 f \n\
0000000200 00001 n \n\
trailer\n<< /Size 2 /XRefStm 500 /Prev 0 >>\n";
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 100]);
let rev1_offset = file_data.len() as u64;
file_data.extend_from_slice(rev1);
let rev2_offset = file_data.len() as u64;
let mut rev2_with_prev = rev2_trad.to_vec();
let rev2_str = String::from_utf8_lossy(&rev2_with_prev);
let rev2_str = rev2_str.replace("/Prev 0", &format!("/Prev {}", rev1_offset));
file_data.extend_from_slice(rev2_str.as_bytes());
// Add a dummy xref stream at offset 500
while file_data.len() < 500 {
file_data.push(b' ');
}
// Minimal xref stream (won't parse correctly but tests hybrid detection)
file_data.extend_from_slice(b"1 0 obj\n<< /Type /XRef /Size 2 /W [1 1 1] >>\nstream\n\x00\x00\x00\nendstream\nendobj\n");
let source = MemorySource::new(file_data);
let result = load_xref_with_prev_chain(&source, rev2_offset);
// Should be marked as hybrid
assert!(result.is_hybrid);
}
// proptest for /Prev chain
mod proptest_prev_chain_tests {
use super::*;
use proptest::prelude::*;
proptest! {
/// Property: /Prev chain with random configurations never panics.
#[test]
fn prop_prev_chain_random_no_panic(
revisions in prop::collection::vec(
(0u32..20u32, 0u64..1000u64, 0u16..10u16, any::<bool>()),
0..10
)
) {
// Build a minimal /Prev chain from the random data
// Each tuple: (obj_num, offset, gen_nr, has_prev)
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
let mut offsets = Vec::new();
for (i, (obj_num, offset, gen_nr, has_prev)) in revisions.iter().enumerate() {
let pos = 1000u64 + (i as u64 * 500);
offsets.push(pos);
// Pad to position
while file_data.len() < pos as usize {
file_data.push(b' ');
}
// Create xref for this object
let xref = format!(
"xref\n{} 1\n\
{:010} {:05} n \n\
trailer\n<< /Size {} >>\n",
obj_num, offset, gen_nr, obj_num + 1
);
file_data.extend_from_slice(xref.as_bytes());
}
let source = MemorySource::new(file_data);
// Loading from any offset should not panic
if let Some(&start_offset) = offsets.last() {
let _ = load_xref_with_prev_chain(&source, start_offset);
}
}
/// Property: Random /Prev offsets never panic.
#[test]
fn prop_prev_chain_random_offsets_no_panic(
offsets in prop::collection::vec(0u64..10000u64, 0..20)
) {
let mut file_data = Vec::new();
file_data.extend_from_slice(b"%PDF-1.4\n");
file_data.extend_from_slice(&vec![b' '; 10000]);
// Add a base xref
file_data.extend_from_slice(b"xref\n0 1\n0000000000 65535 f \ntrailer\n<< /Size 1 >>\n");
let source = MemorySource::new(file_data);
// Loading from any random offset should not panic
for offset in offsets {
let _ = load_xref_with_prev_chain(&source, offset);
}
}
}
}
}