pdftract/crates/pdftract-core/src/parser/pages.rs
jedarden b8a1b8f193
Some checks are pending
Schema Generation Validation / Validate JSON Schema (push) Waiting to run
Schema Generation Validation / Validate JSON Syntax (push) Waiting to run
fix(pdftract-2sswr): add Default impl for PageDict to fix JBIG2 compilation
This commit fixes a compilation error in the javascript tests that were
using PageDict::default(). The JBIG2 decoder module was already fully
implemented; this change only enables the tests to compile and run.

Changes:
- Add Default impl for PageDict in parser/pages.rs
- Verify all 11 JBIG2-related tests pass

The JBIG2Decode passthrough filter implementation is complete:
- Passthrough of raw JBIG2 bytes
- /JBIG2Globals reference recording for downstream consumers
- OCR_JBIG2_UNSUPPORTED diagnostic emission when full-render disabled

Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-28 04:44:45 -04:00

1780 lines
64 KiB
Rust

//! Page tree flattening with inherited attribute resolution.
//!
//! This module implements the page tree walker that resolves inherited attributes
//! (MediaBox, CropBox, Resources, Rotate) across the /Pages subtree and produces
//! a flat Vec<PageDict> suitable for downstream extraction phases.
//!
//! Per PDF 1.7 spec section 7.7.3.4 "Page Tree":
//! - /MediaBox, /CropBox, /Resources, /Rotate are inheritable from ancestor /Pages nodes
//! - /BleedBox, /TrimBox, /ArtBox, /Contents, /Annots are not inheritable
//! - Inheritance is "last-write-wins" at each level (child overrides parent)
//! - If a required inheritable attribute is missing and not inherited, use a safe default
use crate::diagnostics::{DiagCode, Diagnostic};
use crate::parser::object::{intern, ObjRef, PdfDict, PdfObject};
use crate::parser::resources::{merge_resources, ResourceDict};
use crate::parser::xref::XrefResolver;
use std::collections::HashSet;
use std::sync::Arc;
/// Default MediaBox when none is specified (US Letter: 612 x 792 points).
///
/// Per EC-09: Page with no MediaBox and no inherited MediaBox substitutes
/// US Letter dimensions and emits STRUCT_MISSING_KEY diagnostic.
pub const DEFAULT_MEDIABOX: [f64; 4] = [0.0, 0.0, 612.0, 792.0];
/// Maximum depth of /Pages nesting to prevent stack overflow.
///
/// Real-world PDFs rarely exceed 5 levels; 16 is very generous.
const MAX_PAGES_DEPTH: u8 = 16;
/// A fully resolved page dictionary with all inherited attributes merged.
///
/// This is the output of the page tree flattening process. Each PageDict
/// represents a leaf /Page node with all inheritable attributes from its
/// ancestor /Pages nodes resolved.
#[derive(Debug, Clone)]
pub struct PageDict {
/// The page's own indirect reference
pub obj_ref: ObjRef,
/// REQUIRED; inherited if missing on this page. Default: [0, 0, 612, 792]
pub media_box: [f64; 4],
/// Optional; defaults to media_box if absent
pub crop_box: Option<[f64; 4]>,
/// Optional; defaults to crop_box if absent
pub bleed_box: Option<[f64; 4]>,
/// Optional; defaults to crop_box if absent
pub trim_box: Option<[f64; 4]>,
/// Optional; defaults to crop_box if absent
pub art_box: Option<[f64; 4]>,
/// Page rotation in degrees; must be a multiple of 90 (0, 90, 180, 270)
pub rotate: i32,
/// Merged resource dict containing all inherited resources
/// Wrapped in Arc for memory efficiency when multiple pages share the same resources
pub resources: Arc<ResourceDict>,
/// List of content stream references (in order)
pub contents: Vec<ObjRef>,
/// Annotation array references
pub annots: Vec<ObjRef>,
/// ActualText from tagged PDF (if present)
pub actual_text: Option<String>,
/// Language identifier (if present)
pub lang: Option<String>,
/// Page-level additional actions (used by JS detection)
pub aa: Option<PdfObject>,
/// /StructParents value for StructTree MCID resolution (Phase 7.1.4)
pub struct_parents: Option<i32>,
}
impl Default for PageDict {
fn default() -> Self {
Self {
obj_ref: ObjRef::new(0, 0),
media_box: DEFAULT_MEDIABOX,
crop_box: None,
bleed_box: None,
trim_box: None,
art_box: None,
rotate: 0,
resources: Arc::new(ResourceDict::new()),
contents: Vec::new(),
annots: Vec::new(),
actual_text: None,
lang: None,
aa: None,
struct_parents: None,
}
}
}
impl PageDict {
/// Get the /StructParents value for this page.
///
/// This value is used to resolve MCIDs to structure elements via the ParentTree.
/// Returns None if the page has no /StructParents entry.
pub fn struct_parents(&self) -> Option<i32> {
self.struct_parents
}
}
/// Inherited attributes accumulator for page tree traversal.
///
/// Tracks the current inherited values as we walk down the /Pages tree.
/// Each /Pages node may override these values; leaf /Page nodes read
/// the accumulated values.
#[derive(Debug, Clone)]
struct InheritedAttrs {
/// Inherited MediaBox (required, but may be None -> use default)
media_box: Option<[f64; 4]>,
/// Inherited CropBox (optional)
crop_box: Option<[f64; 4]>,
/// Inherited merged resources (accumulated from all ancestors)
resources: Arc<ResourceDict>,
/// Inherited Rotate value (defaults to 0)
rotate: i32,
}
impl Default for InheritedAttrs {
fn default() -> Self {
InheritedAttrs {
media_box: None,
crop_box: None,
resources: Arc::new(ResourceDict::new()),
rotate: 0,
}
}
}
/// Result type for page tree flattening.
pub type Result<T> = std::result::Result<T, Vec<Diagnostic>>;
/// Count pages in the page tree without materializing PageDict objects.
///
/// This function walks the /Pages subtree and counts only leaf /Page nodes,
/// using O(depth) memory without building any PageDict objects. This is
/// the memory-efficient way to get the page count for large documents.
///
/// # Arguments
/// * `resolver` - The xref resolver for resolving indirect references
/// * `pages_ref` - The object reference to the root /Pages dictionary
///
/// # Returns
/// A `Result<usize>` containing the page count or diagnostics.
///
/// # Behavior
/// - Empty /Pages tree: returns 0
/// - Circular reference: detected, subtree pruned
/// - Depth exceeded: subtree pruned
///
/// # Example
/// ```ignore
/// let count = count_pages_tree(&resolver, catalog.pages_ref)?;
/// println!("Document has {} pages", count);
/// ```
pub fn count_pages_tree(resolver: &XrefResolver, pages_ref: ObjRef) -> Result<usize> {
let mut diagnostics = Vec::new();
let mut visited = HashSet::new();
let count = count_pages_walk(resolver, pages_ref, &mut visited, 0, &mut diagnostics);
if diagnostics.is_empty() || count > 0 {
Ok(count)
} else {
Err(diagnostics)
}
}
/// Recursive page tree counter.
///
/// Walks the /Pages subtree depth-first and counts leaf /Page nodes.
/// Uses O(depth) memory by tracking only the current path.
fn count_pages_walk(
resolver: &XrefResolver,
node_ref: ObjRef,
visited: &mut HashSet<ObjRef>,
depth: u8,
diagnostics: &mut Vec<Diagnostic>,
) -> usize {
// Depth limit check
if depth > MAX_PAGES_DEPTH {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructDepthExceeded,
format!(
"STRUCT_DEPTH_EXCEEDED: /Pages nesting exceeds {} levels",
MAX_PAGES_DEPTH
),
));
return 0;
}
// Check for cycles
if visited.contains(&node_ref) {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructCircularRef,
format!(
"STRUCT_CIRCULAR_REF: /Pages node {} already visited",
node_ref
),
));
return 0;
}
visited.insert(node_ref);
// Resolve the node
let node_obj = match resolver.resolve(node_ref) {
Ok(obj) => obj,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Failed to resolve /Pages node {}: {}", node_ref, e),
));
return 0;
}
};
let dict = match node_obj.as_dict() {
Some(d) => d,
None => {
return 0;
}
};
let node_type = dict.get("Type").and_then(|o| o.as_name()).unwrap_or("");
match node_type {
"Page" => {
// Leaf node: count it
1
}
"Pages" => {
// Internal node: recurse into /Kids
let kids = match dict.get("Kids") {
Some(k) => k,
None => {
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::StructMissingKey,
"STRUCT_MISSING_KEY: /Pages node missing /Kids",
));
return 0;
}
};
let kids_array = match kids.as_array() {
Some(arr) => arr,
None => {
return 0;
}
};
// Sum the counts from all children
let mut total = 0;
for kid in kids_array {
let kid_ref = match kid {
PdfObject::Ref(ref_) => *ref_,
PdfObject::Dict(_) => {
// Direct dictionary - count as a page if it's a /Page
let kid_type = kid
.as_dict()
.and_then(|d| d.get("Type"))
.and_then(|o| o.as_name())
.unwrap_or("");
if kid_type == "Page" {
total += 1;
}
continue;
}
_ => continue,
};
total += count_pages_walk(resolver, kid_ref, visited, depth + 1, diagnostics);
}
total
}
_ => 0,
}
}
/// Flatten the page tree into a vector of fully resolved PageDict objects.
///
/// This function walks the /Pages subtree starting from the given /Pages reference,
/// resolves all inherited attributes, and returns a flat vector of leaf pages in
/// document order (left-to-right depth-first traversal).
///
/// # Arguments
/// * `resolver` - The xref resolver for resolving indirect references
/// * `pages_ref` - The object reference to the root /Pages dictionary
///
/// # Returns
/// A `Result<Vec<PageDict>>` containing the flattened pages or diagnostics.
///
/// # Behavior
/// - Empty /Pages tree: returns empty Vec (page_count = 0)
/// - Missing /MediaBox: substitutes DEFAULT_MEDIABOX, emits STRUCT_MISSING_KEY
/// - Invalid /Rotate: clamps to nearest multiple of 90, emits STRUCT_INVALID_ROTATE
/// - Circular reference: detected, subtree pruned, STRUCT_CIRCULAR_REF emitted
/// - Depth exceeded: subtree pruned, STRUCT_DEPTH_EXCEEDED emitted
/// - Page count mismatch: emits STRUCT_INVALID_PAGE_COUNT if /Count disagrees
///
/// # Memory Usage
///
/// This function materializes all PageDict objects in memory. For large documents,
/// use `count_pages_tree()` to get the page count without materializing pages,
/// or use `LazyPageIter` for streaming extraction.
///
/// # Example
/// ```ignore
/// let pages = flatten_page_tree(&resolver, catalog.pages_ref)?;
/// for (i, page) in pages.iter().enumerate() {
/// println!("Page {}: MediaBox {:?}", i, page.media_box);
/// }
/// ```
pub fn flatten_page_tree(resolver: &XrefResolver, pages_ref: ObjRef) -> Result<Vec<PageDict>> {
let mut diagnostics = Vec::new();
let mut visited = HashSet::new();
let mut inherited = InheritedAttrs::default();
// Resolve the root /Pages node
let pages_obj = match resolver.resolve(pages_ref) {
Ok(obj) => obj,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Failed to resolve root /Pages node {}: {}", pages_ref, e),
));
return Err(diagnostics);
}
};
// Extract /Count if present (for validation later)
let declared_count = pages_obj
.as_dict()
.and_then(|d| d.get("Count"))
.and_then(|o| o.as_int())
.unwrap_or(0);
// Walk the tree starting from root /Pages
let pages = walk_page_tree(
resolver,
&pages_obj,
&mut inherited,
&mut visited,
0,
&mut diagnostics,
);
// Validate page count against /Count
let actual_count = pages.len() as i64;
if declared_count > 0 && actual_count != declared_count {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::PageInvalidCount,
format!(
"STRUCT_INVALID_PAGE_COUNT: /Count declares {} pages, but tree contains {} pages",
declared_count, actual_count
),
));
}
if !diagnostics.is_empty() && pages.is_empty() {
// Only return error if we have no pages at all
Err(diagnostics)
} else {
Ok(pages)
}
}
/// Recursive page tree walker.
///
/// Traverses the /Pages subtree depth-first, accumulating inherited attributes
/// and emitting PageDict objects for leaf /Page nodes.
///
/// # Arguments
/// * `resolver` - The xref resolver
/// * `node` - The current node (either /Pages or /Page)
/// * `inherited` - Current inherited attributes (mutated during traversal)
/// * `visited` - Set of visited object references for cycle detection
/// * `depth` - Current nesting depth
/// * `diagnostics` - Accumulator for diagnostics
///
/// # Returns
/// A vector of PageDict objects from this subtree.
fn walk_page_tree(
resolver: &XrefResolver,
node: &PdfObject,
inherited: &mut InheritedAttrs,
visited: &mut HashSet<ObjRef>,
depth: u8,
diagnostics: &mut Vec<Diagnostic>,
) -> Vec<PageDict> {
// Depth limit check
if depth > MAX_PAGES_DEPTH {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructDepthExceeded,
format!(
"STRUCT_DEPTH_EXCEEDED: /Pages nesting exceeds {} levels",
MAX_PAGES_DEPTH
),
));
return Vec::new();
}
let dict = match node.as_dict() {
Some(d) => d,
None => {
// Not a dictionary - skip this node
return Vec::new();
}
};
// Check /Type to determine if this is /Pages or /Page
let node_type = dict.get("Type").and_then(|o| o.as_name()).unwrap_or("");
// Save the inherited state before merging this node's attributes
let parent_inherited = inherited.clone();
// Merge inheritable attributes from this node
merge_inherited_attrs(dict, inherited, diagnostics);
match node_type {
"Page" => {
// Leaf node: emit a PageDict
vec![build_page_dict(node, inherited, diagnostics)]
}
"Pages" => {
// Internal node: recurse into /Kids
let kids = match dict.get("Kids") {
Some(k) => k,
None => {
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::StructMissingKey,
"STRUCT_MISSING_KEY: /Pages node missing /Kids",
));
return Vec::new();
}
};
let kids_array = match kids.as_array() {
Some(arr) => arr,
None => {
// /Kids is not an array - skip
return Vec::new();
}
};
// For /Pages nodes, all children should start with the same inherited state
// (the state after merging this /Pages node's own attributes).
// Save this state so we can restore it for each sibling.
let pages_parent_inherited = inherited.clone();
let mut pages = Vec::new();
for kid in kids_array {
// Handle both direct (embedded dict) and indirect references
let kid_obj = match kid {
PdfObject::Ref(ref_) => {
// Check for cycles
if visited.contains(ref_) {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructCircularRef,
format!(
"STRUCT_CIRCULAR_REF: /Pages node {} already visited",
ref_
),
));
continue;
}
visited.insert(*ref_);
match resolver.resolve(*ref_) {
Ok(obj) => obj,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!(
"STRUCT_MISSING_KEY: Failed to resolve /Kids entry {}: {}",
ref_, e
),
));
continue;
}
}
}
PdfObject::Dict(_) => {
// Direct dictionary - uncommon but legal
kid.clone()
}
_ => {
// Invalid /Kids entry - skip
continue;
}
};
// Recurse into the child
let child_pages = walk_page_tree(
resolver,
&kid_obj,
inherited,
visited,
depth + 1,
diagnostics,
);
pages.extend(child_pages);
// Restore inherited state for next sibling
*inherited = pages_parent_inherited.clone();
}
pages
}
_ => {
// Unknown /Type - skip this node
*inherited = parent_inherited;
Vec::new()
}
}
}
/// Merge inheritable attributes from a /Pages or /Page node into the accumulator.
///
/// Per PDF spec 7.7.3.4, only MediaBox, CropBox, Resources, and Rotate are inheritable.
/// This function updates the `inherited` accumulator with any values present in `dict`.
fn merge_inherited_attrs(
dict: &PdfDict,
inherited: &mut InheritedAttrs,
diagnostics: &mut Vec<Diagnostic>,
) {
// MediaBox (inheritable)
if let Some(mb) = parse_rect(dict.get("MediaBox")) {
inherited.media_box = Some(mb);
}
// CropBox (inheritable)
if let Some(cb) = parse_rect(dict.get("CropBox")) {
inherited.crop_box = Some(cb);
}
// Resources (inheritable) - merge with existing resources
if let Some(resources_obj) = dict.get("Resources") {
let merged = merge_resources(&inherited.resources, resources_obj);
inherited.resources = Arc::new(merged);
}
// Rotate (inheritable)
if let Some(rot) = dict.get("Rotate").and_then(|o| o.as_int()) {
if rot % 90 != 0 {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::PageInvalidRotate,
format!(
"STRUCT_INVALID_ROTATE: /Rotate value {} is not a multiple of 90",
rot
),
));
// Clamp to nearest multiple of 90 (floor toward negative infinity)
inherited.rotate = ((rot as f64 / 90.0).floor() as i64 * 90) as i32;
} else {
inherited.rotate = rot as i32;
}
}
}
/// Build a PageDict from a leaf /Page node and accumulated inherited attributes.
///
/// This function extracts all page-level attributes, substituting defaults for
/// missing values and emitting diagnostics where appropriate.
fn build_page_dict(
page_obj: &PdfObject,
inherited: &InheritedAttrs,
diagnostics: &mut Vec<Diagnostic>,
) -> PageDict {
let dict = match page_obj.as_dict() {
Some(d) => d,
None => {
// Not a dict - return a minimal PageDict with defaults
return PageDict {
obj_ref: ObjRef::new(0, 0),
media_box: DEFAULT_MEDIABOX,
crop_box: None,
bleed_box: None,
trim_box: None,
art_box: None,
rotate: inherited.rotate,
resources: Arc::clone(&inherited.resources),
contents: Vec::new(),
annots: Vec::new(),
actual_text: None,
lang: None,
aa: None,
struct_parents: None,
};
}
};
// Get the page's object reference (if available as Indirect)
let obj_ref = if let PdfObject::Indirect(ind) = page_obj {
ind.id
} else {
ObjRef::new(0, 0)
};
// MediaBox: use page's own, or inherited, or default
let media_box = if let Some(mb) = parse_rect(dict.get("MediaBox")) {
mb
} else if let Some(inherited_mb) = inherited.media_box {
inherited_mb
} else {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("STRUCT_MISSING_KEY: Page {} has no /MediaBox and no inherited /MediaBox; using US Letter default", obj_ref),
));
DEFAULT_MEDIABOX
};
// CropBox: use page's own, or inherited, or default to media_box
let crop_box = if let Some(cb) = parse_rect(dict.get("CropBox")) {
Some(cb)
} else {
inherited.crop_box
};
// BleedBox, TrimBox, ArtBox: non-inheritable, must be on this page
let bleed_box = parse_rect(dict.get("BleedBox"));
let trim_box = parse_rect(dict.get("TrimBox"));
let art_box = parse_rect(dict.get("ArtBox"));
// Rotate: use page's own (with validation) or inherited
let mut rotate = inherited.rotate;
if let Some(rot) = dict.get("Rotate").and_then(|o| o.as_int()) {
if rot % 90 != 0 {
diagnostics.push(Diagnostic::with_dynamic(
DiagCode::PageInvalidRotate,
0,
format!(
"Page {} has /Rotate value {} (not a multiple of 90)",
obj_ref, rot
),
));
// Clamp to nearest multiple of 90 (floor toward negative infinity)
rotate = ((rot as f64 / 90.0).floor() as i64 * 90) as i32;
} else {
// Valid rotate value - normalize to 0-270 range
rotate = ((rot % 360 + 360) % 360) as i32;
}
}
// Resources: merge page's own resources with inherited resources
let resources = if let Some(resources_obj) = dict.get("Resources") {
let merged = merge_resources(&inherited.resources, resources_obj);
Arc::new(merged)
} else {
// No resources on this page - use inherited resources as-is
Arc::clone(&inherited.resources)
};
// Contents: normalize to Vec<ObjRef>
let contents = parse_contents_array(dict.get("Contents"));
// Annots: collect array of references
let annots = if let Some(PdfObject::Array(arr)) = dict.get("Annots") {
arr.iter().filter_map(|o| o.as_ref()).collect()
} else {
Vec::new()
};
// ActualText (from tagged PDF)
let actual_text = dict
.get("ActualText")
.and_then(|o| o.as_string())
.and_then(|s| String::from_utf8(s.to_vec()).ok());
// Lang (language identifier)
let lang = dict
.get("Lang")
.and_then(|o| o.as_string())
.and_then(|s| String::from_utf8(s.to_vec()).ok());
// AA (additional actions)
let aa = dict.get("AA").cloned();
// StructParents: for StructTree MCID resolution (Phase 7.1.4)
let struct_parents = dict
.get("StructParents")
.and_then(|o| o.as_int())
.map(|i| i as i32);
PageDict {
obj_ref,
media_box,
crop_box,
bleed_box,
trim_box,
art_box,
rotate,
resources,
contents,
annots,
actual_text,
lang,
aa,
struct_parents,
}
}
/// Parse a rectangle array [x1 y1 x2 y2] from a PdfObject.
///
/// Returns None if the object is not a 4-element array of numbers.
fn parse_rect(obj: Option<&PdfObject>) -> Option<[f64; 4]> {
let arr = obj?.as_array()?;
if arr.len() != 4 {
return None;
}
let x1 = arr[0]
.as_int()
.map(|i| i as f64)
.or_else(|| arr[0].as_real())?;
let y1 = arr[1]
.as_int()
.map(|i| i as f64)
.or_else(|| arr[1].as_real())?;
let x2 = arr[2]
.as_int()
.map(|i| i as f64)
.or_else(|| arr[2].as_real())?;
let y2 = arr[3]
.as_int()
.map(|i| i as f64)
.or_else(|| arr[3].as_real())?;
Some([x1, y1, x2, y2])
}
/// Normalize /Contents to a Vec<ObjRef>.
///
/// /Contents can be:
/// - A single stream reference -> Vec with one element
/// - An array of stream references -> Vec with all elements
/// - A direct stream (illegal) -> empty Vec with diagnostic
/// - Missing -> empty Vec
fn parse_contents_array(obj: Option<&PdfObject>) -> Vec<ObjRef> {
match obj {
None => Vec::new(),
Some(PdfObject::Ref(ref_)) => vec![*ref_],
Some(PdfObject::Array(arr)) => arr.iter().filter_map(|o| o.as_ref()).collect(),
Some(PdfObject::Stream(_)) => {
// Direct stream is illegal - should be indirect
// Return empty; diagnostics would be emitted by parser
Vec::new()
}
_ => Vec::new(),
}
}
/// Build a map from page ObjRef to 0-based page index.
///
/// This function walks the page tree and creates a HashMap that maps
/// each page's object reference to its 0-based index in document order.
/// This is useful for features like thread bead chain walking that need
/// to resolve page references to page indices.
///
/// # Arguments
///
/// * `catalog` - The document catalog containing the /Pages reference
/// * `resolver` - The xref resolver for resolving indirect references
///
/// # Returns
///
/// A HashMap<ObjRef, usize> mapping page references to their 0-based indices.
///
/// # Behavior
///
/// - Empty /Pages tree: returns empty HashMap
/// - Pages are indexed in document order (left-to-right depth-first traversal)
/// - Missing or unresolvable pages are skipped
pub fn build_page_ref_to_index(
catalog: &crate::parser::catalog::Catalog,
resolver: &XrefResolver,
) -> std::collections::HashMap<ObjRef, usize> {
use std::collections::HashMap;
let mut page_ref_to_index = HashMap::new();
// Flatten the page tree to get all pages in order
if let Ok(pages) = flatten_page_tree(resolver, catalog.pages_ref) {
for (index, page) in pages.iter().enumerate() {
page_ref_to_index.insert(page.obj_ref, index);
}
}
page_ref_to_index
}
#[cfg(test)]
fn make_pages_dict(kids: Vec<PdfObject>, count: i64, media_box: Option<[f64; 4]>) -> PdfObject {
let mut dict = PdfDict::new();
dict.insert(intern("Type"), PdfObject::Name(intern("Pages")));
dict.insert(intern("Kids"), PdfObject::Array(Box::new(kids)));
dict.insert(intern("Count"), PdfObject::Integer(count));
if let Some(mb) = media_box {
dict.insert(intern("MediaBox"), make_rect_array(mb));
}
PdfObject::Dict(Box::new(dict))
}
#[cfg(test)]
fn make_page_dict(media_box: Option<[f64; 4]>, rotate: Option<i64>) -> PdfObject {
let mut dict = PdfDict::new();
dict.insert(intern("Type"), PdfObject::Name(intern("Page")));
if let Some(mb) = media_box {
dict.insert(intern("MediaBox"), make_rect_array(mb));
}
if let Some(rot) = rotate {
dict.insert(intern("Rotate"), PdfObject::Integer(rot));
}
PdfObject::Dict(Box::new(dict))
}
#[cfg(test)]
fn make_rect_array(rect: [f64; 4]) -> PdfObject {
PdfObject::Array(Box::new(vec![
PdfObject::Real(rect[0]),
PdfObject::Real(rect[1]),
PdfObject::Real(rect[2]),
PdfObject::Real(rect[3]),
]))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_mediabox() {
assert_eq!(DEFAULT_MEDIABOX, [0.0, 0.0, 612.0, 792.0]);
}
#[test]
fn test_parse_rect_valid() {
let rect = PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(612),
PdfObject::Integer(792),
]));
assert_eq!(parse_rect(Some(&rect)), Some([0.0, 0.0, 612.0, 792.0]));
}
#[test]
fn test_parse_rect_real() {
let rect = PdfObject::Array(Box::new(vec![
PdfObject::Real(0.0),
PdfObject::Real(0.0),
PdfObject::Real(612.5),
PdfObject::Real(792.5),
]));
assert_eq!(parse_rect(Some(&rect)), Some([0.0, 0.0, 612.5, 792.5]));
}
#[test]
fn test_parse_rect_invalid_length() {
let rect = PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(612),
]));
assert_eq!(parse_rect(Some(&rect)), None);
}
#[test]
fn test_parse_rect_non_array() {
assert_eq!(parse_rect(Some(&PdfObject::Integer(42))), None);
}
#[test]
fn test_parse_contents_single_ref() {
let ref_obj = PdfObject::Ref(ObjRef::new(10, 0));
assert_eq!(
parse_contents_array(Some(&ref_obj)),
vec![ObjRef::new(10, 0)]
);
}
#[test]
fn test_parse_contents_array() {
let arr = PdfObject::Array(Box::new(vec![
PdfObject::Ref(ObjRef::new(10, 0)),
PdfObject::Ref(ObjRef::new(11, 0)),
]));
assert_eq!(
parse_contents_array(Some(&arr)),
vec![ObjRef::new(10, 0), ObjRef::new(11, 0),]
);
}
#[test]
fn test_parse_contents_none() {
assert_eq!(parse_contents_array(None), Vec::new());
}
#[test]
fn test_flatten_single_page() {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
let page = make_page_dict(Some([0.0, 0.0, 612.0, 792.0]), None);
let pages = make_pages_dict(vec![page], 1, None);
resolver.cache_object(pages_ref, pages);
let result = flatten_page_tree(&resolver, pages_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec.len(), 1);
assert_eq!(pages_vec[0].media_box, [0.0, 0.0, 612.0, 792.0]);
}
#[test]
fn test_flatten_three_level_inheritance() {
// Critical test: 3-level /Pages tree with MediaBox only on grandparent
let resolver = XrefResolver::new();
// Grandparent /Pages (has MediaBox)
let grandparent_ref = ObjRef::new(1, 0);
let grandparent = make_pages_dict(vec![], 2, Some([0.0, 0.0, 612.0, 792.0]));
// Parent /Pages (no MediaBox - inherits from grandparent)
let parent_ref = ObjRef::new(2, 0);
let parent = make_pages_dict(vec![], 1, None);
// Leaf pages (no MediaBox - inherits from grandparent via parent)
let page1_ref = ObjRef::new(3, 0);
let page1 = make_page_dict(None, None);
let page2_ref = ObjRef::new(4, 0);
let page2 = make_page_dict(None, None);
// Wire up the tree: grandparent -> parent -> [page1, page2]
let mut grandparent_dict = grandparent.as_dict().unwrap().clone();
grandparent_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![PdfObject::Ref(parent_ref)])),
);
let mut parent_dict = parent.as_dict().unwrap().clone();
parent_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![
PdfObject::Ref(page1_ref),
PdfObject::Ref(page2_ref),
])),
);
resolver.cache_object(grandparent_ref, PdfObject::Dict(Box::new(grandparent_dict)));
resolver.cache_object(parent_ref, PdfObject::Dict(Box::new(parent_dict)));
resolver.cache_object(page1_ref, page1);
resolver.cache_object(page2_ref, page2);
let result = flatten_page_tree(&resolver, grandparent_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec.len(), 2);
// Both pages should inherit MediaBox from grandparent
assert_eq!(pages_vec[0].media_box, [0.0, 0.0, 612.0, 792.0]);
assert_eq!(pages_vec[1].media_box, [0.0, 0.0, 612.0, 792.0]);
}
#[test]
fn test_ec09_missing_mediabox_defaults_to_us_letter() {
// Critical test EC-09: page with no MediaBox anywhere
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
// /Pages with no MediaBox
let pages = make_pages_dict(vec![make_page_dict(None, None)], 1, None);
resolver.cache_object(pages_ref, pages);
let result = flatten_page_tree(&resolver, pages_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec.len(), 1);
assert_eq!(pages_vec[0].media_box, DEFAULT_MEDIABOX);
}
#[test]
fn test_invalid_rotate_clamped() {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
// /Rotate = 45 should be clamped to 0
let pages = make_pages_dict(
vec![make_page_dict(Some(DEFAULT_MEDIABOX), Some(45))],
1,
Some(DEFAULT_MEDIABOX),
);
resolver.cache_object(pages_ref, pages);
let result = flatten_page_tree(&resolver, pages_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec[0].rotate, 0);
}
#[test]
fn test_invalid_rotate_135_clamped() {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
// /Rotate = 135 should be clamped to 90
let pages = make_pages_dict(
vec![make_page_dict(Some(DEFAULT_MEDIABOX), Some(135))],
1,
Some(DEFAULT_MEDIABOX),
);
resolver.cache_object(pages_ref, pages);
let result = flatten_page_tree(&resolver, pages_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec[0].rotate, 90);
}
#[test]
fn test_valid_rotate_values() {
for rot in [0, 90, 180, 270, 360, -90, -180] {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
let pages = make_pages_dict(
vec![make_page_dict(Some(DEFAULT_MEDIABOX), Some(rot))],
1,
Some(DEFAULT_MEDIABOX),
);
resolver.cache_object(pages_ref, pages);
let result = flatten_page_tree(&resolver, pages_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
// Normalize to 0-270 range
let expected = ((rot % 360 + 360) % 360) as i32;
assert_eq!(pages_vec[0].rotate, expected);
}
}
#[test]
fn test_empty_pages_tree() {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
let pages = make_pages_dict(vec![], 0, None);
resolver.cache_object(pages_ref, pages);
let result = flatten_page_tree(&resolver, pages_ref);
assert!(result.is_ok());
assert_eq!(result.unwrap().len(), 0);
}
#[test]
fn test_page_count_mismatch() {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
// /Count says 5, but we only have 1 page
let pages = make_pages_dict(
vec![make_page_dict(Some(DEFAULT_MEDIABOX), None)],
5, // Wrong count
Some(DEFAULT_MEDIABOX),
);
resolver.cache_object(pages_ref, pages);
let result = flatten_page_tree(&resolver, pages_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec.len(), 1);
// The function should have emitted a diagnostic about count mismatch
// (we can't easily check this without exposing diagnostics from the public API)
}
#[test]
fn test_cycle_detection_in_page_tree() {
// Test that circular references in the page tree are detected and handled
let resolver = XrefResolver::new();
// Create a tree with a cycle: parent -> child1 -> child2 -> child1 (cycle)
let parent_ref = ObjRef::new(1, 0);
let child1_ref = ObjRef::new(2, 0);
let child2_ref = ObjRef::new(3, 0);
let page_ref = ObjRef::new(4, 0);
// Add a valid page first
let page = make_page_dict(Some(DEFAULT_MEDIABOX), None);
resolver.cache_object(page_ref, page);
// Create child2 with a valid page and a reference to child1 (creating cycle)
let mut child2_dict = PdfDict::new();
child2_dict.insert(intern("Type"), PdfObject::Name(intern("Pages")));
child2_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![
PdfObject::Ref(page_ref),
PdfObject::Ref(child1_ref), // This will cause a cycle
])),
);
child2_dict.insert(intern("Count"), PdfObject::Integer(2));
// Create child1 that references child2 (the other half of the cycle)
let mut child1_dict = PdfDict::new();
child1_dict.insert(intern("Type"), PdfObject::Name(intern("Pages")));
child1_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![PdfObject::Ref(child2_ref)])),
);
child1_dict.insert(intern("Count"), PdfObject::Integer(1));
// Create parent that references child1
let mut parent_dict = PdfDict::new();
parent_dict.insert(intern("Type"), PdfObject::Name(intern("Pages")));
parent_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![PdfObject::Ref(child1_ref)])),
);
parent_dict.insert(intern("Count"), PdfObject::Integer(2));
parent_dict.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
resolver.cache_object(child1_ref, PdfObject::Dict(Box::new(child1_dict)));
resolver.cache_object(child2_ref, PdfObject::Dict(Box::new(child2_dict)));
resolver.cache_object(parent_ref, PdfObject::Dict(Box::new(parent_dict)));
let result = flatten_page_tree(&resolver, parent_ref);
// Should succeed and return the one valid page (the cycle is pruned)
assert!(result.is_ok());
let pages_vec = result.unwrap();
// We should get exactly 1 page (the valid one before the cycle)
assert_eq!(pages_vec.len(), 1);
assert_eq!(pages_vec[0].media_box, DEFAULT_MEDIABOX);
}
#[test]
fn test_resource_inheritance_three_level() {
// Critical test: 3-level resource inheritance
let resolver = XrefResolver::new();
// Grandparent /Pages with resources /F1 and /Im1
let grandparent_ref = ObjRef::new(1, 0);
let mut grandparent_resources = PdfDict::new();
let mut gp_fonts = PdfDict::new();
gp_fonts.insert(intern("F1"), PdfObject::Ref(ObjRef::new(10, 0)));
let mut gp_xobj = PdfDict::new();
gp_xobj.insert(intern("Im1"), PdfObject::Ref(ObjRef::new(20, 0)));
grandparent_resources.insert(intern("Font"), PdfObject::Dict(Box::new(gp_fonts)));
grandparent_resources.insert(intern("XObject"), PdfObject::Dict(Box::new(gp_xobj)));
let mut grandparent = PdfDict::new();
grandparent.insert(intern("Type"), PdfObject::Name(intern("Pages")));
grandparent.insert(intern("Kids"), PdfObject::Array(Box::new(vec![])));
grandparent.insert(intern("Count"), PdfObject::Integer(2));
grandparent.insert(
intern("Resources"),
PdfObject::Dict(Box::new(grandparent_resources)),
);
grandparent.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
// Parent /Pages adds /F2
let parent_ref = ObjRef::new(2, 0);
let mut parent_resources = PdfDict::new();
let mut p_fonts = PdfDict::new();
p_fonts.insert(intern("F2"), PdfObject::Ref(ObjRef::new(11, 0)));
parent_resources.insert(intern("Font"), PdfObject::Dict(Box::new(p_fonts)));
let mut parent = PdfDict::new();
parent.insert(intern("Type"), PdfObject::Name(intern("Pages")));
parent.insert(intern("Kids"), PdfObject::Array(Box::new(vec![])));
parent.insert(intern("Count"), PdfObject::Integer(2));
parent.insert(
intern("Resources"),
PdfObject::Dict(Box::new(parent_resources)),
);
// Page 1 adds /F3 and overrides /F1
let page1_ref = ObjRef::new(3, 0);
let mut page1_resources = PdfDict::new();
let mut page1_fonts = PdfDict::new();
page1_fonts.insert(intern("F1"), PdfObject::Ref(ObjRef::new(15, 0))); // Override
page1_fonts.insert(intern("F3"), PdfObject::Ref(ObjRef::new(12, 0))); // New
page1_resources.insert(intern("Font"), PdfObject::Dict(Box::new(page1_fonts)));
let mut page1 = PdfDict::new();
page1.insert(intern("Type"), PdfObject::Name(intern("Page")));
page1.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
page1.insert(
intern("Resources"),
PdfObject::Dict(Box::new(page1_resources)),
);
// Page 2 has no resources (should inherit all)
let page2_ref = ObjRef::new(4, 0);
let mut page2 = PdfDict::new();
page2.insert(intern("Type"), PdfObject::Name(intern("Page")));
page2.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
// Wire up the tree: grandparent -> parent -> [page1, page2]
let mut grandparent_dict = grandparent.clone();
grandparent_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![PdfObject::Ref(parent_ref)])),
);
let mut parent_dict = parent.clone();
parent_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![
PdfObject::Ref(page1_ref),
PdfObject::Ref(page2_ref),
])),
);
resolver.cache_object(grandparent_ref, PdfObject::Dict(Box::new(grandparent_dict)));
resolver.cache_object(parent_ref, PdfObject::Dict(Box::new(parent_dict)));
resolver.cache_object(page1_ref, PdfObject::Dict(Box::new(page1)));
resolver.cache_object(page2_ref, PdfObject::Dict(Box::new(page2)));
let result = flatten_page_tree(&resolver, grandparent_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec.len(), 2);
// Page 1: should have F1 (overridden), F2 (inherited), F3 (new), Im1 (inherited)
assert_eq!(pages_vec[0].resources.fonts.len(), 3);
assert_eq!(
pages_vec[0].resources.fonts.get(&intern("F1")),
Some(&ObjRef::new(15, 0))
); // Overridden
assert_eq!(
pages_vec[0].resources.fonts.get(&intern("F2")),
Some(&ObjRef::new(11, 0))
); // Inherited from parent
assert_eq!(
pages_vec[0].resources.fonts.get(&intern("F3")),
Some(&ObjRef::new(12, 0))
); // New on page
assert_eq!(pages_vec[0].resources.xobjects.len(), 1);
assert_eq!(
pages_vec[0].resources.xobjects.get(&intern("Im1")),
Some(&ObjRef::new(20, 0))
); // Inherited from grandparent
// Page 2: should have all inherited resources (F1, F2, Im1)
assert_eq!(pages_vec[1].resources.fonts.len(), 2);
assert_eq!(
pages_vec[1].resources.fonts.get(&intern("F1")),
Some(&ObjRef::new(10, 0))
); // From grandparent
assert_eq!(
pages_vec[1].resources.fonts.get(&intern("F2")),
Some(&ObjRef::new(11, 0))
); // From parent
assert_eq!(pages_vec[1].resources.xobjects.len(), 1);
assert_eq!(
pages_vec[1].resources.xobjects.get(&intern("Im1")),
Some(&ObjRef::new(20, 0))
); // From grandparent
}
#[test]
fn test_resource_inheritance_page_without_resources() {
// Test that a page without /Resources inherits parent's resources
// and that multiple pages with no resources share the same Arc instance
let resolver = XrefResolver::new();
// Parent /Pages with resources
let parent_ref = ObjRef::new(1, 0);
let mut parent_resources = PdfDict::new();
let mut parent_fonts = PdfDict::new();
parent_fonts.insert(intern("F1"), PdfObject::Ref(ObjRef::new(10, 0)));
parent_resources.insert(intern("Font"), PdfObject::Dict(Box::new(parent_fonts)));
let mut parent = PdfDict::new();
parent.insert(intern("Type"), PdfObject::Name(intern("Pages")));
parent.insert(intern("Kids"), PdfObject::Array(Box::new(vec![])));
parent.insert(intern("Count"), PdfObject::Integer(2));
parent.insert(
intern("Resources"),
PdfObject::Dict(Box::new(parent_resources)),
);
parent.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
// Two pages without /Resources
let page1_ref = ObjRef::new(2, 0);
let mut page1 = PdfDict::new();
page1.insert(intern("Type"), PdfObject::Name(intern("Page")));
page1.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
let page2_ref = ObjRef::new(3, 0);
let mut page2 = PdfDict::new();
page2.insert(intern("Type"), PdfObject::Name(intern("Page")));
page2.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
// Wire up the tree
let mut parent_dict = parent.clone();
parent_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![
PdfObject::Ref(page1_ref),
PdfObject::Ref(page2_ref),
])),
);
resolver.cache_object(parent_ref, PdfObject::Dict(Box::new(parent_dict)));
resolver.cache_object(page1_ref, PdfObject::Dict(Box::new(page1)));
resolver.cache_object(page2_ref, PdfObject::Dict(Box::new(page2)));
let result = flatten_page_tree(&resolver, parent_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec.len(), 2);
// Both pages should have inherited F1 from parent
assert_eq!(pages_vec[0].resources.fonts.len(), 1);
assert_eq!(
pages_vec[0].resources.fonts.get(&intern("F1")),
Some(&ObjRef::new(10, 0))
);
assert_eq!(pages_vec[1].resources.fonts.len(), 1);
assert_eq!(
pages_vec[1].resources.fonts.get(&intern("F1")),
Some(&ObjRef::new(10, 0))
);
// Verify Arc pointer sharing: when pages have no resources,
// they should share the same Arc instance (memory efficiency)
assert!(Arc::ptr_eq(
&pages_vec[0].resources,
&pages_vec[1].resources
));
}
#[test]
fn test_resource_inheritance_empty_root() {
// Test that empty /Resources at root propagates correctly
let resolver = XrefResolver::new();
// Root /Pages with empty /Resources
let root_ref = ObjRef::new(1, 0);
let mut root_resources = PdfDict::new(); // Empty resources dict
let mut root = PdfDict::new();
root.insert(intern("Type"), PdfObject::Name(intern("Pages")));
root.insert(intern("Kids"), PdfObject::Array(Box::new(vec![])));
root.insert(intern("Count"), PdfObject::Integer(1));
root.insert(
intern("Resources"),
PdfObject::Dict(Box::new(root_resources)),
);
root.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
// Page without /Resources
let page_ref = ObjRef::new(2, 0);
let mut page = PdfDict::new();
page.insert(intern("Type"), PdfObject::Name(intern("Page")));
page.insert(intern("MediaBox"), make_rect_array(DEFAULT_MEDIABOX));
// Wire up the tree
let mut root_dict = root.clone();
root_dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![PdfObject::Ref(page_ref)])),
);
resolver.cache_object(root_ref, PdfObject::Dict(Box::new(root_dict)));
resolver.cache_object(page_ref, PdfObject::Dict(Box::new(page)));
let result = flatten_page_tree(&resolver, root_ref);
assert!(result.is_ok());
let pages_vec = result.unwrap();
assert_eq!(pages_vec.len(), 1);
// Page should have empty resources
assert!(pages_vec[0].resources.is_empty());
}
}
/// Lazy iterator over pages in a page tree.
///
/// This iterator walks the page tree depth-first, yielding pages one at a time
/// without materializing the entire page tree in memory. This is critical for
/// memory-efficient extraction of large documents.
///
/// # Memory Behavior
///
/// - Only the current path from root to leaf is held in memory (max ~16 nodes)
/// - Each yielded PageDict is standalone and can be dropped after use
/// - Peak RSS stays O(depth) not O(pages)
///
/// # Example
///
/// ```ignore
/// let mut iter = LazyPageIter::new(&resolver, pages_ref);
/// while let Some(page) = iter.next() {
/// let page_dict = page?;
/// // Process page - it will be dropped after loop iteration
/// }
/// ```
pub struct LazyPageIter<'a> {
/// The xref resolver for resolving indirect references
resolver: &'a XrefResolver,
/// Stack of (node_obj, inherited_attrs, kid_index) for depth-first traversal
/// Each element represents a level in the page tree we're currently traversing
stack: Vec<(PdfObject, InheritedAttrs, usize)>,
/// Set of visited object references for cycle detection
visited: HashSet<ObjRef>,
/// Diagnostics collected during traversal
diagnostics: Vec<Diagnostic>,
}
impl<'a> LazyPageIter<'a> {
/// Create a new lazy page iterator starting from the given /Pages reference.
///
/// This resolves the root /Pages node and initializes the traversal stack.
pub fn new(
resolver: &'a XrefResolver,
pages_ref: ObjRef,
) -> std::result::Result<Self, Vec<Diagnostic>> {
let mut visited = HashSet::new();
let mut diagnostics = Vec::new();
// Resolve the root /Pages node
let pages_obj = match resolver.resolve(pages_ref) {
Ok(obj) => obj,
Err(e) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Failed to resolve root /Pages node {}: {}", pages_ref, e),
));
return Err(diagnostics);
}
};
// Mark root as visited
visited.insert(pages_ref);
// Initialize with root node and default inherited attrs
let inherited = InheritedAttrs::default();
let mut stack = Vec::new();
// Push root node onto stack
stack.push((pages_obj, inherited, 0));
Ok(Self {
resolver,
stack,
visited,
diagnostics,
})
}
/// Get diagnostics collected during traversal.
pub fn diagnostics(&self) -> &[Diagnostic] {
&self.diagnostics
}
/// Consume the iterator and return all collected diagnostics.
pub fn into_diagnostics(self) -> Vec<Diagnostic> {
self.diagnostics
}
}
impl<'a> Iterator for LazyPageIter<'a> {
type Item = std::result::Result<PageDict, Vec<Diagnostic>>;
fn next(&mut self) -> Option<Self::Item> {
while !self.stack.is_empty() {
let (node, mut inherited, kid_idx) = self.stack.pop().unwrap();
// Depth limit check
if self.stack.len() > MAX_PAGES_DEPTH as usize {
self.diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructDepthExceeded,
format!(
"STRUCT_DEPTH_EXCEEDED: /Pages nesting exceeds {} levels",
MAX_PAGES_DEPTH
),
));
continue;
}
let dict = match node.as_dict() {
Some(d) => d,
None => {
// Not a dictionary - skip this node
continue;
}
};
let node_type = dict.get("Type").and_then(|o| o.as_name()).unwrap_or("");
// Save the inherited state before merging this node's attributes
let parent_inherited = inherited.clone();
// Merge inheritable attributes from this node
merge_inherited_attrs(dict, &mut inherited, &mut self.diagnostics);
match node_type {
"Page" => {
// Leaf node: emit a PageDict
let page_dict = build_page_dict(&node, &inherited, &mut self.diagnostics);
return Some(Ok(page_dict));
}
"Pages" => {
// Internal node: process /Kids
let kids = match dict.get("Kids") {
Some(k) => k,
None => {
self.diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::StructMissingKey,
"STRUCT_MISSING_KEY: /Pages node missing /Kids",
));
inherited = parent_inherited;
continue;
}
};
let kids_array = match kids.as_array() {
Some(arr) => arr,
None => {
// /Kids is not an array - skip
inherited = parent_inherited;
continue;
}
};
// For /Pages nodes, all children should start with the same inherited state
// Save this state so we can restore it for each sibling
let pages_parent_inherited = inherited.clone();
// Push remaining siblings back onto stack (in reverse order so we process left-to-right)
// We need to push kids[kid_idx+1..] first, then process kid at kid_idx
if kid_idx + 1 < kids_array.len() {
// Clone node before moving it to avoid borrow checker error
self.stack.push((
node.clone(),
pages_parent_inherited.clone(),
kid_idx + 1,
));
}
// Push the current kid onto stack
if kid_idx < kids_array.len() {
let kid = &kids_array[kid_idx];
// Handle both direct (embedded dict) and indirect references
let kid_obj = match kid {
PdfObject::Ref(ref_) => {
// Check for cycles
if self.visited.contains(ref_) {
self.diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructCircularRef,
format!(
"STRUCT_CIRCULAR_REF: /Pages node {} already visited",
ref_
),
));
inherited = parent_inherited;
continue;
}
self.visited.insert(*ref_);
match self.resolver.resolve(*ref_) {
Ok(obj) => obj,
Err(e) => {
self.diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("STRUCT_MISSING_KEY: Failed to resolve /Kids entry {}: {}", ref_, e),
));
inherited = parent_inherited;
continue;
}
}
}
PdfObject::Dict(_) => {
// Direct dictionary - uncommon but legal
kid.clone()
}
_ => {
// Invalid /Kids entry - skip
inherited = parent_inherited;
continue;
}
};
// Push kid onto stack with inherited attrs from this /Pages node
self.stack.push((kid_obj, pages_parent_inherited, 0));
} else {
inherited = parent_inherited;
}
}
_ => {
// Unknown /Type - skip this node
inherited = parent_inherited;
}
}
}
None
}
}
/// Property tests for page tree flattening fuzzing.
///
/// Per acceptance criteria: "proptest: random page-tree shapes never panic"
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
/// Helper to make a /Pages dict (duplicate from tests module).
fn make_pages_dict(kids: Vec<PdfObject>, count: i64, media_box: Option<[f64; 4]>) -> PdfObject {
let mut dict = PdfDict::new();
dict.insert(intern("Type"), PdfObject::Name(intern("Pages")));
dict.insert(intern("Kids"), PdfObject::Array(Box::new(kids)));
dict.insert(intern("Count"), PdfObject::Integer(count));
if let Some(mb) = media_box {
dict.insert(
intern("MediaBox"),
PdfObject::Array(Box::new(vec![
PdfObject::Real(mb[0]),
PdfObject::Real(mb[1]),
PdfObject::Real(mb[2]),
PdfObject::Real(mb[3]),
])),
);
}
PdfObject::Dict(Box::new(dict))
}
/// Helper to make a /Page dict (duplicate from tests module).
fn make_page_dict(media_box: Option<[f64; 4]>, rotate: Option<i64>) -> PdfObject {
let mut dict = PdfDict::new();
dict.insert(intern("Type"), PdfObject::Name(intern("Page")));
if let Some(mb) = media_box {
dict.insert(
intern("MediaBox"),
PdfObject::Array(Box::new(vec![
PdfObject::Real(mb[0]),
PdfObject::Real(mb[1]),
PdfObject::Real(mb[2]),
PdfObject::Real(mb[3]),
])),
);
}
if let Some(rot) = rotate {
dict.insert(intern("Rotate"), PdfObject::Integer(rot));
}
PdfObject::Dict(Box::new(dict))
}
/// Strategy to generate arbitrary rectangle arrays.
fn arb_rect() -> impl Strategy<Value = [f64; 4]> {
prop::array::uniform4(-1000.0..1000.0)
}
/// Strategy to generate arbitrary page dictionaries.
fn arb_page_dict() -> impl Strategy<Value = PdfDict> {
(
arb_rect(),
prop::option::of(-1000i64..1000),
prop::option::of(arb_rect()),
prop::option::of(arb_rect()),
)
.prop_map(|(media_box, rotate, crop_box, bleed_box)| {
let mut dict = PdfDict::new();
dict.insert(intern("Type"), PdfObject::Name(intern("Page")));
dict.insert(
intern("MediaBox"),
PdfObject::Array(Box::new(vec![
PdfObject::Real(media_box[0]),
PdfObject::Real(media_box[1]),
PdfObject::Real(media_box[2]),
PdfObject::Real(media_box[3]),
])),
);
if let Some(rot) = rotate {
dict.insert(intern("Rotate"), PdfObject::Integer(rot));
}
if let Some(cb) = crop_box {
dict.insert(
intern("CropBox"),
PdfObject::Array(Box::new(vec![
PdfObject::Real(cb[0]),
PdfObject::Real(cb[1]),
PdfObject::Real(cb[2]),
PdfObject::Real(cb[3]),
])),
);
}
if let Some(bb) = bleed_box {
dict.insert(
intern("BleedBox"),
PdfObject::Array(Box::new(vec![
PdfObject::Real(bb[0]),
PdfObject::Real(bb[1]),
PdfObject::Real(bb[2]),
PdfObject::Real(bb[3]),
])),
);
}
dict
})
}
/// Strategy to generate /Pages dictionaries with direct /Kids.
fn arb_pages_dict_with_direct_kids(max_depth: u8) -> impl Strategy<Value = PdfDict> {
let leaf = prop::option::of(arb_page_dict());
leaf.prop_map(move |maybe_page: Option<PdfDict>| {
let mut dict = PdfDict::new();
dict.insert(intern("Type"), PdfObject::Name(intern("Pages")));
dict.insert(intern("Count"), PdfObject::Integer(0));
if let Some(page) = maybe_page {
dict.insert(
intern("Kids"),
PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new(page))])),
);
dict.insert(intern("Count"), PdfObject::Integer(1));
} else {
dict.insert(intern("Kids"), PdfObject::Array(Box::new(vec![])));
}
dict
})
}
proptest! {
/// Test that parse_rect never panics on arbitrary arrays (INV-8).
#[test]
fn fuzz_parse_rect_no_panics(arr in prop::collection::vec(any::<f64>(), 0..10)) {
let obj = PdfObject::Array(Box::new(
arr.into_iter().map(|f| if f.is_finite() { PdfObject::Real(f) } else { PdfObject::Real(0.0) }).collect()
));
// This should never panic
let _ = parse_rect(Some(&obj));
}
/// Test that build_page_dict never panics on arbitrary input.
#[test]
fn fuzz_build_page_dict_no_panics(page_dict in arb_page_dict()) {
let inherited = InheritedAttrs::default();
let mut diagnostics = Vec::new();
let page_obj = PdfObject::Dict(Box::new(page_dict));
// This should never panic
let _ = build_page_dict(&page_obj, &inherited, &mut diagnostics);
}
/// Test that flatten_page_tree handles arbitrary /Pages structures without panicking.
#[test]
fn fuzz_flatten_page_tree_no_panics(pages_dict in arb_pages_dict_with_direct_kids(2)) {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
resolver.cache_object(pages_ref, PdfObject::Dict(Box::new(pages_dict)));
// This should never panic - should always return Ok or Err with diagnostics
let _ = flatten_page_tree(&resolver, pages_ref);
}
/// Test that arbitrary rotate values are handled without panicking.
#[test]
fn fuzz_rotate_clamping_no_panics(rot in any::<i64>()) {
let resolver = XrefResolver::new();
let pages_ref = ObjRef::new(1, 0);
let pages = make_pages_dict(
vec![make_page_dict(Some(DEFAULT_MEDIABOX), Some(rot))],
1,
Some(DEFAULT_MEDIABOX),
);
resolver.cache_object(pages_ref, pages);
// This should never panic
let result = flatten_page_tree(&resolver, pages_ref);
prop_assert!(result.is_ok() || result.is_err());
}
}
}