pdftract/crates/pdftract-core/src/threads/mod.rs
jedarden 6000c654ce fix: resolve compilation errors across codebase
- Fixed missing fields in BlockJson, SpanJson, ExtractionOptions initializations
- Added feature gates to ocr_integration tests for conditional compilation
- Fixed McpServerState::new calls to include audit writer argument
- Fixed CCITTFaxDecoder::decode calls to use instance method
- Fixed type casts for ObjRef::new calls
- Fixed serde_json::Value method calls (is_some -> !is_null)
- Fixed ProfileType test feature gates
- Worked around lifetime issues in schema roundtrip tests

These changes fix numerous compilation errors that were blocking the
codebase from building. The main library and tests now compile successfully.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 08:38:04 -04:00

1482 lines
54 KiB
Rust

//! PDF article thread discovery and metadata extraction.
//!
//! This module implements Phase 7.7.1 of the plan: reading the /Threads array
//! from the document catalog and extracting thread info metadata (/I) for each
//! thread.
//!
//! ## Architecture
//!
//! - **Discovery** (7.7.1): Read /Threads array from catalog, extract /F and /I
//! - **Bead chain walking** (7.7.2): Walk /N links from first bead (future work)
//!
//! ## PDF Thread Structure
//!
//! Per PDF 1.7 Section 12.4.3, an article thread consists of:
//! - `/Threads` array in catalog (optional)
//! - Each thread dict has:
//! - `/F`: Indirect reference to first bead (required)
//! - `/I`: Thread info dict (optional)
//! - `/Title`: Thread title (PdfString, optional)
//! - `/Author`: Thread author (PdfString, optional)
//! - `/Subject`: Thread subject (PdfString, optional)
//! - `/Keywords`: Thread keywords (PdfString, optional, comma-separated)
use crate::diagnostics::{DiagCode, Diagnostic};
use crate::parser::catalog::Catalog;
use crate::parser::object::{ObjRef, PdfDict, PdfObject};
use crate::parser::xref::XrefResolver;
/// Result type for thread operations.
pub type Result<T> = std::result::Result<T, Vec<Diagnostic>>;
/// A thread header with metadata from the thread info dict.
///
/// Represents the metadata for a single article thread, extracted from
/// the /I dict in the /Threads array entry. The bead chain walking
/// happens in Phase 7.7.2.
///
/// # Fields
///
/// * `first_bead_ref` - Indirect reference to the first bead in the chain
/// * `title` - Thread title from /I/Title (None if /I missing or /Title absent)
/// * `author` - Thread author from /I/Author (None if /I missing or /Author absent)
/// * `subject` - Thread subject from /I/Subject (None if /I missing or /Subject absent)
/// * `keywords` - Thread keywords from /I/Keywords (None if /I missing or /Keywords absent)
#[derive(Debug, Clone, PartialEq)]
pub struct ThreadHeader {
/// Indirect reference to the first bead in the thread chain.
///
/// This is always present for valid threads; threads without /F are
/// skipped with a diagnostic.
pub first_bead_ref: ObjRef,
/// Thread title from /I/Title.
///
/// - `Some("")` if /I/Title is present but empty string
/// - `None` if /I is missing or /Title is absent
pub title: Option<String>,
/// Thread author from /I/Author.
///
/// - `Some("")` if /I/Author is present but empty string
/// - `None` if /I is missing or /Author is absent
pub author: Option<String>,
/// Thread subject from /I/Subject.
///
/// - `Some("")` if /I/Subject is present but empty string
/// - `None` if /I is missing or /Subject is absent
pub subject: Option<String>,
/// Thread keywords from /I/Keywords.
///
/// Per PDF spec, this is a comma-separated convention (not an array).
/// - `Some("")` if /I/Keywords is present but empty string
/// - `None` if /I is missing or /Keywords is absent
pub keywords: Option<String>,
}
impl ThreadHeader {
/// Create a new ThreadHeader with the required first bead reference.
pub fn new(first_bead_ref: ObjRef) -> Self {
ThreadHeader {
first_bead_ref,
title: None,
author: None,
subject: None,
keywords: None,
}
}
}
/// Discover article threads from the document catalog.
///
/// Reads the optional /Threads array from the catalog and extracts thread
/// headers (metadata only; bead chain walking is Phase 7.7.2).
///
/// # Arguments
///
/// * `catalog` - The document catalog (may have /Threads)
/// * `resolver` - The xref resolver for resolving indirect references
///
/// # Returns
///
/// A `Result<Vec<ThreadHeader>>` containing all discovered thread headers,
/// or a list of diagnostics (for fatal errors only; per-thread errors are
/// emitted as diagnostics but don't fail the entire operation).
///
/// # Behavior
///
/// - If /Threads is absent or not an array, returns empty Vec (no diagnostic)
/// - If a thread dict lacks /F, skips with diagnostic and continues processing
/// - If /I is missing, all four fields are None (not a diagnostic)
/// - Empty strings ("") are emitted as Some("") to distinguish from absent fields
/// - Multiple threads with the same /Title are legal (no deduplication)
pub fn discover(catalog: &Catalog, resolver: &XrefResolver) -> Result<Vec<ThreadHeader>> {
let mut threads = Vec::new();
let mut diagnostics = Vec::new();
// /Threads is optional; absent is not an error
let threads_ref = match catalog.threads_ref {
Some(ref_) => ref_,
None => return Ok(threads),
};
// Resolve the /Threads array
let threads_obj = match resolver.resolve(threads_ref) {
Ok(obj) => obj,
Err(_) => {
// If we can't resolve /Threads, return empty (not fatal)
return Ok(threads);
}
};
let threads_array = match threads_obj.as_array() {
Some(arr) => arr,
None => {
// /Threads exists but isn't an array; skip without diagnostic
return Ok(threads);
}
};
// Process each thread entry in the array
for (idx, thread_entry) in threads_array.iter().enumerate() {
// Each thread entry should be an indirect ref to a thread dict
let thread_ref = match thread_entry {
PdfObject::Ref(ref_) => *ref_,
_ => {
// Skip non-ref entries with diagnostic
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!(
"Thread entry {} is not an indirect reference (type: {})",
idx,
thread_entry.type_name()
),
));
continue;
}
};
let thread_obj = match resolver.resolve(thread_ref) {
Ok(obj) => obj,
Err(_) => {
// Skip unresolvable thread refs
continue;
}
};
let thread_dict = match thread_obj.as_dict() {
Some(d) => d,
None => {
// Skip non-dict threads
continue;
}
};
// Extract /F (first bead reference) - REQUIRED
let first_bead_ref = match thread_dict.get("F") {
Some(PdfObject::Ref(ref_)) => *ref_,
Some(other) => {
// /F exists but isn't a ref - skip with diagnostic
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!(
"Thread {} has /F but it's not a reference (type: {})",
idx,
other.type_name()
),
));
continue;
}
None => {
// /F is required - skip with diagnostic
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Thread {} is missing /F (first bead reference)", idx),
));
continue;
}
};
let mut header = ThreadHeader::new(first_bead_ref);
// Extract /I (thread info dict) - OPTIONAL
if let Some(info_obj) = thread_dict.get("I") {
if let Some(info_dict) = info_obj.as_dict() {
// Extract /Title
if let Some(title_bytes) = info_dict.get("Title").and_then(|o| o.as_string()) {
header.title = decode_pdf_string(title_bytes);
}
// Extract /Author
if let Some(author_bytes) = info_dict.get("Author").and_then(|o| o.as_string()) {
header.author = decode_pdf_string(author_bytes);
}
// Extract /Subject
if let Some(subject_bytes) = info_dict.get("Subject").and_then(|o| o.as_string()) {
header.subject = decode_pdf_string(subject_bytes);
}
// Extract /Keywords
if let Some(keywords_bytes) = info_dict.get("Keywords").and_then(|o| o.as_string())
{
header.keywords = decode_pdf_string(keywords_bytes);
}
}
// If /I exists but isn't a dict, we skip it (no diagnostic, header fields stay None)
}
threads.push(header);
}
// Only return Err if diagnostics were actually fatal (none are currently)
Ok(threads)
}
/// A single bead in an article thread chain.
///
/// Represents one bead's position on a page, extracted during bead chain walking.
/// Per PDF 1.7 Section 12.4.3, each bead contains a reference to its page and
/// a bounding rectangle defining the article region on that page.
///
/// # Fields
///
/// * `page_index` - 0-based index of the page containing this bead
/// * `rect` - Bounding rectangle of the bead region in PDF user-space coordinates [x0, y0, x1, y1]
#[derive(Debug, Clone, PartialEq)]
pub struct Bead {
/// 0-based page index where this bead is located.
pub page_index: usize,
/// Bounding rectangle in PDF user-space coordinates [x0, y0, x1, y1].
///
/// Per PDF spec, the origin is at the bottom-left corner of the page.
/// This rect is NOT flipped to image-space coordinates.
pub rect: [f32; 4],
}
impl Bead {
/// Create a new Bead with the given page index and rect.
pub fn new(page_index: usize, rect: [f32; 4]) -> Self {
Bead { page_index, rect }
}
}
/// Walk the bead chain for a single thread.
///
/// Follows `/N` (next bead) links from the first bead until the chain
/// terminates (when `/N` points back to the first bead). Detects malformed
/// chains (cycles that don't return to first) and aborts with diagnostic.
///
/// # Arguments
///
/// * `header` - The thread header containing the first bead reference
/// * `resolver` - The xref resolver for resolving indirect references
/// * `page_ref_to_index` - Precomputed map from page ObjRef to 0-based page index
///
/// # Returns
///
/// A `Result<Vec<Bead>>` containing all beads in chain order, or diagnostics
/// for errors encountered during walking.
///
/// # Behavior
///
/// - Follows `/N` links from first bead
/// - Terminates when `/N` points back to first bead (legitimate circular end)
/// - Detects malformed cycles (non-first bead revisited) with diagnostic
/// - Detects missing `/N` with diagnostic
/// - Detects missing or invalid `/R` (page ref) with diagnostic, skips that bead
/// - Detects missing or invalid `/V` (rect) with diagnostic, skips that bead
/// - Tolerates `/Pg` as fallback for page reference (some legacy PDFs)
/// - Maximum 10000 iterations per thread as safety net
/// - Beads are returned in chain order
///
/// # PDF Spec Reference
///
/// Per PDF 1.7 Section 12.4.3:
/// - `/R` - Page object reference (required)
/// - `/V` - Bounding rectangle of article region (required)
/// - `/N` - Next bead in thread (optional; null or absent means end of thread)
/// - `/T` - Thread containing this bead (back-reference, optional)
/// - `/P` - Page reference (alternative to `/R`, tolerated for legacy PDFs)
pub fn walk_beads(
header: &ThreadHeader,
resolver: &XrefResolver,
page_ref_to_index: &std::collections::HashMap<ObjRef, usize>,
) -> Result<Vec<Bead>> {
let mut beads = Vec::new();
let mut diagnostics = Vec::new();
let mut visited = std::collections::HashSet::new();
let first_ref = header.first_bead_ref;
let mut current_ref = first_ref;
// Maximum iterations as safety net (real-world threads have < 1000 beads)
const MAX_ITERATIONS: usize = 10000;
let mut iterations = 0;
visited.insert(current_ref);
loop {
iterations += 1;
if iterations > MAX_ITERATIONS {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!(
"Thread bead chain exceeded maximum iteration count ({}); possible malformed chain",
MAX_ITERATIONS
),
));
return Err(diagnostics);
}
// Resolve current bead
let bead_obj = match resolver.resolve(current_ref) {
Ok(obj) => obj,
Err(_) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Failed to resolve bead reference {:?}", current_ref),
));
break;
}
};
let bead_dict = match bead_obj.as_dict() {
Some(d) => d,
None => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!("Bead {:?} is not a dictionary", current_ref),
));
break;
}
};
// Extract page reference - try /R first, then /P as fallback
let page_ref = match (bead_dict.get("R"), bead_dict.get("P")) {
(Some(PdfObject::Ref(r)), _) => Some(*r),
(_, Some(PdfObject::Ref(r))) => Some(*r),
(Some(other), _) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!("Bead {:?} has /R but it's not a reference", current_ref,),
));
None
}
(_, Some(_)) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!("Bead {:?} has /P but it's not a reference", current_ref,),
));
None
}
(None, None) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!(
"Bead {:?} is missing both /R and /P (page reference)",
current_ref
),
));
None
}
};
let page_index = match page_ref {
Some(ref_) => match page_ref_to_index.get(&ref_) {
Some(idx) => *idx,
None => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!(
"Bead {:?} page reference {:?} not found in document page tree",
current_ref, ref_
),
));
// Skip this bead and continue
current_ref = match get_next_bead_ref(bead_dict, current_ref) {
Ok(next_ref) => next_ref,
Err(_) => break,
};
continue;
}
},
None => {
// Skip this bead and continue
current_ref = match get_next_bead_ref(bead_dict, current_ref) {
Ok(next_ref) => next_ref,
Err(_) => break,
};
continue;
}
};
// Extract rect (/V in PDF spec, but /V might be confused with other uses)
// The plan says /V for rect, but let's check for both /V and /R as fallback
let rect = match extract_bead_rect(bead_dict, current_ref) {
Some(r) => r,
None => {
// Skip this bead and continue
current_ref = match get_next_bead_ref(bead_dict, current_ref) {
Ok(next_ref) => next_ref,
Err(_) => break,
};
continue;
}
};
beads.push(Bead::new(page_index, rect));
// Get next bead reference
let next_ref = match get_next_bead_ref(bead_dict, current_ref) {
Ok(next) => next,
Err(_) => break,
};
// Check for termination (next points back to first)
if next_ref == first_ref {
// Legitimate circular end
break;
}
// Check for malformed cycle
if visited.contains(&next_ref) {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!(
"Malformed bead chain: bead {:?} revisited (cycle doesn't return to first bead {:?})",
next_ref, first_ref
),
));
return Err(diagnostics);
}
visited.insert(next_ref);
current_ref = next_ref;
}
// Only return Err if diagnostics were fatal
if diagnostics.is_empty() {
Ok(beads)
} else {
// Check if any diagnostics are fatal - for now, we treat malformed cycles as fatal
// but missing individual beads are not (we skip them)
let has_fatal = diagnostics
.iter()
.any(|d| matches!(d.code, DiagCode::StructUnexpectedEof));
if has_fatal {
Err(diagnostics)
} else {
// Non-fatal diagnostics - return beads with warnings
// For now, we'll still return Ok with the beads we collected
Ok(beads)
}
}
}
/// Extract the next bead reference from a bead dictionary.
fn get_next_bead_ref(
bead_dict: &PdfDict,
current_ref: ObjRef,
) -> std::result::Result<ObjRef, Vec<Diagnostic>> {
match bead_dict.get("N") {
None => {
// Missing /N means end of thread (not an error)
Err(Vec::new())
}
Some(PdfObject::Null) => {
// Explicit null /N means end of thread
Err(Vec::new())
}
Some(PdfObject::Ref(next_ref)) => Ok(*next_ref),
Some(_) => {
let diagnostics = vec![Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!("Bead {:?} has /N but it's not a reference", current_ref,),
)];
Err(diagnostics)
}
}
}
/// Extract the bounding rectangle from a bead dictionary.
///
/// Per PDF 1.7 spec, the rect is stored in /V. However, some PDFs may
/// use other keys, so we also check for common alternatives.
fn extract_bead_rect(bead_dict: &PdfDict, current_ref: ObjRef) -> Option<[f32; 4]> {
// Try /V first (per spec)
let rect_obj = bead_dict.get("V").or_else(|| bead_dict.get("Rect"))?;
let rect_array = rect_obj.as_array()?;
if rect_array.len() < 4 {
return None;
}
let mut rect = [0.0f32; 4];
for (i, val) in rect_array.iter().take(4).enumerate() {
let n = match val {
PdfObject::Integer(n) => *n as f64,
PdfObject::Real(n) => *n,
_ => return None,
};
rect[i] = n as f32;
}
// Validate rect: x0 < x1 and y0 < y1 (non-zero area)
if rect[0] >= rect[2] || rect[1] >= rect[3] {
return None;
}
Some(rect)
}
/// Decode a PDF string to a Rust String.
///
/// Handles PDFDocEncoding and UTF-16BE with BOM, per PDF 1.7 Section 5.3.3.
/// This is a minimal reimplementation of the decode_pdf_string from the
/// outline module, moved here for thread module use.
fn decode_pdf_string(bytes: &[u8]) -> Option<String> {
// Check for UTF-16BE BOM (0xFE 0xFF)
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
return decode_utf16be(&bytes[2..]);
}
// Fall back to PDFDocEncoding (latin1-ish)
decode_pdfdocencoding(bytes)
}
/// Decode UTF-16BE bytes (after BOM) to a String.
fn decode_utf16be(bytes: &[u8]) -> Option<String> {
if bytes.len() % 2 != 0 {
return None;
}
let utf16_chars: Vec<u16> = bytes
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect();
String::from_utf16(&utf16_chars).ok()
}
/// Decode PDFDocEncoding bytes to a String.
///
/// PDFDocEncoding is a single-byte encoding that maps bytes 0-255 to
/// Unicode codepoints. For bytes 0-127, it matches ASCII. For bytes 128-255,
/// it maps to various Latin-1 and special characters.
fn decode_pdfdocencoding(bytes: &[u8]) -> Option<String> {
// For most practical purposes, PDFDocEncoding is a superset of Latin-1
// We use Latin-1 decoding which never fails (maps each byte to a char)
Some(bytes.iter().map(|&b| b as char).collect())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::xref::XrefResolver;
#[test]
fn test_thread_header_new() {
let ref_ = ObjRef::new(1, 0);
let header = ThreadHeader::new(ref_);
assert_eq!(header.first_bead_ref, ref_);
assert!(header.title.is_none());
assert!(header.author.is_none());
assert!(header.subject.is_none());
assert!(header.keywords.is_none());
}
#[test]
fn test_thread_header_with_fields() {
let mut header = ThreadHeader::new(ObjRef::new(1, 0));
header.title = Some("Test Thread".to_string());
header.author = Some("John Doe".to_string());
assert_eq!(header.title, Some("Test Thread".to_string()));
assert_eq!(header.author, Some("John Doe".to_string()));
}
#[test]
fn test_decode_pdf_string_ascii() {
let bytes = b"Hello, World!";
assert_eq!(decode_pdf_string(bytes), Some("Hello, World!".to_string()));
}
#[test]
fn test_decode_pdf_string_utf16be_bom() {
// UTF-16BE with BOM: "Hello" in UTF-16BE
let bytes = &[
0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F,
];
assert_eq!(decode_pdf_string(bytes), Some("Hello".to_string()));
}
#[test]
fn test_decode_pdf_string_empty() {
let bytes: &[u8] = b"";
assert_eq!(decode_pdf_string(bytes), Some("".to_string()));
}
#[test]
fn test_decode_pdf_string_latin1() {
// Latin-1 extended characters (á, é, ñ)
let bytes = &[0xE1, 0xE9, 0xF1]; // á, é, ñ in Latin-1
let result = decode_pdf_string(bytes);
assert!(result.is_some());
// Latin-1 maps directly to Unicode codepoints 0-255
assert_eq!(result.unwrap(), "áéñ");
}
#[test]
fn test_decode_utf16be_invalid_length() {
let bytes = &[0xFE, 0xFF, 0x00]; // Odd length after BOM
assert_eq!(decode_utf16be(&bytes[2..]), None);
}
#[test]
fn test_decode_pdfdocencoding_empty() {
assert_eq!(decode_pdfdocencoding(b""), Some("".to_string()));
}
#[test]
fn test_decode_pdfdocencoding_ascii() {
assert_eq!(decode_pdfdocencoding(b"ABC"), Some("ABC".to_string()));
}
/// Test: Thread with no /I info dict -> all fields null (per acceptance criteria)
#[test]
fn test_discover_thread_no_info_dict() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Cache the /Threads array with one thread (has /F but no /I)
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
thread_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
// No /I dict - all fields should be None
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 1);
let header = &threads[0];
assert_eq!(header.first_bead_ref, ObjRef::new(20, 0));
assert!(header.title.is_none());
assert!(header.author.is_none());
assert!(header.subject.is_none());
assert!(header.keywords.is_none());
}
/// Test: 3 threads with various info dict configurations
#[test]
fn test_discover_three_threads() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread 1: full info dict
let thread1_ref = ObjRef::new(11, 0);
let mut thread1_dict = indexmap::IndexMap::new();
thread1_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
let mut info1 = indexmap::IndexMap::new();
info1.insert(
"Title".into(),
PdfObject::String(Box::new(b"Thread 1".to_vec())),
);
info1.insert(
"Author".into(),
PdfObject::String(Box::new(b"Author 1".to_vec())),
);
info1.insert(
"Subject".into(),
PdfObject::String(Box::new(b"Subject 1".to_vec())),
);
info1.insert(
"Keywords".into(),
PdfObject::String(Box::new(b"kw1,kw2".to_vec())),
);
thread1_dict.insert("I".into(), PdfObject::Dict(Box::new(info1)));
// Thread 2: no /Title but has other fields
let thread2_ref = ObjRef::new(12, 0);
let mut thread2_dict = indexmap::IndexMap::new();
thread2_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(21, 0)));
let mut info2 = indexmap::IndexMap::new();
info2.insert(
"Author".into(),
PdfObject::String(Box::new(b"Author 2".to_vec())),
);
// No /Title
thread2_dict.insert("I".into(), PdfObject::Dict(Box::new(info2)));
// Thread 3: no /I dict at all
let thread3_ref = ObjRef::new(13, 0);
let mut thread3_dict = indexmap::IndexMap::new();
thread3_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(22, 0)));
// No /I
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread1_ref));
threads_array.push(PdfObject::Ref(thread2_ref));
threads_array.push(PdfObject::Ref(thread3_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread1_ref, PdfObject::Dict(Box::new(thread1_dict)));
resolver.cache_object(thread2_ref, PdfObject::Dict(Box::new(thread2_dict)));
resolver.cache_object(thread3_ref, PdfObject::Dict(Box::new(thread3_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 3);
// Thread 1: all fields present
assert_eq!(threads[0].title, Some("Thread 1".to_string()));
assert_eq!(threads[0].author, Some("Author 1".to_string()));
assert_eq!(threads[0].subject, Some("Subject 1".to_string()));
assert_eq!(threads[0].keywords, Some("kw1,kw2".to_string()));
// Thread 2: no title
assert!(threads[1].title.is_none());
assert_eq!(threads[1].author, Some("Author 2".to_string()));
assert!(threads[1].subject.is_none());
assert!(threads[1].keywords.is_none());
// Thread 3: no info dict
assert!(threads[2].title.is_none());
assert!(threads[2].author.is_none());
assert!(threads[2].subject.is_none());
assert!(threads[2].keywords.is_none());
}
/// Test: Thread missing /F is skipped with diagnostic
#[test]
fn test_discover_thread_missing_f_skipped() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread with no /F
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
// No /F - should be skipped
let mut info = indexmap::IndexMap::new();
info.insert(
"Title".into(),
PdfObject::String(Box::new(b"Orphan".to_vec())),
);
thread_dict.insert("I".into(), PdfObject::Dict(Box::new(info)));
// Valid thread
let thread2_ref = ObjRef::new(12, 0);
let mut thread2_dict = indexmap::IndexMap::new();
thread2_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
threads_array.push(PdfObject::Ref(thread2_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
resolver.cache_object(thread2_ref, PdfObject::Dict(Box::new(thread2_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
// Only the valid thread should be returned
assert_eq!(threads.len(), 1);
assert_eq!(threads[0].first_bead_ref, ObjRef::new(20, 0));
}
/// Test: UTF-16BE encoded title
#[test]
fn test_discover_thread_utf16_title() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread with UTF-16BE title
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
thread_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
// UTF-16BE with BOM: "日本語" (Japanese)
let utf16_bytes = &[
0xFE, 0xFF, // BOM
0x65, 0xE5, // 日
0x67, 0x9C, // 本
0x9E, 0x8A, // 語
];
let mut info = indexmap::IndexMap::new();
info.insert(
"Title".into(),
PdfObject::String(Box::new(utf16_bytes.to_vec())),
);
thread_dict.insert("I".into(), PdfObject::Dict(Box::new(info)));
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 1);
assert_eq!(threads[0].title, Some("日本語".to_string()));
}
/// Test: Empty /Threads returns empty Vec without diagnostic
#[test]
fn test_discover_empty_threads() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference to empty array
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
let empty_array = PdfObject::Array(Box::new(Vec::new()));
resolver.cache_object(ObjRef::new(10, 0), empty_array);
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
assert!(result.unwrap().is_empty());
}
/// Test: /Threads absent returns empty Vec without diagnostic
#[test]
fn test_discover_no_threads_field() {
let resolver = XrefResolver::new();
// Create a catalog without /Threads
let catalog = Catalog::new(ObjRef::new(1, 0));
// threads_ref is None
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
assert!(result.unwrap().is_empty());
}
/// Test: Empty string title is Some("") not None
#[test]
fn test_discover_thread_empty_title() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread with empty title
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
thread_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
let mut info = indexmap::IndexMap::new();
info.insert("Title".into(), PdfObject::String(Box::new(Vec::new()))); // Empty string
thread_dict.insert("I".into(), PdfObject::Dict(Box::new(info)));
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 1);
// Empty string should be Some("") not None
assert_eq!(threads[0].title, Some("".to_string()));
}
/// Test: Bead with /R and /V correctly extracted
#[test]
fn test_walk_beads_single_bead() {
let resolver = XrefResolver::new();
// Create page ref to index map
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
// Create thread header
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Create bead dict with /R (page ref) and /V (rect)
let mut bead_dict = indexmap::IndexMap::new();
bead_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(100),
PdfObject::Integer(200),
PdfObject::Integer(300),
PdfObject::Integer(400),
])),
);
// /N points back to first (circular termination)
bead_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0)));
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
assert_eq!(beads.len(), 1);
assert_eq!(beads[0].page_index, 0);
assert_eq!(beads[0].rect, [100.0, 200.0, 300.0, 400.0]);
}
/// Test: Two article threads - both reconstructed with correct bead order
#[test]
fn test_walk_beads_two_threads() {
let resolver = XrefResolver::new();
// Create page ref to index map
let mut page_ref_to_index = std::collections::HashMap::new();
let page0_ref = ObjRef::new(100, 0);
let page1_ref = ObjRef::new(101, 0);
let page2_ref = ObjRef::new(102, 0);
page_ref_to_index.insert(page0_ref, 0);
page_ref_to_index.insert(page1_ref, 1);
page_ref_to_index.insert(page2_ref, 2);
// Thread 1: three beads across pages 0, 1, 2
let header1 = ThreadHeader::new(ObjRef::new(20, 0));
let mut bead1_dict = indexmap::IndexMap::new();
bead1_dict.insert("R".into(), PdfObject::Ref(page0_ref));
bead1_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(10),
PdfObject::Integer(20),
PdfObject::Integer(30),
PdfObject::Integer(40),
])),
);
bead1_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(21, 0)));
let mut bead2_dict = indexmap::IndexMap::new();
bead2_dict.insert("R".into(), PdfObject::Ref(page1_ref));
bead2_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(50),
PdfObject::Integer(60),
PdfObject::Integer(70),
PdfObject::Integer(80),
])),
);
bead2_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(22, 0)));
let mut bead3_dict = indexmap::IndexMap::new();
bead3_dict.insert("R".into(), PdfObject::Ref(page2_ref));
bead3_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(90),
PdfObject::Integer(100),
PdfObject::Integer(110),
PdfObject::Integer(120),
])),
);
bead3_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0))); // Back to first
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead1_dict)));
resolver.cache_object(ObjRef::new(21, 0), PdfObject::Dict(Box::new(bead2_dict)));
resolver.cache_object(ObjRef::new(22, 0), PdfObject::Dict(Box::new(bead3_dict)));
let result1 = walk_beads(&header1, &resolver, &page_ref_to_index);
assert!(result1.is_ok());
let beads1 = result1.unwrap();
assert_eq!(beads1.len(), 3);
assert_eq!(beads1[0].page_index, 0);
assert_eq!(beads1[0].rect, [10.0, 20.0, 30.0, 40.0]);
assert_eq!(beads1[1].page_index, 1);
assert_eq!(beads1[1].rect, [50.0, 60.0, 70.0, 80.0]);
assert_eq!(beads1[2].page_index, 2);
assert_eq!(beads1[2].rect, [90.0, 100.0, 110.0, 120.0]);
// Thread 2: single bead on page 1
let header2 = ThreadHeader::new(ObjRef::new(30, 0));
let mut bead4_dict = indexmap::IndexMap::new();
bead4_dict.insert("R".into(), PdfObject::Ref(page1_ref));
bead4_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(200),
PdfObject::Integer(300),
PdfObject::Integer(400),
PdfObject::Integer(500),
])),
);
bead4_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(30, 0))); // Back to first
resolver.cache_object(ObjRef::new(30, 0), PdfObject::Dict(Box::new(bead4_dict)));
let result2 = walk_beads(&header2, &resolver, &page_ref_to_index);
assert!(result2.is_ok());
let beads2 = result2.unwrap();
assert_eq!(beads2.len(), 1);
assert_eq!(beads2[0].page_index, 1);
assert_eq!(beads2[0].rect, [200.0, 300.0, 400.0, 500.0]);
}
/// Test: Circular bead chain termination - walk stops without infinite loop
#[test]
fn test_walk_beads_circular_termination() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Create a chain: 20 -> 21 -> 22 -> 20 (circular back to first)
let mut bead1_dict = indexmap::IndexMap::new();
bead1_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead1_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(100),
PdfObject::Integer(100),
])),
);
bead1_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(21, 0)));
let mut bead2_dict = indexmap::IndexMap::new();
bead2_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead2_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(100),
PdfObject::Integer(0),
PdfObject::Integer(200),
PdfObject::Integer(100),
])),
);
bead2_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(22, 0)));
let mut bead3_dict = indexmap::IndexMap::new();
bead3_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead3_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(200),
PdfObject::Integer(0),
PdfObject::Integer(300),
PdfObject::Integer(100),
])),
);
bead3_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0))); // Back to first
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead1_dict)));
resolver.cache_object(ObjRef::new(21, 0), PdfObject::Dict(Box::new(bead2_dict)));
resolver.cache_object(ObjRef::new(22, 0), PdfObject::Dict(Box::new(bead3_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
assert_eq!(beads.len(), 3); // All three beads visited
}
/// Test: Pathological cycle detection (non-first bead revisited)
#[test]
fn test_walk_beads_malformed_cycle() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Create a malformed chain: 20 -> 21 -> 22 -> 21 (cycle that doesn't return to first)
let mut bead1_dict = indexmap::IndexMap::new();
bead1_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead1_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(100),
PdfObject::Integer(100),
])),
);
bead1_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(21, 0)));
let mut bead2_dict = indexmap::IndexMap::new();
bead2_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead2_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(100),
PdfObject::Integer(0),
PdfObject::Integer(200),
PdfObject::Integer(100),
])),
);
bead2_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(22, 0)));
let mut bead3_dict = indexmap::IndexMap::new();
bead3_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead3_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(200),
PdfObject::Integer(0),
PdfObject::Integer(300),
PdfObject::Integer(100),
])),
);
bead3_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(21, 0))); // Back to 21, not 20
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead1_dict)));
resolver.cache_object(ObjRef::new(21, 0), PdfObject::Dict(Box::new(bead2_dict)));
resolver.cache_object(ObjRef::new(22, 0), PdfObject::Dict(Box::new(bead3_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_err());
let diagnostics = result.unwrap_err();
assert!(!diagnostics.is_empty());
// Should contain a malformed cycle diagnostic
assert!(diagnostics
.iter()
.any(|d| d.message.contains("Malformed bead chain")));
}
/// Test: Missing /N terminates the chain
#[test]
fn test_walk_beads_missing_next() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Bead with no /N
let mut bead_dict = indexmap::IndexMap::new();
bead_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(100),
PdfObject::Integer(100),
])),
);
// No /N - chain terminates
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
assert_eq!(beads.len(), 1);
}
/// Test: Missing /R and /P skips bead
#[test]
fn test_walk_beads_missing_page_ref() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// First bead with no page ref
let mut bead1_dict = indexmap::IndexMap::new();
bead1_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(100),
PdfObject::Integer(100),
])),
);
bead1_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(21, 0)));
// Second bead with valid page ref
let mut bead2_dict = indexmap::IndexMap::new();
bead2_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead2_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(100),
PdfObject::Integer(0),
PdfObject::Integer(200),
PdfObject::Integer(100),
])),
);
bead2_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0)));
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead1_dict)));
resolver.cache_object(ObjRef::new(21, 0), PdfObject::Dict(Box::new(bead2_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
// First bead skipped, second bead included
assert_eq!(beads.len(), 1);
assert_eq!(beads[0].page_index, 0);
}
/// Test: /Pg fallback for page reference
#[test]
fn test_walk_beads_pg_fallback() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Bead with /P instead of /R
let mut bead_dict = indexmap::IndexMap::new();
bead_dict.insert("P".into(), PdfObject::Ref(page_ref)); // /P instead of /R
bead_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(100),
PdfObject::Integer(100),
])),
);
bead_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0)));
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
assert_eq!(beads.len(), 1);
assert_eq!(beads[0].page_index, 0);
}
/// Test: Missing /V rect skips bead
#[test]
fn test_walk_beads_missing_rect() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// First bead with no rect
let mut bead1_dict = indexmap::IndexMap::new();
bead1_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead1_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(21, 0)));
// Second bead with valid rect
let mut bead2_dict = indexmap::IndexMap::new();
bead2_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead2_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(100),
PdfObject::Integer(100),
])),
);
bead2_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0)));
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead1_dict)));
resolver.cache_object(ObjRef::new(21, 0), PdfObject::Dict(Box::new(bead2_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
// First bead skipped (no rect), second bead included
assert_eq!(beads.len(), 1);
}
/// Test: Bead with invalid rect shape skips bead
#[test]
fn test_walk_beads_invalid_rect_shape() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Bead with invalid rect (x0 >= x1)
let mut bead_dict = indexmap::IndexMap::new();
bead_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(100), // x0
PdfObject::Integer(0), // y0
PdfObject::Integer(50), // x1 < x0 - invalid!
PdfObject::Integer(100),
])),
);
bead_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0)));
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
// Bead skipped due to invalid rect
assert_eq!(beads.len(), 0);
}
/// Test: Page ref outside document range
#[test]
fn test_walk_beads_page_ref_not_in_tree() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Bead with page ref not in the page tree
let unknown_page_ref = ObjRef::new(999, 0);
let mut bead_dict = indexmap::IndexMap::new();
bead_dict.insert("R".into(), PdfObject::Ref(unknown_page_ref));
bead_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(100),
PdfObject::Integer(100),
])),
);
bead_dict.insert("N".into(), PdfObject::Ref(ObjRef::new(20, 0)));
resolver.cache_object(ObjRef::new(20, 0), PdfObject::Dict(Box::new(bead_dict)));
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_ok());
let beads = result.unwrap();
// Bead skipped due to unknown page ref
assert_eq!(beads.len(), 0);
}
/// Test: Bead struct new method
#[test]
fn test_bead_new() {
let bead = Bead::new(5, [10.0, 20.0, 30.0, 40.0]);
assert_eq!(bead.page_index, 5);
assert_eq!(bead.rect, [10.0, 20.0, 30.0, 40.0]);
}
/// Test: Maximum iteration cap enforced
#[test]
fn test_walk_beads_max_iterations() {
let resolver = XrefResolver::new();
let mut page_ref_to_index = std::collections::HashMap::new();
let page_ref = ObjRef::new(100, 0);
page_ref_to_index.insert(page_ref, 0);
let header = ThreadHeader::new(ObjRef::new(20, 0));
// Create a long chain that exceeds MAX_ITERATIONS
// We'll create a chain of 10001 beads (20 -> 21 -> 22 -> ... -> 10020 -> 20)
for i in 0..=10050 {
let mut bead_dict = indexmap::IndexMap::new();
bead_dict.insert("R".into(), PdfObject::Ref(page_ref));
bead_dict.insert(
"V".into(),
PdfObject::Array(Box::new(vec![
PdfObject::Integer(i),
PdfObject::Integer(0),
PdfObject::Integer(i + 100),
PdfObject::Integer(100),
])),
);
// Each bead points to the next, except the last which points back to first
let next_ref = if i < 10050 {
ObjRef::new((20 + i + 1) as u32, 0)
} else {
ObjRef::new(20, 0) // Would close the loop, but we hit max iterations first
};
bead_dict.insert("N".into(), PdfObject::Ref(next_ref));
resolver.cache_object(ObjRef::new((20 + i) as u32, 0), PdfObject::Dict(Box::new(bead_dict)));
}
let result = walk_beads(&header, &resolver, &page_ref_to_index);
assert!(result.is_err());
let diagnostics = result.unwrap_err();
assert!(!diagnostics.is_empty());
assert!(diagnostics
.iter()
.any(|d| d.message.contains("exceeded maximum iteration count")));
}
}