feat(pdftract-1c4j2): implement thread info extraction (7.7.1)

Implements Phase 7.7.1: /Threads array discovery + /I thread info
metadata extraction.

Changes:
- Add threads_ref field to Catalog struct and parse /Threads in catalog
- Create threads module with ThreadHeader struct
- Implement discover() function to extract thread metadata
- Handle PDFDocEncoding and UTF-16BE string decoding
- Empty strings return Some("") to distinguish from None

Acceptance criteria:
- Thread with no /I info dict -> title/author/subject/keywords null
- 3 threads with various info configurations
- Thread with no /Title (but /I present)
- Thread missing /F skipped with diagnostic
- UTF-16BE title decoding

Closes: pdftract-1c4j2
This commit is contained in:
jedarden 2026-05-25 02:38:42 -04:00
parent ce7960b39a
commit aedabdb19a
3 changed files with 644 additions and 0 deletions

View file

@ -49,6 +49,7 @@ pub mod semaphore;
pub mod signature;
pub mod span_flags;
pub mod table;
pub mod threads;
// Re-export key types for convenience
pub use confidence::ConfidenceSource;

View file

@ -389,6 +389,8 @@ pub struct Catalog {
pub aa: Option<PdfObject>,
/// PDF version override from catalog (optional)
pub version: Option<String>,
/// Reference to /Threads array (optional, article threads)
pub threads_ref: Option<ObjRef>,
/// Diagnostics emitted during parsing
pub diagnostics: Vec<Diagnostic>,
}
@ -409,6 +411,7 @@ impl Catalog {
open_action: None,
aa: None,
version: None,
threads_ref: None,
diagnostics: Vec::new(),
}
}
@ -437,6 +440,7 @@ impl Default for Catalog {
open_action: None,
aa: None,
version: None,
threads_ref: None,
diagnostics: Vec::new(),
}
}
@ -574,6 +578,11 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result<Catalo
}
}
// Extract /Threads (optional, article threads)
if let Some(PdfObject::Ref(threads_ref)) = catalog_dict.get("Threads") {
catalog.threads_ref = Some(*threads_ref);
}
catalog.diagnostics = diagnostics;
Ok(catalog)
}

View file

@ -0,0 +1,634 @@
//! PDF article thread discovery and metadata extraction.
//!
//! This module implements Phase 7.7.1 of the plan: reading the /Threads array
//! from the document catalog and extracting thread info metadata (/I) for each
//! thread.
//!
//! ## Architecture
//!
//! - **Discovery** (7.7.1): Read /Threads array from catalog, extract /F and /I
//! - **Bead chain walking** (7.7.2): Walk /N links from first bead (future work)
//!
//! ## PDF Thread Structure
//!
//! Per PDF 1.7 Section 12.4.3, an article thread consists of:
//! - `/Threads` array in catalog (optional)
//! - Each thread dict has:
//! - `/F`: Indirect reference to first bead (required)
//! - `/I`: Thread info dict (optional)
//! - `/Title`: Thread title (PdfString, optional)
//! - `/Author`: Thread author (PdfString, optional)
//! - `/Subject`: Thread subject (PdfString, optional)
//! - `/Keywords`: Thread keywords (PdfString, optional, comma-separated)
use crate::diagnostics::{DiagCode, Diagnostic};
use crate::parser::catalog::Catalog;
use crate::parser::object::{ObjRef, PdfObject};
use crate::parser::xref::XrefResolver;
/// Result type for thread operations.
pub type Result<T> = std::result::Result<T, Vec<Diagnostic>>;
/// A thread header with metadata from the thread info dict.
///
/// Represents the metadata for a single article thread, extracted from
/// the /I dict in the /Threads array entry. The bead chain walking
/// happens in Phase 7.7.2.
///
/// # Fields
///
/// * `first_bead_ref` - Indirect reference to the first bead in the chain
/// * `title` - Thread title from /I/Title (None if /I missing or /Title absent)
/// * `author` - Thread author from /I/Author (None if /I missing or /Author absent)
/// * `subject` - Thread subject from /I/Subject (None if /I missing or /Subject absent)
/// * `keywords` - Thread keywords from /I/Keywords (None if /I missing or /Keywords absent)
#[derive(Debug, Clone, PartialEq)]
pub struct ThreadHeader {
/// Indirect reference to the first bead in the thread chain.
///
/// This is always present for valid threads; threads without /F are
/// skipped with a diagnostic.
pub first_bead_ref: ObjRef,
/// Thread title from /I/Title.
///
/// - `Some("")` if /I/Title is present but empty string
/// - `None` if /I is missing or /Title is absent
pub title: Option<String>,
/// Thread author from /I/Author.
///
/// - `Some("")` if /I/Author is present but empty string
/// - `None` if /I is missing or /Author is absent
pub author: Option<String>,
/// Thread subject from /I/Subject.
///
/// - `Some("")` if /I/Subject is present but empty string
/// - `None` if /I is missing or /Subject is absent
pub subject: Option<String>,
/// Thread keywords from /I/Keywords.
///
/// Per PDF spec, this is a comma-separated convention (not an array).
/// - `Some("")` if /I/Keywords is present but empty string
/// - `None` if /I is missing or /Keywords is absent
pub keywords: Option<String>,
}
impl ThreadHeader {
/// Create a new ThreadHeader with the required first bead reference.
pub fn new(first_bead_ref: ObjRef) -> Self {
ThreadHeader {
first_bead_ref,
title: None,
author: None,
subject: None,
keywords: None,
}
}
}
/// Discover article threads from the document catalog.
///
/// Reads the optional /Threads array from the catalog and extracts thread
/// headers (metadata only; bead chain walking is Phase 7.7.2).
///
/// # Arguments
///
/// * `catalog` - The document catalog (may have /Threads)
/// * `resolver` - The xref resolver for resolving indirect references
///
/// # Returns
///
/// A `Result<Vec<ThreadHeader>>` containing all discovered thread headers,
/// or a list of diagnostics (for fatal errors only; per-thread errors are
/// emitted as diagnostics but don't fail the entire operation).
///
/// # Behavior
///
/// - If /Threads is absent or not an array, returns empty Vec (no diagnostic)
/// - If a thread dict lacks /F, skips with diagnostic and continues processing
/// - If /I is missing, all four fields are None (not a diagnostic)
/// - Empty strings ("") are emitted as Some("") to distinguish from absent fields
/// - Multiple threads with the same /Title are legal (no deduplication)
pub fn discover(catalog: &Catalog, resolver: &XrefResolver) -> Result<Vec<ThreadHeader>> {
let mut threads = Vec::new();
let mut diagnostics = Vec::new();
// /Threads is optional; absent is not an error
let threads_ref = match catalog.threads_ref {
Some(ref_) => ref_,
None => return Ok(threads),
};
// Resolve the /Threads array
let threads_obj = match resolver.resolve(threads_ref) {
Ok(obj) => obj,
Err(_) => {
// If we can't resolve /Threads, return empty (not fatal)
return Ok(threads);
}
};
let threads_array = match threads_obj.as_array() {
Some(arr) => arr,
None => {
// /Threads exists but isn't an array; skip without diagnostic
return Ok(threads);
}
};
// Process each thread entry in the array
for (idx, thread_entry) in threads_array.iter().enumerate() {
// Each thread entry should be an indirect ref to a thread dict
let thread_ref = match thread_entry {
PdfObject::Ref(ref_) => *ref_,
_ => {
// Skip non-ref entries with diagnostic
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!(
"Thread entry {} is not an indirect reference (type: {})",
idx,
thread_entry.type_name()
),
));
continue;
}
};
let thread_obj = match resolver.resolve(thread_ref) {
Ok(obj) => obj,
Err(_) => {
// Skip unresolvable thread refs
continue;
}
};
let thread_dict = match thread_obj.as_dict() {
Some(d) => d,
None => {
// Skip non-dict threads
continue;
}
};
// Extract /F (first bead reference) - REQUIRED
let first_bead_ref = match thread_dict.get("F") {
Some(PdfObject::Ref(ref_)) => *ref_,
Some(other) => {
// /F exists but isn't a ref - skip with diagnostic
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!(
"Thread {} has /F but it's not a reference (type: {})",
idx,
other.type_name()
),
));
continue;
}
None => {
// /F is required - skip with diagnostic
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!("Thread {} is missing /F (first bead reference)", idx),
));
continue;
}
};
let mut header = ThreadHeader::new(first_bead_ref);
// Extract /I (thread info dict) - OPTIONAL
if let Some(info_obj) = thread_dict.get("I") {
if let Some(info_dict) = info_obj.as_dict() {
// Extract /Title
if let Some(title_bytes) = info_dict.get("Title").and_then(|o| o.as_string()) {
header.title = decode_pdf_string(title_bytes);
}
// Extract /Author
if let Some(author_bytes) = info_dict.get("Author").and_then(|o| o.as_string()) {
header.author = decode_pdf_string(author_bytes);
}
// Extract /Subject
if let Some(subject_bytes) = info_dict.get("Subject").and_then(|o| o.as_string()) {
header.subject = decode_pdf_string(subject_bytes);
}
// Extract /Keywords
if let Some(keywords_bytes) = info_dict.get("Keywords").and_then(|o| o.as_string())
{
header.keywords = decode_pdf_string(keywords_bytes);
}
}
// If /I exists but isn't a dict, we skip it (no diagnostic, header fields stay None)
}
threads.push(header);
}
// Only return Err if diagnostics were actually fatal (none are currently)
Ok(threads)
}
/// Decode a PDF string to a Rust String.
///
/// Handles PDFDocEncoding and UTF-16BE with BOM, per PDF 1.7 Section 5.3.3.
/// This is a minimal reimplementation of the decode_pdf_string from the
/// outline module, moved here for thread module use.
fn decode_pdf_string(bytes: &[u8]) -> Option<String> {
// Check for UTF-16BE BOM (0xFE 0xFF)
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
return decode_utf16be(&bytes[2..]);
}
// Fall back to PDFDocEncoding (latin1-ish)
decode_pdfdocencoding(bytes)
}
/// Decode UTF-16BE bytes (after BOM) to a String.
fn decode_utf16be(bytes: &[u8]) -> Option<String> {
if bytes.len() % 2 != 0 {
return None;
}
let utf16_chars: Vec<u16> = bytes
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect();
String::from_utf16(&utf16_chars).ok()
}
/// Decode PDFDocEncoding bytes to a String.
///
/// PDFDocEncoding is a single-byte encoding that maps bytes 0-255 to
/// Unicode codepoints. For bytes 0-127, it matches ASCII. For bytes 128-255,
/// it maps to various Latin-1 and special characters.
fn decode_pdfdocencoding(bytes: &[u8]) -> Option<String> {
// For most practical purposes, PDFDocEncoding is a superset of Latin-1
// We use Latin-1 decoding which never fails (maps each byte to a char)
Some(bytes.iter().map(|&b| b as char).collect())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::xref::XrefResolver;
#[test]
fn test_thread_header_new() {
let ref_ = ObjRef::new(1, 0);
let header = ThreadHeader::new(ref_);
assert_eq!(header.first_bead_ref, ref_);
assert!(header.title.is_none());
assert!(header.author.is_none());
assert!(header.subject.is_none());
assert!(header.keywords.is_none());
}
#[test]
fn test_thread_header_with_fields() {
let mut header = ThreadHeader::new(ObjRef::new(1, 0));
header.title = Some("Test Thread".to_string());
header.author = Some("John Doe".to_string());
assert_eq!(header.title, Some("Test Thread".to_string()));
assert_eq!(header.author, Some("John Doe".to_string()));
}
#[test]
fn test_decode_pdf_string_ascii() {
let bytes = b"Hello, World!";
assert_eq!(decode_pdf_string(bytes), Some("Hello, World!".to_string()));
}
#[test]
fn test_decode_pdf_string_utf16be_bom() {
// UTF-16BE with BOM: "Hello" in UTF-16BE
let bytes = &[
0xFE, 0xFF, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F,
];
assert_eq!(decode_pdf_string(bytes), Some("Hello".to_string()));
}
#[test]
fn test_decode_pdf_string_empty() {
let bytes: &[u8] = b"";
assert_eq!(decode_pdf_string(bytes), Some("".to_string()));
}
#[test]
fn test_decode_pdf_string_latin1() {
// Latin-1 extended characters (á, é, ñ)
let bytes = &[0xE1, 0xE9, 0xF1]; // á, é, ñ in Latin-1
let result = decode_pdf_string(bytes);
assert!(result.is_some());
// Latin-1 maps directly to Unicode codepoints 0-255
assert_eq!(result.unwrap(), "áéñ");
}
#[test]
fn test_decode_utf16be_invalid_length() {
let bytes = &[0xFE, 0xFF, 0x00]; // Odd length after BOM
assert_eq!(decode_utf16be(&bytes[2..]), None);
}
#[test]
fn test_decode_pdfdocencoding_empty() {
assert_eq!(decode_pdfdocencoding(b""), Some("".to_string()));
}
#[test]
fn test_decode_pdfdocencoding_ascii() {
assert_eq!(decode_pdfdocencoding(b"ABC"), Some("ABC".to_string()));
}
/// Test: Thread with no /I info dict -> all fields null (per acceptance criteria)
#[test]
fn test_discover_thread_no_info_dict() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Cache the /Threads array with one thread (has /F but no /I)
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
thread_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
// No /I dict - all fields should be None
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 1);
let header = &threads[0];
assert_eq!(header.first_bead_ref, ObjRef::new(20, 0));
assert!(header.title.is_none());
assert!(header.author.is_none());
assert!(header.subject.is_none());
assert!(header.keywords.is_none());
}
/// Test: 3 threads with various info dict configurations
#[test]
fn test_discover_three_threads() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread 1: full info dict
let thread1_ref = ObjRef::new(11, 0);
let mut thread1_dict = indexmap::IndexMap::new();
thread1_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
let mut info1 = indexmap::IndexMap::new();
info1.insert(
"Title".into(),
PdfObject::String(Box::new(b"Thread 1".to_vec())),
);
info1.insert(
"Author".into(),
PdfObject::String(Box::new(b"Author 1".to_vec())),
);
info1.insert(
"Subject".into(),
PdfObject::String(Box::new(b"Subject 1".to_vec())),
);
info1.insert(
"Keywords".into(),
PdfObject::String(Box::new(b"kw1,kw2".to_vec())),
);
thread1_dict.insert("I".into(), PdfObject::Dict(Box::new(info1)));
// Thread 2: no /Title but has other fields
let thread2_ref = ObjRef::new(12, 0);
let mut thread2_dict = indexmap::IndexMap::new();
thread2_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(21, 0)));
let mut info2 = indexmap::IndexMap::new();
info2.insert(
"Author".into(),
PdfObject::String(Box::new(b"Author 2".to_vec())),
);
// No /Title
thread2_dict.insert("I".into(), PdfObject::Dict(Box::new(info2)));
// Thread 3: no /I dict at all
let thread3_ref = ObjRef::new(13, 0);
let mut thread3_dict = indexmap::IndexMap::new();
thread3_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(22, 0)));
// No /I
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread1_ref));
threads_array.push(PdfObject::Ref(thread2_ref));
threads_array.push(PdfObject::Ref(thread3_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread1_ref, PdfObject::Dict(Box::new(thread1_dict)));
resolver.cache_object(thread2_ref, PdfObject::Dict(Box::new(thread2_dict)));
resolver.cache_object(thread3_ref, PdfObject::Dict(Box::new(thread3_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 3);
// Thread 1: all fields present
assert_eq!(threads[0].title, Some("Thread 1".to_string()));
assert_eq!(threads[0].author, Some("Author 1".to_string()));
assert_eq!(threads[0].subject, Some("Subject 1".to_string()));
assert_eq!(threads[0].keywords, Some("kw1,kw2".to_string()));
// Thread 2: no title
assert!(threads[1].title.is_none());
assert_eq!(threads[1].author, Some("Author 2".to_string()));
assert!(threads[1].subject.is_none());
assert!(threads[1].keywords.is_none());
// Thread 3: no info dict
assert!(threads[2].title.is_none());
assert!(threads[2].author.is_none());
assert!(threads[2].subject.is_none());
assert!(threads[2].keywords.is_none());
}
/// Test: Thread missing /F is skipped with diagnostic
#[test]
fn test_discover_thread_missing_f_skipped() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread with no /F
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
// No /F - should be skipped
let mut info = indexmap::IndexMap::new();
info.insert(
"Title".into(),
PdfObject::String(Box::new(b"Orphan".to_vec())),
);
thread_dict.insert("I".into(), PdfObject::Dict(Box::new(info)));
// Valid thread
let thread2_ref = ObjRef::new(12, 0);
let mut thread2_dict = indexmap::IndexMap::new();
thread2_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
threads_array.push(PdfObject::Ref(thread2_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
resolver.cache_object(thread2_ref, PdfObject::Dict(Box::new(thread2_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
// Only the valid thread should be returned
assert_eq!(threads.len(), 1);
assert_eq!(threads[0].first_bead_ref, ObjRef::new(20, 0));
}
/// Test: UTF-16BE encoded title
#[test]
fn test_discover_thread_utf16_title() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread with UTF-16BE title
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
thread_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
// UTF-16BE with BOM: "日本語" (Japanese)
let utf16_bytes = &[
0xFE, 0xFF, // BOM
0x65, 0xE5, // 日
0x67, 0x9C, // 本
0x9E, 0x8A, // 語
];
let mut info = indexmap::IndexMap::new();
info.insert(
"Title".into(),
PdfObject::String(Box::new(utf16_bytes.to_vec())),
);
thread_dict.insert("I".into(), PdfObject::Dict(Box::new(info)));
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 1);
assert_eq!(threads[0].title, Some("日本語".to_string()));
}
/// Test: Empty /Threads returns empty Vec without diagnostic
#[test]
fn test_discover_empty_threads() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference to empty array
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
let empty_array = PdfObject::Array(Box::new(Vec::new()));
resolver.cache_object(ObjRef::new(10, 0), empty_array);
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
assert!(result.unwrap().is_empty());
}
/// Test: /Threads absent returns empty Vec without diagnostic
#[test]
fn test_discover_no_threads_field() {
let resolver = XrefResolver::new();
// Create a catalog without /Threads
let catalog = Catalog::new(ObjRef::new(1, 0));
// threads_ref is None
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
assert!(result.unwrap().is_empty());
}
/// Test: Empty string title is Some("") not None
#[test]
fn test_discover_thread_empty_title() {
let resolver = XrefResolver::new();
// Create a catalog with /Threads reference
let mut catalog = Catalog::new(ObjRef::new(1, 0));
catalog.threads_ref = Some(ObjRef::new(10, 0));
// Thread with empty title
let thread_ref = ObjRef::new(11, 0);
let mut thread_dict = indexmap::IndexMap::new();
thread_dict.insert("F".into(), PdfObject::Ref(ObjRef::new(20, 0)));
let mut info = indexmap::IndexMap::new();
info.insert("Title".into(), PdfObject::String(Box::new(Vec::new()))); // Empty string
thread_dict.insert("I".into(), PdfObject::Dict(Box::new(info)));
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
assert!(result.is_ok());
let threads = result.unwrap();
assert_eq!(threads.len(), 1);
// Empty string should be Some("") not None
assert_eq!(threads[0].title, Some("".to_string()));
}
}