feat(pdftract-29z7b): implement unified diagnostic system + CLI commands

- Added `cmd_explain_diagnostic` function to CLI for detailed diagnostic code explanation
- Added `--list-diagnostics` and `--explain-diagnostic <code>` CLI commands
- Verified all Phase 1.1-1.5 modules use unified DiagCode (lexer, parser, xref, stream, catalog, outline, pages)
- DIAGNOSTIC_CATALOG provides metadata for all 61 diagnostic codes
- Diagnostic struct size: 56 bytes (within 48-64 target range)
- emit! macro provides ergonomic diagnostic emission
- INV-8 maintained: no panics in error paths

All diagnostic codes follow naming convention:
- STRUCT_*: PDF structure errors
- STREAM_*: Stream decoder errors
- XREF_*: Cross-reference table errors
- ENCRYPTION_*: Encryption-related errors
- OCR_*: OCR pipeline errors
- REMOTE_*: Remote source errors
- PAGE_*: Page-level errors
- FONT_*: Font pipeline errors
- GSTATE_*: Graphics state errors
- LAYOUT_*: Layout and reading order errors
- MCP_*: MCP server errors
- CACHE_*: Cache errors

References: Phase 1.6 (error recovery), INV-8, Phase 0.4 (clippy enforces doc comments)
This commit is contained in:
jedarden 2026-05-22 22:38:18 -04:00
parent 1959ff2446
commit 6a35bdd869
14 changed files with 817 additions and 459 deletions

View file

@ -1 +1 @@
c6be8e6b574e5a1ef0fb65fb3aacebfe36740030
37413028fa8535169cd8a39e47bee704cfc7bf80

1
Cargo.lock generated
View file

@ -738,6 +738,7 @@ dependencies = [
"chrono",
"clap",
"lzw",
"pdftract-core",
"regex",
"secrecy",
"serde",

View file

@ -22,6 +22,7 @@ anyhow = { workspace = true }
chrono = { version = "0.4", features = ["serde"] }
clap = { version = "4.5", features = ["derive"] }
lzw = { workspace = true }
pdftract-core = { path = "../pdftract-core" }
regex = "1.10"
secrecy = { workspace = true }
serde = { workspace = true, features = ["derive"] }

View file

@ -8,6 +8,9 @@ mod mcp;
mod password;
use codegen::Language;
// Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands
pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG};
#[derive(Parser)]
#[command(name = "pdftract")]
#[command(about = "pdftract CLI - PDF extraction and conformance testing", long_about = None)]
@ -18,6 +21,13 @@ struct Cli {
#[derive(Subcommand)]
enum Commands {
/// List all diagnostic codes with their metadata
ListDiagnostics,
/// Explain a specific diagnostic code in detail
ExplainDiagnostic {
/// Diagnostic code to explain (e.g., STRUCT_MISSING_KEY, STREAM_BOMB)
code: String,
},
/// Compare actual results against expected values with tolerances (for conformance testing)
Compare {
/// Path to the actual results JSON
@ -113,6 +123,12 @@ fn main() -> Result<()> {
let cli = Cli::parse();
match cli.command {
Commands::ListDiagnostics => {
cmd_list_diagnostics()?;
}
Commands::ExplainDiagnostic { code } => {
cmd_explain_diagnostic(&code)?;
}
Commands::Compare {
actual,
expected,
@ -192,6 +208,335 @@ fn cmd_extract(
Ok(())
}
fn cmd_list_diagnostics() -> Result<()> {
println!("pdftract Diagnostic Codes");
println!();
println!("This catalog lists all diagnostic codes emitted during PDF parsing and extraction.");
println!("Each diagnostic includes a severity level, recoverable flag, phase origin, and suggested action.");
println!();
// Group by category
let mut categories: std::collections::HashMap<&str, Vec<&DiagInfo>> = std::collections::HashMap::new();
for info in DIAGNOSTIC_CATALOG {
categories.entry(info.category).or_default().push(info);
}
// Define category order
let category_order = vec![
"STRUCT", "XREF", "STREAM", "ENCRYPTION", "PAGE", "FONT",
"OCR", "REMOTE", "GSTATE", "LAYOUT", "MCP", "CACHE",
];
for category in category_order {
if let Some(infos) = categories.get(category) {
println!("=== {}_* codes ===", category);
println!();
for info in infos {
println!("{} ({})", info.code, info.severity);
println!(" Phase: {}", info.phase);
println!(" Recoverable: {}", if info.recoverable { "Yes" } else { "No" });
println!(" Action: {}", info.suggested_action);
println!();
}
}
}
println!("Total: {} diagnostic codes", DIAGNOSTIC_CATALOG.len());
Ok(())
}
fn cmd_explain_diagnostic(code: &str) -> Result<()> {
// Normalize the input code (handle case-insensitivity and strip whitespace)
let code_upper = code.to_uppercase().trim().to_string();
// Try to find the diagnostic by name in the catalog
let info = DIAGNOSTIC_CATALOG
.iter()
.find(|info| info.code.name() == code_upper)
.ok_or_else(|| anyhow::anyhow!("Unknown diagnostic code: {}", code))?;
println!("Diagnostic: {}", info.code);
println!("Category: {}", info.category);
println!("Severity: {}", info.severity);
println!("Recoverable: {}", if info.recoverable { "Yes" } else { "No" });
println!("Phase Origin: {}", info.phase);
println!();
println!("Description:");
// Get the description from the DiagCode's doc comment
// We can't access doc comments at runtime, but we can provide useful info
match info.code {
DiagCode::StructInvalidName => {
println!(" Invalid name character or malformed name object");
println!(" Names containing invalid characters or exceeding the 127-byte limit are truncated.");
}
DiagCode::StructInvalidHex => {
println!(" Invalid hexadecimal character in hex string or name escape");
println!(" Non-hex characters in <...> strings or #XX escapes are skipped.");
}
DiagCode::StructInvalidOctal => {
println!(" Invalid octal escape sequence in literal string");
println!(" Invalid \\NNN escapes are passed through literally.");
}
DiagCode::StructInvalidStreamHeader => {
println!(" Invalid stream header");
println!(" The 'stream' keyword must be followed by CRLF or LF per PDF spec.");
}
DiagCode::StructUnexpectedByte => {
println!(" Unexpected byte during parsing");
println!(" A byte doesn't match expected token syntax; lexer resynchronizes.");
}
DiagCode::StructUnexpectedEof => {
println!(" Unexpected end of file");
println!(" The file ends mid-token; parsing continues with partial data.");
}
DiagCode::StructUnterminatedString => {
println!(" Unterminated literal string");
println!(" A literal string is missing a closing parenthesis.");
}
DiagCode::StructMissingKey => {
println!(" Missing required dictionary key");
println!(" A required key is absent from a dictionary.");
}
DiagCode::StructCircularRef => {
println!(" Circular reference detected");
println!(" An indirect reference forms a cycle (A → B → A).");
}
DiagCode::StructXobjectCycle => {
println!(" Form XObject cycle detected");
println!(" A form XObject invokes itself directly or indirectly.");
}
DiagCode::StructDepthExceeded => {
println!(" Dictionary nesting depth exceeds limit");
println!(" Structure is too deeply nested; truncated to prevent stack overflow.");
}
DiagCode::StructInvalidDictValue => {
println!(" Invalid dictionary value");
println!(" A dictionary key is not followed by a value.");
}
DiagCode::StructInvalidDictKey => {
println!(" Invalid dictionary key");
println!(" A dictionary key is not a name object.");
}
DiagCode::StructInvalidIndirectHeader => {
println!(" Invalid indirect object header");
println!(" The 'N G obj' header is malformed.");
}
DiagCode::StructIntegerOverflow => {
println!(" Integer overflow during parsing");
println!(" An integer would overflow i64; value is clamped.");
}
DiagCode::StructInvalidObjstm => {
println!(" Invalid object stream format");
println!(" An object stream has a malformed header or invalid data.");
}
DiagCode::StructInvalidGeometry => {
println!(" Invalid geometry value");
println!(" NaN or Inf in MediaBox/CropBox/Rotate; canonicalized to 0.");
}
DiagCode::StructInvalidUtf16 => {
println!(" Invalid UTF-16BE encoding");
println!(" A UTF-16BE string has odd length or invalid encoding.");
}
DiagCode::StructUnresolvedDestination => {
println!(" Unresolved named destination");
println!(" An outline references a named destination (not yet resolved).");
}
DiagCode::StructNonGotoOutline => {
println!(" Non-GoTo action in outline");
println!(" An outline has an action other than GoTo/URI.");
}
DiagCode::StructInvalidPdfDocEncoding => {
println!(" Invalid PDFDocEncoding");
println!(" A PDFDocEncoding string cannot be decoded to UTF-8.");
}
DiagCode::StructHybridConflict => {
println!(" Hybrid xref conflict");
println!(" Traditional xref and stream disagree on object state.");
}
DiagCode::StructInvalidPrevOffset => {
println!(" Invalid /Prev offset in xref chain");
println!(" A trailer's /Prev offset points to invalid data.");
}
DiagCode::XrefInvalidHeader => {
println!(" Invalid xref keyword or header");
println!(" The xref table doesn't start with the 'xref' keyword.");
}
DiagCode::XrefInvalidEntry => {
println!(" Malformed xref entry");
println!(" An xref entry doesn't match the 20-byte format.");
}
DiagCode::XrefInvalidSubsectionHeader => {
println!(" Invalid subsection header");
println!(" An xref subsection header is malformed.");
}
DiagCode::XrefObjectZeroNotFree => {
println!(" Object 0 is not free");
println!(" Object 0 is marked as in-use, violating PDF spec.");
}
DiagCode::XrefTrailerNotFound => {
println!(" Trailer dictionary not found");
println!(" The trailer dictionary couldn't be located or parsed.");
}
DiagCode::XrefTruncated => {
println!(" Truncated xref table");
println!(" The xref table ends unexpectedly.");
}
DiagCode::XrefRepaired => {
println!(" Xref was reconstructed");
println!(" Forward scan recovered xref entries after primary strategies failed.");
}
DiagCode::XrefLinearizedNoForwardScan => {
println!(" Forward scan disabled for linearized PDF");
println!(" Forward scan would incorrectly find the partial first-page xref.");
}
DiagCode::XrefRemoteNoForwardScan => {
println!(" Forward scan disabled for remote sources");
println!(" Forward scan would require fetching the entire file.");
}
DiagCode::XrefInvalidStreamFormat => {
println!(" Invalid xref stream format");
println!(" An xref stream has a malformed header or invalid /W array.");
}
DiagCode::XrefInvalidStreamEntry => {
println!(" Invalid xref stream entry");
println!(" An xref stream entry cannot be parsed due to invalid data.");
}
DiagCode::StreamDecodeError => {
println!(" Stream decompression failed");
println!(" A stream decoder encountered corrupt data mid-decompression.");
}
DiagCode::StreamBomb => {
println!(" Decompression bomb limit exceeded");
println!(" A stream's decompressed size would exceed the safety limit.");
}
DiagCode::StreamUnknownFilter => {
println!(" Unknown filter name");
println!(" A stream specifies an unsupported filter.");
}
DiagCode::StreamInvalidParams => {
println!(" Invalid filter parameters");
println!(" A stream's /DecodeParms dictionary is malformed.");
}
DiagCode::EncryptionUnsupported => {
println!(" Unsupported encryption or no password");
println!(" PDF is encrypted and no password was supplied or algorithm is unsupported.");
}
DiagCode::EncryptionWrongPassword => {
println!(" Password incorrect");
println!(" The supplied password doesn't match the PDF's encryption key.");
}
DiagCode::PageOutOfRange => {
println!(" Page number out of range");
println!(" --pages specifies a page number greater than the document's page count.");
}
DiagCode::PageInvalidCount => {
println!(" Invalid page count");
println!(" The /Count key in the /Pages tree is invalid.");
}
DiagCode::PageInvalidRotate => {
println!(" Invalid /Rotate value");
println!(" A page's /Rotate value is not a multiple of 90.");
}
DiagCode::FontGlyphUnmapped => {
println!(" Glyph could not be mapped to Unicode");
println!(" A glyph has no entry in /ToUnicode CMap, AGL, fingerprint, or shape match.");
}
DiagCode::FontNotFound => {
println!(" Font not found or couldn't be parsed");
println!(" A referenced font is missing from the PDF or couldn't be parsed.");
}
DiagCode::FontInvalidCmap => {
println!(" Invalid CMap format");
println!(" A CMap stream is malformed.");
}
DiagCode::OcrJbig2Unsupported => {
println!(" JBIG2 decoder not available");
println!(" Build with --features full-render to enable JBIG2 decoding.");
}
DiagCode::OcrJpxUnsupported => {
println!(" JPEG2000 decoder not available");
println!(" Build with --features full-render or install libopenjp2.");
}
DiagCode::OcrCcittUnsupported => {
println!(" CCITT fax decoder not available");
println!(" Install libtiff system library or build with --features full-render.");
}
DiagCode::OcrTesseractFailed => {
println!(" Tesseract OCR failed");
println!(" Tesseract crashed or returned an error.");
}
DiagCode::OcrBrokenVectorUnavailable => {
println!(" OCR unavailable on broken-vector page");
println!(" Build with --features ocr to enable OCR recovery.");
}
DiagCode::RemoteFetchInterrupted => {
println!(" HTTP fetch interrupted or failed");
println!(" Network error, timeout, or server error occurred.");
}
DiagCode::RemoteNoRangeSupport => {
println!(" Server does not support Range requests");
println!(" Falls back to downloading the entire file.");
}
DiagCode::RemoteTlsFailed => {
println!(" TLS handshake failed");
println!(" The TLS handshake failed; check the server's certificate.");
}
DiagCode::RemoteDnsFailed => {
println!(" DNS resolution failed");
println!(" The hostname could not be resolved.");
}
DiagCode::GstateStackOverflow => {
println!(" Graphics state stack overflow");
println!(" The graphics state stack exceeded the internal limit.");
}
DiagCode::GstateStackUnderflow => {
println!(" Graphics state stack underflow");
println!(" More Q operators than q operators in the content stream.");
}
DiagCode::GstateBtEtMismatch => {
println!(" Mismatched BT/ET pair");
println!(" The content stream has mismatched BT/ET operators.");
}
DiagCode::LayoutTaggedPdfDeferred => {
println!(" Tagged PDF StructTree deferred");
println!(" StructTree is ignored; XY-cut is used instead (Phase 7.1 pending).");
}
DiagCode::LayoutReadingOrderAmbiguous => {
println!(" Reading order may be incorrect");
println!(" The reading order algorithm detected ambiguity.");
}
DiagCode::LayoutLowReadability => {
println!(" Low readability score");
println!(" Page readability is below 0.85; may indicate mojibake.");
}
DiagCode::McpToolInvalidParams => {
println!(" MCP tool call has invalid parameters");
println!(" An MCP tool call doesn't match the tool's schema.");
}
DiagCode::McpPathTraversal => {
println!(" MCP path traversal attempt");
println!(" An MCP path escapes the --root directory.");
}
DiagCode::CacheEntryCorrupt => {
println!(" Cache entry is corrupted");
println!(" A cached entry failed to deserialize and was deleted.");
}
DiagCode::CacheWriteFailed => {
println!(" Cache write failed");
println!(" Writing to the cache failed (e.g., out of disk space).");
}
}
println!();
println!("Suggested Action: {}", info.suggested_action);
println!();
println!("Phase Origin: {}", info.phase);
Ok(())
}
fn cmd_compare(actual: PathBuf, expected: PathBuf, tolerances: Option<PathBuf>, format: &str) -> Result<()> {
let actual_json = fs::read_to_string(&actual)
.context(format!("Failed to read actual results from {:?}", actual))?;

View file

@ -5,3 +5,4 @@
# It is recommended to check this file in to source control so that
# everyone who runs the test benefits from these saved cases.
cc 9eb796a85e40a841d1cd43881214b688676e982ec812d8c66313ea753a019ec6 # shrinks to bytes = [123]
cc e23be3e45757e93e13f0d3daf57c9fbce249a6629b9bfc8d0cb14ebf332767ae # shrinks to bytes = [41]

View file

@ -383,6 +383,30 @@ pub enum DiagCode {
/// Phase origin: 1.3
XrefRemoteNoForwardScan,
/// Invalid xref stream format
///
/// Emitted when an xref stream has a malformed header, invalid /W array,
/// or other format violations. The stream is skipped.
///
/// Phase origin: 1.3
XrefInvalidStreamFormat,
/// Invalid xref stream entry
///
/// Emitted when an xref stream entry cannot be parsed due to invalid data
/// in the stream's compressed entries section.
///
/// Phase origin: 1.3
XrefInvalidStreamEntry,
/// Invalid /Prev offset in xref chain
///
/// Emitted when a trailer's /Prev offset points to invalid data (outside file,
/// not at xref boundary, etc.). The chain is truncated at this point.
///
/// Phase origin: 1.3
StructInvalidPrevOffset,
// === STREAM_* codes ===
/// Stream decompression failed (corrupt data)
@ -687,7 +711,12 @@ impl DiagCode {
| DiagCode::XrefTruncated
| DiagCode::XrefRepaired
| DiagCode::XrefLinearizedNoForwardScan
| DiagCode::XrefRemoteNoForwardScan => "XREF",
| DiagCode::XrefRemoteNoForwardScan
| DiagCode::XrefInvalidStreamFormat
| DiagCode::XrefInvalidStreamEntry => "XREF",
// STRUCT_* (continued)
DiagCode::StructInvalidPrevOffset => "STRUCT",
// STREAM_*
DiagCode::StreamDecodeError
@ -774,6 +803,9 @@ impl DiagCode {
DiagCode::XrefRepaired => "XREF_REPAIRED",
DiagCode::XrefLinearizedNoForwardScan => "XREF_LINEARIZED_NO_FORWARD_SCAN",
DiagCode::XrefRemoteNoForwardScan => "XREF_REMOTE_NO_FORWARD_SCAN",
DiagCode::XrefInvalidStreamFormat => "XREF_INVALID_STREAM_FORMAT",
DiagCode::XrefInvalidStreamEntry => "XREF_INVALID_STREAM_ENTRY",
DiagCode::StructInvalidPrevOffset => "STRUCT_INVALID_PREV_OFFSET",
DiagCode::StreamDecodeError => "STREAM_DECODE_ERROR",
DiagCode::StreamBomb => "STREAM_BOMB",
DiagCode::StreamUnknownFilter => "STREAM_UNKNOWN_FILTER",
@ -836,6 +868,7 @@ impl DiagCode {
| DiagCode::StructNonGotoOutline
| DiagCode::StructInvalidPdfDocEncoding
| DiagCode::StructHybridConflict
| DiagCode::StructInvalidPrevOffset
| DiagCode::XrefInvalidHeader
| DiagCode::XrefInvalidEntry
| DiagCode::XrefInvalidSubsectionHeader
@ -844,6 +877,8 @@ impl DiagCode {
| DiagCode::XrefTruncated
| DiagCode::XrefLinearizedNoForwardScan
| DiagCode::XrefRemoteNoForwardScan
| DiagCode::XrefInvalidStreamFormat
| DiagCode::XrefInvalidStreamEntry
| DiagCode::StreamDecodeError
| DiagCode::StreamUnknownFilter
| DiagCode::StreamInvalidParams
@ -1145,6 +1180,30 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
phase: "1.3",
suggested_action: "Forward scan is disabled for HTTP sources (would fetch entire file)",
},
DiagInfo {
code: DiagCode::XrefInvalidStreamFormat,
category: "XREF",
severity: Severity::Warning,
recoverable: true,
phase: "1.3",
suggested_action: "The xref stream has a malformed header or invalid /W array; the stream is skipped",
},
DiagInfo {
code: DiagCode::XrefInvalidStreamEntry,
category: "XREF",
severity: Severity::Warning,
recoverable: true,
phase: "1.3",
suggested_action: "An xref stream entry cannot be parsed due to invalid data",
},
DiagInfo {
code: DiagCode::StructInvalidPrevOffset,
category: "STRUCT",
severity: Severity::Warning,
recoverable: true,
phase: "1.3",
suggested_action: "A trailer's /Prev offset points to invalid data; the xref chain is truncated at this point",
},
// === STREAM_* codes ===
DiagInfo {
code: DiagCode::StreamDecodeError,

View file

@ -783,7 +783,9 @@ mod tests {
assert!(catalog.names_ref.is_none());
assert!(catalog.metadata_ref.is_none());
assert!(catalog.page_labels.is_none());
assert!(catalog.oc_properties.is_none());
// oc_properties is always Some; check present flag for absence
assert!(catalog.oc_properties.is_some());
assert!(!catalog.oc_properties.as_ref().unwrap().present);
assert!(catalog.open_action.is_none());
assert!(catalog.aa.is_none());
assert!(catalog.version.is_none());

View file

@ -3,7 +3,7 @@
//! This module provides the lexer that converts raw PDF byte sequences into tokens.
//! PDF is byte-oriented; position tracking is byte-level, not character-level.
use std::borrow::Cow;
use crate::diagnostics::{Diagnostic as Diag, DiagCode};
/// Token produced by the PDF lexer.
///
@ -49,82 +49,6 @@ pub enum Token {
Eof,
}
/// Diagnostic code for lexer errors.
///
/// All lexer diagnostic codes use the `STRUCT_` prefix to indicate
/// they relate to structural/lexical issues in the PDF document.
#[derive(Clone, Debug, PartialEq)]
pub enum DiagCode {
/// Invalid name character or malformed name
StructInvalidName,
/// Invalid hexadecimal character in hex string or name escape
StructInvalidHex,
/// Invalid octal escape sequence in literal string
StructInvalidOctal,
/// Invalid stream header (stream keyword not followed by proper newline)
StructInvalidStreamHeader,
/// Unexpected byte (e.g., stray `>` not part of `>>`)
StructUnexpectedByte,
/// Unexpected end of file while parsing a token
StructUnexpectedEof,
/// Unterminated literal string (missing closing paren)
StructUnterminatedString,
// Object parser codes
/// Dictionary nesting depth exceeds limit
DepthExceeded,
/// Missing required key in dictionary
MissingKey,
// Object stream codes
/// Invalid object stream format
InvalidObjstm,
/// Circular reference in /Extends chain
CircularRef,
/// Stream decompression failed
DecompressionFailed,
/// Decompression bomb limit exceeded
StreamBomb,
}
/// Diagnostic message emitted during lexing.
///
/// Diagnostics are accumulated during lexing and can be retrieved
/// via `Lexer::take_diagnostics()`. They do not stop lexing; the
/// lexer attempts recovery and continues.
///
/// Diagnostic messages use `Cow<'static, str>` so static error messages
/// don't allocate. Dynamic messages (with formatting) allocate only when needed.
#[derive(Clone, Debug, PartialEq)]
pub struct Diagnostic {
/// The diagnostic code identifying the type of error
pub code: DiagCode,
/// Byte offset in the input where the error occurred
pub byte_offset: u64,
/// Human-readable error message
pub msg: Cow<'static, str>,
}
impl Diagnostic {
/// Create a diagnostic with a static message (no allocation).
fn with_static(code: DiagCode, byte_offset: u64, msg: &'static str) -> Self {
Diagnostic {
code,
byte_offset,
msg: Cow::Borrowed(msg),
}
}
/// Create a diagnostic with a dynamic message (allocates).
fn with_dynamic(code: DiagCode, byte_offset: u64, msg: String) -> Self {
Diagnostic {
code,
byte_offset,
msg: Cow::Owned(msg),
}
}
}
/// PDF lexical analyzer.
///
/// The lexer processes PDF byte sequences and produces tokens.
@ -149,7 +73,7 @@ pub struct Lexer<'a> {
/// Current byte position within the original input
pos: usize,
/// Accumulated diagnostics
diagnostics: Vec<Diagnostic>,
diagnostics: Vec<Diag>,
/// Cached token for peek operations (token, position after token)
peek_cache: Option<(Token, usize)>,
/// Whether Eof has been returned
@ -322,7 +246,7 @@ impl<'a> Lexer<'a> {
/// let diags = lexer.take_diagnostics();
/// assert!(diags.is_empty());
/// ```
pub fn take_diagnostics(&mut self) -> Vec<Diagnostic> {
pub fn take_diagnostics(&mut self) -> Vec<Diag> {
std::mem::take(&mut self.diagnostics)
}
@ -387,6 +311,17 @@ impl<'a> Lexer<'a> {
b'n' => self.lex_n_keyword(),
b'x' => self.lex_x_keyword(),
b'%' => self.lex_percent(),
b'{' | b'}' => {
// PDF 1.2 reserved these for future use; treat as unexpected bytes
let pos = self.pos;
self.diagnostics.push(Diag::with_dynamic(
DiagCode::StructUnexpectedByte,
pos as u64,
format!("Unexpected byte: 0x{:02x}", next),
));
self.advance(1);
Some(Token::Null)
}
_ => self.lex_keyword(),
}
}
@ -601,7 +536,7 @@ impl<'a> Lexer<'a> {
if !has_digit {
// Not a valid number, emit diagnostic and return null
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructUnexpectedEof,
start as u64,
"Invalid numeric literal",
@ -710,7 +645,7 @@ impl<'a> Lexer<'a> {
}
if value > 255 {
self.diagnostics.push(Diagnostic::with_dynamic(
self.diagnostics.push(Diag::with_dynamic(
DiagCode::StructInvalidOctal,
self.pos as u64,
format!("Octal escape \\{:03o} exceeds 255, truncated", value),
@ -738,7 +673,7 @@ impl<'a> Lexer<'a> {
}
// Unterminated string
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructUnterminatedString,
start as u64,
"Unterminated literal string",
@ -763,7 +698,7 @@ impl<'a> Lexer<'a> {
// Special check for NUL byte: it's whitespace per spec, but invalid in names
if b == 0x00 {
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructInvalidName,
self.pos as u64,
"NUL byte in name is invalid per PDF spec",
@ -796,7 +731,7 @@ impl<'a> Lexer<'a> {
let decoded = (h << 4) | l;
// Check if decoded byte is NUL
if decoded == 0 {
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructInvalidName,
self.pos as u64,
"NUL byte in name is invalid per PDF spec",
@ -810,7 +745,7 @@ impl<'a> Lexer<'a> {
}
_ => {
// Invalid hex: emit diagnostic and treat # as literal
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructInvalidName,
self.pos as u64,
"Invalid hex escape sequence in name",
@ -836,7 +771,7 @@ impl<'a> Lexer<'a> {
// Emit diagnostic if we hit the length limit
if truncated_due_to_length || raw_consumed > MAX_RAW_BYTES {
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructInvalidName,
start as u64,
"Name exceeds 127-byte length limit",
@ -845,7 +780,7 @@ impl<'a> Lexer<'a> {
// Check if there's more input that we didn't consume
if let Some(&b) = self.bytes.first() {
if !Self::is_pdf_whitespace(b) && !Self::is_pdf_delimiter(b) {
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructInvalidName,
start as u64,
"Name exceeds 127-byte length limit",
@ -910,7 +845,7 @@ impl<'a> Lexer<'a> {
out.push(hi << 4);
current_nibble = None;
}
self.diagnostics.push(Diagnostic::with_dynamic(
self.diagnostics.push(Diag::with_dynamic(
DiagCode::StructInvalidHex,
self.pos as u64,
format!("Invalid hex character '{}' (0x{:02x})", b as char, b),
@ -920,7 +855,7 @@ impl<'a> Lexer<'a> {
}
// EOF before >
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructUnterminatedString,
start as u64,
"Unterminated hex string",
@ -950,7 +885,7 @@ impl<'a> Lexer<'a> {
Some(Token::DictEnd)
} else {
// Stray > - emit diagnostic
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructUnexpectedByte,
self.pos as u64,
"Unexpected > character",
@ -980,7 +915,7 @@ impl<'a> Lexer<'a> {
self.advance(1); // consume the \n
} else {
// Lone \r - invalid
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructInvalidStreamHeader,
start_pos as u64,
"stream keyword must be followed by \\n or \\r\\n, not lone \\r",
@ -988,7 +923,7 @@ impl<'a> Lexer<'a> {
}
} else {
// No line ending at all - invalid
self.diagnostics.push(Diagnostic::with_static(
self.diagnostics.push(Diag::with_static(
DiagCode::StructInvalidStreamHeader,
start_pos as u64,
"stream keyword must be followed by \\n or \\r\\n",
@ -1071,7 +1006,7 @@ impl<'a> Lexer<'a> {
fn lex_unknown(&mut self) -> Option<Token> {
// Unknown character - skip it and emit diagnostic
let pos = self.pos;
self.diagnostics.push(Diagnostic::with_dynamic(
self.diagnostics.push(Diag::with_dynamic(
DiagCode::StructUnexpectedEof,
pos as u64,
format!("Unexpected byte: 0x{:02x}", self.bytes[0]),
@ -1201,7 +1136,7 @@ mod tests {
let diags = lexer.take_diagnostics();
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].code, DiagCode::StructInvalidStreamHeader);
assert!(diags[0].msg.contains("lone \\r"));
assert!(diags[0].message.as_ref().contains("lone \\r"));
}
#[test]
@ -1358,7 +1293,7 @@ mod tests {
let diags = lexer.take_diagnostics();
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].code, DiagCode::StructInvalidOctal);
assert!(diags[0].msg.contains("401"));
assert!(diags[0].message.as_ref().contains("401"));
}
#[test]
@ -1477,8 +1412,8 @@ mod tests {
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].code, DiagCode::StructInvalidHex);
// Debug: print actual message
eprintln!("Actual diagnostic message: {}", diags[0].msg);
assert!(diags[0].msg.contains("Z"));
eprintln!("Actual diagnostic message: {}", diags[0].message.as_ref());
assert!(diags[0].message.as_ref().contains("Z"));
}
#[test]
@ -1489,7 +1424,7 @@ mod tests {
let diags = lexer.take_diagnostics();
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].code, DiagCode::StructUnterminatedString);
assert!(diags[0].msg.contains("hex string"));
assert!(diags[0].message.as_ref().contains("hex string"));
}
#[test]
@ -1772,7 +1707,7 @@ mod tests {
let diags = lexer.take_diagnostics();
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].code, DiagCode::StructInvalidName);
assert!(diags[0].msg.contains("NUL"));
assert!(diags[0].message.as_ref().contains("NUL"));
}
#[test]
@ -1801,7 +1736,7 @@ mod tests {
let diags = lexer.take_diagnostics();
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].code, DiagCode::StructInvalidName);
assert!(diags[0].msg.contains("127"));
assert!(diags[0].message.as_ref().contains("127"));
}
#[test]
@ -1873,7 +1808,7 @@ mod tests {
let diags = lexer.take_diagnostics();
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].code, DiagCode::StructInvalidName);
assert!(diags[0].msg.contains("hex"));
assert!(diags[0].message.as_ref().contains("hex"));
}
#[test]

View file

@ -20,9 +20,10 @@ pub use crate::diagnostics::{Diagnostic, Severity, DiagCode, ObjRef};
pub use object::{PdfObject};
pub use objstm::{ObjectStmParser, ObjStmCacheEntry, ObjStmResult, ObjStmError};
pub use xref::{
XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection, XrefDiagnostic, XrefDiagCode,
XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection,
parse_traditional_xref, parse_xref_stream, merge_hybrid, is_hybrid_trailer,
LinearizationInfo, detect_linearization, load_xref_linearized, merge_linearized_xrefs,
load_xref_with_prev_chain,
};
pub use catalog::{Catalog, MarkInfo, PageLabel, PageLabelsTree, PageLabelStyle, parse_catalog};
pub use ocg::{OcProperties, OcGroup, Ocmd, OcmdPolicy, BaseState, parse_oc_properties};

View file

@ -5,7 +5,7 @@
use super::types::{intern, ObjRef, PdfDict, PdfObject, PdfStream, PdfIndirect};
use crate::parser::lexer::{Lexer, Token};
use crate::parser::diagnostic::{Diagnostic, DiagCode};
use crate::diagnostics::{Diagnostic as Diag, DiagCode};
/// Maximum nesting depth for dictionaries and arrays.
///
@ -21,7 +21,7 @@ pub struct ObjectParser<'a> {
/// The lexer that provides tokens
lexer: Lexer<'a>,
/// Accumulated diagnostics
diagnostics: Vec<Diagnostic>,
diagnostics: Vec<Diag>,
/// Current nesting depth (for depth limit enforcement)
depth: u16,
}
@ -50,7 +50,7 @@ impl<'a> ObjectParser<'a> {
}
/// Take all accumulated diagnostics.
pub fn take_diagnostics(&mut self) -> Vec<Diagnostic> {
pub fn take_diagnostics(&mut self) -> Vec<Diag> {
std::mem::take(&mut self.diagnostics)
}
@ -93,8 +93,8 @@ impl<'a> ObjectParser<'a> {
Token::Eof => None,
_ => {
// Unexpected token - emit diagnostic and return null
self.diagnostics.push(Diagnostic::warning(
"1.2",
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructUnexpectedByte,
format!("Unexpected token: {:?}", token),
));
Some(PdfObject::Null)
@ -119,8 +119,8 @@ impl<'a> ObjectParser<'a> {
// Validate object and generation numbers are non-negative
if first_int < 0 || gen < 0 {
self.diagnostics.push(Diagnostic::warning(
"1.2",
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidIndirectHeader,
format!("Invalid indirect reference: {} {} R", first_int, gen),
));
return Some(PdfObject::Null);
@ -141,9 +141,9 @@ impl<'a> ObjectParser<'a> {
fn parse_array(&mut self) -> Option<PdfObject> {
// Check depth limit
if self.depth >= MAX_DEPTH {
self.diagnostics.push(Diagnostic::error(
"1.2",
format!("STRUCT_DEPTH_EXCEEDED: Array nesting depth exceeds limit of {}", MAX_DEPTH),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructDepthExceeded,
format!("Array nesting depth exceeds limit of {}", MAX_DEPTH),
));
// Skip to matching closing bracket
self.skip_to_array_end();
@ -199,9 +199,8 @@ impl<'a> ObjectParser<'a> {
fn parse_dict(&mut self) -> Option<PdfObject> {
// Check depth limit
if self.depth >= MAX_DEPTH {
self.diagnostics.push(Diagnostic::error_with_code(
DiagCode::DepthExceeded,
"1.2",
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructDepthExceeded,
format!("Dictionary nesting depth exceeds limit of {}", MAX_DEPTH),
));
self.skip_to_dict_end();
@ -232,9 +231,9 @@ impl<'a> ObjectParser<'a> {
match self.lexer.peek_token() {
Some(Token::DictEnd) | Some(Token::Eof) => {
// Missing value - insert PdfNull
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INVALID_DICT_VALUE: Dictionary key '{}' has no value, inserting null", key),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidDictValue,
format!("Dictionary key '{}' has no value, inserting null", key),
));
dict.insert(key, PdfObject::Null);
break; // End of dict
@ -253,9 +252,9 @@ impl<'a> ObjectParser<'a> {
}
_ => {
// Invalid key - not a name
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INVALID_DICT_KEY: Dictionary key is not a name object, skipping"),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidDictKey,
"Dictionary key is not a name object, skipping".to_string(),
));
// Skip the invalid token and the next token (would-be value)
let _ = self.lexer.next_token();
@ -314,9 +313,9 @@ impl<'a> ObjectParser<'a> {
let len_usize = len as usize;
let actual_skipped = self.lexer.skip_bytes(len);
if actual_skipped < len_usize {
self.diagnostics.push(Diagnostic::error(
"1.2",
format!("STRUCT_TRUNCATED_STREAM: Stream truncated at EOF: expected {} bytes, got {}", len, actual_skipped),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!("Stream truncated at EOF: expected {} bytes, got {}", len, actual_skipped),
));
}
} else {
@ -330,24 +329,24 @@ impl<'a> ObjectParser<'a> {
// Normal case - stream properly terminated
}
Some(Token::Eof) => {
self.diagnostics.push(Diagnostic::error(
"1.2",
"STRUCT_TRUNCATED_STREAM: Stream truncated at EOF, missing endstream keyword",
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
"Stream truncated at EOF, missing endstream keyword".to_string(),
));
}
Some(other) => {
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_MISSING_KEY: Expected endstream keyword after stream body, found {:?}", other),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructUnexpectedByte,
format!("Expected endstream keyword after stream body, found {:?}", other),
));
// Try to recover by scanning forward for EndStream
self.scan_to_endstream();
}
None => {
// Shouldn't happen, but handle gracefully
self.diagnostics.push(Diagnostic::error(
"1.2",
"Unexpected None after skipping stream body",
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
"Unexpected None after skipping stream body".to_string(),
));
}
}
@ -420,15 +419,15 @@ impl<'a> ObjectParser<'a> {
Token::Integer(n) => {
// Check for overflow
if n > u32::MAX as i64 {
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INTEGER_OVERFLOW: Object number {} exceeds u32::MAX, clamping", n),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructIntegerOverflow,
format!("Object number {} exceeds u32::MAX, clamping", n),
));
u32::MAX
} else if n < 0 {
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INVALID_INDIRECT_HEADER: Negative object number {}", n),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidIndirectHeader,
format!("Negative object number {}", n),
));
// Recover by scanning forward to next obj keyword
self.scan_to_next_obj();
@ -439,9 +438,9 @@ impl<'a> ObjectParser<'a> {
}
_ => {
// Not an integer - emit diagnostic and recover
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INVALID_INDIRECT_HEADER: Expected object number, found {:?}", token1),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidIndirectHeader,
format!("Expected object number, found {:?}", token1),
));
self.scan_to_next_obj();
return None;
@ -454,15 +453,15 @@ impl<'a> ObjectParser<'a> {
Token::Integer(g) => {
// Check for overflow
if g > u16::MAX as i64 {
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INTEGER_OVERFLOW: Generation number {} exceeds u16::MAX, clamping", g),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructIntegerOverflow,
format!("Generation number {} exceeds u16::MAX, clamping", g),
));
u16::MAX
} else if g < 0 {
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INVALID_INDIRECT_HEADER: Negative generation number {}", g),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidIndirectHeader,
format!("Negative generation number {}", g),
));
self.scan_to_next_obj();
return None;
@ -472,9 +471,9 @@ impl<'a> ObjectParser<'a> {
}
_ => {
// Not an integer - emit diagnostic and recover
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INVALID_INDIRECT_HEADER: Expected generation number, found {:?}", token2),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidIndirectHeader,
format!("Expected generation number, found {:?}", token2),
));
self.scan_to_next_obj();
return None;
@ -484,9 +483,9 @@ impl<'a> ObjectParser<'a> {
// Read the third token (must be Obj)
let token3 = self.lexer.next_token()?;
if !matches!(token3, Token::Obj) {
self.diagnostics.push(Diagnostic::warning(
"1.2",
format!("STRUCT_INVALID_INDIRECT_HEADER: Expected 'obj' keyword, found {:?}", token3),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructInvalidIndirectHeader,
format!("Expected 'obj' keyword, found {:?}", token3),
));
self.scan_to_next_obj();
return None;
@ -507,9 +506,9 @@ impl<'a> ObjectParser<'a> {
Some(Token::Obj) => {
// Found the start of the next indirect object before endobj
// This means the current object is malformed
self.diagnostics.push(Diagnostic::warning(
"1.2",
"STRUCT_MISSING_KEY: Missing 'endobj' before next indirect object".to_string(),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructMissingKey,
"Missing 'endobj' before next indirect object".to_string(),
));
// We're positioned at 'obj' but need to be at the object number
// Scan forward to find the next integer (object number)
@ -518,22 +517,22 @@ impl<'a> ObjectParser<'a> {
Some(Token::Eof) => {
// Consume the Eof
let _ = self.lexer.next_token();
self.diagnostics.push(Diagnostic::warning(
"1.2",
"STRUCT_MISSING_KEY: Missing 'endobj' at EOF".to_string(),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructMissingKey,
"Missing 'endobj' at EOF".to_string(),
));
}
None => {
self.diagnostics.push(Diagnostic::warning(
"1.2",
"STRUCT_MISSING_KEY: Missing 'endobj' at EOF".to_string(),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructMissingKey,
"Missing 'endobj' at EOF".to_string(),
));
}
Some(_) => {
// Some other token - scan for endobj or next obj
self.diagnostics.push(Diagnostic::warning(
"1.2",
"STRUCT_MISSING_KEY: Expected 'endobj', scanning forward".to_string(),
self.diagnostics.push(Diag::with_dynamic_no_offset(
DiagCode::StructMissingKey,
"Expected 'endobj', scanning forward".to_string(),
));
self.scan_to_endobj_or_obj();
}
@ -826,7 +825,7 @@ mod tests {
assert_eq!(dict.len(), 1);
assert_eq!(dict.get("Type"), Some(&PdfObject::Null));
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_DICT_VALUE")));
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictValue));
} else {
panic!("Expected dict, got {:?}", obj);
}
@ -839,7 +838,7 @@ mod tests {
if let Some(PdfObject::Dict(dict)) = obj {
assert_eq!(dict.len(), 0);
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_DICT_KEY")));
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictKey));
} else {
panic!("Expected dict, got {:?}", obj);
}
@ -926,7 +925,7 @@ mod tests {
// Should have emitted STRUCT_DEPTH_EXCEEDED diagnostic
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.code == DiagCode::DepthExceeded));
assert!(diags.iter().any(|d| d.code == DiagCode::StructDepthExceeded));
}
#[test]
@ -951,7 +950,7 @@ mod tests {
// Should have emitted STRUCT_INVALID_DICT_VALUE diagnostic for missing value
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.code == DiagCode::InvalidDictValue));
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictValue));
}
#[test]
@ -962,7 +961,7 @@ mod tests {
// Should return PdfNull with diagnostic
assert_eq!(obj, Some(PdfObject::Null));
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.code == DiagCode::StructUnexpectedEof));
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader));
}
#[test]
@ -1085,7 +1084,7 @@ mod tests {
// Should have emitted STRUCT_MISSING_KEY diagnostic
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.message.contains("STRUCT_MISSING_KEY")));
assert!(diags.iter().any(|d| d.code == DiagCode::StructMissingKey));
// Next parse should handle the second object
let indirect2 = parser.parse_indirect_object();
@ -1109,7 +1108,7 @@ mod tests {
// Should have emitted STRUCT_INTEGER_OVERFLOW diagnostic
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INTEGER_OVERFLOW")));
assert!(diags.iter().any(|d| d.code == DiagCode::StructIntegerOverflow));
}
#[test]
@ -1124,7 +1123,7 @@ mod tests {
// Should have emitted STRUCT_INTEGER_OVERFLOW diagnostic
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INTEGER_OVERFLOW")));
assert!(diags.iter().any(|d| d.code == DiagCode::StructIntegerOverflow));
}
#[test]
@ -1138,7 +1137,7 @@ mod tests {
// Should have emitted STRUCT_INVALID_INDIRECT_HEADER diagnostic
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_INDIRECT_HEADER")));
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader));
}
#[test]
@ -1151,7 +1150,7 @@ mod tests {
// Should have emitted STRUCT_INVALID_INDIRECT_HEADER diagnostic
let diags = parser.take_diagnostics();
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_INDIRECT_HEADER")));
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader));
}
#[test]

View file

@ -134,7 +134,7 @@ impl PdfStream {
/// Returns None if no filter is present (raw stream).
/// Filter names are returned without the leading slash (e.g., "FlateDecode", not "/FlateDecode").
pub fn filter(&self) -> Option<Vec<String>> {
let filter = self.dict.get("Filter")?;
let filter = self.dict.get("/Filter")?;
Some(match filter {
PdfObject::Name(name) => {
@ -168,7 +168,7 @@ impl PdfStream {
///
/// Returns None if no parameters are present.
pub fn decode_params(&self) -> Option<Vec<PdfObject>> {
let params = self.dict.get("DecodeParms")?;
let params = self.dict.get("/DecodeParms")?;
Some(match params {
PdfObject::Dict(_) => vec![params.clone()],
@ -181,7 +181,7 @@ impl PdfStream {
///
/// Returns the direct integer value, or None if /Length is indirect/missing.
pub fn length(&self) -> Option<u64> {
self.dict.get("Length")?.as_int().map(|i| i as u64)
self.dict.get("/Length")?.as_int().map(|i| i as u64)
}
}

View file

@ -214,27 +214,27 @@ fn decode_utf16be_raw(bytes: &[u8]) -> std::result::Result<String, ()> {
///
/// Returns true if:
/// - Length is even
/// - For any byte > 0x7F, the adjacent bytes are 0x00
/// - Most high bytes (first byte of each pair) are 0x00
///
/// This detects UTF-16BE encoded ASCII text, where each ASCII character
/// is stored as [0x00, char_code].
fn looks_like_utf16be(bytes: &[u8]) -> bool {
if bytes.len() < 2 || bytes.len() % 2 != 0 {
return false;
}
// Check if high bytes are mostly zero (indicative of UTF-16BE ASCII text)
let mut high_bytes_count = 0;
let mut high_bytes_zero = 0;
// Count how many high bytes are zero
let mut zero_high_bytes = 0;
let total_pairs = bytes.len() / 2;
for chunk in bytes.chunks_exact(2) {
if chunk[0] > 0x7F || chunk[1] > 0x7F {
high_bytes_count += 1;
if chunk[0] == 0x00 {
high_bytes_zero += 1;
}
zero_high_bytes += 1;
}
}
// If we have non-ASCII bytes and most high bytes are zero, likely UTF-16BE
high_bytes_count > 0 && high_bytes_zero >= high_bytes_count / 2
// If most high bytes are zero (>= 75%), likely UTF-16BE
zero_high_bytes >= total_pairs * 3 / 4
}
/// Decode PDFDocEncoded string to UTF-8.
@ -567,6 +567,13 @@ fn resolve_destination(
}
}
(None, None)
} else if dest_obj.as_name().is_some() || dest_obj.as_string().is_some() {
// Named destination (name or string) - emit diagnostic and return None
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::StructUnresolvedDestination,
"STRUCT_UNRESOLVED_DESTINATION: Named destination not supported",
));
(None, None)
} else {
(None, None)
}

View file

@ -17,7 +17,7 @@ use flate2::read::ZlibDecoder;
use lzw::{MsbReader, Decoder, DecoderEarlyChange};
use secrecy::SecretString;
use crate::parser::diagnostic::{Diagnostic, DiagCode};
use crate::diagnostics::{Diagnostic, DiagCode};
use crate::parser::object::{PdfObject, PdfStream};
/// Maximum number of filters allowed in a single stream's pipeline.
@ -1863,8 +1863,10 @@ fn decode_stream_impl(
let truncated = raw_bytes[..remaining.min(raw_bytes.len())].to_vec();
return DecodeResult::with_diagnostic(
truncated,
Diagnostic::error("1.5",
format!("STREAM_BOMB: Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes))
Diagnostic::with_dynamic_no_offset(
DiagCode::StreamBomb,
format!("Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes)
)
);
}
*doc_decompress_counter += len;
@ -1881,13 +1883,17 @@ fn decode_stream_impl(
// Step 3: Get decode params (aligned with filters, may be shorter)
let decode_params = stream.decode_params().unwrap_or_default();
// Validate /Filter and /DecodeParms array lengths match
if !decode_params.is_empty() && decode_params.len() != filters.len() {
// Validate /Filter and /DecodeParms array lengths
// Per PDF spec, /DecodeParms can be shorter than /Filter (missing params are treated as null).
// But /DecodeParms cannot be longer than /Filter.
if decode_params.len() > filters.len() {
return DecodeResult::with_diagnostic(
raw_bytes,
Diagnostic::error("1.5",
format!("STRUCT_INVALID_FILTER_PARAMS: /Filter array length ({}) != /DecodeParms array length ({})",
filters.len(), decode_params.len()))
Diagnostic::with_dynamic_no_offset(
DiagCode::StreamInvalidParams,
format!("/DecodeParms array length ({}) > /Filter array length ({})",
decode_params.len(), filters.len())
)
);
}
@ -1918,9 +1924,8 @@ fn decode_stream_impl(
Err(FilterError::EncryptionUnsupported) => {
// Crypt filter with custom /Name - emit ENCRYPTION_UNSUPPORTED
// and return empty bytes (stream is undecryptable)
diagnostics.push(Diagnostic::error_with_code(
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::EncryptionUnsupported,
"1.5",
"Crypt filter with custom /Name parameter is not supported",
));
return DecodeResult {
@ -1928,7 +1933,7 @@ fn decode_stream_impl(
diagnostics,
};
}
Err(_) => {
Err(e) => {
// Hard error - return raw bytes for this filter
break;
}
@ -1936,16 +1941,20 @@ fn decode_stream_impl(
}
None => {
// Unknown filter - emit diagnostic and return current bytes (partial decode) per INV-8
diagnostics.push(Diagnostic::warning("1.5",
format!("STRUCT_UNKNOWN_FILTER: Unknown filter: {}, returning partial decode", filter_name)));
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StreamUnknownFilter,
format!("Unknown filter: {}, returning partial decode", filter_name)
));
break;
}
}
}
if bomb_limit_hit {
diagnostics.push(Diagnostic::error("1.5",
format!("STREAM_BOMB: Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes)));
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StreamBomb,
format!("Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes)
));
}
DecodeResult {

View file

@ -7,9 +7,9 @@
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock};
use std::borrow::Cow;
use crate::parser::object::{ObjRef, PdfObject, PdfDict, PdfStream};
use crate::parser::object::{ObjRef, PdfObject, PdfDict, PdfStream, ObjectParser};
use crate::parser::stream::{PdfSource, MemorySource};
use crate::diagnostics::{Diagnostic as Diag, DiagCode};
// Use memchr for SIMD-accelerated byte searching in forward_scan_xref
use memchr::{memchr, memchr_iter};
@ -51,74 +51,6 @@ pub enum XrefEntry {
Compressed { obj_stm_nr: u32, index: u32 },
}
/// Diagnostic codes for xref parsing.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum XrefDiagCode {
/// Invalid xref keyword or header
InvalidXrefHeader,
/// Malformed xref entry (not 20 bytes, bad format)
InvalidXrefEntry,
/// Invalid subsection header (not "start count")
InvalidSubsectionHeader,
/// Object 0 is not free (violates PDF spec)
ObjectZeroNotFree,
/// Trailer dictionary not found or malformed
TrailerNotFound,
/// Truncated xref table (unexpected EOF)
XrefTruncated,
/// Forward scan recovered xref entries (EC-07 recovery)
XrefRepaired,
/// Forward scan disabled for remote sources (would fetch entire file)
RemoteNoForwardScan,
/// Forward scan disabled for linearized files (has partial leading xref)
LinearizedNoForwardScan,
/// Invalid xref stream entry (unknown type, malformed data)
InvalidXrefStreamEntry,
/// Invalid xref stream format (missing required key, bad /W array)
InvalidXrefStreamFormat,
/// Xref stream decompression failed
XrefStreamDecompressionFailed,
/// Hybrid xref conflict: traditional table and stream disagree on object state
StructHybridConflict,
/// Circular /Prev reference detected (incremental update cycle)
StructCircularRef,
/// /Prev chain depth exceeded (adversarial input or corrupted file)
StructDepthExceeded,
/// /Prev offset points beyond file size
StructInvalidPrevOffset,
}
/// A diagnostic message emitted during xref parsing.
#[derive(Debug, Clone, PartialEq)]
pub struct XrefDiagnostic {
/// The diagnostic code
pub code: XrefDiagCode,
/// Byte offset in the input where the error occurred
pub byte_offset: u64,
/// Human-readable error message
pub msg: Cow<'static, str>,
}
impl XrefDiagnostic {
/// Create a diagnostic with a static message.
fn with_static(code: XrefDiagCode, byte_offset: u64, msg: &'static str) -> Self {
XrefDiagnostic {
code,
byte_offset,
msg: Cow::Borrowed(msg),
}
}
/// Create a diagnostic with a dynamic message.
fn with_dynamic(code: XrefDiagCode, byte_offset: u64, msg: String) -> Self {
XrefDiagnostic {
code,
byte_offset,
msg: Cow::Owned(msg),
}
}
}
/// Result of parsing a traditional xref table.
///
/// Contains the parsed xref entries and the trailer dictionary.
@ -129,7 +61,7 @@ pub struct XrefSection {
/// The trailer dictionary
pub trailer: Option<PdfDict>,
/// Diagnostics emitted during parsing
pub diagnostics: Vec<XrefDiagnostic>,
pub diagnostics: Vec<Diag>,
/// Whether this xref section is from a hybrid file (traditional + stream merged)
pub is_hybrid: bool,
}
@ -222,8 +154,8 @@ pub fn merge_hybrid(traditional: XrefSection, stream: XrefSection) -> XrefSectio
let stream_is_inuse = matches!(stream_entry, XrefEntry::InUse { .. } | XrefEntry::Compressed { .. });
if trad_is_free && stream_is_inuse {
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::StructHybridConflict,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::StructHybridConflict,
0,
format!(
"Object {}: traditional table marks as Free, stream marks as InUse; traditional wins (object is Free)",
@ -446,8 +378,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let header_bytes = match source.read_at(pos, 1024) {
Ok(bytes) if !bytes.is_empty() => bytes,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::XrefTruncated,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Failed to read xref header",
));
@ -461,8 +393,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let header_str = match std::str::from_utf8(&header_bytes) {
Ok(s) => s,
Err(_) => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefHeader,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidHeader,
pos,
"Invalid UTF-8 in xref header",
));
@ -478,8 +410,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
// Found it! ws_offset is the position of "xref" in header_bytes
break ws_offset;
} else {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefHeader,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidHeader,
pos,
"xref keyword not found",
));
@ -522,8 +454,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let chunk_str = match std::str::from_utf8(&chunk_bytes) {
Ok(s) => s,
Err(_) => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::XrefTruncated,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Invalid UTF-8 in xref data",
));
@ -547,8 +479,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let header_line = match read_line_at(source, subsection_start) {
Some(line) => line,
None => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidSubsectionHeader,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
"Failed to read subsection header",
));
@ -558,8 +490,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let header_parts: Vec<&str> = header_line.split_whitespace().collect();
if header_parts.len() != 2 {
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidSubsectionHeader,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
format!("Invalid subsection header: {}", header_line),
));
@ -584,8 +516,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let obj_start: u32 = match header_parts[0].parse() {
Ok(n) => n,
Err(_) => {
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidSubsectionHeader,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
format!("Invalid subsection start: {}", header_parts[0]),
));
@ -597,8 +529,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let obj_count: u32 = match header_parts[1].parse() {
Ok(n) => n,
Err(_) => {
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidSubsectionHeader,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidSubsectionHeader,
subsection_start,
format!("Invalid subsection count: {}", header_parts[1]),
));
@ -635,8 +567,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
let entry_bytes = match source.read_at(pos, 20) {
Ok(bytes) => bytes,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::XrefTruncated,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Failed to read xref entry",
));
@ -646,8 +578,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
if entry_bytes.len() < 19 {
// Definitely truncated
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::XrefTruncated,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
pos,
"Xref entry truncated (< 19 bytes)",
));
@ -668,18 +600,16 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
// Object 0 must be free (PDF spec requirement)
if obj_nr == 0 {
if let XrefEntry::InUse { .. } = entry {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::ObjectZeroNotFree,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefObjectZeroNotFree,
entry_start,
"Object 0 is not free (violates PDF spec)",
));
}
}
// Only add in-use entries to the result
// Free entries are ignored per pdftract spec (they don't resolve to objects)
if matches!(entry, XrefEntry::InUse { .. }) {
// Add all entries to the result (both InUse and Free)
// Free entries are needed for /Prev chain merge semantics to track object lifecycle
result.add_entry(obj_nr, entry);
}
pos += stride as u64;
entries_parsed += 1;
}
@ -699,8 +629,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
// If we exited the loop without finding a trailer, emit a diagnostic
if !trailer_found {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::TrailerNotFound,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
pos,
"Trailer dictionary not found (xref table may be truncated)",
));
@ -717,7 +647,7 @@ fn parse_xref_entry(
obj_nr: u32,
offset: u64,
stride: usize,
diagnostics: &mut Vec<XrefDiagnostic>,
diagnostics: &mut Vec<Diag>,
) -> Option<(u32, XrefEntry)> {
if bytes.len() != stride {
return None;
@ -727,8 +657,8 @@ fn parse_xref_entry(
let entry_str = match std::str::from_utf8(bytes) {
Ok(s) => s,
Err(_) => {
diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefEntry,
diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidEntry,
offset,
"Invalid UTF-8 in xref entry",
));
@ -739,8 +669,8 @@ fn parse_xref_entry(
// Entry format: "offset/next_free generation f/n" with line ending
let parts: Vec<&str> = entry_str.split_whitespace().collect();
if parts.len() < 3 {
diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefEntry,
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Malformed xref entry: {}", entry_str.trim()),
));
@ -750,8 +680,8 @@ fn parse_xref_entry(
let first_field: u64 = match parts[0].parse() {
Ok(n) => n,
Err(_) => {
diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefEntry,
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Invalid offset/next_free: {}", parts[0]),
));
@ -762,8 +692,8 @@ fn parse_xref_entry(
let gen_nr: u16 = match parts[1].parse() {
Ok(n) => n,
Err(_) => {
diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefEntry,
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Invalid generation: {}", parts[1]),
));
@ -776,8 +706,8 @@ fn parse_xref_entry(
Some('n') | Some('N') => Some((obj_nr, XrefEntry::InUse { offset: first_field, gen_nr })),
Some('f') | Some('F') => Some((obj_nr, XrefEntry::Free { next_free: first_field as u32, gen_nr })),
_ => {
diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefEntry,
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
offset,
format!("Invalid entry type: {}", parts[2]),
));
@ -842,7 +772,7 @@ fn read_line_at(source: &dyn PdfSource, mut pos: u64) -> Option<String> {
fn read_line(
source: &dyn PdfSource,
pos: &mut u64,
diagnostics: &mut Vec<XrefDiagnostic>,
diagnostics: &mut Vec<Diag>,
) -> Option<String> {
let line = read_line_at(source, *pos)?;
// Advance position past the line (including line ending)
@ -865,26 +795,30 @@ fn read_line(
/// Parse the trailer dictionary.
///
/// This is a simplified implementation that reads until the end of the
/// dictionary (>>) and returns a placeholder dict object.
/// The full implementation will use the object parser from Phase 1.2.
/// Parse the trailer dictionary from the xref trailer section.
///
/// This function extracts the trailer dictionary bytes and parses them
/// using the object parser to get the actual key-value pairs.
fn parse_trailer_dict(
source: &dyn PdfSource,
pos: &mut u64,
diagnostics: &mut Vec<XrefDiagnostic>,
diagnostics: &mut Vec<Diag>,
) -> Option<PdfDict> {
// Skip whitespace before <<
let mut seen_bracket = false;
let mut depth = 0;
let mut chunk_pos = 0u64;
let dict_start_offset = *pos;
let mut dict_end_offset = None;
// First, find the extent of the trailer dict (from << to >>)
loop {
let chunk = match source.read_at(*pos + chunk_pos, 1024) {
let chunk = match source.read_at(dict_start_offset + chunk_pos, 4096) {
Ok(bytes) => bytes,
Err(_) => {
diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::TrailerNotFound,
*pos,
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"I/O error reading trailer",
));
return None;
@ -914,8 +848,10 @@ fn parse_trailer_dict(
if j + 1 < remaining.len() && remaining[j + 1] == b'>' {
depth -= 1;
if depth == 0 {
*pos += chunk_pos + j as u64 + 2;
return Some(PdfDict::new());
// Found the end of the dict
let end_offset = dict_start_offset + chunk_pos + j as u64 + 2;
dict_end_offset = Some(end_offset);
break;
}
}
}
@ -927,26 +863,75 @@ fn parse_trailer_dict(
}
}
if dict_end_offset.is_some() {
break;
}
chunk_pos += chunk.len() as u64;
// Safety limit
if chunk_pos > 100000 {
diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::TrailerNotFound,
*pos,
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Trailer dictionary too large or unterminated",
));
return None;
}
}
diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::TrailerNotFound,
*pos,
"Trailer dictionary not found",
// If we didn't find the end, return None
let dict_end_offset = match dict_end_offset {
Some(offset) => offset,
None => {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Trailer dictionary not found (no << >> markers)",
));
return None;
}
};
// Read the full dict bytes and parse them
let dict_len = (dict_end_offset - dict_start_offset) as usize;
let dict_bytes = match source.read_at(dict_start_offset, dict_len) {
Ok(bytes) => bytes,
Err(_) => {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Failed to read trailer dictionary bytes",
));
return None;
}
};
// Parse the dict using ObjectParser
let mut parser = ObjectParser::new(&dict_bytes);
if let Some(PdfObject::Dict(dict)) = parser.parse_direct_object() {
// Update pos to after the dict
*pos = dict_end_offset;
// Transfer any diagnostics from the parser
for diag in parser.take_diagnostics() {
diagnostics.push(Diag::with_dynamic(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
diag.message.into_owned(),
));
}
Some(*dict)
} else {
diagnostics.push(Diag::with_static(
DiagCode::XrefTrailerNotFound,
dict_start_offset,
"Failed to parse trailer dictionary as a dict object",
));
None
}
}
/// Parse a direct PDF object (for trailer dictionary parsing).
///
@ -999,8 +984,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec
// Check for linearized file
if is_linearized {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::LinearizedNoForwardScan,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefLinearizedNoForwardScan,
0,
"Forward scan disabled for linearized PDF (partial leading xref would cause false results)",
));
@ -1014,8 +999,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec
let source_len = match source.len() {
Ok(len) if len > 0 => len,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::XrefTruncated,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefTruncated,
0,
"Unable to determine source length for forward scan",
));
@ -1095,8 +1080,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec
}
// Emit XREF_REPAIRED diagnostic with count
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::XrefRepaired,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefRepaired,
0,
format!("Forward scan recovered {} object entries", entries_found),
));
@ -1162,8 +1147,8 @@ fn forward_scan_memory(data: &[u8], source_len: u64) -> XrefSection {
}
// Emit XREF_REPAIRED diagnostic with count
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::XrefRepaired,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefRepaired,
0,
format!("Forward scan recovered {} object entries", entries_found),
));
@ -1403,8 +1388,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
let obj_bytes = match source.read_at(stream_obj_offset, 4096) {
Ok(bytes) if !bytes.is_empty() => bytes,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Failed to read xref stream object",
));
@ -1416,8 +1401,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
let indirect = match parser.parse_indirect_object() {
Some(i) => i,
None => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Failed to parse xref stream as indirect object",
));
@ -1429,8 +1414,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
let stream = match indirect.obj {
PdfObject::Stream(s) => s,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Xref stream object is not a stream",
));
@ -1441,8 +1426,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
// Check for /Type /XRef (optional per spec, but we validate it)
if let Some(PdfObject::Name(type_name)) = stream.dict.get("Type") {
if type_name.as_ref() != "/XRef" && type_name.as_ref() != "XRef" {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Stream /Type is not /XRef",
));
@ -1453,8 +1438,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
let size = match stream.dict.get("Size") {
Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Missing or invalid /Size in xref stream",
));
@ -1469,8 +1454,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
.filter_map(|o| o.as_int())
.collect();
if widths.len() != 3 {
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
format!("/W array must have 3 elements, got {}", widths.len()),
));
@ -1478,8 +1463,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
}
// Widths can be 0, but negative is invalid
if widths.iter().any(|&w| w < 0) {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"/W array contains negative values",
));
@ -1488,8 +1473,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
widths
}
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Missing or invalid /W in xref stream",
));
@ -1512,8 +1497,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
let first = match first_obj.as_int() {
Some(n) if n >= 0 => n as u32,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Invalid /Index first value",
));
@ -1523,8 +1508,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
let count = match iter.peek() {
Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32,
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Invalid /Index count value",
));
@ -1535,8 +1520,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
pairs.push((first, count));
}
if pairs.is_empty() {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"/Index array is empty",
));
@ -1546,8 +1531,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
}
None => vec![(0, size)],
_ => {
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefStreamFormat,
result.diagnostics.push(Diag::with_static(
DiagCode::XrefInvalidStreamFormat,
stream_obj_offset,
"Invalid /Index in xref stream (not an array)",
));
@ -1582,8 +1567,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
if decoded.is_empty() {
// Check if this is a legitimate empty stream (no objects) or an error
// A valid xref stream with no objects would have /Size 0, which is unusual
result.diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::XrefStreamDecompressionFailed,
result.diagnostics.push(Diag::with_static(
DiagCode::StreamDecodeError,
stream_obj_offset,
"Xref stream decompression produced empty output",
));
@ -1600,8 +1585,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
// Check we have enough bytes for this entry
if data_pos + entry_stride > decoded.len() {
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefStreamEntry,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidStreamEntry,
stream_obj_offset,
format!("Xref stream truncated at object {}", obj_nr),
));
@ -1657,8 +1642,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
}
_ => {
// Unknown type - emit diagnostic and treat as free
result.diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefStreamEntry,
result.diagnostics.push(Diag::with_dynamic(
DiagCode::XrefInvalidStreamEntry,
stream_obj_offset,
format!("Invalid xref entry type {} for object {}", entry_type, obj_nr),
));
@ -2105,12 +2090,12 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
offset: u64,
visited: &mut HashSet<u64>,
depth: u32,
diagnostics: &mut Vec<XrefDiagnostic>,
diagnostics: &mut Vec<Diag>,
) -> XrefSection {
// Cycle detection
if visited.contains(&offset) {
diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::StructCircularRef,
diagnostics.push(Diag::with_static(
DiagCode::StructCircularRef,
offset,
"Circular /Prev reference detected; stopping chain traversal",
));
@ -2121,8 +2106,8 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
// Depth limit check
if depth >= MAX_PREV_DEPTH {
diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::StructDepthExceeded,
diagnostics.push(Diag::with_dynamic(
DiagCode::StructDepthExceeded,
offset,
format!("/Prev chain depth exceeded maximum of {}", MAX_PREV_DEPTH).into(),
));
@ -2143,14 +2128,13 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
})
});
// Validate /Prev offset if present
let mut should_follow_prev = false;
// Validate /Prev offset and recursively load previous revision if present
if let Some(prev) = prev_offset {
match source.len() {
Ok(file_size) if prev > file_size => {
// /Prev points beyond file size - invalid
diagnostics.push(XrefDiagnostic::with_dynamic(
XrefDiagCode::StructInvalidPrevOffset,
diagnostics.push(Diag::with_dynamic(
DiagCode::StructInvalidPrevOffset,
offset,
format!("/Prev offset {} exceeds file size {}; ignoring /Prev key", prev, file_size).into(),
));
@ -2158,25 +2142,13 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
if let Some(ref mut trailer) = current.trailer {
trailer.shift_remove("Prev");
}
// Return current revision without following /Prev
let mut result = current;
result.diagnostics.extend(diagnostics.drain(..));
return result;
}
Ok(_) => {
// Valid /Prev offset
should_follow_prev = true;
}
Err(_) => {
// Can't determine file size - be conservative and don't follow
diagnostics.push(XrefDiagnostic::with_static(
XrefDiagCode::StructInvalidPrevOffset,
offset,
"Cannot determine file size; ignoring /Prev key",
));
}
}
}
// Recursively load previous revision if /Prev exists
if should_follow_prev {
let prev = prev_offset.unwrap(); // Safe because we checked should_follow_prev
// Valid /Prev offset - recursively load
let mut older = walk_chain(source, prev, visited, depth + 1, diagnostics);
// Merge: older entries first, then current (newer) entries override
@ -2200,10 +2172,26 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
older.diagnostics.extend(diagnostics.drain(..));
older
}
Err(_) => {
// Can't determine file size - be conservative and don't follow
diagnostics.push(Diag::with_static(
DiagCode::StructInvalidPrevOffset,
offset,
"Cannot determine file size; ignoring /Prev key",
));
// Return current revision without following /Prev
let mut result = current;
result.diagnostics.extend(diagnostics.drain(..));
result
}
}
} else {
// No /Prev - this is the baseline (original) revision
// Return current as-is
current
// Return current with any diagnostics from this level
let mut result = current;
result.diagnostics.extend(diagnostics.drain(..));
result
}
}
@ -2341,26 +2329,26 @@ mod tests {
#[test]
fn test_xref_diagnostic_static() {
let diag = XrefDiagnostic::with_static(
XrefDiagCode::InvalidXrefHeader,
let diag = Diag::with_static(
DiagCode::XrefInvalidHeader,
100,
"test message",
);
assert_eq!(diag.byte_offset, 100);
assert_eq!(diag.msg.as_ref(), "test message");
assert!(matches!(diag.code, XrefDiagCode::InvalidXrefHeader));
assert_eq!(diag.byte_offset, Some(100));
assert_eq!(diag.message.as_ref(), "test message");
assert!(matches!(diag.code, DiagCode::XrefInvalidHeader));
}
#[test]
fn test_xref_diagnostic_dynamic() {
let diag = XrefDiagnostic::with_dynamic(
XrefDiagCode::InvalidXrefEntry,
let diag = Diag::with_dynamic(
DiagCode::XrefInvalidEntry,
200,
"dynamic message".to_string(),
);
assert_eq!(diag.byte_offset, 200);
assert_eq!(diag.msg.as_ref(), "dynamic message");
assert!(matches!(diag.code, XrefDiagCode::InvalidXrefEntry));
assert_eq!(diag.byte_offset, Some(200));
assert_eq!(diag.message.as_ref(), "dynamic message");
assert!(matches!(diag.code, DiagCode::XrefInvalidEntry));
}
#[test]
@ -2378,12 +2366,15 @@ trailer\n<< /Size 6 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed 4 in-use entries (objects 0 and 3 are free and ignored)
assert_eq!(result.len(), 4);
// Should have parsed 6 entries (all objects 0-5, including free entries)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 6);
// Check specific entries
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 17, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 81, gen_nr: 0 }));
assert_eq!(result.entries.get(&3), Some(&XrefEntry::Free { next_free: 0, gen_nr: 7 }));
assert_eq!(result.entries.get(&4), Some(&XrefEntry::InUse { offset: 331, gen_nr: 0 }));
assert_eq!(result.entries.get(&5), Some(&XrefEntry::InUse { offset: 409, gen_nr: 0 }));
@ -2403,8 +2394,10 @@ trailer\r\n<< /Size 3 >>\r\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed 2 in-use entries
assert_eq!(result.len(), 2);
// Should have parsed 3 entries (all objects 0-2, including free entry)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 3);
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 }));
}
@ -2421,7 +2414,10 @@ trailer\n<< /Size 3 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should have parsed 2 in-use entries
// Should have parsed 3 entries (all objects 0-2, including free entry)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 3);
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
assert_eq!(result.len(), 2);
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 }));
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 }));
@ -2473,7 +2469,7 @@ trailer\n<< /Size 4 >>\n";
// Should have emitted a diagnostic for the bad entry
assert!(!result.diagnostics.is_empty());
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefEntry));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidEntry));
}
#[test]
@ -2489,7 +2485,7 @@ trailer\n<< /Size 3 >>\n";
let result = parse_traditional_xref(&source, 0);
// Should emit diagnostic for object 0 not being free
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::ObjectZeroNotFree));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefObjectZeroNotFree));
}
#[test]
@ -2502,12 +2498,13 @@ trailer\n<< /Size 3 >>\n";
let source = MemorySource::new(xref_data.to_vec());
let result = parse_traditional_xref(&source, 0);
// Should still parse the entry
assert_eq!(result.len(), 1);
// Should still parse both entries (including free entry)
// Free entries are tracked for /Prev chain merge semantics
assert_eq!(result.len(), 2);
assert!(result.trailer.is_none());
// Should emit diagnostic about missing trailer
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::TrailerNotFound));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefTrailerNotFound));
}
#[test]
@ -2686,7 +2683,7 @@ trailer\n<< /Size 3 >>\n";
assert!(result.entries.contains_key(&3));
// Check for XREF_REPAIRED diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::XrefRepaired));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefRepaired));
}
#[test]
@ -2719,7 +2716,7 @@ trailer\n<< /Size 3 >>\n";
assert_eq!(result.len(), 0);
// Should have LINEARIZED_NO_FORWARD_SCAN diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::LinearizedNoForwardScan));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefLinearizedNoForwardScan));
}
#[test]
@ -3119,7 +3116,7 @@ trailer\n<< /Size 3 >>\n";
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 2000, gen_nr: 0 }));
// Should have emitted a diagnostic for invalid type
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamEntry));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamEntry));
}
#[test]
@ -3134,7 +3131,7 @@ trailer\n<< /Size 3 >>\n";
let result = parse_xref_stream(&source, 0);
// Should have emitted diagnostic about missing /Size
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamFormat));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat));
}
#[test]
@ -3156,7 +3153,7 @@ trailer\n<< /Size 3 >>\n";
let result = parse_xref_stream(&source, 0);
// Should have emitted diagnostic about invalid /W
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamFormat));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat));
}
#[test]
@ -3443,7 +3440,7 @@ trailer\n<< /Size 3 >>\n";
assert!(merged.is_hybrid);
// Should have emitted STRUCT_HYBRID_CONFLICT diagnostic
assert!(merged.diagnostics.iter().any(|d| matches!(d.code, XrefDiagCode::StructHybridConflict)));
assert!(merged.diagnostics.iter().any(|d| matches!(d.code, DiagCode::StructHybridConflict)));
// Traditional Free wins
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
}
@ -3829,8 +3826,8 @@ trailer\n<< /Size 3 >>\n";
// Load from the latest revision
let result = load_xref_with_prev_chain(&source, rev3_offset);
// Verify all 5 objects are present
assert_eq!(result.len(), 5, "Should have entries for objects 1-5, got {}", result.len());
// Verify all 6 entries are present (including object 0)
assert_eq!(result.len(), 6, "Should have entries for objects 0-5, got {}", result.len());
// Verify LATEST values win:
// Object 1: unchanged from rev1 (offset 100)
@ -3980,11 +3977,12 @@ trailer\n<< /Size 3 >>\n";
let root = trailer.get("Root");
assert!(root.is_some());
match root {
Some(PdfObject::Array(ref arr)) if arr.len() == 3 => {
// [2, 0, R] - object number 2
assert_eq!(arr[0], PdfObject::Integer(2));
Some(PdfObject::Ref(obj_ref)) => {
// 2 0 R - indirect reference to object 2
assert_eq!(obj_ref.object, 2);
assert_eq!(obj_ref.generation, 0);
}
_ => panic!("Expected /Root to be an array [2 0 R]"),
_ => panic!("Expected /Root to be an indirect reference 2 0 R"),
}
// Should have /Info from rev2
@ -4043,7 +4041,7 @@ trailer\n<< /Size 3 >>\n";
let result = load_xref_with_prev_chain(&source, rev3_offset);
// Should emit STRUCT_CIRCULAR_REF diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructCircularRef));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructCircularRef));
}
/// Test depth limit enforcement.
@ -4081,7 +4079,7 @@ trailer\n<< /Size 3 >>\n";
let result = load_xref_with_prev_chain(&source, start_offset);
// Should emit STRUCT_DEPTH_EXCEEDED diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructDepthExceeded));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructDepthExceeded));
}
/// Test /Prev offset pointing beyond file size.
@ -4109,7 +4107,7 @@ trailer\n<< /Size 3 >>\n";
let result = load_xref_with_prev_chain(&source, rev2_offset);
// Should emit STRUCT_INVALID_PREV_OFFSET diagnostic
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset));
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
// /Prev should be removed from trailer
let trailer = result.trailer.as_ref().unwrap();
@ -4134,7 +4132,7 @@ trailer\n<< /Size 3 >>\n";
let result = load_xref_with_prev_chain(&source, offset);
// Should not follow /Prev 0, should just return this single revision
assert!(!result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset));
assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
}
/// Test negative /Prev treated as "no previous revision".
@ -4155,7 +4153,7 @@ trailer\n<< /Size 3 >>\n";
let result = load_xref_with_prev_chain(&source, offset);
// Should not follow negative /Prev
assert!(!result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset));
assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
}
/// Test hybrid file in /Prev chain.