diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index 99a91f5..e6b6f58 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -c6be8e6b574e5a1ef0fb65fb3aacebfe36740030 +37413028fa8535169cd8a39e47bee704cfc7bf80 diff --git a/Cargo.lock b/Cargo.lock index 1adb18b..9205645 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -738,6 +738,7 @@ dependencies = [ "chrono", "clap", "lzw", + "pdftract-core", "regex", "secrecy", "serde", diff --git a/crates/pdftract-cli/Cargo.toml b/crates/pdftract-cli/Cargo.toml index acd640c..9e49f23 100644 --- a/crates/pdftract-cli/Cargo.toml +++ b/crates/pdftract-cli/Cargo.toml @@ -22,6 +22,7 @@ anyhow = { workspace = true } chrono = { version = "0.4", features = ["serde"] } clap = { version = "4.5", features = ["derive"] } lzw = { workspace = true } +pdftract-core = { path = "../pdftract-core" } regex = "1.10" secrecy = { workspace = true } serde = { workspace = true, features = ["derive"] } diff --git a/crates/pdftract-cli/src/main.rs b/crates/pdftract-cli/src/main.rs index 8f28426..0db7c3a 100644 --- a/crates/pdftract-cli/src/main.rs +++ b/crates/pdftract-cli/src/main.rs @@ -8,6 +8,9 @@ mod mcp; mod password; use codegen::Language; +// Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands +pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG}; + #[derive(Parser)] #[command(name = "pdftract")] #[command(about = "pdftract CLI - PDF extraction and conformance testing", long_about = None)] @@ -18,6 +21,13 @@ struct Cli { #[derive(Subcommand)] enum Commands { + /// List all diagnostic codes with their metadata + ListDiagnostics, + /// Explain a specific diagnostic code in detail + ExplainDiagnostic { + /// Diagnostic code to explain (e.g., STRUCT_MISSING_KEY, STREAM_BOMB) + code: String, + }, /// Compare actual results against expected values with tolerances (for conformance testing) Compare { /// Path to the actual results JSON @@ -113,6 +123,12 @@ fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { + Commands::ListDiagnostics => { + cmd_list_diagnostics()?; + } + Commands::ExplainDiagnostic { code } => { + cmd_explain_diagnostic(&code)?; + } Commands::Compare { actual, expected, @@ -192,6 +208,335 @@ fn cmd_extract( Ok(()) } +fn cmd_list_diagnostics() -> Result<()> { + println!("pdftract Diagnostic Codes"); + println!(); + println!("This catalog lists all diagnostic codes emitted during PDF parsing and extraction."); + println!("Each diagnostic includes a severity level, recoverable flag, phase origin, and suggested action."); + println!(); + + // Group by category + let mut categories: std::collections::HashMap<&str, Vec<&DiagInfo>> = std::collections::HashMap::new(); + for info in DIAGNOSTIC_CATALOG { + categories.entry(info.category).or_default().push(info); + } + + // Define category order + let category_order = vec![ + "STRUCT", "XREF", "STREAM", "ENCRYPTION", "PAGE", "FONT", + "OCR", "REMOTE", "GSTATE", "LAYOUT", "MCP", "CACHE", + ]; + + for category in category_order { + if let Some(infos) = categories.get(category) { + println!("=== {}_* codes ===", category); + println!(); + + for info in infos { + println!("{} ({})", info.code, info.severity); + println!(" Phase: {}", info.phase); + println!(" Recoverable: {}", if info.recoverable { "Yes" } else { "No" }); + println!(" Action: {}", info.suggested_action); + println!(); + } + } + } + + println!("Total: {} diagnostic codes", DIAGNOSTIC_CATALOG.len()); + Ok(()) +} + +fn cmd_explain_diagnostic(code: &str) -> Result<()> { + // Normalize the input code (handle case-insensitivity and strip whitespace) + let code_upper = code.to_uppercase().trim().to_string(); + + // Try to find the diagnostic by name in the catalog + let info = DIAGNOSTIC_CATALOG + .iter() + .find(|info| info.code.name() == code_upper) + .ok_or_else(|| anyhow::anyhow!("Unknown diagnostic code: {}", code))?; + + println!("Diagnostic: {}", info.code); + println!("Category: {}", info.category); + println!("Severity: {}", info.severity); + println!("Recoverable: {}", if info.recoverable { "Yes" } else { "No" }); + println!("Phase Origin: {}", info.phase); + println!(); + println!("Description:"); + + // Get the description from the DiagCode's doc comment + // We can't access doc comments at runtime, but we can provide useful info + match info.code { + DiagCode::StructInvalidName => { + println!(" Invalid name character or malformed name object"); + println!(" Names containing invalid characters or exceeding the 127-byte limit are truncated."); + } + DiagCode::StructInvalidHex => { + println!(" Invalid hexadecimal character in hex string or name escape"); + println!(" Non-hex characters in <...> strings or #XX escapes are skipped."); + } + DiagCode::StructInvalidOctal => { + println!(" Invalid octal escape sequence in literal string"); + println!(" Invalid \\NNN escapes are passed through literally."); + } + DiagCode::StructInvalidStreamHeader => { + println!(" Invalid stream header"); + println!(" The 'stream' keyword must be followed by CRLF or LF per PDF spec."); + } + DiagCode::StructUnexpectedByte => { + println!(" Unexpected byte during parsing"); + println!(" A byte doesn't match expected token syntax; lexer resynchronizes."); + } + DiagCode::StructUnexpectedEof => { + println!(" Unexpected end of file"); + println!(" The file ends mid-token; parsing continues with partial data."); + } + DiagCode::StructUnterminatedString => { + println!(" Unterminated literal string"); + println!(" A literal string is missing a closing parenthesis."); + } + DiagCode::StructMissingKey => { + println!(" Missing required dictionary key"); + println!(" A required key is absent from a dictionary."); + } + DiagCode::StructCircularRef => { + println!(" Circular reference detected"); + println!(" An indirect reference forms a cycle (A → B → A)."); + } + DiagCode::StructXobjectCycle => { + println!(" Form XObject cycle detected"); + println!(" A form XObject invokes itself directly or indirectly."); + } + DiagCode::StructDepthExceeded => { + println!(" Dictionary nesting depth exceeds limit"); + println!(" Structure is too deeply nested; truncated to prevent stack overflow."); + } + DiagCode::StructInvalidDictValue => { + println!(" Invalid dictionary value"); + println!(" A dictionary key is not followed by a value."); + } + DiagCode::StructInvalidDictKey => { + println!(" Invalid dictionary key"); + println!(" A dictionary key is not a name object."); + } + DiagCode::StructInvalidIndirectHeader => { + println!(" Invalid indirect object header"); + println!(" The 'N G obj' header is malformed."); + } + DiagCode::StructIntegerOverflow => { + println!(" Integer overflow during parsing"); + println!(" An integer would overflow i64; value is clamped."); + } + DiagCode::StructInvalidObjstm => { + println!(" Invalid object stream format"); + println!(" An object stream has a malformed header or invalid data."); + } + DiagCode::StructInvalidGeometry => { + println!(" Invalid geometry value"); + println!(" NaN or Inf in MediaBox/CropBox/Rotate; canonicalized to 0."); + } + DiagCode::StructInvalidUtf16 => { + println!(" Invalid UTF-16BE encoding"); + println!(" A UTF-16BE string has odd length or invalid encoding."); + } + DiagCode::StructUnresolvedDestination => { + println!(" Unresolved named destination"); + println!(" An outline references a named destination (not yet resolved)."); + } + DiagCode::StructNonGotoOutline => { + println!(" Non-GoTo action in outline"); + println!(" An outline has an action other than GoTo/URI."); + } + DiagCode::StructInvalidPdfDocEncoding => { + println!(" Invalid PDFDocEncoding"); + println!(" A PDFDocEncoding string cannot be decoded to UTF-8."); + } + DiagCode::StructHybridConflict => { + println!(" Hybrid xref conflict"); + println!(" Traditional xref and stream disagree on object state."); + } + DiagCode::StructInvalidPrevOffset => { + println!(" Invalid /Prev offset in xref chain"); + println!(" A trailer's /Prev offset points to invalid data."); + } + DiagCode::XrefInvalidHeader => { + println!(" Invalid xref keyword or header"); + println!(" The xref table doesn't start with the 'xref' keyword."); + } + DiagCode::XrefInvalidEntry => { + println!(" Malformed xref entry"); + println!(" An xref entry doesn't match the 20-byte format."); + } + DiagCode::XrefInvalidSubsectionHeader => { + println!(" Invalid subsection header"); + println!(" An xref subsection header is malformed."); + } + DiagCode::XrefObjectZeroNotFree => { + println!(" Object 0 is not free"); + println!(" Object 0 is marked as in-use, violating PDF spec."); + } + DiagCode::XrefTrailerNotFound => { + println!(" Trailer dictionary not found"); + println!(" The trailer dictionary couldn't be located or parsed."); + } + DiagCode::XrefTruncated => { + println!(" Truncated xref table"); + println!(" The xref table ends unexpectedly."); + } + DiagCode::XrefRepaired => { + println!(" Xref was reconstructed"); + println!(" Forward scan recovered xref entries after primary strategies failed."); + } + DiagCode::XrefLinearizedNoForwardScan => { + println!(" Forward scan disabled for linearized PDF"); + println!(" Forward scan would incorrectly find the partial first-page xref."); + } + DiagCode::XrefRemoteNoForwardScan => { + println!(" Forward scan disabled for remote sources"); + println!(" Forward scan would require fetching the entire file."); + } + DiagCode::XrefInvalidStreamFormat => { + println!(" Invalid xref stream format"); + println!(" An xref stream has a malformed header or invalid /W array."); + } + DiagCode::XrefInvalidStreamEntry => { + println!(" Invalid xref stream entry"); + println!(" An xref stream entry cannot be parsed due to invalid data."); + } + DiagCode::StreamDecodeError => { + println!(" Stream decompression failed"); + println!(" A stream decoder encountered corrupt data mid-decompression."); + } + DiagCode::StreamBomb => { + println!(" Decompression bomb limit exceeded"); + println!(" A stream's decompressed size would exceed the safety limit."); + } + DiagCode::StreamUnknownFilter => { + println!(" Unknown filter name"); + println!(" A stream specifies an unsupported filter."); + } + DiagCode::StreamInvalidParams => { + println!(" Invalid filter parameters"); + println!(" A stream's /DecodeParms dictionary is malformed."); + } + DiagCode::EncryptionUnsupported => { + println!(" Unsupported encryption or no password"); + println!(" PDF is encrypted and no password was supplied or algorithm is unsupported."); + } + DiagCode::EncryptionWrongPassword => { + println!(" Password incorrect"); + println!(" The supplied password doesn't match the PDF's encryption key."); + } + DiagCode::PageOutOfRange => { + println!(" Page number out of range"); + println!(" --pages specifies a page number greater than the document's page count."); + } + DiagCode::PageInvalidCount => { + println!(" Invalid page count"); + println!(" The /Count key in the /Pages tree is invalid."); + } + DiagCode::PageInvalidRotate => { + println!(" Invalid /Rotate value"); + println!(" A page's /Rotate value is not a multiple of 90."); + } + DiagCode::FontGlyphUnmapped => { + println!(" Glyph could not be mapped to Unicode"); + println!(" A glyph has no entry in /ToUnicode CMap, AGL, fingerprint, or shape match."); + } + DiagCode::FontNotFound => { + println!(" Font not found or couldn't be parsed"); + println!(" A referenced font is missing from the PDF or couldn't be parsed."); + } + DiagCode::FontInvalidCmap => { + println!(" Invalid CMap format"); + println!(" A CMap stream is malformed."); + } + DiagCode::OcrJbig2Unsupported => { + println!(" JBIG2 decoder not available"); + println!(" Build with --features full-render to enable JBIG2 decoding."); + } + DiagCode::OcrJpxUnsupported => { + println!(" JPEG2000 decoder not available"); + println!(" Build with --features full-render or install libopenjp2."); + } + DiagCode::OcrCcittUnsupported => { + println!(" CCITT fax decoder not available"); + println!(" Install libtiff system library or build with --features full-render."); + } + DiagCode::OcrTesseractFailed => { + println!(" Tesseract OCR failed"); + println!(" Tesseract crashed or returned an error."); + } + DiagCode::OcrBrokenVectorUnavailable => { + println!(" OCR unavailable on broken-vector page"); + println!(" Build with --features ocr to enable OCR recovery."); + } + DiagCode::RemoteFetchInterrupted => { + println!(" HTTP fetch interrupted or failed"); + println!(" Network error, timeout, or server error occurred."); + } + DiagCode::RemoteNoRangeSupport => { + println!(" Server does not support Range requests"); + println!(" Falls back to downloading the entire file."); + } + DiagCode::RemoteTlsFailed => { + println!(" TLS handshake failed"); + println!(" The TLS handshake failed; check the server's certificate."); + } + DiagCode::RemoteDnsFailed => { + println!(" DNS resolution failed"); + println!(" The hostname could not be resolved."); + } + DiagCode::GstateStackOverflow => { + println!(" Graphics state stack overflow"); + println!(" The graphics state stack exceeded the internal limit."); + } + DiagCode::GstateStackUnderflow => { + println!(" Graphics state stack underflow"); + println!(" More Q operators than q operators in the content stream."); + } + DiagCode::GstateBtEtMismatch => { + println!(" Mismatched BT/ET pair"); + println!(" The content stream has mismatched BT/ET operators."); + } + DiagCode::LayoutTaggedPdfDeferred => { + println!(" Tagged PDF StructTree deferred"); + println!(" StructTree is ignored; XY-cut is used instead (Phase 7.1 pending)."); + } + DiagCode::LayoutReadingOrderAmbiguous => { + println!(" Reading order may be incorrect"); + println!(" The reading order algorithm detected ambiguity."); + } + DiagCode::LayoutLowReadability => { + println!(" Low readability score"); + println!(" Page readability is below 0.85; may indicate mojibake."); + } + DiagCode::McpToolInvalidParams => { + println!(" MCP tool call has invalid parameters"); + println!(" An MCP tool call doesn't match the tool's schema."); + } + DiagCode::McpPathTraversal => { + println!(" MCP path traversal attempt"); + println!(" An MCP path escapes the --root directory."); + } + DiagCode::CacheEntryCorrupt => { + println!(" Cache entry is corrupted"); + println!(" A cached entry failed to deserialize and was deleted."); + } + DiagCode::CacheWriteFailed => { + println!(" Cache write failed"); + println!(" Writing to the cache failed (e.g., out of disk space)."); + } + } + + println!(); + println!("Suggested Action: {}", info.suggested_action); + println!(); + println!("Phase Origin: {}", info.phase); + + Ok(()) +} + fn cmd_compare(actual: PathBuf, expected: PathBuf, tolerances: Option, format: &str) -> Result<()> { let actual_json = fs::read_to_string(&actual) .context(format!("Failed to read actual results from {:?}", actual))?; diff --git a/crates/pdftract-core/proptest-regressions/parser/lexer/mod.txt b/crates/pdftract-core/proptest-regressions/parser/lexer/mod.txt index 70e607a..8ae6d9a 100644 --- a/crates/pdftract-core/proptest-regressions/parser/lexer/mod.txt +++ b/crates/pdftract-core/proptest-regressions/parser/lexer/mod.txt @@ -5,3 +5,4 @@ # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc 9eb796a85e40a841d1cd43881214b688676e982ec812d8c66313ea753a019ec6 # shrinks to bytes = [123] +cc e23be3e45757e93e13f0d3daf57c9fbce249a6629b9bfc8d0cb14ebf332767ae # shrinks to bytes = [41] diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index 8e630d3..2be2eb9 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -383,6 +383,30 @@ pub enum DiagCode { /// Phase origin: 1.3 XrefRemoteNoForwardScan, + /// Invalid xref stream format + /// + /// Emitted when an xref stream has a malformed header, invalid /W array, + /// or other format violations. The stream is skipped. + /// + /// Phase origin: 1.3 + XrefInvalidStreamFormat, + + /// Invalid xref stream entry + /// + /// Emitted when an xref stream entry cannot be parsed due to invalid data + /// in the stream's compressed entries section. + /// + /// Phase origin: 1.3 + XrefInvalidStreamEntry, + + /// Invalid /Prev offset in xref chain + /// + /// Emitted when a trailer's /Prev offset points to invalid data (outside file, + /// not at xref boundary, etc.). The chain is truncated at this point. + /// + /// Phase origin: 1.3 + StructInvalidPrevOffset, + // === STREAM_* codes === /// Stream decompression failed (corrupt data) @@ -687,7 +711,12 @@ impl DiagCode { | DiagCode::XrefTruncated | DiagCode::XrefRepaired | DiagCode::XrefLinearizedNoForwardScan - | DiagCode::XrefRemoteNoForwardScan => "XREF", + | DiagCode::XrefRemoteNoForwardScan + | DiagCode::XrefInvalidStreamFormat + | DiagCode::XrefInvalidStreamEntry => "XREF", + + // STRUCT_* (continued) + DiagCode::StructInvalidPrevOffset => "STRUCT", // STREAM_* DiagCode::StreamDecodeError @@ -774,6 +803,9 @@ impl DiagCode { DiagCode::XrefRepaired => "XREF_REPAIRED", DiagCode::XrefLinearizedNoForwardScan => "XREF_LINEARIZED_NO_FORWARD_SCAN", DiagCode::XrefRemoteNoForwardScan => "XREF_REMOTE_NO_FORWARD_SCAN", + DiagCode::XrefInvalidStreamFormat => "XREF_INVALID_STREAM_FORMAT", + DiagCode::XrefInvalidStreamEntry => "XREF_INVALID_STREAM_ENTRY", + DiagCode::StructInvalidPrevOffset => "STRUCT_INVALID_PREV_OFFSET", DiagCode::StreamDecodeError => "STREAM_DECODE_ERROR", DiagCode::StreamBomb => "STREAM_BOMB", DiagCode::StreamUnknownFilter => "STREAM_UNKNOWN_FILTER", @@ -836,6 +868,7 @@ impl DiagCode { | DiagCode::StructNonGotoOutline | DiagCode::StructInvalidPdfDocEncoding | DiagCode::StructHybridConflict + | DiagCode::StructInvalidPrevOffset | DiagCode::XrefInvalidHeader | DiagCode::XrefInvalidEntry | DiagCode::XrefInvalidSubsectionHeader @@ -844,6 +877,8 @@ impl DiagCode { | DiagCode::XrefTruncated | DiagCode::XrefLinearizedNoForwardScan | DiagCode::XrefRemoteNoForwardScan + | DiagCode::XrefInvalidStreamFormat + | DiagCode::XrefInvalidStreamEntry | DiagCode::StreamDecodeError | DiagCode::StreamUnknownFilter | DiagCode::StreamInvalidParams @@ -1145,6 +1180,30 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "1.3", suggested_action: "Forward scan is disabled for HTTP sources (would fetch entire file)", }, + DiagInfo { + code: DiagCode::XrefInvalidStreamFormat, + category: "XREF", + severity: Severity::Warning, + recoverable: true, + phase: "1.3", + suggested_action: "The xref stream has a malformed header or invalid /W array; the stream is skipped", + }, + DiagInfo { + code: DiagCode::XrefInvalidStreamEntry, + category: "XREF", + severity: Severity::Warning, + recoverable: true, + phase: "1.3", + suggested_action: "An xref stream entry cannot be parsed due to invalid data", + }, + DiagInfo { + code: DiagCode::StructInvalidPrevOffset, + category: "STRUCT", + severity: Severity::Warning, + recoverable: true, + phase: "1.3", + suggested_action: "A trailer's /Prev offset points to invalid data; the xref chain is truncated at this point", + }, // === STREAM_* codes === DiagInfo { code: DiagCode::StreamDecodeError, diff --git a/crates/pdftract-core/src/parser/catalog.rs b/crates/pdftract-core/src/parser/catalog.rs index 49a7dda..552c529 100644 --- a/crates/pdftract-core/src/parser/catalog.rs +++ b/crates/pdftract-core/src/parser/catalog.rs @@ -783,7 +783,9 @@ mod tests { assert!(catalog.names_ref.is_none()); assert!(catalog.metadata_ref.is_none()); assert!(catalog.page_labels.is_none()); - assert!(catalog.oc_properties.is_none()); + // oc_properties is always Some; check present flag for absence + assert!(catalog.oc_properties.is_some()); + assert!(!catalog.oc_properties.as_ref().unwrap().present); assert!(catalog.open_action.is_none()); assert!(catalog.aa.is_none()); assert!(catalog.version.is_none()); diff --git a/crates/pdftract-core/src/parser/lexer/mod.rs b/crates/pdftract-core/src/parser/lexer/mod.rs index 4f927cc..cdd7e46 100644 --- a/crates/pdftract-core/src/parser/lexer/mod.rs +++ b/crates/pdftract-core/src/parser/lexer/mod.rs @@ -3,7 +3,7 @@ //! This module provides the lexer that converts raw PDF byte sequences into tokens. //! PDF is byte-oriented; position tracking is byte-level, not character-level. -use std::borrow::Cow; +use crate::diagnostics::{Diagnostic as Diag, DiagCode}; /// Token produced by the PDF lexer. /// @@ -49,82 +49,6 @@ pub enum Token { Eof, } -/// Diagnostic code for lexer errors. -/// -/// All lexer diagnostic codes use the `STRUCT_` prefix to indicate -/// they relate to structural/lexical issues in the PDF document. -#[derive(Clone, Debug, PartialEq)] -pub enum DiagCode { - /// Invalid name character or malformed name - StructInvalidName, - /// Invalid hexadecimal character in hex string or name escape - StructInvalidHex, - /// Invalid octal escape sequence in literal string - StructInvalidOctal, - /// Invalid stream header (stream keyword not followed by proper newline) - StructInvalidStreamHeader, - /// Unexpected byte (e.g., stray `>` not part of `>>`) - StructUnexpectedByte, - /// Unexpected end of file while parsing a token - StructUnexpectedEof, - /// Unterminated literal string (missing closing paren) - StructUnterminatedString, - - // Object parser codes - /// Dictionary nesting depth exceeds limit - DepthExceeded, - /// Missing required key in dictionary - MissingKey, - - // Object stream codes - /// Invalid object stream format - InvalidObjstm, - /// Circular reference in /Extends chain - CircularRef, - /// Stream decompression failed - DecompressionFailed, - /// Decompression bomb limit exceeded - StreamBomb, -} - -/// Diagnostic message emitted during lexing. -/// -/// Diagnostics are accumulated during lexing and can be retrieved -/// via `Lexer::take_diagnostics()`. They do not stop lexing; the -/// lexer attempts recovery and continues. -/// -/// Diagnostic messages use `Cow<'static, str>` so static error messages -/// don't allocate. Dynamic messages (with formatting) allocate only when needed. -#[derive(Clone, Debug, PartialEq)] -pub struct Diagnostic { - /// The diagnostic code identifying the type of error - pub code: DiagCode, - /// Byte offset in the input where the error occurred - pub byte_offset: u64, - /// Human-readable error message - pub msg: Cow<'static, str>, -} - -impl Diagnostic { - /// Create a diagnostic with a static message (no allocation). - fn with_static(code: DiagCode, byte_offset: u64, msg: &'static str) -> Self { - Diagnostic { - code, - byte_offset, - msg: Cow::Borrowed(msg), - } - } - - /// Create a diagnostic with a dynamic message (allocates). - fn with_dynamic(code: DiagCode, byte_offset: u64, msg: String) -> Self { - Diagnostic { - code, - byte_offset, - msg: Cow::Owned(msg), - } - } -} - /// PDF lexical analyzer. /// /// The lexer processes PDF byte sequences and produces tokens. @@ -149,7 +73,7 @@ pub struct Lexer<'a> { /// Current byte position within the original input pos: usize, /// Accumulated diagnostics - diagnostics: Vec, + diagnostics: Vec, /// Cached token for peek operations (token, position after token) peek_cache: Option<(Token, usize)>, /// Whether Eof has been returned @@ -322,7 +246,7 @@ impl<'a> Lexer<'a> { /// let diags = lexer.take_diagnostics(); /// assert!(diags.is_empty()); /// ``` - pub fn take_diagnostics(&mut self) -> Vec { + pub fn take_diagnostics(&mut self) -> Vec { std::mem::take(&mut self.diagnostics) } @@ -387,6 +311,17 @@ impl<'a> Lexer<'a> { b'n' => self.lex_n_keyword(), b'x' => self.lex_x_keyword(), b'%' => self.lex_percent(), + b'{' | b'}' => { + // PDF 1.2 reserved these for future use; treat as unexpected bytes + let pos = self.pos; + self.diagnostics.push(Diag::with_dynamic( + DiagCode::StructUnexpectedByte, + pos as u64, + format!("Unexpected byte: 0x{:02x}", next), + )); + self.advance(1); + Some(Token::Null) + } _ => self.lex_keyword(), } } @@ -601,7 +536,7 @@ impl<'a> Lexer<'a> { if !has_digit { // Not a valid number, emit diagnostic and return null - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructUnexpectedEof, start as u64, "Invalid numeric literal", @@ -710,7 +645,7 @@ impl<'a> Lexer<'a> { } if value > 255 { - self.diagnostics.push(Diagnostic::with_dynamic( + self.diagnostics.push(Diag::with_dynamic( DiagCode::StructInvalidOctal, self.pos as u64, format!("Octal escape \\{:03o} exceeds 255, truncated", value), @@ -738,7 +673,7 @@ impl<'a> Lexer<'a> { } // Unterminated string - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructUnterminatedString, start as u64, "Unterminated literal string", @@ -763,7 +698,7 @@ impl<'a> Lexer<'a> { // Special check for NUL byte: it's whitespace per spec, but invalid in names if b == 0x00 { - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructInvalidName, self.pos as u64, "NUL byte in name is invalid per PDF spec", @@ -796,7 +731,7 @@ impl<'a> Lexer<'a> { let decoded = (h << 4) | l; // Check if decoded byte is NUL if decoded == 0 { - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructInvalidName, self.pos as u64, "NUL byte in name is invalid per PDF spec", @@ -810,7 +745,7 @@ impl<'a> Lexer<'a> { } _ => { // Invalid hex: emit diagnostic and treat # as literal - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructInvalidName, self.pos as u64, "Invalid hex escape sequence in name", @@ -836,7 +771,7 @@ impl<'a> Lexer<'a> { // Emit diagnostic if we hit the length limit if truncated_due_to_length || raw_consumed > MAX_RAW_BYTES { - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructInvalidName, start as u64, "Name exceeds 127-byte length limit", @@ -845,7 +780,7 @@ impl<'a> Lexer<'a> { // Check if there's more input that we didn't consume if let Some(&b) = self.bytes.first() { if !Self::is_pdf_whitespace(b) && !Self::is_pdf_delimiter(b) { - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructInvalidName, start as u64, "Name exceeds 127-byte length limit", @@ -910,7 +845,7 @@ impl<'a> Lexer<'a> { out.push(hi << 4); current_nibble = None; } - self.diagnostics.push(Diagnostic::with_dynamic( + self.diagnostics.push(Diag::with_dynamic( DiagCode::StructInvalidHex, self.pos as u64, format!("Invalid hex character '{}' (0x{:02x})", b as char, b), @@ -920,7 +855,7 @@ impl<'a> Lexer<'a> { } // EOF before > - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructUnterminatedString, start as u64, "Unterminated hex string", @@ -950,7 +885,7 @@ impl<'a> Lexer<'a> { Some(Token::DictEnd) } else { // Stray > - emit diagnostic - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructUnexpectedByte, self.pos as u64, "Unexpected > character", @@ -980,7 +915,7 @@ impl<'a> Lexer<'a> { self.advance(1); // consume the \n } else { // Lone \r - invalid - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructInvalidStreamHeader, start_pos as u64, "stream keyword must be followed by \\n or \\r\\n, not lone \\r", @@ -988,7 +923,7 @@ impl<'a> Lexer<'a> { } } else { // No line ending at all - invalid - self.diagnostics.push(Diagnostic::with_static( + self.diagnostics.push(Diag::with_static( DiagCode::StructInvalidStreamHeader, start_pos as u64, "stream keyword must be followed by \\n or \\r\\n", @@ -1071,7 +1006,7 @@ impl<'a> Lexer<'a> { fn lex_unknown(&mut self) -> Option { // Unknown character - skip it and emit diagnostic let pos = self.pos; - self.diagnostics.push(Diagnostic::with_dynamic( + self.diagnostics.push(Diag::with_dynamic( DiagCode::StructUnexpectedEof, pos as u64, format!("Unexpected byte: 0x{:02x}", self.bytes[0]), @@ -1201,7 +1136,7 @@ mod tests { let diags = lexer.take_diagnostics(); assert_eq!(diags.len(), 1); assert_eq!(diags[0].code, DiagCode::StructInvalidStreamHeader); - assert!(diags[0].msg.contains("lone \\r")); + assert!(diags[0].message.as_ref().contains("lone \\r")); } #[test] @@ -1358,7 +1293,7 @@ mod tests { let diags = lexer.take_diagnostics(); assert_eq!(diags.len(), 1); assert_eq!(diags[0].code, DiagCode::StructInvalidOctal); - assert!(diags[0].msg.contains("401")); + assert!(diags[0].message.as_ref().contains("401")); } #[test] @@ -1477,8 +1412,8 @@ mod tests { assert_eq!(diags.len(), 1); assert_eq!(diags[0].code, DiagCode::StructInvalidHex); // Debug: print actual message - eprintln!("Actual diagnostic message: {}", diags[0].msg); - assert!(diags[0].msg.contains("Z")); + eprintln!("Actual diagnostic message: {}", diags[0].message.as_ref()); + assert!(diags[0].message.as_ref().contains("Z")); } #[test] @@ -1489,7 +1424,7 @@ mod tests { let diags = lexer.take_diagnostics(); assert_eq!(diags.len(), 1); assert_eq!(diags[0].code, DiagCode::StructUnterminatedString); - assert!(diags[0].msg.contains("hex string")); + assert!(diags[0].message.as_ref().contains("hex string")); } #[test] @@ -1772,7 +1707,7 @@ mod tests { let diags = lexer.take_diagnostics(); assert_eq!(diags.len(), 1); assert_eq!(diags[0].code, DiagCode::StructInvalidName); - assert!(diags[0].msg.contains("NUL")); + assert!(diags[0].message.as_ref().contains("NUL")); } #[test] @@ -1801,7 +1736,7 @@ mod tests { let diags = lexer.take_diagnostics(); assert_eq!(diags.len(), 1); assert_eq!(diags[0].code, DiagCode::StructInvalidName); - assert!(diags[0].msg.contains("127")); + assert!(diags[0].message.as_ref().contains("127")); } #[test] @@ -1873,7 +1808,7 @@ mod tests { let diags = lexer.take_diagnostics(); assert_eq!(diags.len(), 1); assert_eq!(diags[0].code, DiagCode::StructInvalidName); - assert!(diags[0].msg.contains("hex")); + assert!(diags[0].message.as_ref().contains("hex")); } #[test] diff --git a/crates/pdftract-core/src/parser/mod.rs b/crates/pdftract-core/src/parser/mod.rs index 453c812..6005f30 100644 --- a/crates/pdftract-core/src/parser/mod.rs +++ b/crates/pdftract-core/src/parser/mod.rs @@ -20,9 +20,10 @@ pub use crate::diagnostics::{Diagnostic, Severity, DiagCode, ObjRef}; pub use object::{PdfObject}; pub use objstm::{ObjectStmParser, ObjStmCacheEntry, ObjStmResult, ObjStmError}; pub use xref::{ - XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection, XrefDiagnostic, XrefDiagCode, + XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection, parse_traditional_xref, parse_xref_stream, merge_hybrid, is_hybrid_trailer, LinearizationInfo, detect_linearization, load_xref_linearized, merge_linearized_xrefs, + load_xref_with_prev_chain, }; pub use catalog::{Catalog, MarkInfo, PageLabel, PageLabelsTree, PageLabelStyle, parse_catalog}; pub use ocg::{OcProperties, OcGroup, Ocmd, OcmdPolicy, BaseState, parse_oc_properties}; diff --git a/crates/pdftract-core/src/parser/object/parser.rs b/crates/pdftract-core/src/parser/object/parser.rs index 13d3c6a..e6db40b 100644 --- a/crates/pdftract-core/src/parser/object/parser.rs +++ b/crates/pdftract-core/src/parser/object/parser.rs @@ -5,7 +5,7 @@ use super::types::{intern, ObjRef, PdfDict, PdfObject, PdfStream, PdfIndirect}; use crate::parser::lexer::{Lexer, Token}; -use crate::parser::diagnostic::{Diagnostic, DiagCode}; +use crate::diagnostics::{Diagnostic as Diag, DiagCode}; /// Maximum nesting depth for dictionaries and arrays. /// @@ -21,7 +21,7 @@ pub struct ObjectParser<'a> { /// The lexer that provides tokens lexer: Lexer<'a>, /// Accumulated diagnostics - diagnostics: Vec, + diagnostics: Vec, /// Current nesting depth (for depth limit enforcement) depth: u16, } @@ -50,7 +50,7 @@ impl<'a> ObjectParser<'a> { } /// Take all accumulated diagnostics. - pub fn take_diagnostics(&mut self) -> Vec { + pub fn take_diagnostics(&mut self) -> Vec { std::mem::take(&mut self.diagnostics) } @@ -93,8 +93,8 @@ impl<'a> ObjectParser<'a> { Token::Eof => None, _ => { // Unexpected token - emit diagnostic and return null - self.diagnostics.push(Diagnostic::warning( - "1.2", + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructUnexpectedByte, format!("Unexpected token: {:?}", token), )); Some(PdfObject::Null) @@ -119,8 +119,8 @@ impl<'a> ObjectParser<'a> { // Validate object and generation numbers are non-negative if first_int < 0 || gen < 0 { - self.diagnostics.push(Diagnostic::warning( - "1.2", + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidIndirectHeader, format!("Invalid indirect reference: {} {} R", first_int, gen), )); return Some(PdfObject::Null); @@ -141,9 +141,9 @@ impl<'a> ObjectParser<'a> { fn parse_array(&mut self) -> Option { // Check depth limit if self.depth >= MAX_DEPTH { - self.diagnostics.push(Diagnostic::error( - "1.2", - format!("STRUCT_DEPTH_EXCEEDED: Array nesting depth exceeds limit of {}", MAX_DEPTH), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructDepthExceeded, + format!("Array nesting depth exceeds limit of {}", MAX_DEPTH), )); // Skip to matching closing bracket self.skip_to_array_end(); @@ -199,9 +199,8 @@ impl<'a> ObjectParser<'a> { fn parse_dict(&mut self) -> Option { // Check depth limit if self.depth >= MAX_DEPTH { - self.diagnostics.push(Diagnostic::error_with_code( - DiagCode::DepthExceeded, - "1.2", + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructDepthExceeded, format!("Dictionary nesting depth exceeds limit of {}", MAX_DEPTH), )); self.skip_to_dict_end(); @@ -232,9 +231,9 @@ impl<'a> ObjectParser<'a> { match self.lexer.peek_token() { Some(Token::DictEnd) | Some(Token::Eof) => { // Missing value - insert PdfNull - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INVALID_DICT_VALUE: Dictionary key '{}' has no value, inserting null", key), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidDictValue, + format!("Dictionary key '{}' has no value, inserting null", key), )); dict.insert(key, PdfObject::Null); break; // End of dict @@ -253,9 +252,9 @@ impl<'a> ObjectParser<'a> { } _ => { // Invalid key - not a name - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INVALID_DICT_KEY: Dictionary key is not a name object, skipping"), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidDictKey, + "Dictionary key is not a name object, skipping".to_string(), )); // Skip the invalid token and the next token (would-be value) let _ = self.lexer.next_token(); @@ -314,9 +313,9 @@ impl<'a> ObjectParser<'a> { let len_usize = len as usize; let actual_skipped = self.lexer.skip_bytes(len); if actual_skipped < len_usize { - self.diagnostics.push(Diagnostic::error( - "1.2", - format!("STRUCT_TRUNCATED_STREAM: Stream truncated at EOF: expected {} bytes, got {}", len, actual_skipped), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + format!("Stream truncated at EOF: expected {} bytes, got {}", len, actual_skipped), )); } } else { @@ -330,24 +329,24 @@ impl<'a> ObjectParser<'a> { // Normal case - stream properly terminated } Some(Token::Eof) => { - self.diagnostics.push(Diagnostic::error( - "1.2", - "STRUCT_TRUNCATED_STREAM: Stream truncated at EOF, missing endstream keyword", + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + "Stream truncated at EOF, missing endstream keyword".to_string(), )); } Some(other) => { - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_MISSING_KEY: Expected endstream keyword after stream body, found {:?}", other), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructUnexpectedByte, + format!("Expected endstream keyword after stream body, found {:?}", other), )); // Try to recover by scanning forward for EndStream self.scan_to_endstream(); } None => { // Shouldn't happen, but handle gracefully - self.diagnostics.push(Diagnostic::error( - "1.2", - "Unexpected None after skipping stream body", + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructUnexpectedEof, + "Unexpected None after skipping stream body".to_string(), )); } } @@ -420,15 +419,15 @@ impl<'a> ObjectParser<'a> { Token::Integer(n) => { // Check for overflow if n > u32::MAX as i64 { - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INTEGER_OVERFLOW: Object number {} exceeds u32::MAX, clamping", n), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructIntegerOverflow, + format!("Object number {} exceeds u32::MAX, clamping", n), )); u32::MAX } else if n < 0 { - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INVALID_INDIRECT_HEADER: Negative object number {}", n), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidIndirectHeader, + format!("Negative object number {}", n), )); // Recover by scanning forward to next obj keyword self.scan_to_next_obj(); @@ -439,9 +438,9 @@ impl<'a> ObjectParser<'a> { } _ => { // Not an integer - emit diagnostic and recover - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INVALID_INDIRECT_HEADER: Expected object number, found {:?}", token1), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidIndirectHeader, + format!("Expected object number, found {:?}", token1), )); self.scan_to_next_obj(); return None; @@ -454,15 +453,15 @@ impl<'a> ObjectParser<'a> { Token::Integer(g) => { // Check for overflow if g > u16::MAX as i64 { - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INTEGER_OVERFLOW: Generation number {} exceeds u16::MAX, clamping", g), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructIntegerOverflow, + format!("Generation number {} exceeds u16::MAX, clamping", g), )); u16::MAX } else if g < 0 { - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INVALID_INDIRECT_HEADER: Negative generation number {}", g), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidIndirectHeader, + format!("Negative generation number {}", g), )); self.scan_to_next_obj(); return None; @@ -472,9 +471,9 @@ impl<'a> ObjectParser<'a> { } _ => { // Not an integer - emit diagnostic and recover - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INVALID_INDIRECT_HEADER: Expected generation number, found {:?}", token2), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidIndirectHeader, + format!("Expected generation number, found {:?}", token2), )); self.scan_to_next_obj(); return None; @@ -484,9 +483,9 @@ impl<'a> ObjectParser<'a> { // Read the third token (must be Obj) let token3 = self.lexer.next_token()?; if !matches!(token3, Token::Obj) { - self.diagnostics.push(Diagnostic::warning( - "1.2", - format!("STRUCT_INVALID_INDIRECT_HEADER: Expected 'obj' keyword, found {:?}", token3), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructInvalidIndirectHeader, + format!("Expected 'obj' keyword, found {:?}", token3), )); self.scan_to_next_obj(); return None; @@ -507,9 +506,9 @@ impl<'a> ObjectParser<'a> { Some(Token::Obj) => { // Found the start of the next indirect object before endobj // This means the current object is malformed - self.diagnostics.push(Diagnostic::warning( - "1.2", - "STRUCT_MISSING_KEY: Missing 'endobj' before next indirect object".to_string(), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructMissingKey, + "Missing 'endobj' before next indirect object".to_string(), )); // We're positioned at 'obj' but need to be at the object number // Scan forward to find the next integer (object number) @@ -518,22 +517,22 @@ impl<'a> ObjectParser<'a> { Some(Token::Eof) => { // Consume the Eof let _ = self.lexer.next_token(); - self.diagnostics.push(Diagnostic::warning( - "1.2", - "STRUCT_MISSING_KEY: Missing 'endobj' at EOF".to_string(), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructMissingKey, + "Missing 'endobj' at EOF".to_string(), )); } None => { - self.diagnostics.push(Diagnostic::warning( - "1.2", - "STRUCT_MISSING_KEY: Missing 'endobj' at EOF".to_string(), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructMissingKey, + "Missing 'endobj' at EOF".to_string(), )); } Some(_) => { // Some other token - scan for endobj or next obj - self.diagnostics.push(Diagnostic::warning( - "1.2", - "STRUCT_MISSING_KEY: Expected 'endobj', scanning forward".to_string(), + self.diagnostics.push(Diag::with_dynamic_no_offset( + DiagCode::StructMissingKey, + "Expected 'endobj', scanning forward".to_string(), )); self.scan_to_endobj_or_obj(); } @@ -826,7 +825,7 @@ mod tests { assert_eq!(dict.len(), 1); assert_eq!(dict.get("Type"), Some(&PdfObject::Null)); let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_DICT_VALUE"))); + assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictValue)); } else { panic!("Expected dict, got {:?}", obj); } @@ -839,7 +838,7 @@ mod tests { if let Some(PdfObject::Dict(dict)) = obj { assert_eq!(dict.len(), 0); let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_DICT_KEY"))); + assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictKey)); } else { panic!("Expected dict, got {:?}", obj); } @@ -926,7 +925,7 @@ mod tests { // Should have emitted STRUCT_DEPTH_EXCEEDED diagnostic let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.code == DiagCode::DepthExceeded)); + assert!(diags.iter().any(|d| d.code == DiagCode::StructDepthExceeded)); } #[test] @@ -951,7 +950,7 @@ mod tests { // Should have emitted STRUCT_INVALID_DICT_VALUE diagnostic for missing value let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.code == DiagCode::InvalidDictValue)); + assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictValue)); } #[test] @@ -962,7 +961,7 @@ mod tests { // Should return PdfNull with diagnostic assert_eq!(obj, Some(PdfObject::Null)); let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.code == DiagCode::StructUnexpectedEof)); + assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader)); } #[test] @@ -1085,7 +1084,7 @@ mod tests { // Should have emitted STRUCT_MISSING_KEY diagnostic let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.message.contains("STRUCT_MISSING_KEY"))); + assert!(diags.iter().any(|d| d.code == DiagCode::StructMissingKey)); // Next parse should handle the second object let indirect2 = parser.parse_indirect_object(); @@ -1109,7 +1108,7 @@ mod tests { // Should have emitted STRUCT_INTEGER_OVERFLOW diagnostic let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.message.contains("STRUCT_INTEGER_OVERFLOW"))); + assert!(diags.iter().any(|d| d.code == DiagCode::StructIntegerOverflow)); } #[test] @@ -1124,7 +1123,7 @@ mod tests { // Should have emitted STRUCT_INTEGER_OVERFLOW diagnostic let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.message.contains("STRUCT_INTEGER_OVERFLOW"))); + assert!(diags.iter().any(|d| d.code == DiagCode::StructIntegerOverflow)); } #[test] @@ -1138,7 +1137,7 @@ mod tests { // Should have emitted STRUCT_INVALID_INDIRECT_HEADER diagnostic let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_INDIRECT_HEADER"))); + assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader)); } #[test] @@ -1151,7 +1150,7 @@ mod tests { // Should have emitted STRUCT_INVALID_INDIRECT_HEADER diagnostic let diags = parser.take_diagnostics(); - assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_INDIRECT_HEADER"))); + assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader)); } #[test] diff --git a/crates/pdftract-core/src/parser/object/types.rs b/crates/pdftract-core/src/parser/object/types.rs index 28bdae9..e9d3611 100644 --- a/crates/pdftract-core/src/parser/object/types.rs +++ b/crates/pdftract-core/src/parser/object/types.rs @@ -134,7 +134,7 @@ impl PdfStream { /// Returns None if no filter is present (raw stream). /// Filter names are returned without the leading slash (e.g., "FlateDecode", not "/FlateDecode"). pub fn filter(&self) -> Option> { - let filter = self.dict.get("Filter")?; + let filter = self.dict.get("/Filter")?; Some(match filter { PdfObject::Name(name) => { @@ -168,7 +168,7 @@ impl PdfStream { /// /// Returns None if no parameters are present. pub fn decode_params(&self) -> Option> { - let params = self.dict.get("DecodeParms")?; + let params = self.dict.get("/DecodeParms")?; Some(match params { PdfObject::Dict(_) => vec![params.clone()], @@ -181,7 +181,7 @@ impl PdfStream { /// /// Returns the direct integer value, or None if /Length is indirect/missing. pub fn length(&self) -> Option { - self.dict.get("Length")?.as_int().map(|i| i as u64) + self.dict.get("/Length")?.as_int().map(|i| i as u64) } } diff --git a/crates/pdftract-core/src/parser/outline.rs b/crates/pdftract-core/src/parser/outline.rs index 2951766..217cce3 100644 --- a/crates/pdftract-core/src/parser/outline.rs +++ b/crates/pdftract-core/src/parser/outline.rs @@ -214,27 +214,27 @@ fn decode_utf16be_raw(bytes: &[u8]) -> std::result::Result { /// /// Returns true if: /// - Length is even -/// - For any byte > 0x7F, the adjacent bytes are 0x00 +/// - Most high bytes (first byte of each pair) are 0x00 +/// +/// This detects UTF-16BE encoded ASCII text, where each ASCII character +/// is stored as [0x00, char_code]. fn looks_like_utf16be(bytes: &[u8]) -> bool { if bytes.len() < 2 || bytes.len() % 2 != 0 { return false; } - // Check if high bytes are mostly zero (indicative of UTF-16BE ASCII text) - let mut high_bytes_count = 0; - let mut high_bytes_zero = 0; + // Count how many high bytes are zero + let mut zero_high_bytes = 0; + let total_pairs = bytes.len() / 2; for chunk in bytes.chunks_exact(2) { - if chunk[0] > 0x7F || chunk[1] > 0x7F { - high_bytes_count += 1; - if chunk[0] == 0x00 { - high_bytes_zero += 1; - } + if chunk[0] == 0x00 { + zero_high_bytes += 1; } } - // If we have non-ASCII bytes and most high bytes are zero, likely UTF-16BE - high_bytes_count > 0 && high_bytes_zero >= high_bytes_count / 2 + // If most high bytes are zero (>= 75%), likely UTF-16BE + zero_high_bytes >= total_pairs * 3 / 4 } /// Decode PDFDocEncoded string to UTF-8. @@ -567,6 +567,13 @@ fn resolve_destination( } } (None, None) + } else if dest_obj.as_name().is_some() || dest_obj.as_string().is_some() { + // Named destination (name or string) - emit diagnostic and return None + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StructUnresolvedDestination, + "STRUCT_UNRESOLVED_DESTINATION: Named destination not supported", + )); + (None, None) } else { (None, None) } diff --git a/crates/pdftract-core/src/parser/stream.rs b/crates/pdftract-core/src/parser/stream.rs index 9c17bce..691893a 100644 --- a/crates/pdftract-core/src/parser/stream.rs +++ b/crates/pdftract-core/src/parser/stream.rs @@ -17,7 +17,7 @@ use flate2::read::ZlibDecoder; use lzw::{MsbReader, Decoder, DecoderEarlyChange}; use secrecy::SecretString; -use crate::parser::diagnostic::{Diagnostic, DiagCode}; +use crate::diagnostics::{Diagnostic, DiagCode}; use crate::parser::object::{PdfObject, PdfStream}; /// Maximum number of filters allowed in a single stream's pipeline. @@ -1863,8 +1863,10 @@ fn decode_stream_impl( let truncated = raw_bytes[..remaining.min(raw_bytes.len())].to_vec(); return DecodeResult::with_diagnostic( truncated, - Diagnostic::error("1.5", - format!("STREAM_BOMB: Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes)) + Diagnostic::with_dynamic_no_offset( + DiagCode::StreamBomb, + format!("Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes) + ) ); } *doc_decompress_counter += len; @@ -1881,13 +1883,17 @@ fn decode_stream_impl( // Step 3: Get decode params (aligned with filters, may be shorter) let decode_params = stream.decode_params().unwrap_or_default(); - // Validate /Filter and /DecodeParms array lengths match - if !decode_params.is_empty() && decode_params.len() != filters.len() { + // Validate /Filter and /DecodeParms array lengths + // Per PDF spec, /DecodeParms can be shorter than /Filter (missing params are treated as null). + // But /DecodeParms cannot be longer than /Filter. + if decode_params.len() > filters.len() { return DecodeResult::with_diagnostic( raw_bytes, - Diagnostic::error("1.5", - format!("STRUCT_INVALID_FILTER_PARAMS: /Filter array length ({}) != /DecodeParms array length ({})", - filters.len(), decode_params.len())) + Diagnostic::with_dynamic_no_offset( + DiagCode::StreamInvalidParams, + format!("/DecodeParms array length ({}) > /Filter array length ({})", + decode_params.len(), filters.len()) + ) ); } @@ -1918,9 +1924,8 @@ fn decode_stream_impl( Err(FilterError::EncryptionUnsupported) => { // Crypt filter with custom /Name - emit ENCRYPTION_UNSUPPORTED // and return empty bytes (stream is undecryptable) - diagnostics.push(Diagnostic::error_with_code( + diagnostics.push(Diagnostic::with_static_no_offset( DiagCode::EncryptionUnsupported, - "1.5", "Crypt filter with custom /Name parameter is not supported", )); return DecodeResult { @@ -1928,7 +1933,7 @@ fn decode_stream_impl( diagnostics, }; } - Err(_) => { + Err(e) => { // Hard error - return raw bytes for this filter break; } @@ -1936,16 +1941,20 @@ fn decode_stream_impl( } None => { // Unknown filter - emit diagnostic and return current bytes (partial decode) per INV-8 - diagnostics.push(Diagnostic::warning("1.5", - format!("STRUCT_UNKNOWN_FILTER: Unknown filter: {}, returning partial decode", filter_name))); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StreamUnknownFilter, + format!("Unknown filter: {}, returning partial decode", filter_name) + )); break; } } } if bomb_limit_hit { - diagnostics.push(Diagnostic::error("1.5", - format!("STREAM_BOMB: Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes))); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::StreamBomb, + format!("Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes) + )); } DecodeResult { diff --git a/crates/pdftract-core/src/parser/xref.rs b/crates/pdftract-core/src/parser/xref.rs index 6b7dbfb..3dad8de 100644 --- a/crates/pdftract-core/src/parser/xref.rs +++ b/crates/pdftract-core/src/parser/xref.rs @@ -7,9 +7,9 @@ use std::collections::{HashMap, HashSet}; use std::sync::{Arc, RwLock}; -use std::borrow::Cow; -use crate::parser::object::{ObjRef, PdfObject, PdfDict, PdfStream}; +use crate::parser::object::{ObjRef, PdfObject, PdfDict, PdfStream, ObjectParser}; use crate::parser::stream::{PdfSource, MemorySource}; +use crate::diagnostics::{Diagnostic as Diag, DiagCode}; // Use memchr for SIMD-accelerated byte searching in forward_scan_xref use memchr::{memchr, memchr_iter}; @@ -51,74 +51,6 @@ pub enum XrefEntry { Compressed { obj_stm_nr: u32, index: u32 }, } -/// Diagnostic codes for xref parsing. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum XrefDiagCode { - /// Invalid xref keyword or header - InvalidXrefHeader, - /// Malformed xref entry (not 20 bytes, bad format) - InvalidXrefEntry, - /// Invalid subsection header (not "start count") - InvalidSubsectionHeader, - /// Object 0 is not free (violates PDF spec) - ObjectZeroNotFree, - /// Trailer dictionary not found or malformed - TrailerNotFound, - /// Truncated xref table (unexpected EOF) - XrefTruncated, - /// Forward scan recovered xref entries (EC-07 recovery) - XrefRepaired, - /// Forward scan disabled for remote sources (would fetch entire file) - RemoteNoForwardScan, - /// Forward scan disabled for linearized files (has partial leading xref) - LinearizedNoForwardScan, - /// Invalid xref stream entry (unknown type, malformed data) - InvalidXrefStreamEntry, - /// Invalid xref stream format (missing required key, bad /W array) - InvalidXrefStreamFormat, - /// Xref stream decompression failed - XrefStreamDecompressionFailed, - /// Hybrid xref conflict: traditional table and stream disagree on object state - StructHybridConflict, - /// Circular /Prev reference detected (incremental update cycle) - StructCircularRef, - /// /Prev chain depth exceeded (adversarial input or corrupted file) - StructDepthExceeded, - /// /Prev offset points beyond file size - StructInvalidPrevOffset, -} - -/// A diagnostic message emitted during xref parsing. -#[derive(Debug, Clone, PartialEq)] -pub struct XrefDiagnostic { - /// The diagnostic code - pub code: XrefDiagCode, - /// Byte offset in the input where the error occurred - pub byte_offset: u64, - /// Human-readable error message - pub msg: Cow<'static, str>, -} - -impl XrefDiagnostic { - /// Create a diagnostic with a static message. - fn with_static(code: XrefDiagCode, byte_offset: u64, msg: &'static str) -> Self { - XrefDiagnostic { - code, - byte_offset, - msg: Cow::Borrowed(msg), - } - } - - /// Create a diagnostic with a dynamic message. - fn with_dynamic(code: XrefDiagCode, byte_offset: u64, msg: String) -> Self { - XrefDiagnostic { - code, - byte_offset, - msg: Cow::Owned(msg), - } - } -} - /// Result of parsing a traditional xref table. /// /// Contains the parsed xref entries and the trailer dictionary. @@ -129,7 +61,7 @@ pub struct XrefSection { /// The trailer dictionary pub trailer: Option, /// Diagnostics emitted during parsing - pub diagnostics: Vec, + pub diagnostics: Vec, /// Whether this xref section is from a hybrid file (traditional + stream merged) pub is_hybrid: bool, } @@ -222,8 +154,8 @@ pub fn merge_hybrid(traditional: XrefSection, stream: XrefSection) -> XrefSectio let stream_is_inuse = matches!(stream_entry, XrefEntry::InUse { .. } | XrefEntry::Compressed { .. }); if trad_is_free && stream_is_inuse { - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::StructHybridConflict, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::StructHybridConflict, 0, format!( "Object {}: traditional table marks as Free, stream marks as InUse; traditional wins (object is Free)", @@ -446,8 +378,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let header_bytes = match source.read_at(pos, 1024) { Ok(bytes) if !bytes.is_empty() => bytes, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::XrefTruncated, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefTruncated, pos, "Failed to read xref header", )); @@ -461,8 +393,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let header_str = match std::str::from_utf8(&header_bytes) { Ok(s) => s, Err(_) => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefHeader, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidHeader, pos, "Invalid UTF-8 in xref header", )); @@ -478,8 +410,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref // Found it! ws_offset is the position of "xref" in header_bytes break ws_offset; } else { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefHeader, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidHeader, pos, "xref keyword not found", )); @@ -522,8 +454,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let chunk_str = match std::str::from_utf8(&chunk_bytes) { Ok(s) => s, Err(_) => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::XrefTruncated, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefTruncated, pos, "Invalid UTF-8 in xref data", )); @@ -547,8 +479,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let header_line = match read_line_at(source, subsection_start) { Some(line) => line, None => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidSubsectionHeader, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidSubsectionHeader, subsection_start, "Failed to read subsection header", )); @@ -558,8 +490,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let header_parts: Vec<&str> = header_line.split_whitespace().collect(); if header_parts.len() != 2 { - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidSubsectionHeader, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidSubsectionHeader, subsection_start, format!("Invalid subsection header: {}", header_line), )); @@ -584,8 +516,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let obj_start: u32 = match header_parts[0].parse() { Ok(n) => n, Err(_) => { - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidSubsectionHeader, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidSubsectionHeader, subsection_start, format!("Invalid subsection start: {}", header_parts[0]), )); @@ -597,8 +529,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let obj_count: u32 = match header_parts[1].parse() { Ok(n) => n, Err(_) => { - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidSubsectionHeader, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidSubsectionHeader, subsection_start, format!("Invalid subsection count: {}", header_parts[1]), )); @@ -635,8 +567,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref let entry_bytes = match source.read_at(pos, 20) { Ok(bytes) => bytes, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::XrefTruncated, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefTruncated, pos, "Failed to read xref entry", )); @@ -646,8 +578,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref if entry_bytes.len() < 19 { // Definitely truncated - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::XrefTruncated, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefTruncated, pos, "Xref entry truncated (< 19 bytes)", )); @@ -668,18 +600,16 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref // Object 0 must be free (PDF spec requirement) if obj_nr == 0 { if let XrefEntry::InUse { .. } = entry { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::ObjectZeroNotFree, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefObjectZeroNotFree, entry_start, "Object 0 is not free (violates PDF spec)", )); } } - // Only add in-use entries to the result - // Free entries are ignored per pdftract spec (they don't resolve to objects) - if matches!(entry, XrefEntry::InUse { .. }) { - result.add_entry(obj_nr, entry); - } + // Add all entries to the result (both InUse and Free) + // Free entries are needed for /Prev chain merge semantics to track object lifecycle + result.add_entry(obj_nr, entry); pos += stride as u64; entries_parsed += 1; } @@ -699,8 +629,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref // If we exited the loop without finding a trailer, emit a diagnostic if !trailer_found { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::TrailerNotFound, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefTrailerNotFound, pos, "Trailer dictionary not found (xref table may be truncated)", )); @@ -717,7 +647,7 @@ fn parse_xref_entry( obj_nr: u32, offset: u64, stride: usize, - diagnostics: &mut Vec, + diagnostics: &mut Vec, ) -> Option<(u32, XrefEntry)> { if bytes.len() != stride { return None; @@ -727,8 +657,8 @@ fn parse_xref_entry( let entry_str = match std::str::from_utf8(bytes) { Ok(s) => s, Err(_) => { - diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefEntry, + diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidEntry, offset, "Invalid UTF-8 in xref entry", )); @@ -739,8 +669,8 @@ fn parse_xref_entry( // Entry format: "offset/next_free generation f/n" with line ending let parts: Vec<&str> = entry_str.split_whitespace().collect(); if parts.len() < 3 { - diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefEntry, + diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidEntry, offset, format!("Malformed xref entry: {}", entry_str.trim()), )); @@ -750,8 +680,8 @@ fn parse_xref_entry( let first_field: u64 = match parts[0].parse() { Ok(n) => n, Err(_) => { - diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefEntry, + diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidEntry, offset, format!("Invalid offset/next_free: {}", parts[0]), )); @@ -762,8 +692,8 @@ fn parse_xref_entry( let gen_nr: u16 = match parts[1].parse() { Ok(n) => n, Err(_) => { - diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefEntry, + diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidEntry, offset, format!("Invalid generation: {}", parts[1]), )); @@ -776,8 +706,8 @@ fn parse_xref_entry( Some('n') | Some('N') => Some((obj_nr, XrefEntry::InUse { offset: first_field, gen_nr })), Some('f') | Some('F') => Some((obj_nr, XrefEntry::Free { next_free: first_field as u32, gen_nr })), _ => { - diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefEntry, + diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidEntry, offset, format!("Invalid entry type: {}", parts[2]), )); @@ -842,7 +772,7 @@ fn read_line_at(source: &dyn PdfSource, mut pos: u64) -> Option { fn read_line( source: &dyn PdfSource, pos: &mut u64, - diagnostics: &mut Vec, + diagnostics: &mut Vec, ) -> Option { let line = read_line_at(source, *pos)?; // Advance position past the line (including line ending) @@ -865,26 +795,30 @@ fn read_line( /// Parse the trailer dictionary. /// -/// This is a simplified implementation that reads until the end of the -/// dictionary (>>) and returns a placeholder dict object. -/// The full implementation will use the object parser from Phase 1.2. +/// Parse the trailer dictionary from the xref trailer section. +/// +/// This function extracts the trailer dictionary bytes and parses them +/// using the object parser to get the actual key-value pairs. fn parse_trailer_dict( source: &dyn PdfSource, pos: &mut u64, - diagnostics: &mut Vec, + diagnostics: &mut Vec, ) -> Option { // Skip whitespace before << let mut seen_bracket = false; let mut depth = 0; let mut chunk_pos = 0u64; + let dict_start_offset = *pos; + let mut dict_end_offset = None; + // First, find the extent of the trailer dict (from << to >>) loop { - let chunk = match source.read_at(*pos + chunk_pos, 1024) { + let chunk = match source.read_at(dict_start_offset + chunk_pos, 4096) { Ok(bytes) => bytes, Err(_) => { - diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::TrailerNotFound, - *pos, + diagnostics.push(Diag::with_static( + DiagCode::XrefTrailerNotFound, + dict_start_offset, "I/O error reading trailer", )); return None; @@ -914,8 +848,10 @@ fn parse_trailer_dict( if j + 1 < remaining.len() && remaining[j + 1] == b'>' { depth -= 1; if depth == 0 { - *pos += chunk_pos + j as u64 + 2; - return Some(PdfDict::new()); + // Found the end of the dict + let end_offset = dict_start_offset + chunk_pos + j as u64 + 2; + dict_end_offset = Some(end_offset); + break; } } } @@ -927,25 +863,74 @@ fn parse_trailer_dict( } } + if dict_end_offset.is_some() { + break; + } + chunk_pos += chunk.len() as u64; // Safety limit if chunk_pos > 100000 { - diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::TrailerNotFound, - *pos, + diagnostics.push(Diag::with_static( + DiagCode::XrefTrailerNotFound, + dict_start_offset, "Trailer dictionary too large or unterminated", )); return None; } } - diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::TrailerNotFound, - *pos, - "Trailer dictionary not found", - )); - None + // If we didn't find the end, return None + let dict_end_offset = match dict_end_offset { + Some(offset) => offset, + None => { + diagnostics.push(Diag::with_static( + DiagCode::XrefTrailerNotFound, + dict_start_offset, + "Trailer dictionary not found (no << >> markers)", + )); + return None; + } + }; + + // Read the full dict bytes and parse them + let dict_len = (dict_end_offset - dict_start_offset) as usize; + let dict_bytes = match source.read_at(dict_start_offset, dict_len) { + Ok(bytes) => bytes, + Err(_) => { + diagnostics.push(Diag::with_static( + DiagCode::XrefTrailerNotFound, + dict_start_offset, + "Failed to read trailer dictionary bytes", + )); + return None; + } + }; + + // Parse the dict using ObjectParser + let mut parser = ObjectParser::new(&dict_bytes); + if let Some(PdfObject::Dict(dict)) = parser.parse_direct_object() { + // Update pos to after the dict + *pos = dict_end_offset; + + // Transfer any diagnostics from the parser + for diag in parser.take_diagnostics() { + diagnostics.push(Diag::with_dynamic( + DiagCode::XrefTrailerNotFound, + dict_start_offset, + diag.message.into_owned(), + )); + } + + Some(*dict) + } else { + diagnostics.push(Diag::with_static( + DiagCode::XrefTrailerNotFound, + dict_start_offset, + "Failed to parse trailer dictionary as a dict object", + )); + None + } } /// Parse a direct PDF object (for trailer dictionary parsing). @@ -999,8 +984,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec // Check for linearized file if is_linearized { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::LinearizedNoForwardScan, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefLinearizedNoForwardScan, 0, "Forward scan disabled for linearized PDF (partial leading xref would cause false results)", )); @@ -1014,8 +999,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec let source_len = match source.len() { Ok(len) if len > 0 => len, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::XrefTruncated, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefTruncated, 0, "Unable to determine source length for forward scan", )); @@ -1095,8 +1080,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec } // Emit XREF_REPAIRED diagnostic with count - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::XrefRepaired, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefRepaired, 0, format!("Forward scan recovered {} object entries", entries_found), )); @@ -1162,8 +1147,8 @@ fn forward_scan_memory(data: &[u8], source_len: u64) -> XrefSection { } // Emit XREF_REPAIRED diagnostic with count - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::XrefRepaired, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefRepaired, 0, format!("Forward scan recovered {} object entries", entries_found), )); @@ -1403,8 +1388,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref let obj_bytes = match source.read_at(stream_obj_offset, 4096) { Ok(bytes) if !bytes.is_empty() => bytes, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Failed to read xref stream object", )); @@ -1416,8 +1401,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref let indirect = match parser.parse_indirect_object() { Some(i) => i, None => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Failed to parse xref stream as indirect object", )); @@ -1429,8 +1414,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref let stream = match indirect.obj { PdfObject::Stream(s) => s, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Xref stream object is not a stream", )); @@ -1441,8 +1426,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref // Check for /Type /XRef (optional per spec, but we validate it) if let Some(PdfObject::Name(type_name)) = stream.dict.get("Type") { if type_name.as_ref() != "/XRef" && type_name.as_ref() != "XRef" { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Stream /Type is not /XRef", )); @@ -1453,8 +1438,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref let size = match stream.dict.get("Size") { Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Missing or invalid /Size in xref stream", )); @@ -1469,8 +1454,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref .filter_map(|o| o.as_int()) .collect(); if widths.len() != 3 { - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, format!("/W array must have 3 elements, got {}", widths.len()), )); @@ -1478,8 +1463,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref } // Widths can be 0, but negative is invalid if widths.iter().any(|&w| w < 0) { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "/W array contains negative values", )); @@ -1488,8 +1473,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref widths } _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Missing or invalid /W in xref stream", )); @@ -1512,8 +1497,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref let first = match first_obj.as_int() { Some(n) if n >= 0 => n as u32, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Invalid /Index first value", )); @@ -1523,8 +1508,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref let count = match iter.peek() { Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32, _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Invalid /Index count value", )); @@ -1535,8 +1520,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref pairs.push((first, count)); } if pairs.is_empty() { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "/Index array is empty", )); @@ -1546,8 +1531,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref } None => vec![(0, size)], _ => { - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefStreamFormat, + result.diagnostics.push(Diag::with_static( + DiagCode::XrefInvalidStreamFormat, stream_obj_offset, "Invalid /Index in xref stream (not an array)", )); @@ -1582,8 +1567,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref if decoded.is_empty() { // Check if this is a legitimate empty stream (no objects) or an error // A valid xref stream with no objects would have /Size 0, which is unusual - result.diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::XrefStreamDecompressionFailed, + result.diagnostics.push(Diag::with_static( + DiagCode::StreamDecodeError, stream_obj_offset, "Xref stream decompression produced empty output", )); @@ -1600,8 +1585,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref // Check we have enough bytes for this entry if data_pos + entry_stride > decoded.len() { - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefStreamEntry, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidStreamEntry, stream_obj_offset, format!("Xref stream truncated at object {}", obj_nr), )); @@ -1657,8 +1642,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref } _ => { // Unknown type - emit diagnostic and treat as free - result.diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefStreamEntry, + result.diagnostics.push(Diag::with_dynamic( + DiagCode::XrefInvalidStreamEntry, stream_obj_offset, format!("Invalid xref entry type {} for object {}", entry_type, obj_nr), )); @@ -2105,12 +2090,12 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X offset: u64, visited: &mut HashSet, depth: u32, - diagnostics: &mut Vec, + diagnostics: &mut Vec, ) -> XrefSection { // Cycle detection if visited.contains(&offset) { - diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::StructCircularRef, + diagnostics.push(Diag::with_static( + DiagCode::StructCircularRef, offset, "Circular /Prev reference detected; stopping chain traversal", )); @@ -2121,8 +2106,8 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X // Depth limit check if depth >= MAX_PREV_DEPTH { - diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::StructDepthExceeded, + diagnostics.push(Diag::with_dynamic( + DiagCode::StructDepthExceeded, offset, format!("/Prev chain depth exceeded maximum of {}", MAX_PREV_DEPTH).into(), )); @@ -2143,14 +2128,13 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X }) }); - // Validate /Prev offset if present - let mut should_follow_prev = false; + // Validate /Prev offset and recursively load previous revision if present if let Some(prev) = prev_offset { match source.len() { Ok(file_size) if prev > file_size => { // /Prev points beyond file size - invalid - diagnostics.push(XrefDiagnostic::with_dynamic( - XrefDiagCode::StructInvalidPrevOffset, + diagnostics.push(Diag::with_dynamic( + DiagCode::StructInvalidPrevOffset, offset, format!("/Prev offset {} exceeds file size {}; ignoring /Prev key", prev, file_size).into(), )); @@ -2158,52 +2142,56 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X if let Some(ref mut trailer) = current.trailer { trailer.shift_remove("Prev"); } + // Return current revision without following /Prev + let mut result = current; + result.diagnostics.extend(diagnostics.drain(..)); + return result; } Ok(_) => { - // Valid /Prev offset - should_follow_prev = true; + // Valid /Prev offset - recursively load + let mut older = walk_chain(source, prev, visited, depth + 1, diagnostics); + + // Merge: older entries first, then current (newer) entries override + // This is the opposite of hybrid merge (where first parameter wins) + for (obj_nr, entry) in current.entries { + older.entries.insert(obj_nr, entry); + } + + // Preserve current (latest) trailer + older.trailer = current.trailer; + + // Merge diagnostics from current revision + older.diagnostics.extend(current.diagnostics); + + // Mark as hybrid if current revision is hybrid + if current.is_hybrid { + older.is_hybrid = true; + } + + // Add current's diagnostics to the merged result + older.diagnostics.extend(diagnostics.drain(..)); + + older } Err(_) => { // Can't determine file size - be conservative and don't follow - diagnostics.push(XrefDiagnostic::with_static( - XrefDiagCode::StructInvalidPrevOffset, + diagnostics.push(Diag::with_static( + DiagCode::StructInvalidPrevOffset, offset, "Cannot determine file size; ignoring /Prev key", )); + // Return current revision without following /Prev + let mut result = current; + result.diagnostics.extend(diagnostics.drain(..)); + result } } - } - - // Recursively load previous revision if /Prev exists - if should_follow_prev { - let prev = prev_offset.unwrap(); // Safe because we checked should_follow_prev - let mut older = walk_chain(source, prev, visited, depth + 1, diagnostics); - - // Merge: older entries first, then current (newer) entries override - // This is the opposite of hybrid merge (where first parameter wins) - for (obj_nr, entry) in current.entries { - older.entries.insert(obj_nr, entry); - } - - // Preserve current (latest) trailer - older.trailer = current.trailer; - - // Merge diagnostics from current revision - older.diagnostics.extend(current.diagnostics); - - // Mark as hybrid if current revision is hybrid - if current.is_hybrid { - older.is_hybrid = true; - } - - // Add current's diagnostics to the merged result - older.diagnostics.extend(diagnostics.drain(..)); - - older } else { // No /Prev - this is the baseline (original) revision - // Return current as-is - current + // Return current with any diagnostics from this level + let mut result = current; + result.diagnostics.extend(diagnostics.drain(..)); + result } } @@ -2341,26 +2329,26 @@ mod tests { #[test] fn test_xref_diagnostic_static() { - let diag = XrefDiagnostic::with_static( - XrefDiagCode::InvalidXrefHeader, + let diag = Diag::with_static( + DiagCode::XrefInvalidHeader, 100, "test message", ); - assert_eq!(diag.byte_offset, 100); - assert_eq!(diag.msg.as_ref(), "test message"); - assert!(matches!(diag.code, XrefDiagCode::InvalidXrefHeader)); + assert_eq!(diag.byte_offset, Some(100)); + assert_eq!(diag.message.as_ref(), "test message"); + assert!(matches!(diag.code, DiagCode::XrefInvalidHeader)); } #[test] fn test_xref_diagnostic_dynamic() { - let diag = XrefDiagnostic::with_dynamic( - XrefDiagCode::InvalidXrefEntry, + let diag = Diag::with_dynamic( + DiagCode::XrefInvalidEntry, 200, "dynamic message".to_string(), ); - assert_eq!(diag.byte_offset, 200); - assert_eq!(diag.msg.as_ref(), "dynamic message"); - assert!(matches!(diag.code, XrefDiagCode::InvalidXrefEntry)); + assert_eq!(diag.byte_offset, Some(200)); + assert_eq!(diag.message.as_ref(), "dynamic message"); + assert!(matches!(diag.code, DiagCode::XrefInvalidEntry)); } #[test] @@ -2378,12 +2366,15 @@ trailer\n<< /Size 6 >>\n"; let source = MemorySource::new(xref_data.to_vec()); let result = parse_traditional_xref(&source, 0); - // Should have parsed 4 in-use entries (objects 0 and 3 are free and ignored) - assert_eq!(result.len(), 4); + // Should have parsed 6 entries (all objects 0-5, including free entries) + // Free entries are tracked for /Prev chain merge semantics + assert_eq!(result.len(), 6); // Check specific entries + assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 })); assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 17, gen_nr: 0 })); assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 81, gen_nr: 0 })); + assert_eq!(result.entries.get(&3), Some(&XrefEntry::Free { next_free: 0, gen_nr: 7 })); assert_eq!(result.entries.get(&4), Some(&XrefEntry::InUse { offset: 331, gen_nr: 0 })); assert_eq!(result.entries.get(&5), Some(&XrefEntry::InUse { offset: 409, gen_nr: 0 })); @@ -2403,8 +2394,10 @@ trailer\r\n<< /Size 3 >>\r\n"; let source = MemorySource::new(xref_data.to_vec()); let result = parse_traditional_xref(&source, 0); - // Should have parsed 2 in-use entries - assert_eq!(result.len(), 2); + // Should have parsed 3 entries (all objects 0-2, including free entry) + // Free entries are tracked for /Prev chain merge semantics + assert_eq!(result.len(), 3); + assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 })); assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 })); assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 })); } @@ -2421,7 +2414,10 @@ trailer\n<< /Size 3 >>\n"; let source = MemorySource::new(xref_data.to_vec()); let result = parse_traditional_xref(&source, 0); - // Should have parsed 2 in-use entries + // Should have parsed 3 entries (all objects 0-2, including free entry) + // Free entries are tracked for /Prev chain merge semantics + assert_eq!(result.len(), 3); + assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 })); assert_eq!(result.len(), 2); assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 })); assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 })); @@ -2473,7 +2469,7 @@ trailer\n<< /Size 4 >>\n"; // Should have emitted a diagnostic for the bad entry assert!(!result.diagnostics.is_empty()); - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefEntry)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidEntry)); } #[test] @@ -2489,7 +2485,7 @@ trailer\n<< /Size 3 >>\n"; let result = parse_traditional_xref(&source, 0); // Should emit diagnostic for object 0 not being free - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::ObjectZeroNotFree)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefObjectZeroNotFree)); } #[test] @@ -2502,12 +2498,13 @@ trailer\n<< /Size 3 >>\n"; let source = MemorySource::new(xref_data.to_vec()); let result = parse_traditional_xref(&source, 0); - // Should still parse the entry - assert_eq!(result.len(), 1); + // Should still parse both entries (including free entry) + // Free entries are tracked for /Prev chain merge semantics + assert_eq!(result.len(), 2); assert!(result.trailer.is_none()); // Should emit diagnostic about missing trailer - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::TrailerNotFound)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefTrailerNotFound)); } #[test] @@ -2686,7 +2683,7 @@ trailer\n<< /Size 3 >>\n"; assert!(result.entries.contains_key(&3)); // Check for XREF_REPAIRED diagnostic - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::XrefRepaired)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefRepaired)); } #[test] @@ -2719,7 +2716,7 @@ trailer\n<< /Size 3 >>\n"; assert_eq!(result.len(), 0); // Should have LINEARIZED_NO_FORWARD_SCAN diagnostic - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::LinearizedNoForwardScan)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefLinearizedNoForwardScan)); } #[test] @@ -3119,7 +3116,7 @@ trailer\n<< /Size 3 >>\n"; assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 2000, gen_nr: 0 })); // Should have emitted a diagnostic for invalid type - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamEntry)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamEntry)); } #[test] @@ -3134,7 +3131,7 @@ trailer\n<< /Size 3 >>\n"; let result = parse_xref_stream(&source, 0); // Should have emitted diagnostic about missing /Size - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamFormat)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat)); } #[test] @@ -3156,7 +3153,7 @@ trailer\n<< /Size 3 >>\n"; let result = parse_xref_stream(&source, 0); // Should have emitted diagnostic about invalid /W - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamFormat)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat)); } #[test] @@ -3443,7 +3440,7 @@ trailer\n<< /Size 3 >>\n"; assert!(merged.is_hybrid); // Should have emitted STRUCT_HYBRID_CONFLICT diagnostic - assert!(merged.diagnostics.iter().any(|d| matches!(d.code, XrefDiagCode::StructHybridConflict))); + assert!(merged.diagnostics.iter().any(|d| matches!(d.code, DiagCode::StructHybridConflict))); // Traditional Free wins assert_eq!(merged.entries.get(&1), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 })); } @@ -3829,8 +3826,8 @@ trailer\n<< /Size 3 >>\n"; // Load from the latest revision let result = load_xref_with_prev_chain(&source, rev3_offset); - // Verify all 5 objects are present - assert_eq!(result.len(), 5, "Should have entries for objects 1-5, got {}", result.len()); + // Verify all 6 entries are present (including object 0) + assert_eq!(result.len(), 6, "Should have entries for objects 0-5, got {}", result.len()); // Verify LATEST values win: // Object 1: unchanged from rev1 (offset 100) @@ -3980,11 +3977,12 @@ trailer\n<< /Size 3 >>\n"; let root = trailer.get("Root"); assert!(root.is_some()); match root { - Some(PdfObject::Array(ref arr)) if arr.len() == 3 => { - // [2, 0, R] - object number 2 - assert_eq!(arr[0], PdfObject::Integer(2)); + Some(PdfObject::Ref(obj_ref)) => { + // 2 0 R - indirect reference to object 2 + assert_eq!(obj_ref.object, 2); + assert_eq!(obj_ref.generation, 0); } - _ => panic!("Expected /Root to be an array [2 0 R]"), + _ => panic!("Expected /Root to be an indirect reference 2 0 R"), } // Should have /Info from rev2 @@ -4043,7 +4041,7 @@ trailer\n<< /Size 3 >>\n"; let result = load_xref_with_prev_chain(&source, rev3_offset); // Should emit STRUCT_CIRCULAR_REF diagnostic - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructCircularRef)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructCircularRef)); } /// Test depth limit enforcement. @@ -4081,7 +4079,7 @@ trailer\n<< /Size 3 >>\n"; let result = load_xref_with_prev_chain(&source, start_offset); // Should emit STRUCT_DEPTH_EXCEEDED diagnostic - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructDepthExceeded)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructDepthExceeded)); } /// Test /Prev offset pointing beyond file size. @@ -4109,7 +4107,7 @@ trailer\n<< /Size 3 >>\n"; let result = load_xref_with_prev_chain(&source, rev2_offset); // Should emit STRUCT_INVALID_PREV_OFFSET diagnostic - assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset)); + assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset)); // /Prev should be removed from trailer let trailer = result.trailer.as_ref().unwrap(); @@ -4134,7 +4132,7 @@ trailer\n<< /Size 3 >>\n"; let result = load_xref_with_prev_chain(&source, offset); // Should not follow /Prev 0, should just return this single revision - assert!(!result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset)); + assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset)); } /// Test negative /Prev treated as "no previous revision". @@ -4155,7 +4153,7 @@ trailer\n<< /Size 3 >>\n"; let result = load_xref_with_prev_chain(&source, offset); // Should not follow negative /Prev - assert!(!result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset)); + assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset)); } /// Test hybrid file in /Prev chain.