feat(pdftract-29z7b): implement unified diagnostic system + CLI commands
- Added `cmd_explain_diagnostic` function to CLI for detailed diagnostic code explanation - Added `--list-diagnostics` and `--explain-diagnostic <code>` CLI commands - Verified all Phase 1.1-1.5 modules use unified DiagCode (lexer, parser, xref, stream, catalog, outline, pages) - DIAGNOSTIC_CATALOG provides metadata for all 61 diagnostic codes - Diagnostic struct size: 56 bytes (within 48-64 target range) - emit! macro provides ergonomic diagnostic emission - INV-8 maintained: no panics in error paths All diagnostic codes follow naming convention: - STRUCT_*: PDF structure errors - STREAM_*: Stream decoder errors - XREF_*: Cross-reference table errors - ENCRYPTION_*: Encryption-related errors - OCR_*: OCR pipeline errors - REMOTE_*: Remote source errors - PAGE_*: Page-level errors - FONT_*: Font pipeline errors - GSTATE_*: Graphics state errors - LAYOUT_*: Layout and reading order errors - MCP_*: MCP server errors - CACHE_*: Cache errors References: Phase 1.6 (error recovery), INV-8, Phase 0.4 (clippy enforces doc comments)
This commit is contained in:
parent
1959ff2446
commit
6a35bdd869
14 changed files with 817 additions and 459 deletions
|
|
@ -1 +1 @@
|
|||
c6be8e6b574e5a1ef0fb65fb3aacebfe36740030
|
||||
37413028fa8535169cd8a39e47bee704cfc7bf80
|
||||
|
|
|
|||
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -738,6 +738,7 @@ dependencies = [
|
|||
"chrono",
|
||||
"clap",
|
||||
"lzw",
|
||||
"pdftract-core",
|
||||
"regex",
|
||||
"secrecy",
|
||||
"serde",
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ anyhow = { workspace = true }
|
|||
chrono = { version = "0.4", features = ["serde"] }
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
lzw = { workspace = true }
|
||||
pdftract-core = { path = "../pdftract-core" }
|
||||
regex = "1.10"
|
||||
secrecy = { workspace = true }
|
||||
serde = { workspace = true, features = ["derive"] }
|
||||
|
|
|
|||
|
|
@ -8,6 +8,9 @@ mod mcp;
|
|||
mod password;
|
||||
use codegen::Language;
|
||||
|
||||
// Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands
|
||||
pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG};
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(name = "pdftract")]
|
||||
#[command(about = "pdftract CLI - PDF extraction and conformance testing", long_about = None)]
|
||||
|
|
@ -18,6 +21,13 @@ struct Cli {
|
|||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
/// List all diagnostic codes with their metadata
|
||||
ListDiagnostics,
|
||||
/// Explain a specific diagnostic code in detail
|
||||
ExplainDiagnostic {
|
||||
/// Diagnostic code to explain (e.g., STRUCT_MISSING_KEY, STREAM_BOMB)
|
||||
code: String,
|
||||
},
|
||||
/// Compare actual results against expected values with tolerances (for conformance testing)
|
||||
Compare {
|
||||
/// Path to the actual results JSON
|
||||
|
|
@ -113,6 +123,12 @@ fn main() -> Result<()> {
|
|||
let cli = Cli::parse();
|
||||
|
||||
match cli.command {
|
||||
Commands::ListDiagnostics => {
|
||||
cmd_list_diagnostics()?;
|
||||
}
|
||||
Commands::ExplainDiagnostic { code } => {
|
||||
cmd_explain_diagnostic(&code)?;
|
||||
}
|
||||
Commands::Compare {
|
||||
actual,
|
||||
expected,
|
||||
|
|
@ -192,6 +208,335 @@ fn cmd_extract(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn cmd_list_diagnostics() -> Result<()> {
|
||||
println!("pdftract Diagnostic Codes");
|
||||
println!();
|
||||
println!("This catalog lists all diagnostic codes emitted during PDF parsing and extraction.");
|
||||
println!("Each diagnostic includes a severity level, recoverable flag, phase origin, and suggested action.");
|
||||
println!();
|
||||
|
||||
// Group by category
|
||||
let mut categories: std::collections::HashMap<&str, Vec<&DiagInfo>> = std::collections::HashMap::new();
|
||||
for info in DIAGNOSTIC_CATALOG {
|
||||
categories.entry(info.category).or_default().push(info);
|
||||
}
|
||||
|
||||
// Define category order
|
||||
let category_order = vec![
|
||||
"STRUCT", "XREF", "STREAM", "ENCRYPTION", "PAGE", "FONT",
|
||||
"OCR", "REMOTE", "GSTATE", "LAYOUT", "MCP", "CACHE",
|
||||
];
|
||||
|
||||
for category in category_order {
|
||||
if let Some(infos) = categories.get(category) {
|
||||
println!("=== {}_* codes ===", category);
|
||||
println!();
|
||||
|
||||
for info in infos {
|
||||
println!("{} ({})", info.code, info.severity);
|
||||
println!(" Phase: {}", info.phase);
|
||||
println!(" Recoverable: {}", if info.recoverable { "Yes" } else { "No" });
|
||||
println!(" Action: {}", info.suggested_action);
|
||||
println!();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("Total: {} diagnostic codes", DIAGNOSTIC_CATALOG.len());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cmd_explain_diagnostic(code: &str) -> Result<()> {
|
||||
// Normalize the input code (handle case-insensitivity and strip whitespace)
|
||||
let code_upper = code.to_uppercase().trim().to_string();
|
||||
|
||||
// Try to find the diagnostic by name in the catalog
|
||||
let info = DIAGNOSTIC_CATALOG
|
||||
.iter()
|
||||
.find(|info| info.code.name() == code_upper)
|
||||
.ok_or_else(|| anyhow::anyhow!("Unknown diagnostic code: {}", code))?;
|
||||
|
||||
println!("Diagnostic: {}", info.code);
|
||||
println!("Category: {}", info.category);
|
||||
println!("Severity: {}", info.severity);
|
||||
println!("Recoverable: {}", if info.recoverable { "Yes" } else { "No" });
|
||||
println!("Phase Origin: {}", info.phase);
|
||||
println!();
|
||||
println!("Description:");
|
||||
|
||||
// Get the description from the DiagCode's doc comment
|
||||
// We can't access doc comments at runtime, but we can provide useful info
|
||||
match info.code {
|
||||
DiagCode::StructInvalidName => {
|
||||
println!(" Invalid name character or malformed name object");
|
||||
println!(" Names containing invalid characters or exceeding the 127-byte limit are truncated.");
|
||||
}
|
||||
DiagCode::StructInvalidHex => {
|
||||
println!(" Invalid hexadecimal character in hex string or name escape");
|
||||
println!(" Non-hex characters in <...> strings or #XX escapes are skipped.");
|
||||
}
|
||||
DiagCode::StructInvalidOctal => {
|
||||
println!(" Invalid octal escape sequence in literal string");
|
||||
println!(" Invalid \\NNN escapes are passed through literally.");
|
||||
}
|
||||
DiagCode::StructInvalidStreamHeader => {
|
||||
println!(" Invalid stream header");
|
||||
println!(" The 'stream' keyword must be followed by CRLF or LF per PDF spec.");
|
||||
}
|
||||
DiagCode::StructUnexpectedByte => {
|
||||
println!(" Unexpected byte during parsing");
|
||||
println!(" A byte doesn't match expected token syntax; lexer resynchronizes.");
|
||||
}
|
||||
DiagCode::StructUnexpectedEof => {
|
||||
println!(" Unexpected end of file");
|
||||
println!(" The file ends mid-token; parsing continues with partial data.");
|
||||
}
|
||||
DiagCode::StructUnterminatedString => {
|
||||
println!(" Unterminated literal string");
|
||||
println!(" A literal string is missing a closing parenthesis.");
|
||||
}
|
||||
DiagCode::StructMissingKey => {
|
||||
println!(" Missing required dictionary key");
|
||||
println!(" A required key is absent from a dictionary.");
|
||||
}
|
||||
DiagCode::StructCircularRef => {
|
||||
println!(" Circular reference detected");
|
||||
println!(" An indirect reference forms a cycle (A → B → A).");
|
||||
}
|
||||
DiagCode::StructXobjectCycle => {
|
||||
println!(" Form XObject cycle detected");
|
||||
println!(" A form XObject invokes itself directly or indirectly.");
|
||||
}
|
||||
DiagCode::StructDepthExceeded => {
|
||||
println!(" Dictionary nesting depth exceeds limit");
|
||||
println!(" Structure is too deeply nested; truncated to prevent stack overflow.");
|
||||
}
|
||||
DiagCode::StructInvalidDictValue => {
|
||||
println!(" Invalid dictionary value");
|
||||
println!(" A dictionary key is not followed by a value.");
|
||||
}
|
||||
DiagCode::StructInvalidDictKey => {
|
||||
println!(" Invalid dictionary key");
|
||||
println!(" A dictionary key is not a name object.");
|
||||
}
|
||||
DiagCode::StructInvalidIndirectHeader => {
|
||||
println!(" Invalid indirect object header");
|
||||
println!(" The 'N G obj' header is malformed.");
|
||||
}
|
||||
DiagCode::StructIntegerOverflow => {
|
||||
println!(" Integer overflow during parsing");
|
||||
println!(" An integer would overflow i64; value is clamped.");
|
||||
}
|
||||
DiagCode::StructInvalidObjstm => {
|
||||
println!(" Invalid object stream format");
|
||||
println!(" An object stream has a malformed header or invalid data.");
|
||||
}
|
||||
DiagCode::StructInvalidGeometry => {
|
||||
println!(" Invalid geometry value");
|
||||
println!(" NaN or Inf in MediaBox/CropBox/Rotate; canonicalized to 0.");
|
||||
}
|
||||
DiagCode::StructInvalidUtf16 => {
|
||||
println!(" Invalid UTF-16BE encoding");
|
||||
println!(" A UTF-16BE string has odd length or invalid encoding.");
|
||||
}
|
||||
DiagCode::StructUnresolvedDestination => {
|
||||
println!(" Unresolved named destination");
|
||||
println!(" An outline references a named destination (not yet resolved).");
|
||||
}
|
||||
DiagCode::StructNonGotoOutline => {
|
||||
println!(" Non-GoTo action in outline");
|
||||
println!(" An outline has an action other than GoTo/URI.");
|
||||
}
|
||||
DiagCode::StructInvalidPdfDocEncoding => {
|
||||
println!(" Invalid PDFDocEncoding");
|
||||
println!(" A PDFDocEncoding string cannot be decoded to UTF-8.");
|
||||
}
|
||||
DiagCode::StructHybridConflict => {
|
||||
println!(" Hybrid xref conflict");
|
||||
println!(" Traditional xref and stream disagree on object state.");
|
||||
}
|
||||
DiagCode::StructInvalidPrevOffset => {
|
||||
println!(" Invalid /Prev offset in xref chain");
|
||||
println!(" A trailer's /Prev offset points to invalid data.");
|
||||
}
|
||||
DiagCode::XrefInvalidHeader => {
|
||||
println!(" Invalid xref keyword or header");
|
||||
println!(" The xref table doesn't start with the 'xref' keyword.");
|
||||
}
|
||||
DiagCode::XrefInvalidEntry => {
|
||||
println!(" Malformed xref entry");
|
||||
println!(" An xref entry doesn't match the 20-byte format.");
|
||||
}
|
||||
DiagCode::XrefInvalidSubsectionHeader => {
|
||||
println!(" Invalid subsection header");
|
||||
println!(" An xref subsection header is malformed.");
|
||||
}
|
||||
DiagCode::XrefObjectZeroNotFree => {
|
||||
println!(" Object 0 is not free");
|
||||
println!(" Object 0 is marked as in-use, violating PDF spec.");
|
||||
}
|
||||
DiagCode::XrefTrailerNotFound => {
|
||||
println!(" Trailer dictionary not found");
|
||||
println!(" The trailer dictionary couldn't be located or parsed.");
|
||||
}
|
||||
DiagCode::XrefTruncated => {
|
||||
println!(" Truncated xref table");
|
||||
println!(" The xref table ends unexpectedly.");
|
||||
}
|
||||
DiagCode::XrefRepaired => {
|
||||
println!(" Xref was reconstructed");
|
||||
println!(" Forward scan recovered xref entries after primary strategies failed.");
|
||||
}
|
||||
DiagCode::XrefLinearizedNoForwardScan => {
|
||||
println!(" Forward scan disabled for linearized PDF");
|
||||
println!(" Forward scan would incorrectly find the partial first-page xref.");
|
||||
}
|
||||
DiagCode::XrefRemoteNoForwardScan => {
|
||||
println!(" Forward scan disabled for remote sources");
|
||||
println!(" Forward scan would require fetching the entire file.");
|
||||
}
|
||||
DiagCode::XrefInvalidStreamFormat => {
|
||||
println!(" Invalid xref stream format");
|
||||
println!(" An xref stream has a malformed header or invalid /W array.");
|
||||
}
|
||||
DiagCode::XrefInvalidStreamEntry => {
|
||||
println!(" Invalid xref stream entry");
|
||||
println!(" An xref stream entry cannot be parsed due to invalid data.");
|
||||
}
|
||||
DiagCode::StreamDecodeError => {
|
||||
println!(" Stream decompression failed");
|
||||
println!(" A stream decoder encountered corrupt data mid-decompression.");
|
||||
}
|
||||
DiagCode::StreamBomb => {
|
||||
println!(" Decompression bomb limit exceeded");
|
||||
println!(" A stream's decompressed size would exceed the safety limit.");
|
||||
}
|
||||
DiagCode::StreamUnknownFilter => {
|
||||
println!(" Unknown filter name");
|
||||
println!(" A stream specifies an unsupported filter.");
|
||||
}
|
||||
DiagCode::StreamInvalidParams => {
|
||||
println!(" Invalid filter parameters");
|
||||
println!(" A stream's /DecodeParms dictionary is malformed.");
|
||||
}
|
||||
DiagCode::EncryptionUnsupported => {
|
||||
println!(" Unsupported encryption or no password");
|
||||
println!(" PDF is encrypted and no password was supplied or algorithm is unsupported.");
|
||||
}
|
||||
DiagCode::EncryptionWrongPassword => {
|
||||
println!(" Password incorrect");
|
||||
println!(" The supplied password doesn't match the PDF's encryption key.");
|
||||
}
|
||||
DiagCode::PageOutOfRange => {
|
||||
println!(" Page number out of range");
|
||||
println!(" --pages specifies a page number greater than the document's page count.");
|
||||
}
|
||||
DiagCode::PageInvalidCount => {
|
||||
println!(" Invalid page count");
|
||||
println!(" The /Count key in the /Pages tree is invalid.");
|
||||
}
|
||||
DiagCode::PageInvalidRotate => {
|
||||
println!(" Invalid /Rotate value");
|
||||
println!(" A page's /Rotate value is not a multiple of 90.");
|
||||
}
|
||||
DiagCode::FontGlyphUnmapped => {
|
||||
println!(" Glyph could not be mapped to Unicode");
|
||||
println!(" A glyph has no entry in /ToUnicode CMap, AGL, fingerprint, or shape match.");
|
||||
}
|
||||
DiagCode::FontNotFound => {
|
||||
println!(" Font not found or couldn't be parsed");
|
||||
println!(" A referenced font is missing from the PDF or couldn't be parsed.");
|
||||
}
|
||||
DiagCode::FontInvalidCmap => {
|
||||
println!(" Invalid CMap format");
|
||||
println!(" A CMap stream is malformed.");
|
||||
}
|
||||
DiagCode::OcrJbig2Unsupported => {
|
||||
println!(" JBIG2 decoder not available");
|
||||
println!(" Build with --features full-render to enable JBIG2 decoding.");
|
||||
}
|
||||
DiagCode::OcrJpxUnsupported => {
|
||||
println!(" JPEG2000 decoder not available");
|
||||
println!(" Build with --features full-render or install libopenjp2.");
|
||||
}
|
||||
DiagCode::OcrCcittUnsupported => {
|
||||
println!(" CCITT fax decoder not available");
|
||||
println!(" Install libtiff system library or build with --features full-render.");
|
||||
}
|
||||
DiagCode::OcrTesseractFailed => {
|
||||
println!(" Tesseract OCR failed");
|
||||
println!(" Tesseract crashed or returned an error.");
|
||||
}
|
||||
DiagCode::OcrBrokenVectorUnavailable => {
|
||||
println!(" OCR unavailable on broken-vector page");
|
||||
println!(" Build with --features ocr to enable OCR recovery.");
|
||||
}
|
||||
DiagCode::RemoteFetchInterrupted => {
|
||||
println!(" HTTP fetch interrupted or failed");
|
||||
println!(" Network error, timeout, or server error occurred.");
|
||||
}
|
||||
DiagCode::RemoteNoRangeSupport => {
|
||||
println!(" Server does not support Range requests");
|
||||
println!(" Falls back to downloading the entire file.");
|
||||
}
|
||||
DiagCode::RemoteTlsFailed => {
|
||||
println!(" TLS handshake failed");
|
||||
println!(" The TLS handshake failed; check the server's certificate.");
|
||||
}
|
||||
DiagCode::RemoteDnsFailed => {
|
||||
println!(" DNS resolution failed");
|
||||
println!(" The hostname could not be resolved.");
|
||||
}
|
||||
DiagCode::GstateStackOverflow => {
|
||||
println!(" Graphics state stack overflow");
|
||||
println!(" The graphics state stack exceeded the internal limit.");
|
||||
}
|
||||
DiagCode::GstateStackUnderflow => {
|
||||
println!(" Graphics state stack underflow");
|
||||
println!(" More Q operators than q operators in the content stream.");
|
||||
}
|
||||
DiagCode::GstateBtEtMismatch => {
|
||||
println!(" Mismatched BT/ET pair");
|
||||
println!(" The content stream has mismatched BT/ET operators.");
|
||||
}
|
||||
DiagCode::LayoutTaggedPdfDeferred => {
|
||||
println!(" Tagged PDF StructTree deferred");
|
||||
println!(" StructTree is ignored; XY-cut is used instead (Phase 7.1 pending).");
|
||||
}
|
||||
DiagCode::LayoutReadingOrderAmbiguous => {
|
||||
println!(" Reading order may be incorrect");
|
||||
println!(" The reading order algorithm detected ambiguity.");
|
||||
}
|
||||
DiagCode::LayoutLowReadability => {
|
||||
println!(" Low readability score");
|
||||
println!(" Page readability is below 0.85; may indicate mojibake.");
|
||||
}
|
||||
DiagCode::McpToolInvalidParams => {
|
||||
println!(" MCP tool call has invalid parameters");
|
||||
println!(" An MCP tool call doesn't match the tool's schema.");
|
||||
}
|
||||
DiagCode::McpPathTraversal => {
|
||||
println!(" MCP path traversal attempt");
|
||||
println!(" An MCP path escapes the --root directory.");
|
||||
}
|
||||
DiagCode::CacheEntryCorrupt => {
|
||||
println!(" Cache entry is corrupted");
|
||||
println!(" A cached entry failed to deserialize and was deleted.");
|
||||
}
|
||||
DiagCode::CacheWriteFailed => {
|
||||
println!(" Cache write failed");
|
||||
println!(" Writing to the cache failed (e.g., out of disk space).");
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
println!("Suggested Action: {}", info.suggested_action);
|
||||
println!();
|
||||
println!("Phase Origin: {}", info.phase);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn cmd_compare(actual: PathBuf, expected: PathBuf, tolerances: Option<PathBuf>, format: &str) -> Result<()> {
|
||||
let actual_json = fs::read_to_string(&actual)
|
||||
.context(format!("Failed to read actual results from {:?}", actual))?;
|
||||
|
|
|
|||
|
|
@ -5,3 +5,4 @@
|
|||
# It is recommended to check this file in to source control so that
|
||||
# everyone who runs the test benefits from these saved cases.
|
||||
cc 9eb796a85e40a841d1cd43881214b688676e982ec812d8c66313ea753a019ec6 # shrinks to bytes = [123]
|
||||
cc e23be3e45757e93e13f0d3daf57c9fbce249a6629b9bfc8d0cb14ebf332767ae # shrinks to bytes = [41]
|
||||
|
|
|
|||
|
|
@ -383,6 +383,30 @@ pub enum DiagCode {
|
|||
/// Phase origin: 1.3
|
||||
XrefRemoteNoForwardScan,
|
||||
|
||||
/// Invalid xref stream format
|
||||
///
|
||||
/// Emitted when an xref stream has a malformed header, invalid /W array,
|
||||
/// or other format violations. The stream is skipped.
|
||||
///
|
||||
/// Phase origin: 1.3
|
||||
XrefInvalidStreamFormat,
|
||||
|
||||
/// Invalid xref stream entry
|
||||
///
|
||||
/// Emitted when an xref stream entry cannot be parsed due to invalid data
|
||||
/// in the stream's compressed entries section.
|
||||
///
|
||||
/// Phase origin: 1.3
|
||||
XrefInvalidStreamEntry,
|
||||
|
||||
/// Invalid /Prev offset in xref chain
|
||||
///
|
||||
/// Emitted when a trailer's /Prev offset points to invalid data (outside file,
|
||||
/// not at xref boundary, etc.). The chain is truncated at this point.
|
||||
///
|
||||
/// Phase origin: 1.3
|
||||
StructInvalidPrevOffset,
|
||||
|
||||
// === STREAM_* codes ===
|
||||
|
||||
/// Stream decompression failed (corrupt data)
|
||||
|
|
@ -687,7 +711,12 @@ impl DiagCode {
|
|||
| DiagCode::XrefTruncated
|
||||
| DiagCode::XrefRepaired
|
||||
| DiagCode::XrefLinearizedNoForwardScan
|
||||
| DiagCode::XrefRemoteNoForwardScan => "XREF",
|
||||
| DiagCode::XrefRemoteNoForwardScan
|
||||
| DiagCode::XrefInvalidStreamFormat
|
||||
| DiagCode::XrefInvalidStreamEntry => "XREF",
|
||||
|
||||
// STRUCT_* (continued)
|
||||
DiagCode::StructInvalidPrevOffset => "STRUCT",
|
||||
|
||||
// STREAM_*
|
||||
DiagCode::StreamDecodeError
|
||||
|
|
@ -774,6 +803,9 @@ impl DiagCode {
|
|||
DiagCode::XrefRepaired => "XREF_REPAIRED",
|
||||
DiagCode::XrefLinearizedNoForwardScan => "XREF_LINEARIZED_NO_FORWARD_SCAN",
|
||||
DiagCode::XrefRemoteNoForwardScan => "XREF_REMOTE_NO_FORWARD_SCAN",
|
||||
DiagCode::XrefInvalidStreamFormat => "XREF_INVALID_STREAM_FORMAT",
|
||||
DiagCode::XrefInvalidStreamEntry => "XREF_INVALID_STREAM_ENTRY",
|
||||
DiagCode::StructInvalidPrevOffset => "STRUCT_INVALID_PREV_OFFSET",
|
||||
DiagCode::StreamDecodeError => "STREAM_DECODE_ERROR",
|
||||
DiagCode::StreamBomb => "STREAM_BOMB",
|
||||
DiagCode::StreamUnknownFilter => "STREAM_UNKNOWN_FILTER",
|
||||
|
|
@ -836,6 +868,7 @@ impl DiagCode {
|
|||
| DiagCode::StructNonGotoOutline
|
||||
| DiagCode::StructInvalidPdfDocEncoding
|
||||
| DiagCode::StructHybridConflict
|
||||
| DiagCode::StructInvalidPrevOffset
|
||||
| DiagCode::XrefInvalidHeader
|
||||
| DiagCode::XrefInvalidEntry
|
||||
| DiagCode::XrefInvalidSubsectionHeader
|
||||
|
|
@ -844,6 +877,8 @@ impl DiagCode {
|
|||
| DiagCode::XrefTruncated
|
||||
| DiagCode::XrefLinearizedNoForwardScan
|
||||
| DiagCode::XrefRemoteNoForwardScan
|
||||
| DiagCode::XrefInvalidStreamFormat
|
||||
| DiagCode::XrefInvalidStreamEntry
|
||||
| DiagCode::StreamDecodeError
|
||||
| DiagCode::StreamUnknownFilter
|
||||
| DiagCode::StreamInvalidParams
|
||||
|
|
@ -1145,6 +1180,30 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
|
|||
phase: "1.3",
|
||||
suggested_action: "Forward scan is disabled for HTTP sources (would fetch entire file)",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::XrefInvalidStreamFormat,
|
||||
category: "XREF",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "1.3",
|
||||
suggested_action: "The xref stream has a malformed header or invalid /W array; the stream is skipped",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::XrefInvalidStreamEntry,
|
||||
category: "XREF",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "1.3",
|
||||
suggested_action: "An xref stream entry cannot be parsed due to invalid data",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::StructInvalidPrevOffset,
|
||||
category: "STRUCT",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "1.3",
|
||||
suggested_action: "A trailer's /Prev offset points to invalid data; the xref chain is truncated at this point",
|
||||
},
|
||||
// === STREAM_* codes ===
|
||||
DiagInfo {
|
||||
code: DiagCode::StreamDecodeError,
|
||||
|
|
|
|||
|
|
@ -783,7 +783,9 @@ mod tests {
|
|||
assert!(catalog.names_ref.is_none());
|
||||
assert!(catalog.metadata_ref.is_none());
|
||||
assert!(catalog.page_labels.is_none());
|
||||
assert!(catalog.oc_properties.is_none());
|
||||
// oc_properties is always Some; check present flag for absence
|
||||
assert!(catalog.oc_properties.is_some());
|
||||
assert!(!catalog.oc_properties.as_ref().unwrap().present);
|
||||
assert!(catalog.open_action.is_none());
|
||||
assert!(catalog.aa.is_none());
|
||||
assert!(catalog.version.is_none());
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
//! This module provides the lexer that converts raw PDF byte sequences into tokens.
|
||||
//! PDF is byte-oriented; position tracking is byte-level, not character-level.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use crate::diagnostics::{Diagnostic as Diag, DiagCode};
|
||||
|
||||
/// Token produced by the PDF lexer.
|
||||
///
|
||||
|
|
@ -49,82 +49,6 @@ pub enum Token {
|
|||
Eof,
|
||||
}
|
||||
|
||||
/// Diagnostic code for lexer errors.
|
||||
///
|
||||
/// All lexer diagnostic codes use the `STRUCT_` prefix to indicate
|
||||
/// they relate to structural/lexical issues in the PDF document.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum DiagCode {
|
||||
/// Invalid name character or malformed name
|
||||
StructInvalidName,
|
||||
/// Invalid hexadecimal character in hex string or name escape
|
||||
StructInvalidHex,
|
||||
/// Invalid octal escape sequence in literal string
|
||||
StructInvalidOctal,
|
||||
/// Invalid stream header (stream keyword not followed by proper newline)
|
||||
StructInvalidStreamHeader,
|
||||
/// Unexpected byte (e.g., stray `>` not part of `>>`)
|
||||
StructUnexpectedByte,
|
||||
/// Unexpected end of file while parsing a token
|
||||
StructUnexpectedEof,
|
||||
/// Unterminated literal string (missing closing paren)
|
||||
StructUnterminatedString,
|
||||
|
||||
// Object parser codes
|
||||
/// Dictionary nesting depth exceeds limit
|
||||
DepthExceeded,
|
||||
/// Missing required key in dictionary
|
||||
MissingKey,
|
||||
|
||||
// Object stream codes
|
||||
/// Invalid object stream format
|
||||
InvalidObjstm,
|
||||
/// Circular reference in /Extends chain
|
||||
CircularRef,
|
||||
/// Stream decompression failed
|
||||
DecompressionFailed,
|
||||
/// Decompression bomb limit exceeded
|
||||
StreamBomb,
|
||||
}
|
||||
|
||||
/// Diagnostic message emitted during lexing.
|
||||
///
|
||||
/// Diagnostics are accumulated during lexing and can be retrieved
|
||||
/// via `Lexer::take_diagnostics()`. They do not stop lexing; the
|
||||
/// lexer attempts recovery and continues.
|
||||
///
|
||||
/// Diagnostic messages use `Cow<'static, str>` so static error messages
|
||||
/// don't allocate. Dynamic messages (with formatting) allocate only when needed.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct Diagnostic {
|
||||
/// The diagnostic code identifying the type of error
|
||||
pub code: DiagCode,
|
||||
/// Byte offset in the input where the error occurred
|
||||
pub byte_offset: u64,
|
||||
/// Human-readable error message
|
||||
pub msg: Cow<'static, str>,
|
||||
}
|
||||
|
||||
impl Diagnostic {
|
||||
/// Create a diagnostic with a static message (no allocation).
|
||||
fn with_static(code: DiagCode, byte_offset: u64, msg: &'static str) -> Self {
|
||||
Diagnostic {
|
||||
code,
|
||||
byte_offset,
|
||||
msg: Cow::Borrowed(msg),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a diagnostic with a dynamic message (allocates).
|
||||
fn with_dynamic(code: DiagCode, byte_offset: u64, msg: String) -> Self {
|
||||
Diagnostic {
|
||||
code,
|
||||
byte_offset,
|
||||
msg: Cow::Owned(msg),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// PDF lexical analyzer.
|
||||
///
|
||||
/// The lexer processes PDF byte sequences and produces tokens.
|
||||
|
|
@ -149,7 +73,7 @@ pub struct Lexer<'a> {
|
|||
/// Current byte position within the original input
|
||||
pos: usize,
|
||||
/// Accumulated diagnostics
|
||||
diagnostics: Vec<Diagnostic>,
|
||||
diagnostics: Vec<Diag>,
|
||||
/// Cached token for peek operations (token, position after token)
|
||||
peek_cache: Option<(Token, usize)>,
|
||||
/// Whether Eof has been returned
|
||||
|
|
@ -322,7 +246,7 @@ impl<'a> Lexer<'a> {
|
|||
/// let diags = lexer.take_diagnostics();
|
||||
/// assert!(diags.is_empty());
|
||||
/// ```
|
||||
pub fn take_diagnostics(&mut self) -> Vec<Diagnostic> {
|
||||
pub fn take_diagnostics(&mut self) -> Vec<Diag> {
|
||||
std::mem::take(&mut self.diagnostics)
|
||||
}
|
||||
|
||||
|
|
@ -387,6 +311,17 @@ impl<'a> Lexer<'a> {
|
|||
b'n' => self.lex_n_keyword(),
|
||||
b'x' => self.lex_x_keyword(),
|
||||
b'%' => self.lex_percent(),
|
||||
b'{' | b'}' => {
|
||||
// PDF 1.2 reserved these for future use; treat as unexpected bytes
|
||||
let pos = self.pos;
|
||||
self.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::StructUnexpectedByte,
|
||||
pos as u64,
|
||||
format!("Unexpected byte: 0x{:02x}", next),
|
||||
));
|
||||
self.advance(1);
|
||||
Some(Token::Null)
|
||||
}
|
||||
_ => self.lex_keyword(),
|
||||
}
|
||||
}
|
||||
|
|
@ -601,7 +536,7 @@ impl<'a> Lexer<'a> {
|
|||
|
||||
if !has_digit {
|
||||
// Not a valid number, emit diagnostic and return null
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructUnexpectedEof,
|
||||
start as u64,
|
||||
"Invalid numeric literal",
|
||||
|
|
@ -710,7 +645,7 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
|
||||
if value > 255 {
|
||||
self.diagnostics.push(Diagnostic::with_dynamic(
|
||||
self.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::StructInvalidOctal,
|
||||
self.pos as u64,
|
||||
format!("Octal escape \\{:03o} exceeds 255, truncated", value),
|
||||
|
|
@ -738,7 +673,7 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
|
||||
// Unterminated string
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructUnterminatedString,
|
||||
start as u64,
|
||||
"Unterminated literal string",
|
||||
|
|
@ -763,7 +698,7 @@ impl<'a> Lexer<'a> {
|
|||
|
||||
// Special check for NUL byte: it's whitespace per spec, but invalid in names
|
||||
if b == 0x00 {
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidName,
|
||||
self.pos as u64,
|
||||
"NUL byte in name is invalid per PDF spec",
|
||||
|
|
@ -796,7 +731,7 @@ impl<'a> Lexer<'a> {
|
|||
let decoded = (h << 4) | l;
|
||||
// Check if decoded byte is NUL
|
||||
if decoded == 0 {
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidName,
|
||||
self.pos as u64,
|
||||
"NUL byte in name is invalid per PDF spec",
|
||||
|
|
@ -810,7 +745,7 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
_ => {
|
||||
// Invalid hex: emit diagnostic and treat # as literal
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidName,
|
||||
self.pos as u64,
|
||||
"Invalid hex escape sequence in name",
|
||||
|
|
@ -836,7 +771,7 @@ impl<'a> Lexer<'a> {
|
|||
|
||||
// Emit diagnostic if we hit the length limit
|
||||
if truncated_due_to_length || raw_consumed > MAX_RAW_BYTES {
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidName,
|
||||
start as u64,
|
||||
"Name exceeds 127-byte length limit",
|
||||
|
|
@ -845,7 +780,7 @@ impl<'a> Lexer<'a> {
|
|||
// Check if there's more input that we didn't consume
|
||||
if let Some(&b) = self.bytes.first() {
|
||||
if !Self::is_pdf_whitespace(b) && !Self::is_pdf_delimiter(b) {
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidName,
|
||||
start as u64,
|
||||
"Name exceeds 127-byte length limit",
|
||||
|
|
@ -910,7 +845,7 @@ impl<'a> Lexer<'a> {
|
|||
out.push(hi << 4);
|
||||
current_nibble = None;
|
||||
}
|
||||
self.diagnostics.push(Diagnostic::with_dynamic(
|
||||
self.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::StructInvalidHex,
|
||||
self.pos as u64,
|
||||
format!("Invalid hex character '{}' (0x{:02x})", b as char, b),
|
||||
|
|
@ -920,7 +855,7 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
|
||||
// EOF before >
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructUnterminatedString,
|
||||
start as u64,
|
||||
"Unterminated hex string",
|
||||
|
|
@ -950,7 +885,7 @@ impl<'a> Lexer<'a> {
|
|||
Some(Token::DictEnd)
|
||||
} else {
|
||||
// Stray > - emit diagnostic
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructUnexpectedByte,
|
||||
self.pos as u64,
|
||||
"Unexpected > character",
|
||||
|
|
@ -980,7 +915,7 @@ impl<'a> Lexer<'a> {
|
|||
self.advance(1); // consume the \n
|
||||
} else {
|
||||
// Lone \r - invalid
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidStreamHeader,
|
||||
start_pos as u64,
|
||||
"stream keyword must be followed by \\n or \\r\\n, not lone \\r",
|
||||
|
|
@ -988,7 +923,7 @@ impl<'a> Lexer<'a> {
|
|||
}
|
||||
} else {
|
||||
// No line ending at all - invalid
|
||||
self.diagnostics.push(Diagnostic::with_static(
|
||||
self.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidStreamHeader,
|
||||
start_pos as u64,
|
||||
"stream keyword must be followed by \\n or \\r\\n",
|
||||
|
|
@ -1071,7 +1006,7 @@ impl<'a> Lexer<'a> {
|
|||
fn lex_unknown(&mut self) -> Option<Token> {
|
||||
// Unknown character - skip it and emit diagnostic
|
||||
let pos = self.pos;
|
||||
self.diagnostics.push(Diagnostic::with_dynamic(
|
||||
self.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::StructUnexpectedEof,
|
||||
pos as u64,
|
||||
format!("Unexpected byte: 0x{:02x}", self.bytes[0]),
|
||||
|
|
@ -1201,7 +1136,7 @@ mod tests {
|
|||
let diags = lexer.take_diagnostics();
|
||||
assert_eq!(diags.len(), 1);
|
||||
assert_eq!(diags[0].code, DiagCode::StructInvalidStreamHeader);
|
||||
assert!(diags[0].msg.contains("lone \\r"));
|
||||
assert!(diags[0].message.as_ref().contains("lone \\r"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1358,7 +1293,7 @@ mod tests {
|
|||
let diags = lexer.take_diagnostics();
|
||||
assert_eq!(diags.len(), 1);
|
||||
assert_eq!(diags[0].code, DiagCode::StructInvalidOctal);
|
||||
assert!(diags[0].msg.contains("401"));
|
||||
assert!(diags[0].message.as_ref().contains("401"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1477,8 +1412,8 @@ mod tests {
|
|||
assert_eq!(diags.len(), 1);
|
||||
assert_eq!(diags[0].code, DiagCode::StructInvalidHex);
|
||||
// Debug: print actual message
|
||||
eprintln!("Actual diagnostic message: {}", diags[0].msg);
|
||||
assert!(diags[0].msg.contains("Z"));
|
||||
eprintln!("Actual diagnostic message: {}", diags[0].message.as_ref());
|
||||
assert!(diags[0].message.as_ref().contains("Z"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1489,7 +1424,7 @@ mod tests {
|
|||
let diags = lexer.take_diagnostics();
|
||||
assert_eq!(diags.len(), 1);
|
||||
assert_eq!(diags[0].code, DiagCode::StructUnterminatedString);
|
||||
assert!(diags[0].msg.contains("hex string"));
|
||||
assert!(diags[0].message.as_ref().contains("hex string"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1772,7 +1707,7 @@ mod tests {
|
|||
let diags = lexer.take_diagnostics();
|
||||
assert_eq!(diags.len(), 1);
|
||||
assert_eq!(diags[0].code, DiagCode::StructInvalidName);
|
||||
assert!(diags[0].msg.contains("NUL"));
|
||||
assert!(diags[0].message.as_ref().contains("NUL"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1801,7 +1736,7 @@ mod tests {
|
|||
let diags = lexer.take_diagnostics();
|
||||
assert_eq!(diags.len(), 1);
|
||||
assert_eq!(diags[0].code, DiagCode::StructInvalidName);
|
||||
assert!(diags[0].msg.contains("127"));
|
||||
assert!(diags[0].message.as_ref().contains("127"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1873,7 +1808,7 @@ mod tests {
|
|||
let diags = lexer.take_diagnostics();
|
||||
assert_eq!(diags.len(), 1);
|
||||
assert_eq!(diags[0].code, DiagCode::StructInvalidName);
|
||||
assert!(diags[0].msg.contains("hex"));
|
||||
assert!(diags[0].message.as_ref().contains("hex"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -20,9 +20,10 @@ pub use crate::diagnostics::{Diagnostic, Severity, DiagCode, ObjRef};
|
|||
pub use object::{PdfObject};
|
||||
pub use objstm::{ObjectStmParser, ObjStmCacheEntry, ObjStmResult, ObjStmError};
|
||||
pub use xref::{
|
||||
XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection, XrefDiagnostic, XrefDiagCode,
|
||||
XrefResolver, XrefEntry, ResolveError, ResolveResult, XrefSection,
|
||||
parse_traditional_xref, parse_xref_stream, merge_hybrid, is_hybrid_trailer,
|
||||
LinearizationInfo, detect_linearization, load_xref_linearized, merge_linearized_xrefs,
|
||||
load_xref_with_prev_chain,
|
||||
};
|
||||
pub use catalog::{Catalog, MarkInfo, PageLabel, PageLabelsTree, PageLabelStyle, parse_catalog};
|
||||
pub use ocg::{OcProperties, OcGroup, Ocmd, OcmdPolicy, BaseState, parse_oc_properties};
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
use super::types::{intern, ObjRef, PdfDict, PdfObject, PdfStream, PdfIndirect};
|
||||
use crate::parser::lexer::{Lexer, Token};
|
||||
use crate::parser::diagnostic::{Diagnostic, DiagCode};
|
||||
use crate::diagnostics::{Diagnostic as Diag, DiagCode};
|
||||
|
||||
/// Maximum nesting depth for dictionaries and arrays.
|
||||
///
|
||||
|
|
@ -21,7 +21,7 @@ pub struct ObjectParser<'a> {
|
|||
/// The lexer that provides tokens
|
||||
lexer: Lexer<'a>,
|
||||
/// Accumulated diagnostics
|
||||
diagnostics: Vec<Diagnostic>,
|
||||
diagnostics: Vec<Diag>,
|
||||
/// Current nesting depth (for depth limit enforcement)
|
||||
depth: u16,
|
||||
}
|
||||
|
|
@ -50,7 +50,7 @@ impl<'a> ObjectParser<'a> {
|
|||
}
|
||||
|
||||
/// Take all accumulated diagnostics.
|
||||
pub fn take_diagnostics(&mut self) -> Vec<Diagnostic> {
|
||||
pub fn take_diagnostics(&mut self) -> Vec<Diag> {
|
||||
std::mem::take(&mut self.diagnostics)
|
||||
}
|
||||
|
||||
|
|
@ -93,8 +93,8 @@ impl<'a> ObjectParser<'a> {
|
|||
Token::Eof => None,
|
||||
_ => {
|
||||
// Unexpected token - emit diagnostic and return null
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructUnexpectedByte,
|
||||
format!("Unexpected token: {:?}", token),
|
||||
));
|
||||
Some(PdfObject::Null)
|
||||
|
|
@ -119,8 +119,8 @@ impl<'a> ObjectParser<'a> {
|
|||
|
||||
// Validate object and generation numbers are non-negative
|
||||
if first_int < 0 || gen < 0 {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidIndirectHeader,
|
||||
format!("Invalid indirect reference: {} {} R", first_int, gen),
|
||||
));
|
||||
return Some(PdfObject::Null);
|
||||
|
|
@ -141,9 +141,9 @@ impl<'a> ObjectParser<'a> {
|
|||
fn parse_array(&mut self) -> Option<PdfObject> {
|
||||
// Check depth limit
|
||||
if self.depth >= MAX_DEPTH {
|
||||
self.diagnostics.push(Diagnostic::error(
|
||||
"1.2",
|
||||
format!("STRUCT_DEPTH_EXCEEDED: Array nesting depth exceeds limit of {}", MAX_DEPTH),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructDepthExceeded,
|
||||
format!("Array nesting depth exceeds limit of {}", MAX_DEPTH),
|
||||
));
|
||||
// Skip to matching closing bracket
|
||||
self.skip_to_array_end();
|
||||
|
|
@ -199,9 +199,8 @@ impl<'a> ObjectParser<'a> {
|
|||
fn parse_dict(&mut self) -> Option<PdfObject> {
|
||||
// Check depth limit
|
||||
if self.depth >= MAX_DEPTH {
|
||||
self.diagnostics.push(Diagnostic::error_with_code(
|
||||
DiagCode::DepthExceeded,
|
||||
"1.2",
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructDepthExceeded,
|
||||
format!("Dictionary nesting depth exceeds limit of {}", MAX_DEPTH),
|
||||
));
|
||||
self.skip_to_dict_end();
|
||||
|
|
@ -232,9 +231,9 @@ impl<'a> ObjectParser<'a> {
|
|||
match self.lexer.peek_token() {
|
||||
Some(Token::DictEnd) | Some(Token::Eof) => {
|
||||
// Missing value - insert PdfNull
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INVALID_DICT_VALUE: Dictionary key '{}' has no value, inserting null", key),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidDictValue,
|
||||
format!("Dictionary key '{}' has no value, inserting null", key),
|
||||
));
|
||||
dict.insert(key, PdfObject::Null);
|
||||
break; // End of dict
|
||||
|
|
@ -253,9 +252,9 @@ impl<'a> ObjectParser<'a> {
|
|||
}
|
||||
_ => {
|
||||
// Invalid key - not a name
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INVALID_DICT_KEY: Dictionary key is not a name object, skipping"),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidDictKey,
|
||||
"Dictionary key is not a name object, skipping".to_string(),
|
||||
));
|
||||
// Skip the invalid token and the next token (would-be value)
|
||||
let _ = self.lexer.next_token();
|
||||
|
|
@ -314,9 +313,9 @@ impl<'a> ObjectParser<'a> {
|
|||
let len_usize = len as usize;
|
||||
let actual_skipped = self.lexer.skip_bytes(len);
|
||||
if actual_skipped < len_usize {
|
||||
self.diagnostics.push(Diagnostic::error(
|
||||
"1.2",
|
||||
format!("STRUCT_TRUNCATED_STREAM: Stream truncated at EOF: expected {} bytes, got {}", len, actual_skipped),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructUnexpectedEof,
|
||||
format!("Stream truncated at EOF: expected {} bytes, got {}", len, actual_skipped),
|
||||
));
|
||||
}
|
||||
} else {
|
||||
|
|
@ -330,24 +329,24 @@ impl<'a> ObjectParser<'a> {
|
|||
// Normal case - stream properly terminated
|
||||
}
|
||||
Some(Token::Eof) => {
|
||||
self.diagnostics.push(Diagnostic::error(
|
||||
"1.2",
|
||||
"STRUCT_TRUNCATED_STREAM: Stream truncated at EOF, missing endstream keyword",
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructUnexpectedEof,
|
||||
"Stream truncated at EOF, missing endstream keyword".to_string(),
|
||||
));
|
||||
}
|
||||
Some(other) => {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_MISSING_KEY: Expected endstream keyword after stream body, found {:?}", other),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructUnexpectedByte,
|
||||
format!("Expected endstream keyword after stream body, found {:?}", other),
|
||||
));
|
||||
// Try to recover by scanning forward for EndStream
|
||||
self.scan_to_endstream();
|
||||
}
|
||||
None => {
|
||||
// Shouldn't happen, but handle gracefully
|
||||
self.diagnostics.push(Diagnostic::error(
|
||||
"1.2",
|
||||
"Unexpected None after skipping stream body",
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructUnexpectedEof,
|
||||
"Unexpected None after skipping stream body".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
@ -420,15 +419,15 @@ impl<'a> ObjectParser<'a> {
|
|||
Token::Integer(n) => {
|
||||
// Check for overflow
|
||||
if n > u32::MAX as i64 {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INTEGER_OVERFLOW: Object number {} exceeds u32::MAX, clamping", n),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructIntegerOverflow,
|
||||
format!("Object number {} exceeds u32::MAX, clamping", n),
|
||||
));
|
||||
u32::MAX
|
||||
} else if n < 0 {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INVALID_INDIRECT_HEADER: Negative object number {}", n),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidIndirectHeader,
|
||||
format!("Negative object number {}", n),
|
||||
));
|
||||
// Recover by scanning forward to next obj keyword
|
||||
self.scan_to_next_obj();
|
||||
|
|
@ -439,9 +438,9 @@ impl<'a> ObjectParser<'a> {
|
|||
}
|
||||
_ => {
|
||||
// Not an integer - emit diagnostic and recover
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INVALID_INDIRECT_HEADER: Expected object number, found {:?}", token1),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidIndirectHeader,
|
||||
format!("Expected object number, found {:?}", token1),
|
||||
));
|
||||
self.scan_to_next_obj();
|
||||
return None;
|
||||
|
|
@ -454,15 +453,15 @@ impl<'a> ObjectParser<'a> {
|
|||
Token::Integer(g) => {
|
||||
// Check for overflow
|
||||
if g > u16::MAX as i64 {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INTEGER_OVERFLOW: Generation number {} exceeds u16::MAX, clamping", g),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructIntegerOverflow,
|
||||
format!("Generation number {} exceeds u16::MAX, clamping", g),
|
||||
));
|
||||
u16::MAX
|
||||
} else if g < 0 {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INVALID_INDIRECT_HEADER: Negative generation number {}", g),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidIndirectHeader,
|
||||
format!("Negative generation number {}", g),
|
||||
));
|
||||
self.scan_to_next_obj();
|
||||
return None;
|
||||
|
|
@ -472,9 +471,9 @@ impl<'a> ObjectParser<'a> {
|
|||
}
|
||||
_ => {
|
||||
// Not an integer - emit diagnostic and recover
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INVALID_INDIRECT_HEADER: Expected generation number, found {:?}", token2),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidIndirectHeader,
|
||||
format!("Expected generation number, found {:?}", token2),
|
||||
));
|
||||
self.scan_to_next_obj();
|
||||
return None;
|
||||
|
|
@ -484,9 +483,9 @@ impl<'a> ObjectParser<'a> {
|
|||
// Read the third token (must be Obj)
|
||||
let token3 = self.lexer.next_token()?;
|
||||
if !matches!(token3, Token::Obj) {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
format!("STRUCT_INVALID_INDIRECT_HEADER: Expected 'obj' keyword, found {:?}", token3),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructInvalidIndirectHeader,
|
||||
format!("Expected 'obj' keyword, found {:?}", token3),
|
||||
));
|
||||
self.scan_to_next_obj();
|
||||
return None;
|
||||
|
|
@ -507,9 +506,9 @@ impl<'a> ObjectParser<'a> {
|
|||
Some(Token::Obj) => {
|
||||
// Found the start of the next indirect object before endobj
|
||||
// This means the current object is malformed
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
"STRUCT_MISSING_KEY: Missing 'endobj' before next indirect object".to_string(),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructMissingKey,
|
||||
"Missing 'endobj' before next indirect object".to_string(),
|
||||
));
|
||||
// We're positioned at 'obj' but need to be at the object number
|
||||
// Scan forward to find the next integer (object number)
|
||||
|
|
@ -518,22 +517,22 @@ impl<'a> ObjectParser<'a> {
|
|||
Some(Token::Eof) => {
|
||||
// Consume the Eof
|
||||
let _ = self.lexer.next_token();
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
"STRUCT_MISSING_KEY: Missing 'endobj' at EOF".to_string(),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructMissingKey,
|
||||
"Missing 'endobj' at EOF".to_string(),
|
||||
));
|
||||
}
|
||||
None => {
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
"STRUCT_MISSING_KEY: Missing 'endobj' at EOF".to_string(),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructMissingKey,
|
||||
"Missing 'endobj' at EOF".to_string(),
|
||||
));
|
||||
}
|
||||
Some(_) => {
|
||||
// Some other token - scan for endobj or next obj
|
||||
self.diagnostics.push(Diagnostic::warning(
|
||||
"1.2",
|
||||
"STRUCT_MISSING_KEY: Expected 'endobj', scanning forward".to_string(),
|
||||
self.diagnostics.push(Diag::with_dynamic_no_offset(
|
||||
DiagCode::StructMissingKey,
|
||||
"Expected 'endobj', scanning forward".to_string(),
|
||||
));
|
||||
self.scan_to_endobj_or_obj();
|
||||
}
|
||||
|
|
@ -826,7 +825,7 @@ mod tests {
|
|||
assert_eq!(dict.len(), 1);
|
||||
assert_eq!(dict.get("Type"), Some(&PdfObject::Null));
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_DICT_VALUE")));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictValue));
|
||||
} else {
|
||||
panic!("Expected dict, got {:?}", obj);
|
||||
}
|
||||
|
|
@ -839,7 +838,7 @@ mod tests {
|
|||
if let Some(PdfObject::Dict(dict)) = obj {
|
||||
assert_eq!(dict.len(), 0);
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_DICT_KEY")));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictKey));
|
||||
} else {
|
||||
panic!("Expected dict, got {:?}", obj);
|
||||
}
|
||||
|
|
@ -926,7 +925,7 @@ mod tests {
|
|||
|
||||
// Should have emitted STRUCT_DEPTH_EXCEEDED diagnostic
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::DepthExceeded));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructDepthExceeded));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -951,7 +950,7 @@ mod tests {
|
|||
|
||||
// Should have emitted STRUCT_INVALID_DICT_VALUE diagnostic for missing value
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::InvalidDictValue));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidDictValue));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -962,7 +961,7 @@ mod tests {
|
|||
// Should return PdfNull with diagnostic
|
||||
assert_eq!(obj, Some(PdfObject::Null));
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructUnexpectedEof));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1085,7 +1084,7 @@ mod tests {
|
|||
|
||||
// Should have emitted STRUCT_MISSING_KEY diagnostic
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.message.contains("STRUCT_MISSING_KEY")));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructMissingKey));
|
||||
|
||||
// Next parse should handle the second object
|
||||
let indirect2 = parser.parse_indirect_object();
|
||||
|
|
@ -1109,7 +1108,7 @@ mod tests {
|
|||
|
||||
// Should have emitted STRUCT_INTEGER_OVERFLOW diagnostic
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INTEGER_OVERFLOW")));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructIntegerOverflow));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1124,7 +1123,7 @@ mod tests {
|
|||
|
||||
// Should have emitted STRUCT_INTEGER_OVERFLOW diagnostic
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INTEGER_OVERFLOW")));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructIntegerOverflow));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1138,7 +1137,7 @@ mod tests {
|
|||
|
||||
// Should have emitted STRUCT_INVALID_INDIRECT_HEADER diagnostic
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_INDIRECT_HEADER")));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1151,7 +1150,7 @@ mod tests {
|
|||
|
||||
// Should have emitted STRUCT_INVALID_INDIRECT_HEADER diagnostic
|
||||
let diags = parser.take_diagnostics();
|
||||
assert!(diags.iter().any(|d| d.message.contains("STRUCT_INVALID_INDIRECT_HEADER")));
|
||||
assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidIndirectHeader));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -134,7 +134,7 @@ impl PdfStream {
|
|||
/// Returns None if no filter is present (raw stream).
|
||||
/// Filter names are returned without the leading slash (e.g., "FlateDecode", not "/FlateDecode").
|
||||
pub fn filter(&self) -> Option<Vec<String>> {
|
||||
let filter = self.dict.get("Filter")?;
|
||||
let filter = self.dict.get("/Filter")?;
|
||||
|
||||
Some(match filter {
|
||||
PdfObject::Name(name) => {
|
||||
|
|
@ -168,7 +168,7 @@ impl PdfStream {
|
|||
///
|
||||
/// Returns None if no parameters are present.
|
||||
pub fn decode_params(&self) -> Option<Vec<PdfObject>> {
|
||||
let params = self.dict.get("DecodeParms")?;
|
||||
let params = self.dict.get("/DecodeParms")?;
|
||||
|
||||
Some(match params {
|
||||
PdfObject::Dict(_) => vec![params.clone()],
|
||||
|
|
@ -181,7 +181,7 @@ impl PdfStream {
|
|||
///
|
||||
/// Returns the direct integer value, or None if /Length is indirect/missing.
|
||||
pub fn length(&self) -> Option<u64> {
|
||||
self.dict.get("Length")?.as_int().map(|i| i as u64)
|
||||
self.dict.get("/Length")?.as_int().map(|i| i as u64)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -214,27 +214,27 @@ fn decode_utf16be_raw(bytes: &[u8]) -> std::result::Result<String, ()> {
|
|||
///
|
||||
/// Returns true if:
|
||||
/// - Length is even
|
||||
/// - For any byte > 0x7F, the adjacent bytes are 0x00
|
||||
/// - Most high bytes (first byte of each pair) are 0x00
|
||||
///
|
||||
/// This detects UTF-16BE encoded ASCII text, where each ASCII character
|
||||
/// is stored as [0x00, char_code].
|
||||
fn looks_like_utf16be(bytes: &[u8]) -> bool {
|
||||
if bytes.len() < 2 || bytes.len() % 2 != 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if high bytes are mostly zero (indicative of UTF-16BE ASCII text)
|
||||
let mut high_bytes_count = 0;
|
||||
let mut high_bytes_zero = 0;
|
||||
// Count how many high bytes are zero
|
||||
let mut zero_high_bytes = 0;
|
||||
let total_pairs = bytes.len() / 2;
|
||||
|
||||
for chunk in bytes.chunks_exact(2) {
|
||||
if chunk[0] > 0x7F || chunk[1] > 0x7F {
|
||||
high_bytes_count += 1;
|
||||
if chunk[0] == 0x00 {
|
||||
high_bytes_zero += 1;
|
||||
}
|
||||
zero_high_bytes += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If we have non-ASCII bytes and most high bytes are zero, likely UTF-16BE
|
||||
high_bytes_count > 0 && high_bytes_zero >= high_bytes_count / 2
|
||||
// If most high bytes are zero (>= 75%), likely UTF-16BE
|
||||
zero_high_bytes >= total_pairs * 3 / 4
|
||||
}
|
||||
|
||||
/// Decode PDFDocEncoded string to UTF-8.
|
||||
|
|
@ -567,6 +567,13 @@ fn resolve_destination(
|
|||
}
|
||||
}
|
||||
(None, None)
|
||||
} else if dest_obj.as_name().is_some() || dest_obj.as_string().is_some() {
|
||||
// Named destination (name or string) - emit diagnostic and return None
|
||||
diagnostics.push(Diagnostic::with_static_no_offset(
|
||||
DiagCode::StructUnresolvedDestination,
|
||||
"STRUCT_UNRESOLVED_DESTINATION: Named destination not supported",
|
||||
));
|
||||
(None, None)
|
||||
} else {
|
||||
(None, None)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ use flate2::read::ZlibDecoder;
|
|||
use lzw::{MsbReader, Decoder, DecoderEarlyChange};
|
||||
use secrecy::SecretString;
|
||||
|
||||
use crate::parser::diagnostic::{Diagnostic, DiagCode};
|
||||
use crate::diagnostics::{Diagnostic, DiagCode};
|
||||
use crate::parser::object::{PdfObject, PdfStream};
|
||||
|
||||
/// Maximum number of filters allowed in a single stream's pipeline.
|
||||
|
|
@ -1863,8 +1863,10 @@ fn decode_stream_impl(
|
|||
let truncated = raw_bytes[..remaining.min(raw_bytes.len())].to_vec();
|
||||
return DecodeResult::with_diagnostic(
|
||||
truncated,
|
||||
Diagnostic::error("1.5",
|
||||
format!("STREAM_BOMB: Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes))
|
||||
Diagnostic::with_dynamic_no_offset(
|
||||
DiagCode::StreamBomb,
|
||||
format!("Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes)
|
||||
)
|
||||
);
|
||||
}
|
||||
*doc_decompress_counter += len;
|
||||
|
|
@ -1881,13 +1883,17 @@ fn decode_stream_impl(
|
|||
// Step 3: Get decode params (aligned with filters, may be shorter)
|
||||
let decode_params = stream.decode_params().unwrap_or_default();
|
||||
|
||||
// Validate /Filter and /DecodeParms array lengths match
|
||||
if !decode_params.is_empty() && decode_params.len() != filters.len() {
|
||||
// Validate /Filter and /DecodeParms array lengths
|
||||
// Per PDF spec, /DecodeParms can be shorter than /Filter (missing params are treated as null).
|
||||
// But /DecodeParms cannot be longer than /Filter.
|
||||
if decode_params.len() > filters.len() {
|
||||
return DecodeResult::with_diagnostic(
|
||||
raw_bytes,
|
||||
Diagnostic::error("1.5",
|
||||
format!("STRUCT_INVALID_FILTER_PARAMS: /Filter array length ({}) != /DecodeParms array length ({})",
|
||||
filters.len(), decode_params.len()))
|
||||
Diagnostic::with_dynamic_no_offset(
|
||||
DiagCode::StreamInvalidParams,
|
||||
format!("/DecodeParms array length ({}) > /Filter array length ({})",
|
||||
decode_params.len(), filters.len())
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1918,9 +1924,8 @@ fn decode_stream_impl(
|
|||
Err(FilterError::EncryptionUnsupported) => {
|
||||
// Crypt filter with custom /Name - emit ENCRYPTION_UNSUPPORTED
|
||||
// and return empty bytes (stream is undecryptable)
|
||||
diagnostics.push(Diagnostic::error_with_code(
|
||||
diagnostics.push(Diagnostic::with_static_no_offset(
|
||||
DiagCode::EncryptionUnsupported,
|
||||
"1.5",
|
||||
"Crypt filter with custom /Name parameter is not supported",
|
||||
));
|
||||
return DecodeResult {
|
||||
|
|
@ -1928,7 +1933,7 @@ fn decode_stream_impl(
|
|||
diagnostics,
|
||||
};
|
||||
}
|
||||
Err(_) => {
|
||||
Err(e) => {
|
||||
// Hard error - return raw bytes for this filter
|
||||
break;
|
||||
}
|
||||
|
|
@ -1936,16 +1941,20 @@ fn decode_stream_impl(
|
|||
}
|
||||
None => {
|
||||
// Unknown filter - emit diagnostic and return current bytes (partial decode) per INV-8
|
||||
diagnostics.push(Diagnostic::warning("1.5",
|
||||
format!("STRUCT_UNKNOWN_FILTER: Unknown filter: {}, returning partial decode", filter_name)));
|
||||
diagnostics.push(Diagnostic::with_dynamic_no_offset(
|
||||
DiagCode::StreamUnknownFilter,
|
||||
format!("Unknown filter: {}, returning partial decode", filter_name)
|
||||
));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if bomb_limit_hit {
|
||||
diagnostics.push(Diagnostic::error("1.5",
|
||||
format!("STREAM_BOMB: Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes)));
|
||||
diagnostics.push(Diagnostic::with_dynamic_no_offset(
|
||||
DiagCode::StreamBomb,
|
||||
format!("Decompression bomb limit exceeded: {} bytes", opts.max_decompress_bytes)
|
||||
));
|
||||
}
|
||||
|
||||
DecodeResult {
|
||||
|
|
|
|||
|
|
@ -7,9 +7,9 @@
|
|||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::borrow::Cow;
|
||||
use crate::parser::object::{ObjRef, PdfObject, PdfDict, PdfStream};
|
||||
use crate::parser::object::{ObjRef, PdfObject, PdfDict, PdfStream, ObjectParser};
|
||||
use crate::parser::stream::{PdfSource, MemorySource};
|
||||
use crate::diagnostics::{Diagnostic as Diag, DiagCode};
|
||||
|
||||
// Use memchr for SIMD-accelerated byte searching in forward_scan_xref
|
||||
use memchr::{memchr, memchr_iter};
|
||||
|
|
@ -51,74 +51,6 @@ pub enum XrefEntry {
|
|||
Compressed { obj_stm_nr: u32, index: u32 },
|
||||
}
|
||||
|
||||
/// Diagnostic codes for xref parsing.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum XrefDiagCode {
|
||||
/// Invalid xref keyword or header
|
||||
InvalidXrefHeader,
|
||||
/// Malformed xref entry (not 20 bytes, bad format)
|
||||
InvalidXrefEntry,
|
||||
/// Invalid subsection header (not "start count")
|
||||
InvalidSubsectionHeader,
|
||||
/// Object 0 is not free (violates PDF spec)
|
||||
ObjectZeroNotFree,
|
||||
/// Trailer dictionary not found or malformed
|
||||
TrailerNotFound,
|
||||
/// Truncated xref table (unexpected EOF)
|
||||
XrefTruncated,
|
||||
/// Forward scan recovered xref entries (EC-07 recovery)
|
||||
XrefRepaired,
|
||||
/// Forward scan disabled for remote sources (would fetch entire file)
|
||||
RemoteNoForwardScan,
|
||||
/// Forward scan disabled for linearized files (has partial leading xref)
|
||||
LinearizedNoForwardScan,
|
||||
/// Invalid xref stream entry (unknown type, malformed data)
|
||||
InvalidXrefStreamEntry,
|
||||
/// Invalid xref stream format (missing required key, bad /W array)
|
||||
InvalidXrefStreamFormat,
|
||||
/// Xref stream decompression failed
|
||||
XrefStreamDecompressionFailed,
|
||||
/// Hybrid xref conflict: traditional table and stream disagree on object state
|
||||
StructHybridConflict,
|
||||
/// Circular /Prev reference detected (incremental update cycle)
|
||||
StructCircularRef,
|
||||
/// /Prev chain depth exceeded (adversarial input or corrupted file)
|
||||
StructDepthExceeded,
|
||||
/// /Prev offset points beyond file size
|
||||
StructInvalidPrevOffset,
|
||||
}
|
||||
|
||||
/// A diagnostic message emitted during xref parsing.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct XrefDiagnostic {
|
||||
/// The diagnostic code
|
||||
pub code: XrefDiagCode,
|
||||
/// Byte offset in the input where the error occurred
|
||||
pub byte_offset: u64,
|
||||
/// Human-readable error message
|
||||
pub msg: Cow<'static, str>,
|
||||
}
|
||||
|
||||
impl XrefDiagnostic {
|
||||
/// Create a diagnostic with a static message.
|
||||
fn with_static(code: XrefDiagCode, byte_offset: u64, msg: &'static str) -> Self {
|
||||
XrefDiagnostic {
|
||||
code,
|
||||
byte_offset,
|
||||
msg: Cow::Borrowed(msg),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a diagnostic with a dynamic message.
|
||||
fn with_dynamic(code: XrefDiagCode, byte_offset: u64, msg: String) -> Self {
|
||||
XrefDiagnostic {
|
||||
code,
|
||||
byte_offset,
|
||||
msg: Cow::Owned(msg),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of parsing a traditional xref table.
|
||||
///
|
||||
/// Contains the parsed xref entries and the trailer dictionary.
|
||||
|
|
@ -129,7 +61,7 @@ pub struct XrefSection {
|
|||
/// The trailer dictionary
|
||||
pub trailer: Option<PdfDict>,
|
||||
/// Diagnostics emitted during parsing
|
||||
pub diagnostics: Vec<XrefDiagnostic>,
|
||||
pub diagnostics: Vec<Diag>,
|
||||
/// Whether this xref section is from a hybrid file (traditional + stream merged)
|
||||
pub is_hybrid: bool,
|
||||
}
|
||||
|
|
@ -222,8 +154,8 @@ pub fn merge_hybrid(traditional: XrefSection, stream: XrefSection) -> XrefSectio
|
|||
let stream_is_inuse = matches!(stream_entry, XrefEntry::InUse { .. } | XrefEntry::Compressed { .. });
|
||||
|
||||
if trad_is_free && stream_is_inuse {
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::StructHybridConflict,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::StructHybridConflict,
|
||||
0,
|
||||
format!(
|
||||
"Object {}: traditional table marks as Free, stream marks as InUse; traditional wins (object is Free)",
|
||||
|
|
@ -446,8 +378,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
let header_bytes = match source.read_at(pos, 1024) {
|
||||
Ok(bytes) if !bytes.is_empty() => bytes,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::XrefTruncated,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTruncated,
|
||||
pos,
|
||||
"Failed to read xref header",
|
||||
));
|
||||
|
|
@ -461,8 +393,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
let header_str = match std::str::from_utf8(&header_bytes) {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefHeader,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidHeader,
|
||||
pos,
|
||||
"Invalid UTF-8 in xref header",
|
||||
));
|
||||
|
|
@ -478,8 +410,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
// Found it! ws_offset is the position of "xref" in header_bytes
|
||||
break ws_offset;
|
||||
} else {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefHeader,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidHeader,
|
||||
pos,
|
||||
"xref keyword not found",
|
||||
));
|
||||
|
|
@ -522,8 +454,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
let chunk_str = match std::str::from_utf8(&chunk_bytes) {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::XrefTruncated,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTruncated,
|
||||
pos,
|
||||
"Invalid UTF-8 in xref data",
|
||||
));
|
||||
|
|
@ -547,8 +479,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
let header_line = match read_line_at(source, subsection_start) {
|
||||
Some(line) => line,
|
||||
None => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidSubsectionHeader,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidSubsectionHeader,
|
||||
subsection_start,
|
||||
"Failed to read subsection header",
|
||||
));
|
||||
|
|
@ -558,8 +490,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
|
||||
let header_parts: Vec<&str> = header_line.split_whitespace().collect();
|
||||
if header_parts.len() != 2 {
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidSubsectionHeader,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidSubsectionHeader,
|
||||
subsection_start,
|
||||
format!("Invalid subsection header: {}", header_line),
|
||||
));
|
||||
|
|
@ -584,8 +516,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
let obj_start: u32 = match header_parts[0].parse() {
|
||||
Ok(n) => n,
|
||||
Err(_) => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidSubsectionHeader,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidSubsectionHeader,
|
||||
subsection_start,
|
||||
format!("Invalid subsection start: {}", header_parts[0]),
|
||||
));
|
||||
|
|
@ -597,8 +529,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
let obj_count: u32 = match header_parts[1].parse() {
|
||||
Ok(n) => n,
|
||||
Err(_) => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidSubsectionHeader,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidSubsectionHeader,
|
||||
subsection_start,
|
||||
format!("Invalid subsection count: {}", header_parts[1]),
|
||||
));
|
||||
|
|
@ -635,8 +567,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
let entry_bytes = match source.read_at(pos, 20) {
|
||||
Ok(bytes) => bytes,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::XrefTruncated,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTruncated,
|
||||
pos,
|
||||
"Failed to read xref entry",
|
||||
));
|
||||
|
|
@ -646,8 +578,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
|
||||
if entry_bytes.len() < 19 {
|
||||
// Definitely truncated
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::XrefTruncated,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTruncated,
|
||||
pos,
|
||||
"Xref entry truncated (< 19 bytes)",
|
||||
));
|
||||
|
|
@ -668,18 +600,16 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
// Object 0 must be free (PDF spec requirement)
|
||||
if obj_nr == 0 {
|
||||
if let XrefEntry::InUse { .. } = entry {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::ObjectZeroNotFree,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefObjectZeroNotFree,
|
||||
entry_start,
|
||||
"Object 0 is not free (violates PDF spec)",
|
||||
));
|
||||
}
|
||||
}
|
||||
// Only add in-use entries to the result
|
||||
// Free entries are ignored per pdftract spec (they don't resolve to objects)
|
||||
if matches!(entry, XrefEntry::InUse { .. }) {
|
||||
// Add all entries to the result (both InUse and Free)
|
||||
// Free entries are needed for /Prev chain merge semantics to track object lifecycle
|
||||
result.add_entry(obj_nr, entry);
|
||||
}
|
||||
pos += stride as u64;
|
||||
entries_parsed += 1;
|
||||
}
|
||||
|
|
@ -699,8 +629,8 @@ pub fn parse_traditional_xref(source: &dyn PdfSource, start_offset: u64) -> Xref
|
|||
|
||||
// If we exited the loop without finding a trailer, emit a diagnostic
|
||||
if !trailer_found {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::TrailerNotFound,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTrailerNotFound,
|
||||
pos,
|
||||
"Trailer dictionary not found (xref table may be truncated)",
|
||||
));
|
||||
|
|
@ -717,7 +647,7 @@ fn parse_xref_entry(
|
|||
obj_nr: u32,
|
||||
offset: u64,
|
||||
stride: usize,
|
||||
diagnostics: &mut Vec<XrefDiagnostic>,
|
||||
diagnostics: &mut Vec<Diag>,
|
||||
) -> Option<(u32, XrefEntry)> {
|
||||
if bytes.len() != stride {
|
||||
return None;
|
||||
|
|
@ -727,8 +657,8 @@ fn parse_xref_entry(
|
|||
let entry_str = match std::str::from_utf8(bytes) {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefEntry,
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidEntry,
|
||||
offset,
|
||||
"Invalid UTF-8 in xref entry",
|
||||
));
|
||||
|
|
@ -739,8 +669,8 @@ fn parse_xref_entry(
|
|||
// Entry format: "offset/next_free generation f/n" with line ending
|
||||
let parts: Vec<&str> = entry_str.split_whitespace().collect();
|
||||
if parts.len() < 3 {
|
||||
diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefEntry,
|
||||
diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidEntry,
|
||||
offset,
|
||||
format!("Malformed xref entry: {}", entry_str.trim()),
|
||||
));
|
||||
|
|
@ -750,8 +680,8 @@ fn parse_xref_entry(
|
|||
let first_field: u64 = match parts[0].parse() {
|
||||
Ok(n) => n,
|
||||
Err(_) => {
|
||||
diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefEntry,
|
||||
diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidEntry,
|
||||
offset,
|
||||
format!("Invalid offset/next_free: {}", parts[0]),
|
||||
));
|
||||
|
|
@ -762,8 +692,8 @@ fn parse_xref_entry(
|
|||
let gen_nr: u16 = match parts[1].parse() {
|
||||
Ok(n) => n,
|
||||
Err(_) => {
|
||||
diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefEntry,
|
||||
diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidEntry,
|
||||
offset,
|
||||
format!("Invalid generation: {}", parts[1]),
|
||||
));
|
||||
|
|
@ -776,8 +706,8 @@ fn parse_xref_entry(
|
|||
Some('n') | Some('N') => Some((obj_nr, XrefEntry::InUse { offset: first_field, gen_nr })),
|
||||
Some('f') | Some('F') => Some((obj_nr, XrefEntry::Free { next_free: first_field as u32, gen_nr })),
|
||||
_ => {
|
||||
diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefEntry,
|
||||
diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidEntry,
|
||||
offset,
|
||||
format!("Invalid entry type: {}", parts[2]),
|
||||
));
|
||||
|
|
@ -842,7 +772,7 @@ fn read_line_at(source: &dyn PdfSource, mut pos: u64) -> Option<String> {
|
|||
fn read_line(
|
||||
source: &dyn PdfSource,
|
||||
pos: &mut u64,
|
||||
diagnostics: &mut Vec<XrefDiagnostic>,
|
||||
diagnostics: &mut Vec<Diag>,
|
||||
) -> Option<String> {
|
||||
let line = read_line_at(source, *pos)?;
|
||||
// Advance position past the line (including line ending)
|
||||
|
|
@ -865,26 +795,30 @@ fn read_line(
|
|||
|
||||
/// Parse the trailer dictionary.
|
||||
///
|
||||
/// This is a simplified implementation that reads until the end of the
|
||||
/// dictionary (>>) and returns a placeholder dict object.
|
||||
/// The full implementation will use the object parser from Phase 1.2.
|
||||
/// Parse the trailer dictionary from the xref trailer section.
|
||||
///
|
||||
/// This function extracts the trailer dictionary bytes and parses them
|
||||
/// using the object parser to get the actual key-value pairs.
|
||||
fn parse_trailer_dict(
|
||||
source: &dyn PdfSource,
|
||||
pos: &mut u64,
|
||||
diagnostics: &mut Vec<XrefDiagnostic>,
|
||||
diagnostics: &mut Vec<Diag>,
|
||||
) -> Option<PdfDict> {
|
||||
// Skip whitespace before <<
|
||||
let mut seen_bracket = false;
|
||||
let mut depth = 0;
|
||||
let mut chunk_pos = 0u64;
|
||||
let dict_start_offset = *pos;
|
||||
let mut dict_end_offset = None;
|
||||
|
||||
// First, find the extent of the trailer dict (from << to >>)
|
||||
loop {
|
||||
let chunk = match source.read_at(*pos + chunk_pos, 1024) {
|
||||
let chunk = match source.read_at(dict_start_offset + chunk_pos, 4096) {
|
||||
Ok(bytes) => bytes,
|
||||
Err(_) => {
|
||||
diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::TrailerNotFound,
|
||||
*pos,
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTrailerNotFound,
|
||||
dict_start_offset,
|
||||
"I/O error reading trailer",
|
||||
));
|
||||
return None;
|
||||
|
|
@ -914,8 +848,10 @@ fn parse_trailer_dict(
|
|||
if j + 1 < remaining.len() && remaining[j + 1] == b'>' {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
*pos += chunk_pos + j as u64 + 2;
|
||||
return Some(PdfDict::new());
|
||||
// Found the end of the dict
|
||||
let end_offset = dict_start_offset + chunk_pos + j as u64 + 2;
|
||||
dict_end_offset = Some(end_offset);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -927,25 +863,74 @@ fn parse_trailer_dict(
|
|||
}
|
||||
}
|
||||
|
||||
if dict_end_offset.is_some() {
|
||||
break;
|
||||
}
|
||||
|
||||
chunk_pos += chunk.len() as u64;
|
||||
|
||||
// Safety limit
|
||||
if chunk_pos > 100000 {
|
||||
diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::TrailerNotFound,
|
||||
*pos,
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTrailerNotFound,
|
||||
dict_start_offset,
|
||||
"Trailer dictionary too large or unterminated",
|
||||
));
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::TrailerNotFound,
|
||||
*pos,
|
||||
"Trailer dictionary not found",
|
||||
// If we didn't find the end, return None
|
||||
let dict_end_offset = match dict_end_offset {
|
||||
Some(offset) => offset,
|
||||
None => {
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTrailerNotFound,
|
||||
dict_start_offset,
|
||||
"Trailer dictionary not found (no << >> markers)",
|
||||
));
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
// Read the full dict bytes and parse them
|
||||
let dict_len = (dict_end_offset - dict_start_offset) as usize;
|
||||
let dict_bytes = match source.read_at(dict_start_offset, dict_len) {
|
||||
Ok(bytes) => bytes,
|
||||
Err(_) => {
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTrailerNotFound,
|
||||
dict_start_offset,
|
||||
"Failed to read trailer dictionary bytes",
|
||||
));
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
// Parse the dict using ObjectParser
|
||||
let mut parser = ObjectParser::new(&dict_bytes);
|
||||
if let Some(PdfObject::Dict(dict)) = parser.parse_direct_object() {
|
||||
// Update pos to after the dict
|
||||
*pos = dict_end_offset;
|
||||
|
||||
// Transfer any diagnostics from the parser
|
||||
for diag in parser.take_diagnostics() {
|
||||
diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefTrailerNotFound,
|
||||
dict_start_offset,
|
||||
diag.message.into_owned(),
|
||||
));
|
||||
}
|
||||
|
||||
Some(*dict)
|
||||
} else {
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTrailerNotFound,
|
||||
dict_start_offset,
|
||||
"Failed to parse trailer dictionary as a dict object",
|
||||
));
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a direct PDF object (for trailer dictionary parsing).
|
||||
|
|
@ -999,8 +984,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec
|
|||
|
||||
// Check for linearized file
|
||||
if is_linearized {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::LinearizedNoForwardScan,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefLinearizedNoForwardScan,
|
||||
0,
|
||||
"Forward scan disabled for linearized PDF (partial leading xref would cause false results)",
|
||||
));
|
||||
|
|
@ -1014,8 +999,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec
|
|||
let source_len = match source.len() {
|
||||
Ok(len) if len > 0 => len,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::XrefTruncated,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefTruncated,
|
||||
0,
|
||||
"Unable to determine source length for forward scan",
|
||||
));
|
||||
|
|
@ -1095,8 +1080,8 @@ pub fn forward_scan_xref(source: &dyn PdfSource, is_linearized: bool) -> XrefSec
|
|||
}
|
||||
|
||||
// Emit XREF_REPAIRED diagnostic with count
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::XrefRepaired,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefRepaired,
|
||||
0,
|
||||
format!("Forward scan recovered {} object entries", entries_found),
|
||||
));
|
||||
|
|
@ -1162,8 +1147,8 @@ fn forward_scan_memory(data: &[u8], source_len: u64) -> XrefSection {
|
|||
}
|
||||
|
||||
// Emit XREF_REPAIRED diagnostic with count
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::XrefRepaired,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefRepaired,
|
||||
0,
|
||||
format!("Forward scan recovered {} object entries", entries_found),
|
||||
));
|
||||
|
|
@ -1403,8 +1388,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
let obj_bytes = match source.read_at(stream_obj_offset, 4096) {
|
||||
Ok(bytes) if !bytes.is_empty() => bytes,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Failed to read xref stream object",
|
||||
));
|
||||
|
|
@ -1416,8 +1401,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
let indirect = match parser.parse_indirect_object() {
|
||||
Some(i) => i,
|
||||
None => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Failed to parse xref stream as indirect object",
|
||||
));
|
||||
|
|
@ -1429,8 +1414,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
let stream = match indirect.obj {
|
||||
PdfObject::Stream(s) => s,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Xref stream object is not a stream",
|
||||
));
|
||||
|
|
@ -1441,8 +1426,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
// Check for /Type /XRef (optional per spec, but we validate it)
|
||||
if let Some(PdfObject::Name(type_name)) = stream.dict.get("Type") {
|
||||
if type_name.as_ref() != "/XRef" && type_name.as_ref() != "XRef" {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Stream /Type is not /XRef",
|
||||
));
|
||||
|
|
@ -1453,8 +1438,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
let size = match stream.dict.get("Size") {
|
||||
Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Missing or invalid /Size in xref stream",
|
||||
));
|
||||
|
|
@ -1469,8 +1454,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
.filter_map(|o| o.as_int())
|
||||
.collect();
|
||||
if widths.len() != 3 {
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
format!("/W array must have 3 elements, got {}", widths.len()),
|
||||
));
|
||||
|
|
@ -1478,8 +1463,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
}
|
||||
// Widths can be 0, but negative is invalid
|
||||
if widths.iter().any(|&w| w < 0) {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"/W array contains negative values",
|
||||
));
|
||||
|
|
@ -1488,8 +1473,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
widths
|
||||
}
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Missing or invalid /W in xref stream",
|
||||
));
|
||||
|
|
@ -1512,8 +1497,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
let first = match first_obj.as_int() {
|
||||
Some(n) if n >= 0 => n as u32,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Invalid /Index first value",
|
||||
));
|
||||
|
|
@ -1523,8 +1508,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
let count = match iter.peek() {
|
||||
Some(PdfObject::Integer(n)) if *n >= 0 => *n as u32,
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Invalid /Index count value",
|
||||
));
|
||||
|
|
@ -1535,8 +1520,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
pairs.push((first, count));
|
||||
}
|
||||
if pairs.is_empty() {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"/Index array is empty",
|
||||
));
|
||||
|
|
@ -1546,8 +1531,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
}
|
||||
None => vec![(0, size)],
|
||||
_ => {
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefStreamFormat,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::XrefInvalidStreamFormat,
|
||||
stream_obj_offset,
|
||||
"Invalid /Index in xref stream (not an array)",
|
||||
));
|
||||
|
|
@ -1582,8 +1567,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
if decoded.is_empty() {
|
||||
// Check if this is a legitimate empty stream (no objects) or an error
|
||||
// A valid xref stream with no objects would have /Size 0, which is unusual
|
||||
result.diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::XrefStreamDecompressionFailed,
|
||||
result.diagnostics.push(Diag::with_static(
|
||||
DiagCode::StreamDecodeError,
|
||||
stream_obj_offset,
|
||||
"Xref stream decompression produced empty output",
|
||||
));
|
||||
|
|
@ -1600,8 +1585,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
|
||||
// Check we have enough bytes for this entry
|
||||
if data_pos + entry_stride > decoded.len() {
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefStreamEntry,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidStreamEntry,
|
||||
stream_obj_offset,
|
||||
format!("Xref stream truncated at object {}", obj_nr),
|
||||
));
|
||||
|
|
@ -1657,8 +1642,8 @@ pub fn parse_xref_stream(source: &dyn PdfSource, stream_obj_offset: u64) -> Xref
|
|||
}
|
||||
_ => {
|
||||
// Unknown type - emit diagnostic and treat as free
|
||||
result.diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefStreamEntry,
|
||||
result.diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidStreamEntry,
|
||||
stream_obj_offset,
|
||||
format!("Invalid xref entry type {} for object {}", entry_type, obj_nr),
|
||||
));
|
||||
|
|
@ -2105,12 +2090,12 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
|
|||
offset: u64,
|
||||
visited: &mut HashSet<u64>,
|
||||
depth: u32,
|
||||
diagnostics: &mut Vec<XrefDiagnostic>,
|
||||
diagnostics: &mut Vec<Diag>,
|
||||
) -> XrefSection {
|
||||
// Cycle detection
|
||||
if visited.contains(&offset) {
|
||||
diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::StructCircularRef,
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructCircularRef,
|
||||
offset,
|
||||
"Circular /Prev reference detected; stopping chain traversal",
|
||||
));
|
||||
|
|
@ -2121,8 +2106,8 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
|
|||
|
||||
// Depth limit check
|
||||
if depth >= MAX_PREV_DEPTH {
|
||||
diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::StructDepthExceeded,
|
||||
diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::StructDepthExceeded,
|
||||
offset,
|
||||
format!("/Prev chain depth exceeded maximum of {}", MAX_PREV_DEPTH).into(),
|
||||
));
|
||||
|
|
@ -2143,14 +2128,13 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
|
|||
})
|
||||
});
|
||||
|
||||
// Validate /Prev offset if present
|
||||
let mut should_follow_prev = false;
|
||||
// Validate /Prev offset and recursively load previous revision if present
|
||||
if let Some(prev) = prev_offset {
|
||||
match source.len() {
|
||||
Ok(file_size) if prev > file_size => {
|
||||
// /Prev points beyond file size - invalid
|
||||
diagnostics.push(XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::StructInvalidPrevOffset,
|
||||
diagnostics.push(Diag::with_dynamic(
|
||||
DiagCode::StructInvalidPrevOffset,
|
||||
offset,
|
||||
format!("/Prev offset {} exceeds file size {}; ignoring /Prev key", prev, file_size).into(),
|
||||
));
|
||||
|
|
@ -2158,25 +2142,13 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
|
|||
if let Some(ref mut trailer) = current.trailer {
|
||||
trailer.shift_remove("Prev");
|
||||
}
|
||||
// Return current revision without following /Prev
|
||||
let mut result = current;
|
||||
result.diagnostics.extend(diagnostics.drain(..));
|
||||
return result;
|
||||
}
|
||||
Ok(_) => {
|
||||
// Valid /Prev offset
|
||||
should_follow_prev = true;
|
||||
}
|
||||
Err(_) => {
|
||||
// Can't determine file size - be conservative and don't follow
|
||||
diagnostics.push(XrefDiagnostic::with_static(
|
||||
XrefDiagCode::StructInvalidPrevOffset,
|
||||
offset,
|
||||
"Cannot determine file size; ignoring /Prev key",
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively load previous revision if /Prev exists
|
||||
if should_follow_prev {
|
||||
let prev = prev_offset.unwrap(); // Safe because we checked should_follow_prev
|
||||
// Valid /Prev offset - recursively load
|
||||
let mut older = walk_chain(source, prev, visited, depth + 1, diagnostics);
|
||||
|
||||
// Merge: older entries first, then current (newer) entries override
|
||||
|
|
@ -2200,10 +2172,26 @@ pub fn load_xref_with_prev_chain(source: &dyn PdfSource, start_offset: u64) -> X
|
|||
older.diagnostics.extend(diagnostics.drain(..));
|
||||
|
||||
older
|
||||
}
|
||||
Err(_) => {
|
||||
// Can't determine file size - be conservative and don't follow
|
||||
diagnostics.push(Diag::with_static(
|
||||
DiagCode::StructInvalidPrevOffset,
|
||||
offset,
|
||||
"Cannot determine file size; ignoring /Prev key",
|
||||
));
|
||||
// Return current revision without following /Prev
|
||||
let mut result = current;
|
||||
result.diagnostics.extend(diagnostics.drain(..));
|
||||
result
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No /Prev - this is the baseline (original) revision
|
||||
// Return current as-is
|
||||
current
|
||||
// Return current with any diagnostics from this level
|
||||
let mut result = current;
|
||||
result.diagnostics.extend(diagnostics.drain(..));
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2341,26 +2329,26 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_xref_diagnostic_static() {
|
||||
let diag = XrefDiagnostic::with_static(
|
||||
XrefDiagCode::InvalidXrefHeader,
|
||||
let diag = Diag::with_static(
|
||||
DiagCode::XrefInvalidHeader,
|
||||
100,
|
||||
"test message",
|
||||
);
|
||||
assert_eq!(diag.byte_offset, 100);
|
||||
assert_eq!(diag.msg.as_ref(), "test message");
|
||||
assert!(matches!(diag.code, XrefDiagCode::InvalidXrefHeader));
|
||||
assert_eq!(diag.byte_offset, Some(100));
|
||||
assert_eq!(diag.message.as_ref(), "test message");
|
||||
assert!(matches!(diag.code, DiagCode::XrefInvalidHeader));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_xref_diagnostic_dynamic() {
|
||||
let diag = XrefDiagnostic::with_dynamic(
|
||||
XrefDiagCode::InvalidXrefEntry,
|
||||
let diag = Diag::with_dynamic(
|
||||
DiagCode::XrefInvalidEntry,
|
||||
200,
|
||||
"dynamic message".to_string(),
|
||||
);
|
||||
assert_eq!(diag.byte_offset, 200);
|
||||
assert_eq!(diag.msg.as_ref(), "dynamic message");
|
||||
assert!(matches!(diag.code, XrefDiagCode::InvalidXrefEntry));
|
||||
assert_eq!(diag.byte_offset, Some(200));
|
||||
assert_eq!(diag.message.as_ref(), "dynamic message");
|
||||
assert!(matches!(diag.code, DiagCode::XrefInvalidEntry));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2378,12 +2366,15 @@ trailer\n<< /Size 6 >>\n";
|
|||
let source = MemorySource::new(xref_data.to_vec());
|
||||
let result = parse_traditional_xref(&source, 0);
|
||||
|
||||
// Should have parsed 4 in-use entries (objects 0 and 3 are free and ignored)
|
||||
assert_eq!(result.len(), 4);
|
||||
// Should have parsed 6 entries (all objects 0-5, including free entries)
|
||||
// Free entries are tracked for /Prev chain merge semantics
|
||||
assert_eq!(result.len(), 6);
|
||||
|
||||
// Check specific entries
|
||||
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
|
||||
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 17, gen_nr: 0 }));
|
||||
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 81, gen_nr: 0 }));
|
||||
assert_eq!(result.entries.get(&3), Some(&XrefEntry::Free { next_free: 0, gen_nr: 7 }));
|
||||
assert_eq!(result.entries.get(&4), Some(&XrefEntry::InUse { offset: 331, gen_nr: 0 }));
|
||||
assert_eq!(result.entries.get(&5), Some(&XrefEntry::InUse { offset: 409, gen_nr: 0 }));
|
||||
|
||||
|
|
@ -2403,8 +2394,10 @@ trailer\r\n<< /Size 3 >>\r\n";
|
|||
let source = MemorySource::new(xref_data.to_vec());
|
||||
let result = parse_traditional_xref(&source, 0);
|
||||
|
||||
// Should have parsed 2 in-use entries
|
||||
assert_eq!(result.len(), 2);
|
||||
// Should have parsed 3 entries (all objects 0-2, including free entry)
|
||||
// Free entries are tracked for /Prev chain merge semantics
|
||||
assert_eq!(result.len(), 3);
|
||||
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
|
||||
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 }));
|
||||
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 }));
|
||||
}
|
||||
|
|
@ -2421,7 +2414,10 @@ trailer\n<< /Size 3 >>\n";
|
|||
let source = MemorySource::new(xref_data.to_vec());
|
||||
let result = parse_traditional_xref(&source, 0);
|
||||
|
||||
// Should have parsed 2 in-use entries
|
||||
// Should have parsed 3 entries (all objects 0-2, including free entry)
|
||||
// Free entries are tracked for /Prev chain merge semantics
|
||||
assert_eq!(result.len(), 3);
|
||||
assert_eq!(result.entries.get(&0), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
|
||||
assert_eq!(result.len(), 2);
|
||||
assert_eq!(result.entries.get(&1), Some(&XrefEntry::InUse { offset: 15, gen_nr: 0 }));
|
||||
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 78, gen_nr: 0 }));
|
||||
|
|
@ -2473,7 +2469,7 @@ trailer\n<< /Size 4 >>\n";
|
|||
|
||||
// Should have emitted a diagnostic for the bad entry
|
||||
assert!(!result.diagnostics.is_empty());
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefEntry));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidEntry));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2489,7 +2485,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = parse_traditional_xref(&source, 0);
|
||||
|
||||
// Should emit diagnostic for object 0 not being free
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::ObjectZeroNotFree));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefObjectZeroNotFree));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2502,12 +2498,13 @@ trailer\n<< /Size 3 >>\n";
|
|||
let source = MemorySource::new(xref_data.to_vec());
|
||||
let result = parse_traditional_xref(&source, 0);
|
||||
|
||||
// Should still parse the entry
|
||||
assert_eq!(result.len(), 1);
|
||||
// Should still parse both entries (including free entry)
|
||||
// Free entries are tracked for /Prev chain merge semantics
|
||||
assert_eq!(result.len(), 2);
|
||||
assert!(result.trailer.is_none());
|
||||
|
||||
// Should emit diagnostic about missing trailer
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::TrailerNotFound));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefTrailerNotFound));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2686,7 +2683,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
assert!(result.entries.contains_key(&3));
|
||||
|
||||
// Check for XREF_REPAIRED diagnostic
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::XrefRepaired));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefRepaired));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2719,7 +2716,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
assert_eq!(result.len(), 0);
|
||||
|
||||
// Should have LINEARIZED_NO_FORWARD_SCAN diagnostic
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::LinearizedNoForwardScan));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefLinearizedNoForwardScan));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -3119,7 +3116,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
assert_eq!(result.entries.get(&2), Some(&XrefEntry::InUse { offset: 2000, gen_nr: 0 }));
|
||||
|
||||
// Should have emitted a diagnostic for invalid type
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamEntry));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamEntry));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -3134,7 +3131,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = parse_xref_stream(&source, 0);
|
||||
|
||||
// Should have emitted diagnostic about missing /Size
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamFormat));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -3156,7 +3153,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = parse_xref_stream(&source, 0);
|
||||
|
||||
// Should have emitted diagnostic about invalid /W
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::InvalidXrefStreamFormat));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::XrefInvalidStreamFormat));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -3443,7 +3440,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
|
||||
assert!(merged.is_hybrid);
|
||||
// Should have emitted STRUCT_HYBRID_CONFLICT diagnostic
|
||||
assert!(merged.diagnostics.iter().any(|d| matches!(d.code, XrefDiagCode::StructHybridConflict)));
|
||||
assert!(merged.diagnostics.iter().any(|d| matches!(d.code, DiagCode::StructHybridConflict)));
|
||||
// Traditional Free wins
|
||||
assert_eq!(merged.entries.get(&1), Some(&XrefEntry::Free { next_free: 0, gen_nr: 65535 }));
|
||||
}
|
||||
|
|
@ -3829,8 +3826,8 @@ trailer\n<< /Size 3 >>\n";
|
|||
// Load from the latest revision
|
||||
let result = load_xref_with_prev_chain(&source, rev3_offset);
|
||||
|
||||
// Verify all 5 objects are present
|
||||
assert_eq!(result.len(), 5, "Should have entries for objects 1-5, got {}", result.len());
|
||||
// Verify all 6 entries are present (including object 0)
|
||||
assert_eq!(result.len(), 6, "Should have entries for objects 0-5, got {}", result.len());
|
||||
|
||||
// Verify LATEST values win:
|
||||
// Object 1: unchanged from rev1 (offset 100)
|
||||
|
|
@ -3980,11 +3977,12 @@ trailer\n<< /Size 3 >>\n";
|
|||
let root = trailer.get("Root");
|
||||
assert!(root.is_some());
|
||||
match root {
|
||||
Some(PdfObject::Array(ref arr)) if arr.len() == 3 => {
|
||||
// [2, 0, R] - object number 2
|
||||
assert_eq!(arr[0], PdfObject::Integer(2));
|
||||
Some(PdfObject::Ref(obj_ref)) => {
|
||||
// 2 0 R - indirect reference to object 2
|
||||
assert_eq!(obj_ref.object, 2);
|
||||
assert_eq!(obj_ref.generation, 0);
|
||||
}
|
||||
_ => panic!("Expected /Root to be an array [2 0 R]"),
|
||||
_ => panic!("Expected /Root to be an indirect reference 2 0 R"),
|
||||
}
|
||||
|
||||
// Should have /Info from rev2
|
||||
|
|
@ -4043,7 +4041,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = load_xref_with_prev_chain(&source, rev3_offset);
|
||||
|
||||
// Should emit STRUCT_CIRCULAR_REF diagnostic
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructCircularRef));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructCircularRef));
|
||||
}
|
||||
|
||||
/// Test depth limit enforcement.
|
||||
|
|
@ -4081,7 +4079,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = load_xref_with_prev_chain(&source, start_offset);
|
||||
|
||||
// Should emit STRUCT_DEPTH_EXCEEDED diagnostic
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructDepthExceeded));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructDepthExceeded));
|
||||
}
|
||||
|
||||
/// Test /Prev offset pointing beyond file size.
|
||||
|
|
@ -4109,7 +4107,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = load_xref_with_prev_chain(&source, rev2_offset);
|
||||
|
||||
// Should emit STRUCT_INVALID_PREV_OFFSET diagnostic
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset));
|
||||
assert!(result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
|
||||
|
||||
// /Prev should be removed from trailer
|
||||
let trailer = result.trailer.as_ref().unwrap();
|
||||
|
|
@ -4134,7 +4132,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = load_xref_with_prev_chain(&source, offset);
|
||||
|
||||
// Should not follow /Prev 0, should just return this single revision
|
||||
assert!(!result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset));
|
||||
assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
|
||||
}
|
||||
|
||||
/// Test negative /Prev treated as "no previous revision".
|
||||
|
|
@ -4155,7 +4153,7 @@ trailer\n<< /Size 3 >>\n";
|
|||
let result = load_xref_with_prev_chain(&source, offset);
|
||||
|
||||
// Should not follow negative /Prev
|
||||
assert!(!result.diagnostics.iter().any(|d| d.code == XrefDiagCode::StructInvalidPrevOffset));
|
||||
assert!(!result.diagnostics.iter().any(|d| d.code == DiagCode::StructInvalidPrevOffset));
|
||||
}
|
||||
|
||||
/// Test hybrid file in /Prev chain.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue