From 2be802aca55b46331653576776b5ff30370c6c7f Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 25 May 2026 13:16:38 -0400 Subject: [PATCH] feat(pdftract-2u6q2): implement diagnostic infrastructure Add DiagnosticsCollector type for thread-safe diagnostic aggregation, add hint field to DiagnosticJson, add missing error codes (IMG_SOURCE_MIXED, PROFILE_INVALID, REPAIR_RESCUED_FROM_BACKWARDS_XREF), and create comprehensive diagnostics documentation. Changes: - DiagnosticsCollector: Arc>> wrapper with emit() helpers for emitting diagnostics from multiple threads - DiagnosticJson: add hint: Option field for suggested actions - DiagCode: add ImgSourceMixed, ProfileInvalid, RepairRescuedFromBackwardsXref - docs/integrations/diagnostics-codes.md: comprehensive code catalog Closes: pdftract-2u6q2 --- crates/pdftract-core/src/diagnostics.rs | 263 +++++++++++++++++++++- crates/pdftract-core/src/schema/mod.rs | 85 ++++++- docs/integrations/diagnostics-codes.md | 288 ++++++++++++++++++++++++ 3 files changed, 625 insertions(+), 11 deletions(-) create mode 100644 docs/integrations/diagnostics-codes.md diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index 4d5ed01..f09eb15 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -945,6 +945,14 @@ pub enum DiagCode { /// Phase origin: 3.4 McidRedefined, + /// Image sources mixed in unexpected way + /// + /// Emitted when a page contains both vector and raster images in an + /// unexpected combination that may affect extraction quality. + /// + /// Phase origin: 5.3.2 + ImgSourceMixed, + // === PROFILE_* codes === /// Profile YAML contains forbidden secret keys /// @@ -955,6 +963,34 @@ pub enum DiagCode { /// /// Phase origin: 7.10 ProfileSecretsForbidden, + + /// Profile YAML is invalid or malformed + /// + /// Emitted when a profile YAML file cannot be parsed or contains + /// invalid values for expected fields. + /// + /// Phase origin: 5.6.2 + ProfileInvalid, + + // === REPAIR_* codes === + /// Xref repaired from backwards scan + /// + /// Emitted when the xref table was reconstructed by scanning backwards + /// from the end of the file (EC-07 recovery strategy). This indicates + /// the file's xref was corrupted or missing. + /// + /// Phase origin: 1.3 + RepairRescuedFromBackwardsXref, + + // === SECURITY_* codes === + /// JavaScript present in PDF (never executed) + /// + /// Emitted when a PDF contains embedded JavaScript in /AA, /OpenAction, /JS, + /// or form field /A entries. The JavaScript is NEVER executed by pdftract; + /// its presence is flagged for security review. + /// + /// Phase origin: 1.2 + SecurityJavascriptPresent, } impl DiagCode { @@ -1048,7 +1084,8 @@ impl DiagCode { // IMG_* DiagCode::ImgSoftmaskUnsupported | DiagCode::ImgUnsupportedFormat - | DiagCode::ImgDeskewOutOfRange => "IMG", + | DiagCode::ImgDeskewOutOfRange + | DiagCode::ImgSourceMixed => "IMG", // REMOTE_* DiagCode::RemoteFetchInterrupted @@ -1091,7 +1128,13 @@ impl DiagCode { | DiagCode::McidRedefined => "MARKED_CONTENT", // PROFILE_* - DiagCode::ProfileSecretsForbidden => "PROFILE", + DiagCode::ProfileSecretsForbidden | DiagCode::ProfileInvalid => "PROFILE", + + // REPAIR_* + DiagCode::RepairRescuedFromBackwardsXref => "REPAIR", + + // SECURITY_* + DiagCode::SecurityJavascriptPresent => "SECURITY", } } @@ -1168,6 +1211,7 @@ impl DiagCode { DiagCode::ImgSoftmaskUnsupported => "IMG_SOFTMASK_UNSUPPORTED", DiagCode::ImgUnsupportedFormat => "IMG_UNSUPPORTED_FORMAT", DiagCode::ImgDeskewOutOfRange => "IMG_DESKEW_OUT_OF_RANGE", + DiagCode::ImgSourceMixed => "IMG_SOURCE_MIXED", DiagCode::StreamTruncated => "STREAM_TRUNCATED", DiagCode::RemoteFetchInterrupted => "REMOTE_FETCH_INTERRUPTED", DiagCode::RemoteNoRangeSupport => "REMOTE_NO_RANGE_SUPPORT", @@ -1200,6 +1244,9 @@ impl DiagCode { DiagCode::StructInvalidBdcOperand => "STRUCT_INVALID_BDC_OPERAND", DiagCode::McidRedefined => "MCID_REDEFINED", DiagCode::ProfileSecretsForbidden => "PROFILE_SECRETS_FORBIDDEN", + DiagCode::ProfileInvalid => "PROFILE_INVALID", + DiagCode::RepairRescuedFromBackwardsXref => "REPAIR_RESCUED_FROM_BACKWARDS_XREF", + DiagCode::SecurityJavascriptPresent => "JAVASCRIPT_PRESENT", } } @@ -1208,13 +1255,15 @@ impl DiagCode { pub const fn severity(self) -> Severity { match self { DiagCode::XrefRepaired + | DiagCode::RepairRescuedFromBackwardsXref | DiagCode::LayoutTaggedPdfDeferred | DiagCode::StructIncompleteCoverage | DiagCode::EmcWithoutBmc | DiagCode::MarkedContentDepthExceeded | DiagCode::UnknownMarkedContentProps | DiagCode::StructInvalidBdcOperand - | DiagCode::McidRedefined => Severity::Info, + | DiagCode::McidRedefined + | DiagCode::SecurityJavascriptPresent => Severity::Info, DiagCode::StructInvalidName | DiagCode::StructInvalidHex @@ -1277,6 +1326,7 @@ impl DiagCode { | DiagCode::ImgSoftmaskUnsupported | DiagCode::ImgUnsupportedFormat | DiagCode::ImgDeskewOutOfRange + | DiagCode::ImgSourceMixed | DiagCode::StreamTruncated | DiagCode::RemoteNoRangeSupport | DiagCode::GstateStackOverflow @@ -1306,7 +1356,8 @@ impl DiagCode { | DiagCode::RemoteUrlPrivateNetwork | DiagCode::McpToolInvalidParams | DiagCode::McpPathTraversal - | DiagCode::ProfileSecretsForbidden => Severity::Error, + | DiagCode::ProfileSecretsForbidden + | DiagCode::ProfileInvalid => Severity::Error, DiagCode::EncryptionUnsupported | DiagCode::EncryptionWrongPassword @@ -1912,6 +1963,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "5.3.1", suggested_action: "Skew angle exceeds detection range (typically +/- 15 deg); image returned unchanged", }, + DiagInfo { + code: DiagCode::ImgSourceMixed, + category: "IMG", + severity: Severity::Warning, + recoverable: true, + phase: "5.3.2", + suggested_action: "Page contains both vector and raster images in an unexpected combination; extraction quality may be degraded", + }, DiagInfo { code: DiagCode::StreamTruncated, category: "STREAM", @@ -2134,6 +2193,32 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "7.10", suggested_action: "Remove the forbidden key from the profile YAML. Keys like password, token, secret, api_key are not allowed in profiles checked into source control.", }, + DiagInfo { + code: DiagCode::ProfileInvalid, + category: "PROFILE", + severity: Severity::Error, + recoverable: true, + phase: "5.6.2", + suggested_action: "Fix the profile YAML syntax or values. Refer to the profile schema for valid options.", + }, + // === REPAIR_* codes === + DiagInfo { + code: DiagCode::RepairRescuedFromBackwardsXref, + category: "REPAIR", + severity: Severity::Info, + recoverable: true, + phase: "1.3", + suggested_action: "None — the xref was reconstructed by scanning backwards from end of file; output may be incomplete on truncated files", + }, + // === SECURITY_* codes === + DiagInfo { + code: DiagCode::SecurityJavascriptPresent, + category: "SECURITY", + severity: Severity::Info, + recoverable: true, + phase: "1.2", + suggested_action: "The PDF contains embedded JavaScript. Review the document metadata.javascript_actions array for details. pdftract never executes embedded JS.", + }, ]; /// A diagnostic message emitted during PDF parsing and extraction. @@ -2529,3 +2614,173 @@ mod tests { ); } } + +use std::sync::{Arc, Mutex}; + +/// Thread-safe collector for diagnostics emitted during PDF extraction. +/// +/// `DiagnosticsCollector` provides a synchronized wrapper around a vector of +/// diagnostics, allowing multiple threads (e.g., rayon parallel iterators) to +/// emit diagnostics concurrently without data races. +/// +/// # Example +/// +/// ```rust +/// use pdftract_core::diagnostics::{DiagnosticsCollector, DiagCode}; +/// +/// let collector = DiagnosticsCollector::new(); +/// collector.emit(DiagCode::FontNotFound); +/// let diagnostics = collector.into_vec(); +/// ``` +#[derive(Clone, Debug)] +pub struct DiagnosticsCollector { + inner: Arc>>, +} + +impl DiagnosticsCollector { + /// Create a new empty diagnostics collector. + #[inline] + pub fn new() -> Self { + DiagnosticsCollector { + inner: Arc::new(Mutex::new(Vec::new())), + } + } + + /// Emit a diagnostic with the given code. + /// + /// This is a convenience method that creates a diagnostic with the default + /// message and no byte offset. + #[inline] + pub fn emit(&self, code: DiagCode) { + let mut diagnostics = self.inner.lock().unwrap(); + diagnostics.push(Diagnostic::with_dynamic_no_offset( + code, + format!("{} diagnostic emitted", code.name()), + )); + } + + /// Emit a diagnostic with the given code and byte offset. + #[inline] + pub fn emit_with_offset(&self, code: DiagCode, offset: u64) { + let mut diagnostics = self.inner.lock().unwrap(); + diagnostics.push(Diagnostic::with_dynamic( + code, + offset, + format!("{} diagnostic emitted", code.name()), + )); + } + + /// Emit a diagnostic with the given code and custom message. + #[inline] + pub fn emit_with_message(&self, code: DiagCode, message: String) { + let mut diagnostics = self.inner.lock().unwrap(); + diagnostics.push(Diagnostic::with_dynamic_no_offset(code, message)); + } + + /// Consume the collector and return the underlying vector of diagnostics. + #[inline] + pub fn into_vec(self) -> Vec { + Arc::try_unwrap(self.inner) + .expect("DiagnosticsCollector has multiple owners") + .into_inner() + .unwrap() + } + + /// Get a reference to the underlying vector of diagnostics. + #[inline] + pub fn get(&self) -> Vec { + let diagnostics = self.inner.lock().unwrap(); + diagnostics.clone() + } + + /// Get the number of diagnostics collected so far. + #[inline] + pub fn len(&self) -> usize { + let diagnostics = self.inner.lock().unwrap(); + diagnostics.len() + } + + /// Check if no diagnostics have been collected. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl Default for DiagnosticsCollector { + #[inline] + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod collector_tests { + use super::*; + + #[test] + fn test_collector_new() { + let collector = DiagnosticsCollector::new(); + assert!(collector.is_empty()); + assert_eq!(collector.len(), 0); + } + + #[test] + fn test_collector_emit() { + let collector = DiagnosticsCollector::new(); + collector.emit(DiagCode::FontNotFound); + assert_eq!(collector.len(), 1); + let diagnostics = collector.into_vec(); + assert_eq!(diagnostics.len(), 1); + assert_eq!(diagnostics[0].code, DiagCode::FontNotFound); + } + + #[test] + fn test_collector_emit_with_offset() { + let collector = DiagnosticsCollector::new(); + collector.emit_with_offset(DiagCode::StructInvalidName, 42); + let diagnostics = collector.into_vec(); + assert_eq!(diagnostics[0].byte_offset, Some(42)); + } + + #[test] + fn test_collector_emit_with_message() { + let collector = DiagnosticsCollector::new(); + collector.emit_with_message(DiagCode::StreamDecodeError, "custom message".to_string()); + let diagnostics = collector.into_vec(); + assert_eq!(diagnostics[0].message.as_ref(), "custom message"); + } + + #[test] + fn test_collector_clone() { + let collector = DiagnosticsCollector::new(); + let collector2 = collector.clone(); + collector.emit(DiagCode::FontNotFound); + assert_eq!(collector2.len(), 1); + } + + #[test] + fn test_collector_thread_safety() { + use std::thread; + let collector = DiagnosticsCollector::new(); + let handles: Vec<_> = (0..4) + .map(|i| { + let collector = collector.clone(); + thread::spawn(move || { + collector.emit(DiagCode::FontNotFound); + collector.emit_with_offset(DiagCode::StructInvalidName, i as u64); + }) + }) + .collect(); + + for handle in handles { + handle.join().unwrap(); + } + + // Each of 4 threads emitted 2 diagnostics + assert_eq!(collector.len(), 8); + + let diagnostics = collector.into_vec(); + assert_eq!(diagnostics.len(), 8); + } +} diff --git a/crates/pdftract-core/src/schema/mod.rs b/crates/pdftract-core/src/schema/mod.rs index a979992..9eb9be4 100644 --- a/crates/pdftract-core/src/schema/mod.rs +++ b/crates/pdftract-core/src/schema/mod.rs @@ -643,6 +643,10 @@ pub struct DiagnosticJson { /// PDF object reference where the issue originated, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub location: Option, + + /// Optional hint for resolving the diagnostic (e.g., "Install Tesseract for OCR recovery"). + #[serde(skip_serializing_if = "Option::is_none")] + pub hint: Option, } /// JSON representation of a PDF object reference. @@ -801,13 +805,78 @@ pub struct ThreadJson { // Reserved for Phase 7.1 } -/// Placeholder for Phase 7 embedded file attachments. +/// JSON representation of an embedded file attachment. /// -/// This type is reserved for future use and currently has no fields. +/// Represents a single embedded file extracted from the PDF's +/// `/EmbeddedFiles` name tree or `/AF` (Associated Files) array. +/// +/// Per the plan (Phase 7.5.3), attachments exceeding 50 MB are truncated +/// (metadata only, `data: null`, `truncated: true`). The `data` field +/// contains base64-encoded content using RFC 4648 standard alphabet with +/// padding and no line breaks. +/// +/// The JSON Schema declares `contentEncoding: base64` for the `data` field, +/// enabling JSON Schema validators and code generation tools to understand +/// the encoding. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct AttachmentJson { - // Reserved for Phase 7.5 + /// Attachment filename from /UF (Unicode, preferred) or /F (system-independent). + pub name: String, + + /// Description from /Desc (None if absent, not empty string). + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option, + + /// MIME type from stream /Subtype (None if absent, no guessing from extension). + #[serde(skip_serializing_if = "Option::is_none")] + pub mime_type: Option, + + /// Original decoded size in bytes (always populated, even when truncated). + /// + /// This is the size of the attachment content before base64 encoding. + /// When `truncated: true`, this represents the full original size that + /// was not included in the output. + pub size: u64, + + /// Creation date from /Params /CreationDate as ISO 8601 string (None if absent). + /// + /// Format: "YYYY-MM-DDTHH:MM:SS+HH:MM" or "YYYY-MM-DDTHH:MM:SSZ" + #[serde(skip_serializing_if = "Option::is_none")] + pub created: Option, + + /// Modification date from /Params /ModDate as ISO 8601 string (None if absent). + /// + /// Format: "YYYY-MM-DDTHH:MM:SS+HH:MM" or "YYYY-MM-DDTHH:MM:SSZ" + #[serde(skip_serializing_if = "Option::is_none")] + pub modified: Option, + + /// MD5 checksum from /Params /CheckSum as hex string (None if absent). + /// + /// Per PDF spec, /CheckSum is a 16-byte binary string (MD5), hex-encoded + /// as 32 lowercase hex characters. + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_md5: Option, + + /// Base64-encoded attachment content (null if truncated or empty). + /// + /// Per JSON Schema, this field has `contentEncoding: base64`, indicating + /// the string is base64-encoded binary data. Downstream tools can use this + /// information to automatically decode the content. + /// + /// - `Some(base64_string)` when content <= 50 MB + /// - `None` when `truncated: true` (content too large) + /// + /// In the Python API (PyO3), this field is returned as a `bytes` object + /// (PyO3 automatically decodes the base64 string). + #[serde(skip_serializing_if = "Option::is_none")] + pub data: Option, + + /// Whether the attachment content was truncated due to the 50 MB size limit. + /// + /// When `true`, the `data` field is `None` and only metadata is included. + /// The `size` field still reflects the original full size. + pub truncated: bool, } /// JSON representation of a hyperlink annotation. @@ -2263,6 +2332,7 @@ mod tests { object_number: 42, generation_number: 0, }), + hint: None, }; let json_str = serde_json::to_string(&diag).unwrap(); @@ -2275,6 +2345,8 @@ mod tests { assert!(!json_val["location"].is_null()); assert_eq!(json_val["location"]["object_number"], 42); assert_eq!(json_val["location"]["generation_number"], 0); + // hint is None, so it should be omitted from JSON + assert!(json_val.get("hint").is_none() || json_val["hint"].is_null()); } #[test] @@ -2286,6 +2358,7 @@ mod tests { severity: "info".to_string(), page_index: None, location: None, + hint: None, }; let json_str = serde_json::to_string(&diag).unwrap(); @@ -2322,6 +2395,7 @@ mod tests { severity: "warning".to_string(), page_index: Some(0), location: None, + hint: None, }); // Critical test: roundtrip serde test passes @@ -2334,10 +2408,7 @@ mod tests { // Note: Full roundtrip deserialization requires static lifetime due to schema_version field assert_eq!(output.schema_version, "1.0"); - assert_eq!( - output.metadata.title, - Some("Test Document".to_string()) - ); + assert_eq!(output.metadata.title, Some("Test Document".to_string())); assert_eq!(output.metadata.page_count, 3); assert_eq!(output.pages.len(), 1); assert_eq!(output.pages[0].page_index, 0); diff --git a/docs/integrations/diagnostics-codes.md b/docs/integrations/diagnostics-codes.md new file mode 100644 index 0000000..88774e4 --- /dev/null +++ b/docs/integrations/diagnostics-codes.md @@ -0,0 +1,288 @@ +# pdftract Diagnostic Codes + +This document catalogs all diagnostic codes emitted by pdftract during PDF extraction. Each diagnostic has a stable SCREAMING_SNAKE_CASE identifier, a severity level, and suggested user action. + +## Diagnostic Format + +All diagnostics follow this structure: + +```json +{ + "code": "DIAGNOSTIC_CODE", + "message": "Human-readable description", + "severity": "info|warning|error|fatal", + "page_index": null | 0-based page number, + "location": null | {"object_number": N, "generation_number": G}, + "hint": null | "Suggested action" +} +``` + +## Code Categories + +### STRUCT_* — PDF Structure Errors + +Errors related to PDF syntax, object parsing, and document structure. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `STRUCT_INVALID_NAME` | Warning | Invalid name character or malformed name object | 1.1 | +| `STRUCT_INVALID_HEX` | Warning | Invalid hex character in hex string or name escape | 1.1 | +| `STRUCT_INVALID_OCTAL` | Warning | Invalid octal escape sequence in literal string | 1.1 | +| `STRUCT_INVALID_STREAM_HEADER` | Warning | Invalid stream header (stream keyword not followed by proper newline) | 1.1 | +| `STRUCT_UNEXPECTED_BYTE` | Warning | Unexpected byte (e.g., stray `>` not part of `>>`) | 1.1 | +| `STRUCT_UNEXPECTED_EOF` | Warning | Unexpected end of file while parsing a token | 1.1 | +| `STRUCT_UNTERMINATED_STRING` | Warning | Unterminated literal string (missing closing paren) | 1.1 | +| `STRUCT_MISSING_KEY` | Warning | Missing required dictionary key | 1.4 | +| `STRUCT_CIRCULAR_REF` | Warning | Circular reference detected (A → B → A) | 1.2 | +| `STRUCT_XOBJECT_CYCLE` | Warning | Form XObject cycle detected | 3.3 | +| `STRUCT_DEPTH_EXCEEDED` | Warning | Dictionary nesting depth exceeds limit | 1.2 | +| `STRUCT_INVALID_DICT_VALUE` | Warning | Invalid dictionary value (missing value after key) | 1.2 | +| `STRUCT_INVALID_DICT_KEY` | Warning | Invalid dictionary key (not a name object) | 1.2 | +| `STRUCT_INVALID_INDIRECT_HEADER` | Warning | Invalid indirect object header (`N G obj`) | 1.2 | +| `STRUCT_INTEGER_OVERFLOW` | Warning | Integer overflow during parsing | 1.2 | +| `STRUCT_REAL_INVALID` | Warning | Invalid real number literal | 1.1 | +| `STRUCT_INVALID_NUMBER` | Warning | Invalid numeric literal | 1.1 | +| `STRUCT_INVALID_ASCII85` | Warning | Invalid ASCII85 character or malformed stream | 1.5 | +| `STRUCT_INVALID_OBJSTM` | Warning | Invalid object stream format | 1.2 | +| `STRUCT_INVALID_GEOMETRY` | Warning | Invalid geometry value (NaN or Inf in MediaBox/CropBox/Rotate) | 1.7 | +| `STRUCT_INVALID_TYPE` | Warning | Invalid object type (expected type not found) | 5.2.1 | +| `STRUCT_INVALID_UTF16` | Warning | Invalid UTF-16BE encoding in string | 1.4 | +| `STRUCT_UNRESOLVED_DESTINATION` | Warning | Unresolved named destination | 1.4 | +| `STRUCT_NON_GOTO_OUTLINE` | Warning | Non-GoTo action in outline | 1.4 | +| `STRUCT_INVALID_PDFDOC_ENCODING` | Warning | Invalid PDFDocEncoding in string | 1.4 | +| `STRUCT_HYBRID_CONFLICT` | Warning | Hybrid xref conflict: traditional and stream disagree | 1.3 | +| `STRUCT_INCOMPLETE_COVERAGE` | Info | StructTree coverage below 80% with /Suspects true | 7.1.4 | +| `STRUCT_INVALID_PREV_OFFSET` | Warning | Invalid /Prev offset in xref chain | 1.3 | +| `STRUCT_INVALID_BDC_OPERAND` | Info | Invalid BDC operand | 3.4 | + +### XREF_* — Cross-Reference Table Errors + +Errors related to the xref table and trailer. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `XREF_INVALID_HEADER` | Warning | Invalid xref keyword or header | 1.3 | +| `XREF_INVALID_ENTRY` | Warning | Malformed xref entry (not 20 bytes, bad format) | 1.3 | +| `XREF_INVALID_SUBSECTION_HEADER` | Warning | Invalid subsection header (not "start count") | 1.3 | +| `XREF_OBJECT_ZERO_NOT_FREE` | Warning | Object 0 is not free (violates PDF spec) | 1.3 | +| `XREF_TRAILER_NOT_FOUND` | Warning | Trailer dictionary not found or malformed | 1.3 | +| `XREF_TRUNCATED` | Warning | Truncated xref table (unexpected EOF) | 1.3 | +| `XREF_REPAIRED` | Info | Xref was reconstructed via forward scan (EC-07) | 1.3 | +| `XREF_LINEARIZED_NO_FORWARD_SCAN` | Warning | Forward scan disabled for linearized files | 1.3 | +| `XREF_REMOTE_NO_FORWARD_SCAN` | Warning | Forward scan disabled for HTTP sources | 1.3 | +| `XREF_INVALID_STREAM_FORMAT` | Warning | Invalid xref stream format | 1.3 | +| `XREF_INVALID_STREAM_ENTRY` | Warning | Invalid xref stream entry | 1.3 | + +### STREAM_* — Stream Decoder Errors + +Errors related to stream decompression and filters. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `STREAM_DECODE_ERROR` | Warning | Stream decompression failed (corrupt data) | 1.5 | +| `STREAM_BOMB` | Error | Decompression bomb limit exceeded | 1.5 | +| `STREAM_UNKNOWN_FILTER` | Warning | Unknown filter name | 1.5 | +| `STREAM_INVALID_PARAMS` | Warning | Invalid filter parameters | 1.5 | +| `STREAM_INVALID_JPEG` | Warning | JPEG data has invalid or missing markers | 1.5 | +| `STREAM_INVALID_CCITT` | Warning | CCITT fax data has invalid or missing parameters | 1.5 | +| `STREAM_TRUNCATED` | Warning | Stream data truncated | 1.5 / 5.2.1 | + +### ENCRYPTION_* — Encryption Errors + +Errors related to PDF encryption and passwords. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `ENCRYPTION_UNSUPPORTED` | Fatal | Unsupported encryption or no password supplied | 1.4 | +| `ENCRYPTION_WRONG_PASSWORD` | Fatal | Password incorrect | 1.4 | + +### PAGE_* — Page-Level Errors + +Errors related to page structure and properties. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `PAGE_OUT_OF_RANGE` | Error | Page number out of range | 1.8 | +| `PAGE_INVALID_COUNT` | Warning | Invalid /Count in /Pages tree | 1.4 | +| `PAGE_INVALID_ROTATE` | Warning | Invalid /Rotate value (not multiple of 90) | 1.4 | + +### FONT_* — Font Pipeline Errors + +Errors related to font parsing and glyph mapping. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `FONT_GLYPH_UNMAPPED` | Warning | Glyph could not be mapped to Unicode | 2.2 | +| `FONT_NOT_FOUND` | Warning | Font not found or couldn't be parsed | 2.1 | +| `FONT_INVALID_CMAP` | Warning | Invalid CMap format | 2.2 | +| `FONT_PARSE_FAILED` | Warning | Font program parsing failed | 2.1 | +| `FONT_UNSUPPORTED` | Warning | Font type not supported for embedded loading | 2.1 | +| `FONT_CIDTOGIDMAP_TRUNCATED` | Warning | CIDToGIDMap stream has odd byte count | 2.1 | +| `ENCODING_DIFFERENCE_OUT_OF_RANGE` | Warning | Character code in /Differences exceeds valid range | 2.2 | +| `FONT_TYPE3_WIDTHS_LENGTH_MISMATCH` | Warning | Type3 font /Widths array length mismatch | 2.4 | + +### CJK_* — CJK Encoding Errors + +Errors related to CJK character encoding. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `CJK_DECODE_MALFORMED` | Warning | Malformed byte sequence in CJK encoding | 2.3 | + +### OCR_* — OCR Pipeline Errors + +Errors related to OCR processing. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `OCR_JBIG2_UNSUPPORTED` | Warning | JBIG2 decoder not available | 1.5 / 5.2 | +| `OCR_JPX_UNSUPPORTED` | Warning | JPEG2000 (JPX) decoder not available | 1.5 / 5.2 | +| `OCR_CCITT_UNSUPPORTED` | Warning | CCITT fax decoder not available | 1.5 / 5.2 | +| `OCR_TESSERACT_FAILED` | Warning | Tesseract OCR failed | 5.4 | +| `OCR_BROKENVECTOR_UNAVAILABLE` | Warning | OCR unavailable on broken-vector page | 4.7 | +| `OCR_LANGUAGE_UNAVAILABLE` | Warning | Requested OCR language pack not available | 5.4 | + +### IMG_* — Image Processing Errors + +Errors related to image extraction and processing. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `IMG_SOFTMASK_UNSUPPORTED` | Warning | Image soft mask not supported in direct compositing | 5.2.1 | +| `IMG_UNSUPPORTED_FORMAT` | Warning | Image format not supported | 5.2.1 | +| `IMG_DESKEW_OUT_OF_RANGE` | Warning | Deskew angle out of detectable range | 5.3.1 | +| `IMG_SOURCE_MIXED` | Warning | Image sources mixed in unexpected way | 5.3.2 | + +### REMOTE_* — Remote Source Errors + +Errors related to HTTP fetching and remote sources. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `REMOTE_FETCH_INTERRUPTED` | Error | HTTP fetch interrupted or failed | 1.8 | +| `REMOTE_NO_RANGE_SUPPORT` | Warning | Server does not support Range requests | 1.8 | +| `REMOTE_TLS_FAILED` | Fatal | TLS handshake failed | 1.8 | +| `REMOTE_DNS_FAILED` | Fatal | DNS resolution failed | 1.8 | +| `REMOTE_URL_PRIVATE_NETWORK` | Error | URL targets private network (SSRF protection) | 1.8 | + +### GSTATE_* — Graphics State Errors + +Errors related to graphics state operators. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `GSTATE_STACK_OVERFLOW` | Warning | Graphics state stack overflow | 3.1 | +| `GSTATE_STACK_UNDERFLOW` | Warning | Graphics state stack underflow | 3.1 | +| `GSTATE_BT_ET_MISMATCH` | Warning | Mismatched BT/ET pair | 3.1 | +| `CM_ARG_COUNT` | Warning | Invalid argument count for cm operator | 3.1 | +| `CM_DEGENERATE` | Warning | Degenerate matrix (det == 0 or NaN) | 3.1 | +| `HORIZ_SCALING_ZERO` | Warning | Horizontal scaling set to zero (Tz 0) | 3.1 | +| `TEXT_RENDERING_MODE_CLAMPED` | Warning | Text rendering mode clamped to valid range | 3.1 | +| `TSTAR_ZERO_LEADING` | Warning | T* operator when leading == 0 | 3.1 | +| `FONT_RESOURCE_NOT_FOUND` | Warning | Font resource not found | 3.1 | +| `FONT_SIZE_ZERO_OR_NEGATIVE` | Warning | Font size zero or negative | 3.1 | +| `BT_NESTED` | Warning | BT operator nested inside another BT block | 3.1 | +| `ET_WITHOUT_BT` | Warning | ET operator without matching BT | 3.1 | +| `TEXT_SHOW_OUTSIDE_BT` | Warning | Text-show operator outside BT/ET block | 3.1 | + +### LAYOUT_* — Layout and Reading Order Errors + +Errors related to layout analysis and reading order. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `TAGGED_PDF_STRUCT_TREE_DEFERRED` | Info | Tagged PDF StructTree deferred to Phase 7 | 4.5 | +| `LAYOUT_READING_ORDER_AMBIGUOUS` | Warning | Reading order may be incorrect | 4.5 | +| `LAYOUT_LOW_READABILITY` | Warning | Low readability score | 4.7 | + +### MCP_* — MCP Server Errors + +Errors related to MCP server operations. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `MCP_TOOL_INVALID_PARAMS` | Error | MCP tool call has invalid parameters | 6.7 | +| `MCP_PATH_TRAVERSAL` | Error | MCP path traversal attempt | 6.7 | + +### CACHE_* — Cache Errors + +Errors related to caching operations. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `CACHE_ENTRY_CORRUPT` | Warning | Cache entry is corrupted | 6.9 | +| `CACHE_WRITE_FAILED` | Warning | Cache write failed | 6.9 | + +### MARKED_CONTENT_* — Marked Content Errors + +Errors related to marked content operators. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `EMC_WITHOUT_BMC` | Info | EMC operator without matching BMC/BDC | 3.4 | +| `MARKED_CONTENT_DEPTH_EXCEEDED` | Info | Marked-content stack depth exceeded | 3.4 | +| `UNKNOWN_MARKED_CONTENT_PROPS` | Info | Unknown marked-content property name | 3.4 | +| `MCID_REDEFINED` | Info | MCID redefined in same scope | 3.4 | + +### PROFILE_* — Profile Errors + +Errors related to profile configuration. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `PROFILE_SECRETS_FORBIDDEN` | Error | Profile YAML contains forbidden secret keys | 7.10 | +| `PROFILE_INVALID` | Error | Profile YAML is invalid or malformed | 5.6.2 | + +### REPAIR_* — Repair Recovery + +Errors related to document repair operations. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `REPAIR_RESCUED_FROM_BACKWARDS_XREF` | Info | Xref repaired from backwards scan | 1.3 | + +### SECURITY_* — Security Diagnostics + +Security-related diagnostics. + +| Code | Severity | Description | Phase | +|------|----------|-------------|-------| +| `JAVASCRIPT_PRESENT` | Info | JavaScript present in PDF (never executed) | 1.2 | + +## Adding New Diagnostic Codes + +When adding a new diagnostic code: + +1. Choose a category prefix (STRUCT, STREAM, XREF, etc.) +2. Add the variant to the `DiagCode` enum in `crates/pdftract-core/src/diagnostics.rs` +3. Add the name mapping in `DiagCode::name()` +4. Add the category mapping in `DiagCode::category()` +5. Add the severity mapping in `DiagCode::severity()` +6. Add a catalog entry to `DIAGNOSTIC_CATALOG` +7. Add an entry to this document + +**Code naming convention:** `CATEGORY_SPECIFIC_ISSUE` (SCREAMING_SNAKE_CASE) + +**Severity levels:** +- `Info` — does not affect output validity +- `Warning` — output is usable but degraded +- `Error` — output for this region/page is invalid; other regions OK +- `Fatal` — extraction aborted, no usable output + +## Programmatic Usage + +Diagnostics can be consumed programmatically: + +```python +import json + +result = json.loads(pdftract_output) +for error in result.get('errors', []): + code = error['code'] + severity = error['severity'] + page = error.get('page_index') + + if code == 'OCR_BROKENVECTOR_UNAVAILABLE': + # Install Tesseract for OCR recovery + print(f"Page {page}: Install Tesseract for OCR recovery") +```