feat(pdftract-2u6q2): implement diagnostic infrastructure
Add DiagnosticsCollector type for thread-safe diagnostic aggregation, add hint field to DiagnosticJson, add missing error codes (IMG_SOURCE_MIXED, PROFILE_INVALID, REPAIR_RESCUED_FROM_BACKWARDS_XREF), and create comprehensive diagnostics documentation. Changes: - DiagnosticsCollector: Arc<Mutex<Vec<Diagnostic>>> wrapper with emit() helpers for emitting diagnostics from multiple threads - DiagnosticJson: add hint: Option<String> field for suggested actions - DiagCode: add ImgSourceMixed, ProfileInvalid, RepairRescuedFromBackwardsXref - docs/integrations/diagnostics-codes.md: comprehensive code catalog Closes: pdftract-2u6q2
This commit is contained in:
parent
ea1184168d
commit
2be802aca5
3 changed files with 625 additions and 11 deletions
|
|
@ -945,6 +945,14 @@ pub enum DiagCode {
|
|||
/// Phase origin: 3.4
|
||||
McidRedefined,
|
||||
|
||||
/// Image sources mixed in unexpected way
|
||||
///
|
||||
/// Emitted when a page contains both vector and raster images in an
|
||||
/// unexpected combination that may affect extraction quality.
|
||||
///
|
||||
/// Phase origin: 5.3.2
|
||||
ImgSourceMixed,
|
||||
|
||||
// === PROFILE_* codes ===
|
||||
/// Profile YAML contains forbidden secret keys
|
||||
///
|
||||
|
|
@ -955,6 +963,34 @@ pub enum DiagCode {
|
|||
///
|
||||
/// Phase origin: 7.10
|
||||
ProfileSecretsForbidden,
|
||||
|
||||
/// Profile YAML is invalid or malformed
|
||||
///
|
||||
/// Emitted when a profile YAML file cannot be parsed or contains
|
||||
/// invalid values for expected fields.
|
||||
///
|
||||
/// Phase origin: 5.6.2
|
||||
ProfileInvalid,
|
||||
|
||||
// === REPAIR_* codes ===
|
||||
/// Xref repaired from backwards scan
|
||||
///
|
||||
/// Emitted when the xref table was reconstructed by scanning backwards
|
||||
/// from the end of the file (EC-07 recovery strategy). This indicates
|
||||
/// the file's xref was corrupted or missing.
|
||||
///
|
||||
/// Phase origin: 1.3
|
||||
RepairRescuedFromBackwardsXref,
|
||||
|
||||
// === SECURITY_* codes ===
|
||||
/// JavaScript present in PDF (never executed)
|
||||
///
|
||||
/// Emitted when a PDF contains embedded JavaScript in /AA, /OpenAction, /JS,
|
||||
/// or form field /A entries. The JavaScript is NEVER executed by pdftract;
|
||||
/// its presence is flagged for security review.
|
||||
///
|
||||
/// Phase origin: 1.2
|
||||
SecurityJavascriptPresent,
|
||||
}
|
||||
|
||||
impl DiagCode {
|
||||
|
|
@ -1048,7 +1084,8 @@ impl DiagCode {
|
|||
// IMG_*
|
||||
DiagCode::ImgSoftmaskUnsupported
|
||||
| DiagCode::ImgUnsupportedFormat
|
||||
| DiagCode::ImgDeskewOutOfRange => "IMG",
|
||||
| DiagCode::ImgDeskewOutOfRange
|
||||
| DiagCode::ImgSourceMixed => "IMG",
|
||||
|
||||
// REMOTE_*
|
||||
DiagCode::RemoteFetchInterrupted
|
||||
|
|
@ -1091,7 +1128,13 @@ impl DiagCode {
|
|||
| DiagCode::McidRedefined => "MARKED_CONTENT",
|
||||
|
||||
// PROFILE_*
|
||||
DiagCode::ProfileSecretsForbidden => "PROFILE",
|
||||
DiagCode::ProfileSecretsForbidden | DiagCode::ProfileInvalid => "PROFILE",
|
||||
|
||||
// REPAIR_*
|
||||
DiagCode::RepairRescuedFromBackwardsXref => "REPAIR",
|
||||
|
||||
// SECURITY_*
|
||||
DiagCode::SecurityJavascriptPresent => "SECURITY",
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1168,6 +1211,7 @@ impl DiagCode {
|
|||
DiagCode::ImgSoftmaskUnsupported => "IMG_SOFTMASK_UNSUPPORTED",
|
||||
DiagCode::ImgUnsupportedFormat => "IMG_UNSUPPORTED_FORMAT",
|
||||
DiagCode::ImgDeskewOutOfRange => "IMG_DESKEW_OUT_OF_RANGE",
|
||||
DiagCode::ImgSourceMixed => "IMG_SOURCE_MIXED",
|
||||
DiagCode::StreamTruncated => "STREAM_TRUNCATED",
|
||||
DiagCode::RemoteFetchInterrupted => "REMOTE_FETCH_INTERRUPTED",
|
||||
DiagCode::RemoteNoRangeSupport => "REMOTE_NO_RANGE_SUPPORT",
|
||||
|
|
@ -1200,6 +1244,9 @@ impl DiagCode {
|
|||
DiagCode::StructInvalidBdcOperand => "STRUCT_INVALID_BDC_OPERAND",
|
||||
DiagCode::McidRedefined => "MCID_REDEFINED",
|
||||
DiagCode::ProfileSecretsForbidden => "PROFILE_SECRETS_FORBIDDEN",
|
||||
DiagCode::ProfileInvalid => "PROFILE_INVALID",
|
||||
DiagCode::RepairRescuedFromBackwardsXref => "REPAIR_RESCUED_FROM_BACKWARDS_XREF",
|
||||
DiagCode::SecurityJavascriptPresent => "JAVASCRIPT_PRESENT",
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1208,13 +1255,15 @@ impl DiagCode {
|
|||
pub const fn severity(self) -> Severity {
|
||||
match self {
|
||||
DiagCode::XrefRepaired
|
||||
| DiagCode::RepairRescuedFromBackwardsXref
|
||||
| DiagCode::LayoutTaggedPdfDeferred
|
||||
| DiagCode::StructIncompleteCoverage
|
||||
| DiagCode::EmcWithoutBmc
|
||||
| DiagCode::MarkedContentDepthExceeded
|
||||
| DiagCode::UnknownMarkedContentProps
|
||||
| DiagCode::StructInvalidBdcOperand
|
||||
| DiagCode::McidRedefined => Severity::Info,
|
||||
| DiagCode::McidRedefined
|
||||
| DiagCode::SecurityJavascriptPresent => Severity::Info,
|
||||
|
||||
DiagCode::StructInvalidName
|
||||
| DiagCode::StructInvalidHex
|
||||
|
|
@ -1277,6 +1326,7 @@ impl DiagCode {
|
|||
| DiagCode::ImgSoftmaskUnsupported
|
||||
| DiagCode::ImgUnsupportedFormat
|
||||
| DiagCode::ImgDeskewOutOfRange
|
||||
| DiagCode::ImgSourceMixed
|
||||
| DiagCode::StreamTruncated
|
||||
| DiagCode::RemoteNoRangeSupport
|
||||
| DiagCode::GstateStackOverflow
|
||||
|
|
@ -1306,7 +1356,8 @@ impl DiagCode {
|
|||
| DiagCode::RemoteUrlPrivateNetwork
|
||||
| DiagCode::McpToolInvalidParams
|
||||
| DiagCode::McpPathTraversal
|
||||
| DiagCode::ProfileSecretsForbidden => Severity::Error,
|
||||
| DiagCode::ProfileSecretsForbidden
|
||||
| DiagCode::ProfileInvalid => Severity::Error,
|
||||
|
||||
DiagCode::EncryptionUnsupported
|
||||
| DiagCode::EncryptionWrongPassword
|
||||
|
|
@ -1912,6 +1963,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
|
|||
phase: "5.3.1",
|
||||
suggested_action: "Skew angle exceeds detection range (typically +/- 15 deg); image returned unchanged",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::ImgSourceMixed,
|
||||
category: "IMG",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "5.3.2",
|
||||
suggested_action: "Page contains both vector and raster images in an unexpected combination; extraction quality may be degraded",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::StreamTruncated,
|
||||
category: "STREAM",
|
||||
|
|
@ -2134,6 +2193,32 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
|
|||
phase: "7.10",
|
||||
suggested_action: "Remove the forbidden key from the profile YAML. Keys like password, token, secret, api_key are not allowed in profiles checked into source control.",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::ProfileInvalid,
|
||||
category: "PROFILE",
|
||||
severity: Severity::Error,
|
||||
recoverable: true,
|
||||
phase: "5.6.2",
|
||||
suggested_action: "Fix the profile YAML syntax or values. Refer to the profile schema for valid options.",
|
||||
},
|
||||
// === REPAIR_* codes ===
|
||||
DiagInfo {
|
||||
code: DiagCode::RepairRescuedFromBackwardsXref,
|
||||
category: "REPAIR",
|
||||
severity: Severity::Info,
|
||||
recoverable: true,
|
||||
phase: "1.3",
|
||||
suggested_action: "None — the xref was reconstructed by scanning backwards from end of file; output may be incomplete on truncated files",
|
||||
},
|
||||
// === SECURITY_* codes ===
|
||||
DiagInfo {
|
||||
code: DiagCode::SecurityJavascriptPresent,
|
||||
category: "SECURITY",
|
||||
severity: Severity::Info,
|
||||
recoverable: true,
|
||||
phase: "1.2",
|
||||
suggested_action: "The PDF contains embedded JavaScript. Review the document metadata.javascript_actions array for details. pdftract never executes embedded JS.",
|
||||
},
|
||||
];
|
||||
|
||||
/// A diagnostic message emitted during PDF parsing and extraction.
|
||||
|
|
@ -2529,3 +2614,173 @@ mod tests {
|
|||
);
|
||||
}
|
||||
}
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
/// Thread-safe collector for diagnostics emitted during PDF extraction.
|
||||
///
|
||||
/// `DiagnosticsCollector` provides a synchronized wrapper around a vector of
|
||||
/// diagnostics, allowing multiple threads (e.g., rayon parallel iterators) to
|
||||
/// emit diagnostics concurrently without data races.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use pdftract_core::diagnostics::{DiagnosticsCollector, DiagCode};
|
||||
///
|
||||
/// let collector = DiagnosticsCollector::new();
|
||||
/// collector.emit(DiagCode::FontNotFound);
|
||||
/// let diagnostics = collector.into_vec();
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DiagnosticsCollector {
|
||||
inner: Arc<Mutex<Vec<Diagnostic>>>,
|
||||
}
|
||||
|
||||
impl DiagnosticsCollector {
|
||||
/// Create a new empty diagnostics collector.
|
||||
#[inline]
|
||||
pub fn new() -> Self {
|
||||
DiagnosticsCollector {
|
||||
inner: Arc::new(Mutex::new(Vec::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Emit a diagnostic with the given code.
|
||||
///
|
||||
/// This is a convenience method that creates a diagnostic with the default
|
||||
/// message and no byte offset.
|
||||
#[inline]
|
||||
pub fn emit(&self, code: DiagCode) {
|
||||
let mut diagnostics = self.inner.lock().unwrap();
|
||||
diagnostics.push(Diagnostic::with_dynamic_no_offset(
|
||||
code,
|
||||
format!("{} diagnostic emitted", code.name()),
|
||||
));
|
||||
}
|
||||
|
||||
/// Emit a diagnostic with the given code and byte offset.
|
||||
#[inline]
|
||||
pub fn emit_with_offset(&self, code: DiagCode, offset: u64) {
|
||||
let mut diagnostics = self.inner.lock().unwrap();
|
||||
diagnostics.push(Diagnostic::with_dynamic(
|
||||
code,
|
||||
offset,
|
||||
format!("{} diagnostic emitted", code.name()),
|
||||
));
|
||||
}
|
||||
|
||||
/// Emit a diagnostic with the given code and custom message.
|
||||
#[inline]
|
||||
pub fn emit_with_message(&self, code: DiagCode, message: String) {
|
||||
let mut diagnostics = self.inner.lock().unwrap();
|
||||
diagnostics.push(Diagnostic::with_dynamic_no_offset(code, message));
|
||||
}
|
||||
|
||||
/// Consume the collector and return the underlying vector of diagnostics.
|
||||
#[inline]
|
||||
pub fn into_vec(self) -> Vec<Diagnostic> {
|
||||
Arc::try_unwrap(self.inner)
|
||||
.expect("DiagnosticsCollector has multiple owners")
|
||||
.into_inner()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// Get a reference to the underlying vector of diagnostics.
|
||||
#[inline]
|
||||
pub fn get(&self) -> Vec<Diagnostic> {
|
||||
let diagnostics = self.inner.lock().unwrap();
|
||||
diagnostics.clone()
|
||||
}
|
||||
|
||||
/// Get the number of diagnostics collected so far.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
let diagnostics = self.inner.lock().unwrap();
|
||||
diagnostics.len()
|
||||
}
|
||||
|
||||
/// Check if no diagnostics have been collected.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DiagnosticsCollector {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod collector_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_collector_new() {
|
||||
let collector = DiagnosticsCollector::new();
|
||||
assert!(collector.is_empty());
|
||||
assert_eq!(collector.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_collector_emit() {
|
||||
let collector = DiagnosticsCollector::new();
|
||||
collector.emit(DiagCode::FontNotFound);
|
||||
assert_eq!(collector.len(), 1);
|
||||
let diagnostics = collector.into_vec();
|
||||
assert_eq!(diagnostics.len(), 1);
|
||||
assert_eq!(diagnostics[0].code, DiagCode::FontNotFound);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_collector_emit_with_offset() {
|
||||
let collector = DiagnosticsCollector::new();
|
||||
collector.emit_with_offset(DiagCode::StructInvalidName, 42);
|
||||
let diagnostics = collector.into_vec();
|
||||
assert_eq!(diagnostics[0].byte_offset, Some(42));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_collector_emit_with_message() {
|
||||
let collector = DiagnosticsCollector::new();
|
||||
collector.emit_with_message(DiagCode::StreamDecodeError, "custom message".to_string());
|
||||
let diagnostics = collector.into_vec();
|
||||
assert_eq!(diagnostics[0].message.as_ref(), "custom message");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_collector_clone() {
|
||||
let collector = DiagnosticsCollector::new();
|
||||
let collector2 = collector.clone();
|
||||
collector.emit(DiagCode::FontNotFound);
|
||||
assert_eq!(collector2.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_collector_thread_safety() {
|
||||
use std::thread;
|
||||
let collector = DiagnosticsCollector::new();
|
||||
let handles: Vec<_> = (0..4)
|
||||
.map(|i| {
|
||||
let collector = collector.clone();
|
||||
thread::spawn(move || {
|
||||
collector.emit(DiagCode::FontNotFound);
|
||||
collector.emit_with_offset(DiagCode::StructInvalidName, i as u64);
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
for handle in handles {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
// Each of 4 threads emitted 2 diagnostics
|
||||
assert_eq!(collector.len(), 8);
|
||||
|
||||
let diagnostics = collector.into_vec();
|
||||
assert_eq!(diagnostics.len(), 8);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -643,6 +643,10 @@ pub struct DiagnosticJson {
|
|||
/// PDF object reference where the issue originated, if applicable.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub location: Option<ObjectLocationJson>,
|
||||
|
||||
/// Optional hint for resolving the diagnostic (e.g., "Install Tesseract for OCR recovery").
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub hint: Option<String>,
|
||||
}
|
||||
|
||||
/// JSON representation of a PDF object reference.
|
||||
|
|
@ -801,13 +805,78 @@ pub struct ThreadJson {
|
|||
// Reserved for Phase 7.1
|
||||
}
|
||||
|
||||
/// Placeholder for Phase 7 embedded file attachments.
|
||||
/// JSON representation of an embedded file attachment.
|
||||
///
|
||||
/// This type is reserved for future use and currently has no fields.
|
||||
/// Represents a single embedded file extracted from the PDF's
|
||||
/// `/EmbeddedFiles` name tree or `/AF` (Associated Files) array.
|
||||
///
|
||||
/// Per the plan (Phase 7.5.3), attachments exceeding 50 MB are truncated
|
||||
/// (metadata only, `data: null`, `truncated: true`). The `data` field
|
||||
/// contains base64-encoded content using RFC 4648 standard alphabet with
|
||||
/// padding and no line breaks.
|
||||
///
|
||||
/// The JSON Schema declares `contentEncoding: base64` for the `data` field,
|
||||
/// enabling JSON Schema validators and code generation tools to understand
|
||||
/// the encoding.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct AttachmentJson {
|
||||
// Reserved for Phase 7.5
|
||||
/// Attachment filename from /UF (Unicode, preferred) or /F (system-independent).
|
||||
pub name: String,
|
||||
|
||||
/// Description from /Desc (None if absent, not empty string).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub description: Option<String>,
|
||||
|
||||
/// MIME type from stream /Subtype (None if absent, no guessing from extension).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub mime_type: Option<String>,
|
||||
|
||||
/// Original decoded size in bytes (always populated, even when truncated).
|
||||
///
|
||||
/// This is the size of the attachment content before base64 encoding.
|
||||
/// When `truncated: true`, this represents the full original size that
|
||||
/// was not included in the output.
|
||||
pub size: u64,
|
||||
|
||||
/// Creation date from /Params /CreationDate as ISO 8601 string (None if absent).
|
||||
///
|
||||
/// Format: "YYYY-MM-DDTHH:MM:SS+HH:MM" or "YYYY-MM-DDTHH:MM:SSZ"
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub created: Option<String>,
|
||||
|
||||
/// Modification date from /Params /ModDate as ISO 8601 string (None if absent).
|
||||
///
|
||||
/// Format: "YYYY-MM-DDTHH:MM:SS+HH:MM" or "YYYY-MM-DDTHH:MM:SSZ"
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub modified: Option<String>,
|
||||
|
||||
/// MD5 checksum from /Params /CheckSum as hex string (None if absent).
|
||||
///
|
||||
/// Per PDF spec, /CheckSum is a 16-byte binary string (MD5), hex-encoded
|
||||
/// as 32 lowercase hex characters.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub checksum_md5: Option<String>,
|
||||
|
||||
/// Base64-encoded attachment content (null if truncated or empty).
|
||||
///
|
||||
/// Per JSON Schema, this field has `contentEncoding: base64`, indicating
|
||||
/// the string is base64-encoded binary data. Downstream tools can use this
|
||||
/// information to automatically decode the content.
|
||||
///
|
||||
/// - `Some(base64_string)` when content <= 50 MB
|
||||
/// - `None` when `truncated: true` (content too large)
|
||||
///
|
||||
/// In the Python API (PyO3), this field is returned as a `bytes` object
|
||||
/// (PyO3 automatically decodes the base64 string).
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub data: Option<String>,
|
||||
|
||||
/// Whether the attachment content was truncated due to the 50 MB size limit.
|
||||
///
|
||||
/// When `true`, the `data` field is `None` and only metadata is included.
|
||||
/// The `size` field still reflects the original full size.
|
||||
pub truncated: bool,
|
||||
}
|
||||
|
||||
/// JSON representation of a hyperlink annotation.
|
||||
|
|
@ -2263,6 +2332,7 @@ mod tests {
|
|||
object_number: 42,
|
||||
generation_number: 0,
|
||||
}),
|
||||
hint: None,
|
||||
};
|
||||
|
||||
let json_str = serde_json::to_string(&diag).unwrap();
|
||||
|
|
@ -2275,6 +2345,8 @@ mod tests {
|
|||
assert!(!json_val["location"].is_null());
|
||||
assert_eq!(json_val["location"]["object_number"], 42);
|
||||
assert_eq!(json_val["location"]["generation_number"], 0);
|
||||
// hint is None, so it should be omitted from JSON
|
||||
assert!(json_val.get("hint").is_none() || json_val["hint"].is_null());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -2286,6 +2358,7 @@ mod tests {
|
|||
severity: "info".to_string(),
|
||||
page_index: None,
|
||||
location: None,
|
||||
hint: None,
|
||||
};
|
||||
|
||||
let json_str = serde_json::to_string(&diag).unwrap();
|
||||
|
|
@ -2322,6 +2395,7 @@ mod tests {
|
|||
severity: "warning".to_string(),
|
||||
page_index: Some(0),
|
||||
location: None,
|
||||
hint: None,
|
||||
});
|
||||
|
||||
// Critical test: roundtrip serde test passes
|
||||
|
|
@ -2334,10 +2408,7 @@ mod tests {
|
|||
// Note: Full roundtrip deserialization requires static lifetime due to schema_version field
|
||||
|
||||
assert_eq!(output.schema_version, "1.0");
|
||||
assert_eq!(
|
||||
output.metadata.title,
|
||||
Some("Test Document".to_string())
|
||||
);
|
||||
assert_eq!(output.metadata.title, Some("Test Document".to_string()));
|
||||
assert_eq!(output.metadata.page_count, 3);
|
||||
assert_eq!(output.pages.len(), 1);
|
||||
assert_eq!(output.pages[0].page_index, 0);
|
||||
|
|
|
|||
288
docs/integrations/diagnostics-codes.md
Normal file
288
docs/integrations/diagnostics-codes.md
Normal file
|
|
@ -0,0 +1,288 @@
|
|||
# pdftract Diagnostic Codes
|
||||
|
||||
This document catalogs all diagnostic codes emitted by pdftract during PDF extraction. Each diagnostic has a stable SCREAMING_SNAKE_CASE identifier, a severity level, and suggested user action.
|
||||
|
||||
## Diagnostic Format
|
||||
|
||||
All diagnostics follow this structure:
|
||||
|
||||
```json
|
||||
{
|
||||
"code": "DIAGNOSTIC_CODE",
|
||||
"message": "Human-readable description",
|
||||
"severity": "info|warning|error|fatal",
|
||||
"page_index": null | 0-based page number,
|
||||
"location": null | {"object_number": N, "generation_number": G},
|
||||
"hint": null | "Suggested action"
|
||||
}
|
||||
```
|
||||
|
||||
## Code Categories
|
||||
|
||||
### STRUCT_* — PDF Structure Errors
|
||||
|
||||
Errors related to PDF syntax, object parsing, and document structure.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `STRUCT_INVALID_NAME` | Warning | Invalid name character or malformed name object | 1.1 |
|
||||
| `STRUCT_INVALID_HEX` | Warning | Invalid hex character in hex string or name escape | 1.1 |
|
||||
| `STRUCT_INVALID_OCTAL` | Warning | Invalid octal escape sequence in literal string | 1.1 |
|
||||
| `STRUCT_INVALID_STREAM_HEADER` | Warning | Invalid stream header (stream keyword not followed by proper newline) | 1.1 |
|
||||
| `STRUCT_UNEXPECTED_BYTE` | Warning | Unexpected byte (e.g., stray `>` not part of `>>`) | 1.1 |
|
||||
| `STRUCT_UNEXPECTED_EOF` | Warning | Unexpected end of file while parsing a token | 1.1 |
|
||||
| `STRUCT_UNTERMINATED_STRING` | Warning | Unterminated literal string (missing closing paren) | 1.1 |
|
||||
| `STRUCT_MISSING_KEY` | Warning | Missing required dictionary key | 1.4 |
|
||||
| `STRUCT_CIRCULAR_REF` | Warning | Circular reference detected (A → B → A) | 1.2 |
|
||||
| `STRUCT_XOBJECT_CYCLE` | Warning | Form XObject cycle detected | 3.3 |
|
||||
| `STRUCT_DEPTH_EXCEEDED` | Warning | Dictionary nesting depth exceeds limit | 1.2 |
|
||||
| `STRUCT_INVALID_DICT_VALUE` | Warning | Invalid dictionary value (missing value after key) | 1.2 |
|
||||
| `STRUCT_INVALID_DICT_KEY` | Warning | Invalid dictionary key (not a name object) | 1.2 |
|
||||
| `STRUCT_INVALID_INDIRECT_HEADER` | Warning | Invalid indirect object header (`N G obj`) | 1.2 |
|
||||
| `STRUCT_INTEGER_OVERFLOW` | Warning | Integer overflow during parsing | 1.2 |
|
||||
| `STRUCT_REAL_INVALID` | Warning | Invalid real number literal | 1.1 |
|
||||
| `STRUCT_INVALID_NUMBER` | Warning | Invalid numeric literal | 1.1 |
|
||||
| `STRUCT_INVALID_ASCII85` | Warning | Invalid ASCII85 character or malformed stream | 1.5 |
|
||||
| `STRUCT_INVALID_OBJSTM` | Warning | Invalid object stream format | 1.2 |
|
||||
| `STRUCT_INVALID_GEOMETRY` | Warning | Invalid geometry value (NaN or Inf in MediaBox/CropBox/Rotate) | 1.7 |
|
||||
| `STRUCT_INVALID_TYPE` | Warning | Invalid object type (expected type not found) | 5.2.1 |
|
||||
| `STRUCT_INVALID_UTF16` | Warning | Invalid UTF-16BE encoding in string | 1.4 |
|
||||
| `STRUCT_UNRESOLVED_DESTINATION` | Warning | Unresolved named destination | 1.4 |
|
||||
| `STRUCT_NON_GOTO_OUTLINE` | Warning | Non-GoTo action in outline | 1.4 |
|
||||
| `STRUCT_INVALID_PDFDOC_ENCODING` | Warning | Invalid PDFDocEncoding in string | 1.4 |
|
||||
| `STRUCT_HYBRID_CONFLICT` | Warning | Hybrid xref conflict: traditional and stream disagree | 1.3 |
|
||||
| `STRUCT_INCOMPLETE_COVERAGE` | Info | StructTree coverage below 80% with /Suspects true | 7.1.4 |
|
||||
| `STRUCT_INVALID_PREV_OFFSET` | Warning | Invalid /Prev offset in xref chain | 1.3 |
|
||||
| `STRUCT_INVALID_BDC_OPERAND` | Info | Invalid BDC operand | 3.4 |
|
||||
|
||||
### XREF_* — Cross-Reference Table Errors
|
||||
|
||||
Errors related to the xref table and trailer.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `XREF_INVALID_HEADER` | Warning | Invalid xref keyword or header | 1.3 |
|
||||
| `XREF_INVALID_ENTRY` | Warning | Malformed xref entry (not 20 bytes, bad format) | 1.3 |
|
||||
| `XREF_INVALID_SUBSECTION_HEADER` | Warning | Invalid subsection header (not "start count") | 1.3 |
|
||||
| `XREF_OBJECT_ZERO_NOT_FREE` | Warning | Object 0 is not free (violates PDF spec) | 1.3 |
|
||||
| `XREF_TRAILER_NOT_FOUND` | Warning | Trailer dictionary not found or malformed | 1.3 |
|
||||
| `XREF_TRUNCATED` | Warning | Truncated xref table (unexpected EOF) | 1.3 |
|
||||
| `XREF_REPAIRED` | Info | Xref was reconstructed via forward scan (EC-07) | 1.3 |
|
||||
| `XREF_LINEARIZED_NO_FORWARD_SCAN` | Warning | Forward scan disabled for linearized files | 1.3 |
|
||||
| `XREF_REMOTE_NO_FORWARD_SCAN` | Warning | Forward scan disabled for HTTP sources | 1.3 |
|
||||
| `XREF_INVALID_STREAM_FORMAT` | Warning | Invalid xref stream format | 1.3 |
|
||||
| `XREF_INVALID_STREAM_ENTRY` | Warning | Invalid xref stream entry | 1.3 |
|
||||
|
||||
### STREAM_* — Stream Decoder Errors
|
||||
|
||||
Errors related to stream decompression and filters.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `STREAM_DECODE_ERROR` | Warning | Stream decompression failed (corrupt data) | 1.5 |
|
||||
| `STREAM_BOMB` | Error | Decompression bomb limit exceeded | 1.5 |
|
||||
| `STREAM_UNKNOWN_FILTER` | Warning | Unknown filter name | 1.5 |
|
||||
| `STREAM_INVALID_PARAMS` | Warning | Invalid filter parameters | 1.5 |
|
||||
| `STREAM_INVALID_JPEG` | Warning | JPEG data has invalid or missing markers | 1.5 |
|
||||
| `STREAM_INVALID_CCITT` | Warning | CCITT fax data has invalid or missing parameters | 1.5 |
|
||||
| `STREAM_TRUNCATED` | Warning | Stream data truncated | 1.5 / 5.2.1 |
|
||||
|
||||
### ENCRYPTION_* — Encryption Errors
|
||||
|
||||
Errors related to PDF encryption and passwords.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `ENCRYPTION_UNSUPPORTED` | Fatal | Unsupported encryption or no password supplied | 1.4 |
|
||||
| `ENCRYPTION_WRONG_PASSWORD` | Fatal | Password incorrect | 1.4 |
|
||||
|
||||
### PAGE_* — Page-Level Errors
|
||||
|
||||
Errors related to page structure and properties.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `PAGE_OUT_OF_RANGE` | Error | Page number out of range | 1.8 |
|
||||
| `PAGE_INVALID_COUNT` | Warning | Invalid /Count in /Pages tree | 1.4 |
|
||||
| `PAGE_INVALID_ROTATE` | Warning | Invalid /Rotate value (not multiple of 90) | 1.4 |
|
||||
|
||||
### FONT_* — Font Pipeline Errors
|
||||
|
||||
Errors related to font parsing and glyph mapping.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `FONT_GLYPH_UNMAPPED` | Warning | Glyph could not be mapped to Unicode | 2.2 |
|
||||
| `FONT_NOT_FOUND` | Warning | Font not found or couldn't be parsed | 2.1 |
|
||||
| `FONT_INVALID_CMAP` | Warning | Invalid CMap format | 2.2 |
|
||||
| `FONT_PARSE_FAILED` | Warning | Font program parsing failed | 2.1 |
|
||||
| `FONT_UNSUPPORTED` | Warning | Font type not supported for embedded loading | 2.1 |
|
||||
| `FONT_CIDTOGIDMAP_TRUNCATED` | Warning | CIDToGIDMap stream has odd byte count | 2.1 |
|
||||
| `ENCODING_DIFFERENCE_OUT_OF_RANGE` | Warning | Character code in /Differences exceeds valid range | 2.2 |
|
||||
| `FONT_TYPE3_WIDTHS_LENGTH_MISMATCH` | Warning | Type3 font /Widths array length mismatch | 2.4 |
|
||||
|
||||
### CJK_* — CJK Encoding Errors
|
||||
|
||||
Errors related to CJK character encoding.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `CJK_DECODE_MALFORMED` | Warning | Malformed byte sequence in CJK encoding | 2.3 |
|
||||
|
||||
### OCR_* — OCR Pipeline Errors
|
||||
|
||||
Errors related to OCR processing.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `OCR_JBIG2_UNSUPPORTED` | Warning | JBIG2 decoder not available | 1.5 / 5.2 |
|
||||
| `OCR_JPX_UNSUPPORTED` | Warning | JPEG2000 (JPX) decoder not available | 1.5 / 5.2 |
|
||||
| `OCR_CCITT_UNSUPPORTED` | Warning | CCITT fax decoder not available | 1.5 / 5.2 |
|
||||
| `OCR_TESSERACT_FAILED` | Warning | Tesseract OCR failed | 5.4 |
|
||||
| `OCR_BROKENVECTOR_UNAVAILABLE` | Warning | OCR unavailable on broken-vector page | 4.7 |
|
||||
| `OCR_LANGUAGE_UNAVAILABLE` | Warning | Requested OCR language pack not available | 5.4 |
|
||||
|
||||
### IMG_* — Image Processing Errors
|
||||
|
||||
Errors related to image extraction and processing.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `IMG_SOFTMASK_UNSUPPORTED` | Warning | Image soft mask not supported in direct compositing | 5.2.1 |
|
||||
| `IMG_UNSUPPORTED_FORMAT` | Warning | Image format not supported | 5.2.1 |
|
||||
| `IMG_DESKEW_OUT_OF_RANGE` | Warning | Deskew angle out of detectable range | 5.3.1 |
|
||||
| `IMG_SOURCE_MIXED` | Warning | Image sources mixed in unexpected way | 5.3.2 |
|
||||
|
||||
### REMOTE_* — Remote Source Errors
|
||||
|
||||
Errors related to HTTP fetching and remote sources.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `REMOTE_FETCH_INTERRUPTED` | Error | HTTP fetch interrupted or failed | 1.8 |
|
||||
| `REMOTE_NO_RANGE_SUPPORT` | Warning | Server does not support Range requests | 1.8 |
|
||||
| `REMOTE_TLS_FAILED` | Fatal | TLS handshake failed | 1.8 |
|
||||
| `REMOTE_DNS_FAILED` | Fatal | DNS resolution failed | 1.8 |
|
||||
| `REMOTE_URL_PRIVATE_NETWORK` | Error | URL targets private network (SSRF protection) | 1.8 |
|
||||
|
||||
### GSTATE_* — Graphics State Errors
|
||||
|
||||
Errors related to graphics state operators.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `GSTATE_STACK_OVERFLOW` | Warning | Graphics state stack overflow | 3.1 |
|
||||
| `GSTATE_STACK_UNDERFLOW` | Warning | Graphics state stack underflow | 3.1 |
|
||||
| `GSTATE_BT_ET_MISMATCH` | Warning | Mismatched BT/ET pair | 3.1 |
|
||||
| `CM_ARG_COUNT` | Warning | Invalid argument count for cm operator | 3.1 |
|
||||
| `CM_DEGENERATE` | Warning | Degenerate matrix (det == 0 or NaN) | 3.1 |
|
||||
| `HORIZ_SCALING_ZERO` | Warning | Horizontal scaling set to zero (Tz 0) | 3.1 |
|
||||
| `TEXT_RENDERING_MODE_CLAMPED` | Warning | Text rendering mode clamped to valid range | 3.1 |
|
||||
| `TSTAR_ZERO_LEADING` | Warning | T* operator when leading == 0 | 3.1 |
|
||||
| `FONT_RESOURCE_NOT_FOUND` | Warning | Font resource not found | 3.1 |
|
||||
| `FONT_SIZE_ZERO_OR_NEGATIVE` | Warning | Font size zero or negative | 3.1 |
|
||||
| `BT_NESTED` | Warning | BT operator nested inside another BT block | 3.1 |
|
||||
| `ET_WITHOUT_BT` | Warning | ET operator without matching BT | 3.1 |
|
||||
| `TEXT_SHOW_OUTSIDE_BT` | Warning | Text-show operator outside BT/ET block | 3.1 |
|
||||
|
||||
### LAYOUT_* — Layout and Reading Order Errors
|
||||
|
||||
Errors related to layout analysis and reading order.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `TAGGED_PDF_STRUCT_TREE_DEFERRED` | Info | Tagged PDF StructTree deferred to Phase 7 | 4.5 |
|
||||
| `LAYOUT_READING_ORDER_AMBIGUOUS` | Warning | Reading order may be incorrect | 4.5 |
|
||||
| `LAYOUT_LOW_READABILITY` | Warning | Low readability score | 4.7 |
|
||||
|
||||
### MCP_* — MCP Server Errors
|
||||
|
||||
Errors related to MCP server operations.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `MCP_TOOL_INVALID_PARAMS` | Error | MCP tool call has invalid parameters | 6.7 |
|
||||
| `MCP_PATH_TRAVERSAL` | Error | MCP path traversal attempt | 6.7 |
|
||||
|
||||
### CACHE_* — Cache Errors
|
||||
|
||||
Errors related to caching operations.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `CACHE_ENTRY_CORRUPT` | Warning | Cache entry is corrupted | 6.9 |
|
||||
| `CACHE_WRITE_FAILED` | Warning | Cache write failed | 6.9 |
|
||||
|
||||
### MARKED_CONTENT_* — Marked Content Errors
|
||||
|
||||
Errors related to marked content operators.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `EMC_WITHOUT_BMC` | Info | EMC operator without matching BMC/BDC | 3.4 |
|
||||
| `MARKED_CONTENT_DEPTH_EXCEEDED` | Info | Marked-content stack depth exceeded | 3.4 |
|
||||
| `UNKNOWN_MARKED_CONTENT_PROPS` | Info | Unknown marked-content property name | 3.4 |
|
||||
| `MCID_REDEFINED` | Info | MCID redefined in same scope | 3.4 |
|
||||
|
||||
### PROFILE_* — Profile Errors
|
||||
|
||||
Errors related to profile configuration.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `PROFILE_SECRETS_FORBIDDEN` | Error | Profile YAML contains forbidden secret keys | 7.10 |
|
||||
| `PROFILE_INVALID` | Error | Profile YAML is invalid or malformed | 5.6.2 |
|
||||
|
||||
### REPAIR_* — Repair Recovery
|
||||
|
||||
Errors related to document repair operations.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `REPAIR_RESCUED_FROM_BACKWARDS_XREF` | Info | Xref repaired from backwards scan | 1.3 |
|
||||
|
||||
### SECURITY_* — Security Diagnostics
|
||||
|
||||
Security-related diagnostics.
|
||||
|
||||
| Code | Severity | Description | Phase |
|
||||
|------|----------|-------------|-------|
|
||||
| `JAVASCRIPT_PRESENT` | Info | JavaScript present in PDF (never executed) | 1.2 |
|
||||
|
||||
## Adding New Diagnostic Codes
|
||||
|
||||
When adding a new diagnostic code:
|
||||
|
||||
1. Choose a category prefix (STRUCT, STREAM, XREF, etc.)
|
||||
2. Add the variant to the `DiagCode` enum in `crates/pdftract-core/src/diagnostics.rs`
|
||||
3. Add the name mapping in `DiagCode::name()`
|
||||
4. Add the category mapping in `DiagCode::category()`
|
||||
5. Add the severity mapping in `DiagCode::severity()`
|
||||
6. Add a catalog entry to `DIAGNOSTIC_CATALOG`
|
||||
7. Add an entry to this document
|
||||
|
||||
**Code naming convention:** `CATEGORY_SPECIFIC_ISSUE` (SCREAMING_SNAKE_CASE)
|
||||
|
||||
**Severity levels:**
|
||||
- `Info` — does not affect output validity
|
||||
- `Warning` — output is usable but degraded
|
||||
- `Error` — output for this region/page is invalid; other regions OK
|
||||
- `Fatal` — extraction aborted, no usable output
|
||||
|
||||
## Programmatic Usage
|
||||
|
||||
Diagnostics can be consumed programmatically:
|
||||
|
||||
```python
|
||||
import json
|
||||
|
||||
result = json.loads(pdftract_output)
|
||||
for error in result.get('errors', []):
|
||||
code = error['code']
|
||||
severity = error['severity']
|
||||
page = error.get('page_index')
|
||||
|
||||
if code == 'OCR_BROKENVECTOR_UNAVAILABLE':
|
||||
# Install Tesseract for OCR recovery
|
||||
print(f"Page {page}: Install Tesseract for OCR recovery")
|
||||
```
|
||||
Loading…
Add table
Reference in a new issue