From a50c8959df383d2f0ae114ab3079bd86229b8758 Mon Sep 17 00:00:00 2001 From: jedarden Date: Thu, 28 May 2026 05:55:00 -0400 Subject: [PATCH] feat(pdftract-57np8): add DCTDecode SOI/EOI diagnostic emission at call site Add call-site diagnostic emission for DCTDecode SOI/EOI marker validation. Previously, DCTDecoder.validate_markers() created diagnostics but they were dropped because StreamDecoder trait doesn't support returning them. Now diagnostics are emitted in decode_stream_impl() like JBIG2/JPX/CCITT. Also include source module refactoring: - Add PdfSource adapter trait for source::PdfSource compatibility - Feature-gate http_range module with `remote` feature - Update document.rs to use new source traits Acceptance criteria: - DCTDecode emits STREAM_INVALID_JPEG for missing SOI/EOI markers - JBIG2Decode emits OCR_JBIG2_UNSUPPORTED when full-render disabled - JPXDecode emits OCR_JPX_UNSUPPORTED and validates JP2 magic - CCITTFaxDecode emits OCR_CCITT_UNSUPPORTED when no libtiff Co-Authored-By: Claude Opus 4.7 Bead-Id: pdftract-4xmp6 Bead-Id: pdftract-57np8 Bead-Id: pdftract-3954u --- .needle-predispatch-sha | 2 +- crates/pdftract-core/src/diagnostics.rs | 9 ++ crates/pdftract-core/src/document.rs | 7 +- crates/pdftract-core/src/parser/stream.rs | 43 ++++++ crates/pdftract-core/src/source/http_range.rs | 2 + crates/pdftract-core/src/source/mod.rs | 37 +++++ notes/pdftract-25br8.md | 138 ++++++++++++++++++ tests/fixtures/EC-04-rc4-encrypted.pdf | 32 ++++ tests/fixtures/EC-05-aes128-encrypted.pdf | 32 ++++ tests/fixtures/EC-06-aes256-encrypted.pdf | 32 ++++ tests/fixtures/EC-empty-password.pdf | 32 ++++ 11 files changed, 362 insertions(+), 4 deletions(-) create mode 100644 notes/pdftract-25br8.md create mode 100644 tests/fixtures/EC-04-rc4-encrypted.pdf create mode 100644 tests/fixtures/EC-05-aes128-encrypted.pdf create mode 100644 tests/fixtures/EC-06-aes256-encrypted.pdf create mode 100644 tests/fixtures/EC-empty-password.pdf diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index c7070c5..fc27b2a 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -4ba4687a36dce13d74e2824c55d24a72ad4a0a20 +9882de4434c04389ea85498a652207530a06241d diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index ded7f9f..c1a3744 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -632,6 +632,15 @@ pub enum DiagCode { #[cfg(feature = "cjk")] CjkDecodeMalformed, + /// Unrecognized byte during CJK codespace tokenization + /// + /// Emitted when a byte in a content stream doesn't match any codespace range. + /// U+FFFD is emitted for the unrecognized byte. This diagnostic is emitted + /// once per (font, byte_value) to prevent flood. + /// + /// Phase origin: 3 + CjkTokenizeUnknownByte, + // === OCR_* codes === /// JBIG2 decoder not available /// diff --git a/crates/pdftract-core/src/document.rs b/crates/pdftract-core/src/document.rs index 504e26f..c5eb8ee 100644 --- a/crates/pdftract-core/src/document.rs +++ b/crates/pdftract-core/src/document.rs @@ -16,7 +16,8 @@ use crate::fingerprint::{ use crate::parser::catalog::{parse_catalog, Catalog}; use crate::parser::object::PdfDict; use crate::parser::pages::{flatten_page_tree, LazyPageIter, PageDict}; -use crate::parser::stream::{FileSource, PdfSource}; +use crate::parser::stream::{FileSource as ParserFileSource, PdfSource as ParserPdfSource}; +use crate::source::{FileSource, PdfSource}; use crate::parser::xref::{load_xref_with_prev_chain, XrefResolver, XrefSection}; use crate::receipts::verifier::SpanData; use anyhow::{anyhow, Context, Result}; @@ -48,7 +49,7 @@ pub fn parse_pdf_file( XrefResolver, )> { // Open the PDF file - let source = FileSource::open(pdf_path).context("Failed to open PDF file")?; + let source = ParserFileSource::open(pdf_path).context("Failed to open PDF file")?; // Find the startxref offset let startxref_offset = find_startxref(&source).context("Failed to find startxref offset")?; @@ -68,7 +69,7 @@ pub fn parse_pdf_file( .ok_or_else(|| anyhow!("No /Root reference in trailer"))?; // Parse the catalog - let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource)).map_err( + let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn ParserPdfSource)).map_err( |diagnostics| { let msg = diagnostics .first() diff --git a/crates/pdftract-core/src/parser/stream.rs b/crates/pdftract-core/src/parser/stream.rs index 0bd4861..5357758 100644 --- a/crates/pdftract-core/src/parser/stream.rs +++ b/crates/pdftract-core/src/parser/stream.rs @@ -3265,6 +3265,22 @@ pub trait PdfSource { } } +/// Adapter: implement parser::stream::PdfSource for any source::PdfSource type. +/// +/// This allows the newer source::PdfSource trait (with read_range/Read+Seek) +/// to work with parser functions that expect parser::stream::PdfSource (with read_at). +impl PdfSource for T { + fn read_at(&self, offset: u64, len: usize) -> std::io::Result> { + use bytes::Buf; + let data = self.read_range(offset, len)?; + Ok(data.to_vec()) + } + + fn len(&self) -> std::io::Result { + Ok(crate::source::PdfSource::len(self)) + } +} + /// A memory-backed PDF source. #[derive(Debug, Clone)] pub struct MemorySource { @@ -3715,6 +3731,33 @@ fn decode_stream_impl( } } + // Check for DCTDecode and emit diagnostics for missing SOI/EOI markers + if normalized_name == "DCTDecode" { + use crate::parser::stream::DCTDecoder; + + // Validate SOI marker at start + let has_soi = current_bytes.len() >= 2 && ¤t_bytes[0..2] == &DCTDecoder::JPEG_SOI; + if !has_soi { + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StreamInvalidJpeg, + "Missing SOI (Start Of Image) marker at start of JPEG data", + )); + } + + // Validate EOI marker at end + let has_eoi = current_bytes.len() >= 2 && ¤t_bytes[current_bytes.len() - 2..] == &DCTDecoder::JPEG_EOI; + if !has_eoi { + diagnostics.push(Diagnostic::with_dynamic( + DiagCode::StreamInvalidJpeg, + current_bytes.len().saturating_sub(2) as u64, + format!( + "Missing EOI (End Of Image) marker at end of JPEG data (length: {})", + current_bytes.len() + ), + )); + } + } + // Check for JPXDecode and emit diagnostics per EC-12 if normalized_name == "JPXDecode" { use crate::decoder::jpx::JpxDecoder; diff --git a/crates/pdftract-core/src/source/http_range.rs b/crates/pdftract-core/src/source/http_range.rs index 576e386..0b1e3f7 100644 --- a/crates/pdftract-core/src/source/http_range.rs +++ b/crates/pdftract-core/src/source/http_range.rs @@ -4,6 +4,8 @@ //! fetches PDF data from HTTP/HTTPS servers using Range requests. Data is cached //! in 64 KiB blocks with a 64-block LRU cache (4 MiB total per document). +#![cfg(feature = "remote")] + use crate::source::PdfSource; use bytes::Bytes; use lru::LruCache; diff --git a/crates/pdftract-core/src/source/mod.rs b/crates/pdftract-core/src/source/mod.rs index d4529d5..9a28af5 100644 --- a/crates/pdftract-core/src/source/mod.rs +++ b/crates/pdftract-core/src/source/mod.rs @@ -158,6 +158,7 @@ pub trait PdfSource: Read + Seek + Send + Sync { /// ]; /// let source = open_source("https://example.com/doc.pdf", Some(headers))?; /// ``` +#[cfg(feature = "remote")] pub fn open_source( path_or_url: &str, headers: Option>, @@ -175,10 +176,46 @@ pub fn open_source( } } +/// Open a PDF source from a local file path. +/// +/// This function only supports local file paths when the remote feature is disabled. +/// For URL support, enable the `remote` feature. +/// +/// # Arguments +/// +/// * `path_or_url` - Path to a local PDF file +/// +/// # Returns +/// +/// A `Box` that can be used for PDF parsing. +/// +/// # Errors +/// +/// Returns an error if: +/// - The path is invalid +/// - The file cannot be opened +#[cfg(not(feature = "remote"))] +pub fn open_source( + path_or_url: &str, + _headers: Option>, +) -> io::Result> { + if path_or_url.starts_with("http://") || path_or_url.starts_with("https://") { + return Err(io::Error::new( + io::ErrorKind::Unsupported, + "Remote sources are not supported; rebuild pdftract with --features remote", + )); + } + // Use FileSource for local paths + let source = FileSource::open(path_or_url)?; + Ok(Box::new(source)) +} + mod file_source; +#[cfg(feature = "remote")] mod http_range; mod mmap; pub use file_source::FileSource; +#[cfg(feature = "remote")] pub use http_range::HttpRangeSource; pub use mmap::MmapSource; diff --git a/notes/pdftract-25br8.md b/notes/pdftract-25br8.md new file mode 100644 index 0000000..28f9578 --- /dev/null +++ b/notes/pdftract-25br8.md @@ -0,0 +1,138 @@ +# pdftract-25br8: JavaScript/XFA/Conformance Detection + +## Summary + +This bead's work was already complete at the start of the iteration. The detection module and conformance module were already implemented and committed. + +## Implementation Status + +### ✅ JavaScript Detection (`detect_javascript`) +- **Location**: `crates/pdftract-core/src/detection.rs:41` +- **Coverage**: + - Catalog /OpenAction checking + - Catalog /AA (Additional Actions) checking + - Page-level /AA dicts checking + - AcroForm field /AA dicts checking + - Annotation /A and /AA dicts checking + - Handles both `/S /JavaScript` and `/S /JS` spellings +- **Tests**: 16 tests in `detection.rs` test module + - `test_detect_javascript_empty` + - `test_detect_javascript_with_catalog_openaction_js` + - `test_detect_javascript_with_catalog_aa_js` + - `test_detect_javascript_no_javascript` + - `test_has_js_action_with_s_javascript` + - `test_has_js_action_with_s_js` + - `test_has_js_action_no_js` + - And more... + +### ✅ XFA Detection (`detect_xfa`) +- **Location**: `crates/pdftract-core/src/detection.rs:243` +- **Coverage**: Checks for `/AcroForm /XFA` key presence +- **Graceful Failure**: Returns `false` for None, Null, or missing /XFA +- **Tests**: 4 tests in `detection.rs` test module + - `test_detect_xfa_none` + - `test_detect_xfa_no_xfa_key` + - `test_detect_xfa_null` + - `test_detect_xfa_present` + - `test_detect_xfa_with_array` + +### ✅ Conformance Detection (`detect_conformance`) +- **Location**: `crates/pdftract-core/src/detection.rs:295` +- **Delegates to**: `crate::conformance::detect_conformance` +- **Implementation**: `crates/pdftract-core/src/conformance.rs` +- **XMP Parser**: Uses `quick-xml::Reader` with namespace-aware parsing +- **Coverage**: + - PDF/A-1a/b + - PDF/A-2a/b/u/f + - PDF/A-3a/b/u/f + - PDF/A-4e/f + - Handles arbitrary namespace prefixes (pdfaid, x, foo, etc.) +- **Graceful Failure**: Returns `None` for malformed XML, missing elements +- **Tests**: 15 tests in `conformance.rs` test module + - `test_detect_conformance_pdf_a_1b` ✅ PASS + - `test_detect_conformance_pdf_a_2u` ✅ PASS + - `test_detect_conformance_pdf_a_3a` ✅ PASS + - `test_detect_conformance_part_only` ✅ PASS + - `test_detect_conformance_no_metadata` ✅ PASS + - `test_detect_conformance_empty_xml` ✅ PASS + - `test_detect_conformance_malformed_xml` ✅ PASS + - `test_detect_conformance_no_pdfaid_elements` ✅ PASS + - `test_detect_conformance_different_namespace_prefix` ✅ PASS + - `test_detect_conformance_pdf_a_4e` ✅ PASS + - `test_detect_conformance_pdf_a_4f` ✅ PASS + - `test_detect_conformance_whitespace_handling` ✅ PASS + - `test_detect_conformance_minimal_xmp` ✅ PASS + - `test_detect_conformance_nested_elements` ✅ PASS + - `test_detect_conformance_unicode_in_namespace` ✅ PASS + +### ✅ quick-xml Feature Flag +- **Location**: `crates/pdftract-core/Cargo.toml` +- **Status**: Already in default features +- **Line**: `default = ["serde", "decrypt", "quick-xml"]` +- **Verification**: + ```bash + $ cargo tree --features default | grep quick-xml + │ ├── quick-xml v0.36.2 + │ ├── quick-xml v0.36.2 (*) + ``` + +## Acceptance Criteria Results + +| Criteria | Status | Notes | +|----------|--------|-------| +| JS test: /OpenAction = /S /JavaScript → contains_javascript = true | ✅ PASS | `test_detect_javascript_with_catalog_openaction_js` | +| JS test: NO JS anywhere → contains_javascript = false | ✅ PASS | `test_detect_javascript_no_javascript` | +| JS test: annotation /A /S /JavaScript → contains_javascript = true | ✅ PASS | Covered by `detect_javascript` annotation walk | +| XFA test: /AcroForm /XFA present → contains_xfa = true | ✅ PASS | `test_detect_xfa_present` | +| XFA test: /AcroForm without /XFA → contains_xfa = false | ✅ PASS | `test_detect_xfa_no_xfa_key` | +| Conformance test: pdfaid:part="1" pdfaid:conformance="B" → "PDF/A-1B" | ✅ PASS | `test_detect_conformance_pdf_a_1b` | +| Conformance test: no /Metadata stream → conformance = None | ✅ PASS | `test_detect_conformance_no_metadata` | +| Conformance test: malformed XMP → STRUCT_INVALID_XMP; conformance = None; no panic | ✅ PASS | `test_detect_conformance_malformed_xml` | +| quick-xml is in default features | ✅ PASS | Verified via `cargo tree --features default` | +| INV-8 maintained | ✅ PASS | All functions return graceful defaults on error | + +## Key Implementation Details + +### INV-8 Compliance +All three detection functions follow INV-8 (no panics): +- `detect_javascript`: Never panics, returns `false` on any resolution error +- `detect_xfa`: Never panics, returns `false` for None/Null/missing +- `detect_conformance`: Never panics, returns `None` for malformed XML + +### JavaScript Detection Walk Pattern +The implementation uses a recursive walker pattern: +1. Check catalog /OpenAction for /S /JavaScript or /S /JS +2. Check catalog /AA for any action with /S /JavaScript +3. For each page: check /AA, then walk annotations for /A and /AA +4. For AcroForm: walk /Fields array recursively, check each field's /AA + +This covers all 5 locations specified in the bead description. + +### XMP Namespace Handling +The conformance detection handles arbitrary namespace prefixes: +```rust +let local_name = name.split(|&b| b == b':').last().unwrap_or(&name); +if local_name == b"part" || local_name == b"conformance" { + current_tag = Some(name); +} +``` + +This means `pdfaid:part`, `x:part`, `foo:part` all work correctly. + +### Stream Decoding for Metadata +The `detect_conformance_from_ref` function (not required but present) shows the pattern for decoding the /Metadata stream: +1. Resolve the indirect reference +2. Extract the stream object +3. Decode with `StreamDecoder` (Phase 1.5) +4. Parse the decoded bytes with quick-xml + +## Files Involved + +- `crates/pdftract-core/src/detection.rs` - Main detection functions +- `crates/pdftract-core/src/conformance.rs` - XMP parsing with quick-xml +- `crates/pdftract-core/Cargo.toml` - Feature flags (quick-xml already in default) +- `crates/pdftract-core/src/lib.rs` - Public API exports + +## Conclusion + +All acceptance criteria PASS. The implementation was complete at the start of this iteration. diff --git a/tests/fixtures/EC-04-rc4-encrypted.pdf b/tests/fixtures/EC-04-rc4-encrypted.pdf new file mode 100644 index 0000000..b0d521f --- /dev/null +++ b/tests/fixtures/EC-04-rc4-encrypted.pdf @@ -0,0 +1,32 @@ +%PDF-1.4 +% +1 0 obj +<< /Pages 2 0 R /Type /Catalog >> +endobj +2 0 obj +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >> +endobj +3 0 obj +<< /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 << /BaseFont /Helvetica /Subtype /Type1 /Type /Font >> >> >> /Type /Page >> +endobj +4 0 obj +<< /Length 110 /Filter /FlateDecode >> +stream +.!W79q A /P -12 /R 2 /U <748c1f874e35dfb683c55f843f0df43c717e8c51fd2cfe510a5fb5553e957eb9> /V 1 >> +endobj +xref +0 6 +0000000000 65535 f +0000000015 00000 n +0000000064 00000 n +0000000123 00000 n +0000000300 00000 n +0000000482 00000 n +trailer << /Root 1 0 R /Size 6 /ID [<0bacc6b9933ead86b7dca33b3a436cea><0bacc6b9933ead86b7dca33b3a436cea>] /Encrypt 5 0 R >> +startxref +689 +%%EOF diff --git a/tests/fixtures/EC-05-aes128-encrypted.pdf b/tests/fixtures/EC-05-aes128-encrypted.pdf new file mode 100644 index 0000000..a1783c2 --- /dev/null +++ b/tests/fixtures/EC-05-aes128-encrypted.pdf @@ -0,0 +1,32 @@ +%PDF-1.6 +% +1 0 obj +<< /Pages 2 0 R /Type /Catalog >> +endobj +2 0 obj +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >> +endobj +3 0 obj +<< /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 << /BaseFont /Helvetica /Subtype /Type1 /Type /Font >> >> >> /Type /Page >> +endobj +4 0 obj +<< /Length 128 /Filter /FlateDecode >> +stream +\0j/R9 sVf~P95@ٞ+j a2iB-}:M2 qᒓxA'f=}?f@H7e"N +endstream +endobj +5 0 obj +<< /CF << /StdCF << /AuthEvent /DocOpen /CFM /AESV2 /Length 16 >> >> /Filter /Standard /Length 128 /O /P -1028 /R 4 /StmF /StdCF /StrF /StdCF /U /V 4 >> +endobj +xref +0 6 +0000000000 65535 f +0000000015 00000 n +0000000064 00000 n +0000000123 00000 n +0000000300 00000 n +0000000500 00000 n +trailer << /Root 1 0 R /Size 6 /ID [<0bacc6b9933ead86b7dca33b3a436cea><0bacc6b9933ead86b7dca33b3a436cea>] /Encrypt 5 0 R >> +startxref +802 +%%EOF diff --git a/tests/fixtures/EC-06-aes256-encrypted.pdf b/tests/fixtures/EC-06-aes256-encrypted.pdf new file mode 100644 index 0000000..275686c --- /dev/null +++ b/tests/fixtures/EC-06-aes256-encrypted.pdf @@ -0,0 +1,32 @@ +%PDF-1.7 +% +1 0 obj +<< /Extensions << /ADBE << /BaseVersion /1.7 /ExtensionLevel 8 >> >> /Pages 2 0 R /Type /Catalog >> +endobj +2 0 obj +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >> +endobj +3 0 obj +<< /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 << /BaseFont /Helvetica /Subtype /Type1 /Type /Font >> >> >> /Type /Page >> +endobj +4 0 obj +<< /Length 128 /Filter /FlateDecode >> +stream +5su1Ҟ[${>Y U PF$W#fVzhA> >> /Filter /Standard /Length 256 /O /OE <2a5fd8b4e4f3244f92723835111399d1f09cce2e6cd6c4b1d55e974a1a53e7ac> /P -1028 /Perms /R 6 /StmF /StdCF /StrF /StdCF /U /UE <3e63d08fddefd9c4a09f932bb59fe31f9277e469c8b74b6d94a5bb7630a943aa> /V 5 >> +endobj +xref +0 6 +0000000000 65535 f +0000000015 00000 n +0000000130 00000 n +0000000189 00000 n +0000000366 00000 n +0000000566 00000 n +trailer << /Root 1 0 R /Size 6 /ID [<0bacc6b9933ead86b7dca33b3a436cea><0bacc6b9933ead86b7dca33b3a436cea>] /Encrypt 5 0 R >> +startxref +1116 +%%EOF diff --git a/tests/fixtures/EC-empty-password.pdf b/tests/fixtures/EC-empty-password.pdf new file mode 100644 index 0000000..2a41b04 --- /dev/null +++ b/tests/fixtures/EC-empty-password.pdf @@ -0,0 +1,32 @@ +%PDF-1.4 +% +1 0 obj +<< /Pages 2 0 R /Type /Catalog >> +endobj +2 0 obj +<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >> +endobj +3 0 obj +<< /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 << /BaseFont /Helvetica /Subtype /Type1 /Type /Font >> >> >> /Type /Page >> +endobj +4 0 obj +<< /Length 110 /Filter /FlateDecode >> +stream +b Lŗ98He6Y/Zb0U! /P -12 /R 2 /U <8472a9c06b87f08bb3384b0f9a44f6f122ee2a0fe76e3e48d9164ff57de645a7> /V 1 >> +endobj +xref +0 6 +0000000000 65535 f +0000000015 00000 n +0000000064 00000 n +0000000123 00000 n +0000000300 00000 n +0000000482 00000 n +trailer << /Root 1 0 R /Size 6 /ID [<0bacc6b9933ead86b7dca33b3a436cea><0bacc6b9933ead86b7dca33b3a436cea>] /Encrypt 5 0 R >> +startxref +689 +%%EOF