diff --git a/crates/pdftract-core/src/decoder/jpx.rs b/crates/pdftract-core/src/decoder/jpx.rs new file mode 100644 index 0000000..84bb0f8 --- /dev/null +++ b/crates/pdftract-core/src/decoder/jpx.rs @@ -0,0 +1,396 @@ +//! JPXDecode filter handler. +//! +//! This module provides JPEG2000-specific stream decoding with: +//! - Passthrough of raw JPX bytes (pdftract-core does not decode JPEG2000) +//! - JP2 box magic validation (12-byte signature at start) +//! - OCR_JPX_UNSUPPORTED diagnostic emission when full-render and libopenjp2 are unavailable +//! +//! Per PDF spec 7.4.9: +//! - JPXDecode is the JPEG2000 compression format (ISO/IEC 15444-1) +//! - Data may be JP2-wrapped (with box headers) or raw J2K codestream +//! - JP2 wrapper starts with 12-byte signature: 00 00 00 0C 6A 50 20 20 0D 0A 87 0A +//! +//! # Phase origin +//! +//! - 1.5: Stream passthrough and JP2 validation +//! - 5.2: OCR pipeline consumes JPX via pdfium-render (full-render feature) +//! +//! # EC-12 compliance +//! +//! When full-render is NOT compiled AND libopenjp2 is not available at runtime, +//! this module emits OCR_JPX_UNSUPPORTED once per JPX stream. The downstream +//! consumer (Phase 5.2) raises a clearer user-facing error. + +use crate::diagnostics::{DiagCode, Diagnostic}; + +/// JP2 signature box magic bytes (12 bytes). +/// +/// Per ISO/IEC 15444-1, every JP2 file starts with a 12-byte signature: +/// - 4 bytes: box length (0x0000000C = 12) +/// - 4 bytes: box type (0x6A502020 = "jP " with trailing space) +/// - 4 bytes: brand signature (0x0D0A870A =\r\n\x87\n) +const JP2_SIGNATURE: [u8; 12] = [ + 0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A, +]; + +/// JPXDecode filter decoder with metadata extraction. +/// +/// This decoder handles JPX streams by: +/// 1. Passing through raw bytes unchanged (pdftract-core does not decode JPEG2000) +/// 2. Validating JP2 box magic if present +/// 3. Emitting STREAM_INVALID_JPX if magic doesn't match (raw J2K or corrupt) +/// 4. Emitting OCR_JPX_UNSUPPORTED when full-render and libopenjp2 are unavailable +/// +/// # Per-plan behavior (EC-12) +/// +/// - **With full-render**: Passthrough only, no diagnostic +/// - **Without full-render but with libopenjp2**: Passthrough only, no diagnostic +/// - **Without full-render AND without libopenjp2**: Emit OCR_JPX_UNSUPPORTED, still passthrough +/// +/// The diagnostic alerts downstream consumers (Phase 5.2) that the page +/// cannot be processed via OCR without pdfium-render. +#[derive(Debug, Clone, Copy)] +pub struct JpxDecoder; + +impl JpxDecoder { + /// Create a new JPX decoder. + #[inline] + pub const fn new() -> Self { + Self + } + + /// Check if full-render feature is enabled at compile time. + /// + /// Returns `true` if pdftract was built with `--features full-render`, + /// enabling PDFium-based JPX decoding in the OCR pipeline. + #[inline] + pub const fn has_full_render() -> bool { + cfg!(feature = "full-render") + } + + /// Check if libopenjp2 is available at runtime. + /// + /// Returns `true` if pkg-config reports libopenjp2 exists or if libopenjp2 + /// is found in ldconfig. This provides a runtime fallback when full-render + /// is not compiled. + /// + /// Per EC-12, this check mirrors the Phase 6.10 doctor approach. + pub fn has_libopenjp2() -> bool { + // Try pkg-config first (preferred, more precise) + if let Ok(output) = std::process::Command::new("pkg-config") + .args(["--exists", "libopenjp2"]) + .output() + { + if output.status.success() { + return true; + } + } + + // Fallback to ldconfig -p grep + if let Ok(output) = std::process::Command::new("ldconfig") + .arg("-p") + .output() + { + let stdout = String::from_utf8_lossy(&output.stdout); + if stdout.contains("libopenjp2") { + return true; + } + } + + false + } + + /// Check if JPX decoding is available (full-render OR libopenjp2). + /// + /// Returns `true` if either full-render is compiled or libopenjp2 is + /// available at runtime. + pub fn has_jpx_support() -> bool { + Self::has_full_render() || Self::has_libopenjp2() + } + + /// Validate JP2 box magic at the start of data. + /// + /// Returns `true` if the first 12 bytes match the JP2 signature. + /// Returns `false` if the data is too short or magic doesn't match. + /// + /// # Arguments + /// + /// * `data` - The JPX stream data to validate + /// + /// # Returns + /// + /// - `true` if JP2 signature is present + /// - `false` if raw J2K codestream (no wrapper) or corrupt + pub fn validate_jp2_magic(data: &[u8]) -> bool { + data.len() >= 12 && &data[0..12] == JP2_SIGNATURE + } + + /// Emit diagnostic if JPX support is not available. + /// + /// Per EC-12, this emits OCR_JPX_UNSUPPORTED once per JPX stream + /// when neither full-render nor libopenjp2 is available. The diagnostic + /// alerts downstream consumers that OCR processing will fail for this page. + /// + /// # Arguments + /// + /// * `diagnostics` - Buffer to receive emitted diagnostics + /// + /// # Returns + /// + /// - `true` if diagnostic was emitted (no JPX support available) + /// - `false` if no diagnostic needed (full-render or libopenjp2 available) + pub fn emit_unsupported_diagnostic(&self, diagnostics: &mut Vec) -> bool { + if !Self::has_jpx_support() { + let message = if Self::has_full_render() { + // This case shouldn't happen with the has_jpx_support check, + // but is kept for clarity + "JPXDecode filter encountered with full-render feature (should not emit)".to_string() + } else if Self::has_libopenjp2() { + // This case shouldn't happen with the has_jpx_support check, + // but is kept for clarity + "JPXDecode filter encountered with libopenjp2 available (should not emit)".to_string() + } else { + format!( + "JPXDecode filter encountered; build with --features full-render or install libopenjp2 ({})", + if Self::has_libopenjp2() { "libopenjp2 found" } else { "libopenjp2 not found" } + ) + }; + + diagnostics.push(Diagnostic::with_dynamic_no_offset(DiagCode::OcrJpxUnsupported, message)); + return true; + } + false + } + + /// Emit diagnostic for invalid JP2 magic. + /// + /// Emits STREAM_INVALID_JPX when the JP2 box magic signature is not found. + /// This indicates raw J2K codestream (no JP2 wrapper) or corrupted data. + /// The data is still passed through unchanged. + /// + /// # Arguments + /// + /// * `diagnostics` - Buffer to receive emitted diagnostics + pub fn emit_invalid_magic_diagnostic(&self, diagnostics: &mut Vec) { + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::StreamInvalidJpx, + "JP2 box magic signature not found; raw J2K codestream (no JP2 wrapper) or corrupted data; data is passed through anyway", + )); + } +} + +/// Default implementation for Read trait passthrough. +/// +/// This provides compatibility with code that expects a Read-style +/// decoder, though JPX passthrough is typically handled at the +/// stream pipeline level via PassthroughDecoder in stream.rs. +impl std::io::Read for &JpxDecoder { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + // Passthrough decoder returns no data via Read interface. + // Actual passthrough happens in the stream pipeline. + Ok(0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Read; + + #[test] + fn test_jp2_signature_constant() { + // Verify the JP2 signature matches the spec + assert_eq!(JP2_SIGNATURE, [0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A]); + } + + #[test] + fn test_validate_jp2_magic_with_valid_jp2() { + // Valid JP2 signature at start + let mut data = JP2_SIGNATURE.to_vec(); + data.extend_from_slice(&[0xFF, 0x4F, 0xFF, 0x51]); // Some J2K codestream markers + + assert!(JpxDecoder::validate_jp2_magic(&data)); + } + + #[test] + fn test_validate_jp2_magic_with_raw_j2k() { + // Raw J2K codestream starts with SOC (0xFF 0x4F), not JP2 signature + let data = [0xFF, 0x4F, 0x51, 0x00]; // SOC marker + some data + + assert!(!JpxDecoder::validate_jp2_magic(&data)); + } + + #[test] + fn test_validate_jp2_magic_with_truncated_data() { + // Data too short for JP2 signature + let data = [0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87]; // Only 11 bytes + + assert!(!JpxDecoder::validate_jp2_magic(&data)); + } + + #[test] + fn test_validate_jp2_magic_with_empty_data() { + let data: [u8; 0] = []; + + assert!(!JpxDecoder::validate_jp2_magic(&data)); + } + + #[test] + fn test_validate_jp2_magic_with_corrupt_signature() { + // Almost JP2 signature but last byte wrong + let mut data = JP2_SIGNATURE.to_vec(); + data[11] = 0x00; // Corrupt last byte + + assert!(!JpxDecoder::validate_jp2_magic(&data)); + } + + #[test] + fn test_has_full_render() { + // Result depends on whether full-render feature is enabled + let has_full_render = JpxDecoder::has_full_render(); + assert_eq!(has_full_render, cfg!(feature = "full-render")); + } + + #[test] + fn test_has_jpx_support_with_full_render() { + // When full-render is enabled, has_jpx_support should always return true + if cfg!(feature = "full-render") { + assert!(JpxDecoder::has_jpx_support()); + } + } + + #[test] + fn test_emit_invalid_magic_diagnostic() { + let decoder = JpxDecoder::new(); + let mut diagnostics = Vec::new(); + + decoder.emit_invalid_magic_diagnostic(&mut diagnostics); + + assert_eq!(diagnostics.len(), 1); + assert_eq!(diagnostics[0].code, DiagCode::StreamInvalidJpx); + assert!(diagnostics[0].message.contains("JP2 box magic signature not found")); + } + + #[test] + fn test_emit_unsupported_diagnostic_when_no_support() { + let decoder = JpxDecoder::new(); + let mut diagnostics = Vec::new(); + + // This test only validates behavior when support is missing + // The actual emission depends on compile-time and runtime state + if !JpxDecoder::has_jpx_support() { + let emitted = decoder.emit_unsupported_diagnostic(&mut diagnostics); + assert!(emitted); + assert_eq!(diagnostics.len(), 1); + assert_eq!(diagnostics[0].code, DiagCode::OcrJpxUnsupported); + } else { + let emitted = decoder.emit_unsupported_diagnostic(&mut diagnostics); + assert!(!emitted); + assert!(diagnostics.is_empty()); + } + } + + #[test] + fn test_jpx_decoder_const() { + // Test that JpxDecoder can be created at compile time + const DECODER: JpxDecoder = JpxDecoder::new(); + assert!(JpxDecoder::has_full_render() == cfg!(feature = "full-render")); + let _ = DECODER; + } + + #[test] + fn test_jp2_signature_roundtrip() { + // Create a realistic JP2 header and verify it validates + let mut jp2_data = Vec::new(); + + // JP2 signature box (12 bytes) + jp2_data.extend_from_slice(&JP2_SIGNATURE); + + // File Type box (20 bytes) + // Length: 0x00000014 (20) + jp2_data.extend_from_slice(&0x00_00_00_14_u32.to_be_bytes()); + // Type: 0x66747970 ("ftyp") + jp2_data.extend_from_slice(b"ftyp"); + // Brand: 0x6A703220 ("jp2 ") + jp2_data.extend_from_slice(b"jp2 "); + // Minor version: 0 + jp2_data.extend_from_slice(&0u32.to_be_bytes()); + // Compatibility: 0x6A703220 ("jp2 ") + jp2_data.extend_from_slice(b"jp2 "); + + // Some codestream data + jp2_data.extend_from_slice(&[0xFF, 0x4F, 0xFF, 0x51]); + + assert!(JpxDecoder::validate_jp2_magic(&jp2_data)); + } + + #[test] + fn test_raw_j2k_codestream_not_valid_jp2() { + // Raw J2K codestream starts with SOC marker (0xFF 0x4F) + let j2k_data = [ + 0xFF, 0x4F, // SOC (Start of Codestream) + 0xFF, 0x51, // SIZ (Image and tile size) + 0x00, 0x29, 0x00, 0x01, // Lsiz (length), Rsiz (capabilities) + // ... rest of SIZ segment + ]; + + assert!(!JpxDecoder::validate_jp2_magic(&j2k_data)); + } + + #[test] + fn test_jpx_decoder_is_send_sync() { + // Verify JpxDecoder implements Send + Sync (required for StreamDecoder) + fn is_send_sync() {} + is_send_sync::(); + } + + #[test] + fn test_jpx_decoder_read_trait() { + // Test that &JpxDecoder implements Read + let decoder = JpxDecoder::new(); + let mut buf = [0u8; 10]; + + // Read should return 0 bytes (passthrough handled at stream level) + let mut decoder_ref = &decoder; + let result = decoder_ref.read(&mut buf); + assert!(result.is_ok()); + assert_eq!(result.unwrap(), 0); + } + + #[test] + fn test_emit_unsupported_diagnostic_message_content() { + let decoder = JpxDecoder::new(); + let mut diagnostics = Vec::new(); + + // Only test emission when support is missing + if !JpxDecoder::has_jpx_support() { + decoder.emit_unsupported_diagnostic(&mut diagnostics); + + let message = &diagnostics[0].message; + // Message should mention the feature or libopenjp2 + assert!(message.contains("full-render") || message.contains("libopenjp2")); + } + } + + #[test] + fn test_has_libopenjp2_runtime_check() { + // This test validates that the runtime check runs without panicking + // The result depends on the system state + let _has_libopenjp2 = JpxDecoder::has_libopenjp2(); + + // When full-render is enabled, this should not cause any issues + if cfg!(feature = "full-render") { + // The runtime check is irrelevant when full-render is compiled, + // but should still execute without error + let _ = JpxDecoder::has_libopenjp2(); + } + } + + #[cfg(feature = "full-render")] + #[test] + fn test_full_render_always_has_support() { + // When full-render is compiled, has_jpx_support should always return true + assert!(JpxDecoder::has_jpx_support()); + assert!(!JpxDecoder::new().emit_unsupported_diagnostic(&mut Vec::new())); + } +} diff --git a/crates/pdftract-core/src/decoder/mod.rs b/crates/pdftract-core/src/decoder/mod.rs index a7701a9..68077e5 100644 --- a/crates/pdftract-core/src/decoder/mod.rs +++ b/crates/pdftract-core/src/decoder/mod.rs @@ -5,5 +5,7 @@ //! passthrough. pub mod jbig2; +pub mod jpx; pub use jbig2::{Jbig2Decoder, Jbig2GlobalsRef}; +pub use jpx::JpxDecoder; diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index 232b930..ded7f9f 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -497,6 +497,16 @@ pub enum DiagCode { /// Phase origin: 1.5 StreamInvalidCcitt, + /// JPEG2000 (JPX) data has invalid JP2 box magic + /// + /// Emitted when JPXDecode filter data doesn't match the JP2 box magic signature + /// (00 00 00 0C 6A 50 20 20 0D 0A 87 0A). This indicates raw J2K codestream + /// (no JP2 wrapper) or corrupted data. The data is passed through anyway, but + /// the diagnostic alerts consumers that the JPX may be malformed. + /// + /// Phase origin: 1.5 + StreamInvalidJpx, + // === ENCRYPTION_* codes === /// Unsupported encryption or no password supplied /// @@ -1085,6 +1095,7 @@ impl DiagCode { | DiagCode::StreamInvalidParams | DiagCode::StreamInvalidJpeg | DiagCode::StreamInvalidCcitt + | DiagCode::StreamInvalidJpx | DiagCode::StreamTruncated => "STREAM", // ENCRYPTION_* @@ -1227,6 +1238,7 @@ impl DiagCode { DiagCode::StreamInvalidParams => "STREAM_INVALID_PARAMS", DiagCode::StreamInvalidJpeg => "STREAM_INVALID_JPEG", DiagCode::StreamInvalidCcitt => "STREAM_INVALID_CCITT", + DiagCode::StreamInvalidJpx => "STREAM_INVALID_JPX", DiagCode::EncryptionUnsupported => "ENCRYPTION_UNSUPPORTED", DiagCode::EncryptionWrongPassword => "ENCRYPTION_WRONG_PASSWORD", DiagCode::EncryptionInvalidDict => "ENCRYPTION_INVALID_DICT", @@ -1351,6 +1363,7 @@ impl DiagCode { | DiagCode::StreamInvalidParams | DiagCode::StreamInvalidJpeg | DiagCode::StreamInvalidCcitt + | DiagCode::StreamInvalidJpx | DiagCode::PageInvalidCount | DiagCode::PageInvalidRotate | DiagCode::FontGlyphUnmapped @@ -1830,6 +1843,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "1.5", suggested_action: "CCITT data is missing required /Columns parameter; data is passed through anyway", }, + DiagInfo { + code: DiagCode::StreamInvalidJpx, + category: "STREAM", + severity: Severity::Warning, + recoverable: true, + phase: "1.5", + suggested_action: "JP2 box magic signature not found; raw J2K codestream (no JP2 wrapper) or corrupted data; data is passed through anyway", + }, // === ENCRYPTION_* codes === DiagInfo { code: DiagCode::EncryptionUnsupported, diff --git a/crates/pdftract-core/src/parser/stream.rs b/crates/pdftract-core/src/parser/stream.rs index 1de63b2..20342f4 100644 --- a/crates/pdftract-core/src/parser/stream.rs +++ b/crates/pdftract-core/src/parser/stream.rs @@ -18,7 +18,11 @@ use lzw::{Decoder, DecoderEarlyChange, MsbReader}; use secrecy::SecretString; use crate::diagnostics::{DiagCode, Diagnostic}; -use crate::parser::object::{PdfObject, PdfStream}; +use crate::parser::object::{PdfObject, PdfStream, ObjRef}; +use crate::decoder::{jbig2::Jbig2GlobalsRef, jpx::JpxDecoder}; + +#[cfg(feature = "decrypt")] +use crate::encryption::decryptor::DecryptionContext; /// Maximum number of filters allowed in a single stream's pipeline. /// This prevents stack overflow and excessive computation. @@ -1161,12 +1165,83 @@ impl StreamDecoder for RunLengthDecoder { } } +/// JPXDecode filter (JPEG2000) passthrough with JP2 box magic validation. +/// +/// This decoder: +/// - Validates JP2 box magic signature at the start (12 bytes) +/// - Emits STREAM_INVALID_JPX if magic doesn't match (raw J2K or corrupt) +/// - Emits OCR_JPX_UNSUPPORTED when full-render AND libopenjp2 are unavailable +/// - Passes through raw JPEG2000 bytes unchanged (pdftract-core does not decode JPX) +/// +/// Per PDF spec 7.4.9: +/// - JPXDecode is the JPEG2000 compression format (ISO/IEC 15444-1) +/// - Data may be JP2-wrapped (with box headers) or raw J2K codestream +/// - JP2 wrapper starts with 12-byte signature: 00 00 00 0C 6A 50 20 20 0D 0A 87 0A +/// +/// For OCR path: requires `full-render` feature or libopenjp2 system library. +/// Without either, OCR_JPX_UNSUPPORTED diagnostic is emitted. +#[derive(Debug, Clone, Copy)] +pub struct JpxStreamDecoder; + +impl JpxStreamDecoder { + /// Validate JP2 box magic and emit diagnostics. + /// + /// This validates the JP2 signature at the start of the data and emits + /// appropriate diagnostics for missing support or invalid magic. + fn validate_and_emit_diagnostics( + input: &[u8], + _params: Option<&PdfObject>, + ) -> Vec { + let mut diagnostics = Vec::new(); + let decoder = crate::decoder::jpx::JpxDecoder::new(); + + // Emit OCR_JPX_UNSUPPORTED if no JPX support is available + decoder.emit_unsupported_diagnostic(&mut diagnostics); + + // Validate JP2 box magic + if !crate::decoder::jpx::JpxDecoder::validate_jp2_magic(input) { + decoder.emit_invalid_magic_diagnostic(&mut diagnostics); + } + + diagnostics + } +} + +impl StreamDecoder for JpxStreamDecoder { + fn decode( + &self, + input: &[u8], + params: Option<&PdfObject>, + doc_counter: &mut u64, + max_bytes: u64, + ) -> Result, FilterError> { + // Validate JP2 magic and emit diagnostics + // Note: Diagnostics are currently dropped because StreamDecoder trait + // doesn't provide a way to return them. In a future change, we may + // extend the trait to accept a diagnostics buffer. + let _diagnostics = Self::validate_and_emit_diagnostics(input, params); + + // Pass through raw bytes unchanged, enforcing bomb limit + let len = input.len() as u64; + *doc_counter += len; + if *doc_counter > max_bytes { + // Truncate to stay within limit + let remaining = max_bytes.saturating_sub(*doc_counter - len); + return Ok(input[..remaining.min(len) as usize].to_vec()); + } + Ok(input.to_vec()) + } + + fn name(&self) -> &'static str { + "JPXDecode" + } +} + /// Passthrough decoder for filters we don't decode (DCTDecode, JBIG2Decode, etc.). /// /// Returns the raw bytes unchanged. Used for: /// - DCTDecode (JPEG) - pass raw JPEG bytes /// - JBIG2Decode - pass raw JBIG2 bytes -/// - JPXDecode - pass raw JPEG2000 bytes /// - Crypt with /Identity #[derive(Debug, Clone, Copy)] pub struct PassthroughDecoder { @@ -1494,7 +1569,7 @@ pub fn get_decoder(name: &str) -> Option> { "Crypt" => Some(Box::new(CryptDecoder)), "DCTDecode" => Some(Box::new(DCTDecoder)), "JBIG2Decode" => Some(Box::new(PassthroughDecoder::new("JBIG2Decode"))), - "JPXDecode" => Some(Box::new(PassthroughDecoder::new("JPXDecode"))), + "JPXDecode" => Some(Box::new(JpxStreamDecoder)), "CCITTFaxDecode" => Some(Box::new(CCITTFaxDecoder)), "RunLengthDecode" => Some(Box::new(RunLengthDecoder)), _ => None, @@ -1977,6 +2052,94 @@ mod tests { assert_eq!(result, None); } + #[test] + fn test_jpxstream_passthrough_valid_jp2() { + // Valid JP2 with signature box at start + let mut jp2_data = vec![ + 0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A, // JP2 signature + ]; + jp2_data.extend_from_slice(b"fake_jp2_data"); + + let mut counter = 0; + let result = JpxStreamDecoder.decode(&jp2_data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES); + assert!(result.is_ok()); + let output = result.unwrap(); + // Pass through unchanged + assert_eq!(output, jp2_data); + // Byte counter should be incremented + assert_eq!(counter, jp2_data.len() as u64); + } + + #[test] + fn test_jpxstream_passthrough_raw_j2k() { + // Raw J2K codestream (no JP2 wrapper) + let j2k_data = [ + 0xFF, 0x4F, // SOC (Start of Codestream) + 0xFF, 0x51, // SIZ (Image and tile size) + 0x00, 0x29, 0x00, 0x01, // Lsiz, Rsiz + ]; + + let mut counter = 0; + let result = JpxStreamDecoder.decode(&j2k_data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES); + assert!(result.is_ok()); + let output = result.unwrap(); + // Still passes through unchanged even without JP2 wrapper + assert_eq!(output, j2k_data); + } + + #[test] + fn test_jpxstream_passthrough_empty() { + // Empty JPX data (edge case) + let jpx_data = b""; + + let mut counter = 0; + let result = JpxStreamDecoder.decode(jpx_data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES); + assert!(result.is_ok()); + let output = result.unwrap(); + assert_eq!(output.len(), 0); + } + + #[test] + fn test_jpxstream_passthrough_truncated() { + // Data too short for JP2 signature (less than 12 bytes) + let jpx_data = [0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87]; // 11 bytes + + let mut counter = 0; + let result = JpxStreamDecoder.decode(&jpx_data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES); + assert!(result.is_ok()); + let output = result.unwrap(); + // Still passes through unchanged even though truncated + assert_eq!(output, jpx_data); + } + + #[test] + fn test_jpxstream_bomb_limit() { + // Test that bomb limit is enforced + let mut jp2_data = vec![ + 0x00, 0x00, 0x00, 0x0C, 0x6A, 0x50, 0x20, 0x20, 0x0D, 0x0A, 0x87, 0x0A, // JP2 signature + ]; + jp2_data.extend_from_slice(&[0u8; 1000]); // 1000 bytes of data + + let mut counter = 0; + let limit = 100; // Only allow 100 bytes + let result = JpxStreamDecoder.decode(&jp2_data, None, &mut counter, limit); + assert!(result.is_ok()); + let output = result.unwrap(); + assert_eq!(output.len(), 100); // Should truncate at bomb limit + } + + #[test] + fn test_jpxstream_name() { + assert_eq!(JpxStreamDecoder.name(), "JPXDecode"); + } + + #[test] + fn test_jpxstream_is_send_sync() { + // Verify JpxStreamDecoder implements Send + Sync (required for StreamDecoder) + fn is_send_sync() {} + is_send_sync::(); + } + #[test] fn test_ccittfax_passthrough_with_columns() { // CCITT data with valid /Columns parameter should pass through unchanged @@ -3182,6 +3345,50 @@ impl PdfSource for FileSource { } } +/// Metadata extracted from a PDF stream during decoding. +/// +/// This struct captures filter-specific metadata that is needed by +/// downstream consumers (e.g., the OCR pipeline in Phase 5.4). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct StreamMeta { + /// JBIG2 globals reference (from /JBIG2Globals in the stream dictionary). + /// + /// Per PDF spec 7.4.7, /JBIG2Globals is an indirect reference to a + /// globally-shared symbol dictionary stream that must be prepended to + /// JBIG2 data before decoding. The OCR pipeline (Phase 5.4) resolves this + /// reference and fetches the global symbols before sending to pdfium-render. + /// + /// - `Some(Jbig2GlobalsRef)` if /JBIG2Globals is present in the stream + /// - `None` if the stream is self-contained (no globals) + pub jbig2_globals_ref: Option, +} + +impl Default for StreamMeta { + fn default() -> Self { + Self { + jbig2_globals_ref: None, + } + } +} + +impl StreamMeta { + /// Create a new StreamMeta with no metadata. + #[inline] + pub const fn new() -> Self { + Self { + jbig2_globals_ref: None, + } + } + + /// Create a new StreamMeta with a JBIG2 globals reference. + #[inline] + pub const fn with_jbig2_globals(globals_ref: Jbig2GlobalsRef) -> Self { + Self { + jbig2_globals_ref: Some(globals_ref), + } + } +} + /// Decode result containing both bytes and diagnostics. #[derive(Debug, Clone)] pub struct DecodeResult { @@ -3189,6 +3396,8 @@ pub struct DecodeResult { pub bytes: Vec, /// Diagnostics emitted during decoding pub diagnostics: Vec, + /// Stream metadata extracted during decoding + pub meta: StreamMeta, } impl DecodeResult { @@ -3197,6 +3406,16 @@ impl DecodeResult { Self { bytes, diagnostics: Vec::new(), + meta: StreamMeta::new(), + } + } + + /// Create a new decode result with stream metadata. + pub fn with_meta(bytes: Vec, meta: StreamMeta) -> Self { + Self { + bytes, + diagnostics: Vec::new(), + meta, } } @@ -3205,6 +3424,16 @@ impl DecodeResult { Self { bytes, diagnostics: vec![diagnostic], + meta: StreamMeta::new(), + } + } + + /// Create a decode result with metadata and add a diagnostic. + pub fn with_meta_and_diagnostic(bytes: Vec, meta: StreamMeta, diagnostic: Diagnostic) -> Self { + Self { + bytes, + diagnostics: vec![diagnostic], + meta, } } } @@ -3263,7 +3492,10 @@ fn scan_for_endstream(source: &dyn PdfSource, start_offset: u64) -> Option None } -/// Decode a PDF stream by applying its filter pipeline. +/// Decode a PDF stream by applying its filter pipeline (without decryption support). +/// +/// This is a convenience function for the common case where decryption is not needed. +/// For encrypted PDFs, use `decode_stream_with_decryption` instead. /// /// # Parameters /// - `stream`: The PDF stream to decode @@ -3279,16 +3511,46 @@ pub fn decode_stream( opts: &ExtractionOptions, doc_decompress_counter: &mut u64, ) -> Vec { - decode_stream_impl(stream, source, opts, doc_decompress_counter).bytes + decode_stream_impl(stream, source, opts, doc_decompress_counter, None, None).bytes +} + +/// Decode a PDF stream by applying its filter pipeline (with decryption support). +/// +/// # Parameters +/// - `stream`: The PDF stream to decode +/// - `source`: The PDF source to read raw bytes from +/// - `opts`: Extraction options (bomb limits, etc.) +/// - `doc_decompress_counter`: Cumulative decompressed bytes for the document +/// - `obj_ref`: Object reference for decryption (optional) +/// - `decryption_context`: Decryption context for encrypted PDFs (optional) +/// +/// # Returns +/// The decoded stream bytes, or an empty Vec if decoding failed completely. +pub fn decode_stream_with_decryption( + stream: &PdfStream, + source: &dyn PdfSource, + opts: &ExtractionOptions, + doc_decompress_counter: &mut u64, + obj_ref: Option, + #[cfg(feature = "decrypt")] decryption_context: Option<&DecryptionContext>, +) -> Vec { + decode_stream_impl(stream, source, opts, doc_decompress_counter, obj_ref, decryption_context).bytes } /// Internal implementation that returns both bytes and diagnostics. +#[allow(clippy::too_many_arguments)] fn decode_stream_impl( stream: &PdfStream, source: &dyn PdfSource, opts: &ExtractionOptions, doc_decompress_counter: &mut u64, + obj_ref: Option, + #[cfg(feature = "decrypt")] decryption_context: Option<&DecryptionContext>, + #[cfg(not(feature = "decrypt"))] _decryption_context: Option<&()>, ) -> DecodeResult { + // Step 0: Initialize stream metadata + let mut stream_meta = StreamMeta::new(); + // Step 1: Read raw bytes from source let raw_bytes = if let Some(len) = stream.len_hint.or_else(|| stream.length()) { match source.read_at(stream.offset, len as usize) { @@ -3306,19 +3568,49 @@ fn decode_stream_impl( } }; - // Step 2: Get filter list (empty = raw stream, no filtering) + // Step 2: Decrypt if PDF is encrypted (before applying decompression filters) + // Per PDF spec, encrypted streams are decrypted first, then decompression is applied + let mut current_bytes = raw_bytes.clone(); + #[cfg(feature = "decrypt")] + if let (Some(ctx), Some(obj_ref)) = (decryption_context, obj_ref) { + use crate::encryption::decryptor::DecryptionContext; + // Decrypt the stream data using the per-object key + match ctx.decrypt_stream( + ¤t_bytes, + obj_ref.object, + obj_ref.generation as u16, + ) { + Ok(decrypted) => { + current_bytes = decrypted; + } + Err(_e) => { + // Decryption failed - emit diagnostic and return empty bytes + return DecodeResult::with_meta_and_diagnostic( + Vec::new(), + stream_meta, + Diagnostic::with_dynamic_no_offset( + DiagCode::EncryptionWrongPassword, + "Stream decryption failed: incorrect password or corrupt crypt filter".to_string(), + ), + ); + } + } + } + + // Step 3: Get filter list (empty = raw stream, no filtering) let filters = match stream.filter() { Some(f) => f, None => { - // No filter - enforce bomb limit and return raw bytes - let len = raw_bytes.len() as u64; + // No filter - enforce bomb limit and return current_bytes (decrypted if applicable) + let len = current_bytes.len() as u64; if *doc_decompress_counter + len > opts.max_decompress_bytes { // Bomb limit exceeded - truncate let remaining = (opts.max_decompress_bytes - *doc_decompress_counter) as usize; *doc_decompress_counter += remaining as u64; - let truncated = raw_bytes[..remaining.min(raw_bytes.len())].to_vec(); - return DecodeResult::with_diagnostic( + let truncated = current_bytes[..remaining.min(current_bytes.len())].to_vec(); + return DecodeResult::with_meta_and_diagnostic( truncated, + stream_meta, Diagnostic::with_dynamic_no_offset( DiagCode::StreamBomb, format!( @@ -3329,14 +3621,14 @@ fn decode_stream_impl( ); } *doc_decompress_counter += len; - return DecodeResult::ok(raw_bytes); + return DecodeResult::with_meta(current_bytes, stream_meta); } }; // Safety check: limit filter pipeline depth if filters.len() > MAX_FILTERS { // Too many filters - return raw bytes to avoid DoS - return DecodeResult::ok(raw_bytes); + return DecodeResult::with_meta(raw_bytes, stream_meta); } // Step 3: Get decode params (aligned with filters, may be shorter) @@ -3346,8 +3638,9 @@ fn decode_stream_impl( // Per PDF spec, /DecodeParms can be shorter than /Filter (missing params are treated as null). // But /DecodeParms cannot be longer than /Filter. if decode_params.len() > filters.len() { - return DecodeResult::with_diagnostic( - raw_bytes, + return DecodeResult::with_meta_and_diagnostic( + current_bytes, + stream_meta, Diagnostic::with_dynamic_no_offset( DiagCode::StreamInvalidParams, format!( @@ -3360,7 +3653,6 @@ fn decode_stream_impl( } // Step 4: Apply filters in order - let mut current_bytes = raw_bytes; let mut diagnostics = Vec::new(); let mut bomb_limit_hit = false; @@ -3402,6 +3694,27 @@ fn decode_stream_impl( } } + // Check for JBIG2Decode and emit OCR_JBIG2_UNSUPPORTED if full-render is disabled + if normalized_name == "JBIG2Decode" { + // Per EC-11: emit diagnostic once per JBIG2 stream when full-render is not compiled + // The diagnostic alerts downstream consumers that OCR processing will fail without PDFium + let has_full_render = cfg!(feature = "full-render"); + if !has_full_render { + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::OcrJbig2Unsupported, + "JBIG2Decode filter encountered; build with --features full-render to enable JBIG2 decoding via PDFium", + )); + } + + // Extract /JBIG2Globals reference if present + // The globals reference is stored in StreamMeta for the OCR pipeline (Phase 5.4) + if let Some(PdfObject::Dict(dict)) = params { + if let Some(PdfObject::Ref(globals_ref)) = dict.get("/JBIG2Globals") { + stream_meta.jbig2_globals_ref = Some(Jbig2GlobalsRef::new(*globals_ref)); + } + } + } + match get_decoder(&normalized_name) { Some(decoder) => { let counter_before = *doc_decompress_counter; @@ -3430,6 +3743,7 @@ fn decode_stream_impl( return DecodeResult { bytes: Vec::new(), diagnostics, + meta: stream_meta, }; } Err(e) => { @@ -3462,6 +3776,7 @@ fn decode_stream_impl( DecodeResult { bytes: current_bytes, diagnostics, + meta: stream_meta, } } @@ -5582,4 +5897,95 @@ endobj .expect("failed to read from MemorySource"); assert_eq!(bytes, b"data"); } + + /// JBIG2Decode passthrough test. + /// + /// JBIG2 streams are passed through as-is (raw bytes). + /// The decoder doesn't decode JBIG2; pdftract-core only extracts the raw bytes + /// and optionally the /JBIG2Globals reference for downstream consumers. + #[test] + fn test_jbig2_passthrough() { + let jbig2_data = b"\x00\x01\x02\x03"; // Fake JBIG2 data + let mut counter = 0; + let result = PassthroughDecoder::new("JBIG2Decode").decode( + jbig2_data, + None, + &mut counter, + DEFAULT_MAX_DECOMPRESS_BYTES, + ); + assert!(result.is_ok()); + let output = result.unwrap(); + assert_eq!(output, jbig2_data); + assert_eq!(counter, jbig2_data.len() as u64); + } + + /// JBIG2Decode with /JBIG2Globals reference test. + /// + /// Test that the Jbig2Decoder can extract the /JBIG2Globals reference + /// from the stream dictionary when present. + #[test] + fn test_jbig2_extract_globals_ref() { + use crate::decoder::jbig2::{Jbig2Decoder, Jbig2GlobalsRef}; + use crate::parser::object::PdfDict; + + let mut dict = PdfDict::new(); + dict.insert( + crate::parser::object::intern("/JBIG2Globals"), + PdfObject::Ref(ObjRef::new(42, 0)), + ); + + let globals_ref = Jbig2Decoder::extract_globals_ref(&dict); + assert!(globals_ref.is_some()); + assert_eq!(globals_ref.unwrap().obj_ref.object, 42); + } + + /// JBIG2Decode without /JBIG2Globals test. + /// + /// Test that when /JBIG2Globals is missing, extract_globals_ref returns None. + #[test] + fn test_jbig2_extract_globals_ref_missing() { + use crate::decoder::jbig2::Jbig2Decoder; + use crate::parser::object::PdfDict; + + let dict = PdfDict::new(); // No /JBIG2Globals + + let globals_ref = Jbig2Decoder::extract_globals_ref(&dict); + assert!(globals_ref.is_none()); + } + + /// JBIG2Decode with invalid /JBIG2Globals type test. + /// + /// Per PDF spec, /JBIG2Globals must be an indirect reference (Ref). + /// If it's any other type (Name, String, etc.), we treat it as missing. + #[test] + fn test_jbig2_extract_globals_ref_invalid_type() { + use crate::decoder::jbig2::Jbig2Decoder; + use crate::parser::object::PdfDict; + + let mut dict = PdfDict::new(); + // /JBIG2Globals must be a Ref, not a Name + dict.insert( + crate::parser::object::intern("/JBIG2Globals"), + PdfObject::Name(crate::parser::object::intern("InvalidGlobals")), + ); + + let globals_ref = Jbig2Decoder::extract_globals_ref(&dict); + assert!(globals_ref.is_none()); + } + + /// JBIG2Decode bomb limit enforcement test. + /// + /// Test that the bomb limit is enforced for JBIG2 streams. + #[test] + fn test_jbig2_bomb_limit() { + let jbig2_data = vec![0u8; 1000]; + let mut counter = 0; + let limit = 100; // Only allow 100 bytes + + let result = PassthroughDecoder::new("JBIG2Decode") + .decode(&jbig2_data, None, &mut counter, limit); + assert!(result.is_ok()); + let output = result.unwrap(); + assert_eq!(output.len(), 100); // Should truncate at bomb limit + } }