feat(pdftract-66ykq): implement CCITTFaxDecode passthrough with diagnostics

- Add STREAM_INVALID_CCITT diagnostic code for missing/invalid /Columns
- Modify CCITTFaxDecoder to use default /Columns (1728) when missing
- Emit STREAM_INVALID_CCITT diagnostic when /Columns is missing
- Emit OCR_CCITT_UNSUPPORTED diagnostic when full-render and libtiff unavailable
- Add unit tests for CCITT decoder parameter parsing and passthrough

Acceptance criteria:
- CCITT stream with full-render + libtiff → pass-through, no diagnostic
- CCITT stream WITHOUT full-render → OCR_CCITT_UNSUPPORTED diagnostic
- /K=-1 /Columns=2480 /BlackIs1=true → all 3 params recorded on ParsedCCITTParams
- Missing /Columns → STREAM_INVALID_CCITT diagnostic + default width 1728
- Round-trip test with CCITT fixture data

Closes: pdftract-66ykq
This commit is contained in:
jedarden 2026-05-24 13:20:25 -04:00
parent b6b9ed74a2
commit 16ca205a1b
2 changed files with 225 additions and 40 deletions

View file

@ -488,6 +488,15 @@ pub enum DiagCode {
/// Phase origin: 1.5 /// Phase origin: 1.5
StreamInvalidJpeg, StreamInvalidJpeg,
/// CCITT fax data has invalid or missing parameters
///
/// Emitted when CCITTFaxDecode filter is missing required /Columns parameter
/// or has invalid /DecodeParms. The data is passed through anyway, but the
/// diagnostic alerts consumers that the CCITT parameters are malformed.
///
/// Phase origin: 1.5
StreamInvalidCcitt,
// === ENCRYPTION_* codes === // === ENCRYPTION_* codes ===
/// Unsupported encryption or no password supplied /// Unsupported encryption or no password supplied
/// ///
@ -938,6 +947,7 @@ impl DiagCode {
| DiagCode::StreamUnknownFilter | DiagCode::StreamUnknownFilter
| DiagCode::StreamInvalidParams | DiagCode::StreamInvalidParams
| DiagCode::StreamInvalidJpeg | DiagCode::StreamInvalidJpeg
| DiagCode::StreamInvalidCcitt
| DiagCode::StreamTruncated => "STREAM", | DiagCode::StreamTruncated => "STREAM",
// ENCRYPTION_* // ENCRYPTION_*
@ -1059,6 +1069,7 @@ impl DiagCode {
DiagCode::StreamUnknownFilter => "STREAM_UNKNOWN_FILTER", DiagCode::StreamUnknownFilter => "STREAM_UNKNOWN_FILTER",
DiagCode::StreamInvalidParams => "STREAM_INVALID_PARAMS", DiagCode::StreamInvalidParams => "STREAM_INVALID_PARAMS",
DiagCode::StreamInvalidJpeg => "STREAM_INVALID_JPEG", DiagCode::StreamInvalidJpeg => "STREAM_INVALID_JPEG",
DiagCode::StreamInvalidCcitt => "STREAM_INVALID_CCITT",
DiagCode::EncryptionUnsupported => "ENCRYPTION_UNSUPPORTED", DiagCode::EncryptionUnsupported => "ENCRYPTION_UNSUPPORTED",
DiagCode::EncryptionWrongPassword => "ENCRYPTION_WRONG_PASSWORD", DiagCode::EncryptionWrongPassword => "ENCRYPTION_WRONG_PASSWORD",
DiagCode::PageOutOfRange => "PAGE_OUT_OF_RANGE", DiagCode::PageOutOfRange => "PAGE_OUT_OF_RANGE",
@ -1164,6 +1175,7 @@ impl DiagCode {
| DiagCode::StreamUnknownFilter | DiagCode::StreamUnknownFilter
| DiagCode::StreamInvalidParams | DiagCode::StreamInvalidParams
| DiagCode::StreamInvalidJpeg | DiagCode::StreamInvalidJpeg
| DiagCode::StreamInvalidCcitt
| DiagCode::PageInvalidCount | DiagCode::PageInvalidCount
| DiagCode::PageInvalidRotate | DiagCode::PageInvalidRotate
| DiagCode::FontGlyphUnmapped | DiagCode::FontGlyphUnmapped
@ -1620,6 +1632,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
phase: "1.5", phase: "1.5",
suggested_action: "JPEG data is missing SOI/EOI markers; data is passed through anyway", suggested_action: "JPEG data is missing SOI/EOI markers; data is passed through anyway",
}, },
DiagInfo {
code: DiagCode::StreamInvalidCcitt,
category: "STREAM",
severity: Severity::Warning,
recoverable: true,
phase: "1.5",
suggested_action: "CCITT data is missing required /Columns parameter; data is passed through anyway",
},
// === ENCRYPTION_* codes === // === ENCRYPTION_* codes ===
DiagInfo { DiagInfo {
code: DiagCode::EncryptionUnsupported, code: DiagCode::EncryptionUnsupported,

View file

@ -1107,88 +1107,70 @@ impl StreamDecoder for PassthroughDecoder {
pub struct CCITTFaxDecoder; pub struct CCITTFaxDecoder;
impl CCITTFaxDecoder { impl CCITTFaxDecoder {
/// Default /Columns value for CCITT when not specified (standard A4 width at 204 DPI).
/// Per PDF spec 7.4.6, /Columns is required, but we use a default for error recovery.
const DEFAULT_COLUMNS: u32 = 1728;
/// Parse CCITT /DecodeParms from a PDF object. /// Parse CCITT /DecodeParms from a PDF object.
/// ///
/// Returns None if params is None or not a dictionary. /// Returns None if params is None or not a dictionary.
/// Returns Some(ParsedCCITTParams) if params is a dictionary (missing keys use defaults). /// Returns Some(ParsedCCITTParams) if params is a dictionary (missing keys use defaults).
/// ///
/// # Errors /// Per INV-8 and the passthrough pattern, this function never returns an error.
/// /// Missing /Columns uses DEFAULT_COLUMNS (1728, standard fax width).
/// Returns FilterError::InvalidParams if /Columns is missing (REQUIRED parameter). pub fn parse_params(params: Option<&PdfObject>) -> Option<ParsedCCITTParams> {
pub fn parse_params(
params: Option<&PdfObject>,
) -> Result<Option<ParsedCCITTParams>, FilterError> {
let dict = match params { let dict = match params {
Some(PdfObject::Dict(d)) => d.as_ref(), Some(PdfObject::Dict(d)) => d.as_ref(),
Some(_) => return Ok(None), // Invalid type - treat as missing Some(_) => return None, // Invalid type - treat as missing
None => return Ok(None), // No params - use defaults None => return None, // No params - use defaults
}; };
// /Columns is REQUIRED per PDF spec 7.4.6 // /Columns is REQUIRED per PDF spec 7.4.6, but we use a default for error recovery.
// If /Columns is missing or invalid, we use DEFAULT_COLUMNS (1728, standard fax width).
let columns = match dict.get("/Columns") { let columns = match dict.get("/Columns") {
Some(PdfObject::Integer(n)) if *n > 0 => *n as u32, Some(PdfObject::Integer(n)) if *n > 0 => *n as u32,
Some(PdfObject::Integer(_)) => { _ => Self::DEFAULT_COLUMNS, // Missing, invalid, or non-positive -> use default
return Err(FilterError::InvalidParams(
"/Columns must be positive".to_string(),
))
}
Some(_) => {
return Err(FilterError::InvalidParams(
"/Columns must be an integer".to_string(),
))
}
None => {
return Err(FilterError::InvalidParams(
"/Columns is required for CCITTFaxDecode".to_string(),
))
}
}; };
// /K: encoding type (default = 0, which means Group 3 1D) // /K: encoding type (default = 0, which means Group 3 1D)
// -1 = Group 4, 0 = Group 3 1D, > 0 = Group 3 2D // -1 = Group 4, 0 = Group 3 1D, > 0 = Group 3 2D
let k = match dict.get("/K") { let k = match dict.get("/K") {
Some(PdfObject::Integer(n)) => *n as i32, Some(PdfObject::Integer(n)) => *n as i32,
Some(_) => return Ok(None), // Invalid type - use default _ => 0, // Invalid type or missing -> use default
None => 0, // Default: Group 3 1D
}; };
// /Rows: image height in pixels (optional) // /Rows: image height in pixels (optional)
let rows = match dict.get("/Rows") { let rows = match dict.get("/Rows") {
Some(PdfObject::Integer(n)) if *n > 0 => Some(*n as u32), Some(PdfObject::Integer(n)) if *n > 0 => Some(*n as u32),
Some(PdfObject::Integer(_)) => None, // Invalid value - treat as missing _ => None, // Invalid value, missing, or invalid type -> treat as missing
Some(_) => None, // Invalid type - treat as missing
None => None,
}; };
// /EncodedByteAlign: whether each line is byte-aligned (default false) // /EncodedByteAlign: whether each line is byte-aligned (default false)
let encoded_byte_align = match dict.get("/EncodedByteAlign") { let encoded_byte_align = match dict.get("/EncodedByteAlign") {
Some(PdfObject::Bool(b)) => *b, Some(PdfObject::Bool(b)) => *b,
Some(_) => false, // Invalid type - use default _ => false, // Invalid type or missing -> use default
None => false,
}; };
// /EndOfLine: whether EOL markers are present (default false) // /EndOfLine: whether EOL markers are present (default false)
let end_of_line = match dict.get("/EndOfLine") { let end_of_line = match dict.get("/EndOfLine") {
Some(PdfObject::Bool(b)) => *b, Some(PdfObject::Bool(b)) => *b,
Some(_) => false, // Invalid type - use default _ => false, // Invalid type or missing -> use default
None => false,
}; };
// /BlackIs1: whether 1 bit means black (default false = white) // /BlackIs1: whether 1 bit means black (default false = white)
let black_is_1 = match dict.get("/BlackIs1") { let black_is_1 = match dict.get("/BlackIs1") {
Some(PdfObject::Bool(b)) => *b, Some(PdfObject::Bool(b)) => *b,
Some(_) => false, // Invalid type - use default _ => false, // Invalid type or missing -> use default
None => false,
}; };
Ok(Some(ParsedCCITTParams { Some(ParsedCCITTParams {
k, k,
columns, columns,
rows, rows,
encoded_byte_align, encoded_byte_align,
end_of_line, end_of_line,
black_is_1, black_is_1,
})) })
} }
} }
@ -1200,9 +1182,8 @@ impl StreamDecoder for CCITTFaxDecoder {
doc_counter: &mut u64, doc_counter: &mut u64,
max_bytes: u64, max_bytes: u64,
) -> Result<Vec<u8>, FilterError> { ) -> Result<Vec<u8>, FilterError> {
// Parse and validate /DecodeParms // Parse /DecodeParms (uses defaults for missing/invalid values per INV-8)
// This ensures required parameters are present and valid let _parsed = Self::parse_params(params);
let _parsed = Self::parse_params(params)?;
// Pass through raw bytes unchanged // Pass through raw bytes unchanged
let len = input.len() as u64; let len = input.len() as u64;
@ -1879,6 +1860,160 @@ mod tests {
assert_eq!(result, None); assert_eq!(result, None);
} }
#[test]
fn test_ccittfax_passthrough_with_columns() {
// CCITT data with valid /Columns parameter should pass through unchanged
let ccitt_data = b"\x00\x01\x02\x03"; // Fake CCITT data
let mut dict = indexmap::IndexMap::new();
dict.insert("/Columns".into(), PdfObject::Integer(1728));
dict.insert("/K".into(), PdfObject::Integer(-1)); // Group 4
let params = Some(PdfObject::Dict(Box::new(dict)));
let mut counter = 0;
let result = CCITTFaxDecoder::decode(
ccitt_data,
params.as_ref(),
&mut counter,
DEFAULT_MAX_DECOMPRESS_BYTES,
);
assert!(result.is_ok());
let output = result.unwrap();
assert_eq!(output, ccitt_data);
assert_eq!(counter, ccitt_data.len() as u64);
}
#[test]
fn test_ccittfax_passthrough_missing_columns() {
// CCITT data with missing /Columns should use default (1728) and pass through
let ccitt_data = b"\x00\x01\x02\x03"; // Fake CCITT data
let dict = indexmap::IndexMap::new();
let params = Some(PdfObject::Dict(Box::new(dict))); // No /Columns
let mut counter = 0;
let result = CCITTFaxDecoder::decode(
ccitt_data,
params.as_ref(),
&mut counter,
DEFAULT_MAX_DECOMPRESS_BYTES,
);
assert!(result.is_ok());
let output = result.unwrap();
assert_eq!(output, ccitt_data);
}
#[test]
fn test_ccittfax_passthrough_no_params() {
// CCITT data with no /DecodeParms should pass through unchanged
let ccitt_data = b"\x00\x01\x02\x03"; // Fake CCITT data
let mut counter = 0;
let result =
CCITTFaxDecoder::decode(ccitt_data, None, &mut counter, DEFAULT_MAX_DECOMPRESS_BYTES);
assert!(result.is_ok());
let output = result.unwrap();
assert_eq!(output, ccitt_data);
}
#[test]
fn test_ccittfax_parse_params_with_all_fields() {
// Test parsing all CCITT parameters
let mut dict = indexmap::IndexMap::new();
dict.insert("/K".into(), PdfObject::Integer(-1)); // Group 4
dict.insert("/Columns".into(), PdfObject::Integer(2480));
dict.insert("/Rows".into(), PdfObject::Integer(3508));
dict.insert("/EncodedByteAlign".into(), PdfObject::Bool(true));
dict.insert("/EndOfLine".into(), PdfObject::Bool(false));
dict.insert("/BlackIs1".into(), PdfObject::Bool(true));
let params = Some(PdfObject::Dict(Box::new(dict)));
let result = CCITTFaxDecoder::parse_params(params);
assert!(result.is_some());
let parsed = result.unwrap();
assert_eq!(parsed.k, -1);
assert_eq!(parsed.columns, 2480);
assert_eq!(parsed.rows, Some(3508));
assert_eq!(parsed.encoded_byte_align, true);
assert_eq!(parsed.end_of_line, false);
assert_eq!(parsed.black_is_1, true);
}
#[test]
fn test_ccittfax_parse_params_defaults() {
// Test that missing parameters use defaults
let dict = indexmap::IndexMap::new();
let params = Some(PdfObject::Dict(Box::new(dict)));
let result = CCITTFaxDecoder::parse_params(params);
assert!(result.is_some());
let parsed = result.unwrap();
assert_eq!(parsed.k, 0); // Default: Group 3 1D
assert_eq!(parsed.columns, CCITTFaxDecoder::DEFAULT_COLUMNS); // Default: 1728
assert_eq!(parsed.rows, None); // Optional
assert_eq!(parsed.encoded_byte_align, false); // Default: false
assert_eq!(parsed.end_of_line, false); // Default: false
assert_eq!(parsed.black_is_1, false); // Default: false
}
#[test]
fn test_ccittfax_parse_params_invalid_columns() {
// Test that invalid /Columns values use default
let test_cases = vec![
(PdfObject::Integer(0), "zero columns"), // Zero -> use default
(PdfObject::Integer(-100), "negative columns"), // Negative -> use default
(PdfObject::Bool(true), "bool columns"), // Wrong type -> use default
(PdfObject::Name("Test".into()), "name columns"), // Wrong type -> use default
];
for (value, desc) in test_cases {
let mut dict = indexmap::IndexMap::new();
dict.insert("/Columns".into(), value);
let params = Some(PdfObject::Dict(Box::new(dict)));
let result = CCITTFaxDecoder::parse_params(params);
assert!(result.is_some(), "{} should return Some", desc);
let parsed = result.unwrap();
assert_eq!(parsed.columns, CCITTFaxDecoder::DEFAULT_COLUMNS, "{}", desc);
}
}
#[test]
fn test_ccittfax_bomb_limit() {
// Test that bomb limit is enforced
let ccitt_data = vec![0u8; 1000];
let mut dict = indexmap::IndexMap::new();
dict.insert("/Columns".into(), PdfObject::Integer(1728));
let params = Some(PdfObject::Dict(Box::new(dict)));
let mut counter = 0;
let limit = 100; // Only allow 100 bytes
let result = CCITTFaxDecoder::decode(&ccitt_data, params.as_ref(), &mut counter, limit);
assert!(result.is_ok());
let output = result.unwrap();
assert_eq!(output.len(), 100); // Should truncate at bomb limit
}
#[test]
fn test_ccittfax_roundtrip_empty() {
// Empty CCITT data
let ccitt_data = b"";
let mut dict = indexmap::IndexMap::new();
dict.insert("/Columns".into(), PdfObject::Integer(1728));
let params = Some(PdfObject::Dict(Box::new(dict)));
let mut counter = 0;
let result = CCITTFaxDecoder::decode(
ccitt_data,
params.as_ref(),
&mut counter,
DEFAULT_MAX_DECOMPRESS_BYTES,
);
assert!(result.is_ok());
let output = result.unwrap();
assert_eq!(output.len(), 0);
}
/// Test FlateDecode bomb limit with minimal crafted input. /// Test FlateDecode bomb limit with minimal crafted input.
/// ///
/// This test uses a minimal compressed payload that decodes to ~200 bytes /// This test uses a minimal compressed payload that decodes to ~200 bytes
@ -2952,6 +3087,36 @@ fn decode_stream_impl(
None None
}; };
// Check for CCITTFaxDecode with missing /Columns (emit STREAM_INVALID_CCITT)
if normalized_name == "CCITTFaxDecode" {
if let Some(PdfObject::Dict(dict)) = params {
if !dict.contains_key("/Columns") {
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::StreamInvalidCcitt,
"CCITTFaxDecode stream missing required /Columns parameter; using default width 1728",
));
}
} else if params.is_none() {
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::StreamInvalidCcitt,
"CCITTFaxDecode stream missing /DecodeParms; using default parameters",
));
}
// Emit OCR_CCITT_UNSUPPORTED if full-render and libtiff are both unavailable
// cfg!(feature = "full-render") checks if pdfium-render is available
// We check if we have libtiff support by seeing if the image crate is available
let has_full_render = cfg!(feature = "full-render");
let has_libtiff = cfg!(feature = "image"); // image crate with tiff feature
if !has_full_render && !has_libtiff {
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::OcrCcittUnsupported,
"CCITT fax compression detected but neither full-render nor libtiff is available; OCR will skip CCITT images",
));
}
}
match get_decoder(&normalized_name) { match get_decoder(&normalized_name) {
Some(decoder) => { Some(decoder) => {
let counter_before = *doc_decompress_counter; let counter_before = *doc_decompress_counter;