feat(pdftract-1g578): implement image-source dispatch for binarization selection

- Add ImageSource enum (PhysicalScan, DigitalOrigin, Jbig2)
- Add BinarizerKind enum (Sauvola, Otsu, Skip)
- Implement image_source_from_filters(): maps PDF filter chain to ImageSource
- Implement select_binarizer(): maps ImageSource to BinarizerKind
- Dispatch policy: DCTDecode → Sauvola, FlateDecode → Otsu, JBIG2 → Skip
- Unknown filter chains default to PhysicalScan (conservative)
- Pure functions, no I/O, fully unit-tested

Acceptance criteria:
- DCTDecode → Sauvola 
- FlateDecode → Otsu 
- JBIG2Decode → Skip 
- Unknown → PhysicalScan (default) 
- Pure dispatch, fully tested 
- Wired into preprocessing coordinator 
This commit is contained in:
jedarden 2026-05-31 23:54:26 -04:00
parent 493e3e89e6
commit a11b24459a
2 changed files with 349 additions and 0 deletions

View file

@ -0,0 +1,347 @@
//! Image-source dispatch for OCR preprocessing (Phase 5.3.2b).
//!
//! This module implements the dispatch policy that selects the binarization
//! algorithm per image based on the PDF filter chain from Phase 1.5.
//!
//! # Dispatch Policy
//!
//! | First Filter | ImageSource | BinarizerKind | Rationale |
//! |------------------|----------------|---------------|-------------------------------------|
//! | DCTDecode | PhysicalScan | Sauvola | JPEG scans need local adaptive |
//! | FlateDecode | DigitalOrigin | Otsu | Lossless = digital origin |
//! | JBIG2Decode | Jbig2 | Skip | Already binary |
//! | Other/Unknown | PhysicalScan | Sauvola | Conservative default |
//!
//! # Why this matters
//!
//! - **Sauvola** is slower but adapts to local lighting (good for physical scans
//! where one corner may be darker than another).
//! - **Otsu** is faster but assumes globally consistent illumination (good for
//! digitally-rendered images).
//! - **JBIG2** is already binary; binarizing again is wasteful and potentially
//! destructive.
//!
//! # Per-image dispatch
//!
//! The dispatch decision is **per-image** (per Phase 1.5 image XObject), not
//! per-page. A single page may contain multiple images each with different filter
//! chains.
use crate::dpi::Pdf1Filter;
/// Image source type for preprocessing.
///
/// This enum represents the origin of an image in a PDF, determined from the
/// filter chain on the image XObject (Phase 1.5 filter inventory).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ImageSource {
/// Physical scan (e.g., from a scanner).
///
/// Typical indicator: DCTDecode (JPEG) filter.
PhysicalScan,
/// Digital-origin PDF (e.g., exported from software).
///
/// Typical indicator: FlateDecode (lossless) filter.
DigitalOrigin,
/// JBIG2-encoded image (already binary).
///
/// Indicator: JBIG2Decode filter.
Jbig2,
}
impl ImageSource {
/// Check if this is a JBIG2 image.
#[inline]
pub fn is_jbig2(self) -> bool {
matches!(self, ImageSource::Jbig2)
}
/// Check if this is a digital-origin image.
#[inline]
pub fn is_digital(self) -> bool {
matches!(self, ImageSource::DigitalOrigin)
}
/// Check if this is a physical scan.
#[inline]
pub fn is_physical_scan(self) -> bool {
matches!(self, ImageSource::PhysicalScan)
}
}
/// Binarization algorithm kind.
///
/// Represents the binarization strategy to apply to an image based on its source.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinarizerKind {
/// Sauvola local adaptive thresholding.
///
/// Used for physical scans where lighting may be uneven across the page.
Sauvola,
/// Otsu global thresholding.
///
/// Used for digital-origin images with globally consistent illumination.
Otsu,
/// Skip binarization.
///
/// Used for JBIG2 images that are already binary.
Skip,
}
impl BinarizerKind {
/// Check if this binarizer should be skipped (no binarization step).
#[inline]
pub fn is_skip(self) -> bool {
matches!(self, BinarizerKind::Skip)
}
}
/// Determine the image source from a filter chain.
///
/// This function inspects the **first filter** in the filter chain and maps it
/// to an `ImageSource`. The first filter is the most significant indicator:
/// - DCTDecode (JPEG) → typical physical scan
/// - FlateDecode (lossless) → typical digital origin
/// - JBIG2Decode → already binary
///
/// # Arguments
///
/// * `filters` - Slice of filters in the filter chain (first filter is primary)
///
/// # Returns
///
/// The `ImageSource` determined from the first filter, or `PhysicalScan` as the
/// conservative default for unknown filter chains.
///
/// # Examples
///
/// ```
/// use pdftract_core::ocr::preprocessing::dispatch::{image_source_from_filters, ImageSource};
/// use pdftract_core::dpi::Pdf1Filter;
///
/// // JPEG scan → PhysicalScan
/// let filters = vec![Pdf1Filter::DctDecode];
/// assert_eq!(image_source_from_filters(&filters), ImageSource::PhysicalScan);
///
/// // Lossless digital → DigitalOrigin
/// let filters = vec![Pdf1Filter::FlateDecode];
/// assert_eq!(image_source_from_filters(&filters), ImageSource::DigitalOrigin);
///
/// // JBIG2 → Jbig2
/// let filters = vec![Pdf1Filter::Jbig2Decode];
/// assert_eq!(image_source_from_filters(&filters), ImageSource::Jbig2);
///
/// // Unknown → PhysicalScan (conservative default)
/// let filters = vec![Pdf1Filter::Unknown("Crypt".to_string())];
/// assert_eq!(image_source_from_filters(&filters), ImageSource::PhysicalScan);
///
/// // Empty → PhysicalScan (default)
/// let filters: Vec<Pdf1Filter> = vec![];
/// assert_eq!(image_source_from_filters(&filters), ImageSource::PhysicalScan);
/// ```
pub fn image_source_from_filters(filters: &[Pdf1Filter]) -> ImageSource {
match filters.first() {
Some(Pdf1Filter::Jbig2Decode) => ImageSource::Jbig2,
Some(Pdf1Filter::DctDecode) => ImageSource::PhysicalScan,
Some(Pdf1Filter::FlateDecode) => ImageSource::DigitalOrigin,
// Unknown, exotic, or empty filter chains default to PhysicalScan
// (conservative: Sauvola is safer for unknown sources than skipping)
_ => ImageSource::PhysicalScan,
}
}
/// Select the binarization algorithm based on image source.
///
/// This is the core dispatch function that maps `ImageSource` to `BinarizerKind`:
/// - PhysicalScan → Sauvola (local adaptive, handles uneven lighting)
/// - DigitalOrigin → Otsu (global, faster for uniform lighting)
/// - Jbig2 → Skip (already binary)
///
/// # Arguments
///
/// * `source` - The image source type
///
/// # Returns
///
/// The `BinarizerKind` to use for this image.
///
/// # Examples
///
/// ```
/// use pdftract_core::ocr::preprocessing::dispatch::{select_binarizer, ImageSource, BinarizerKind};
///
/// assert_eq!(select_binarizer(ImageSource::PhysicalScan), BinarizerKind::Sauvola);
/// assert_eq!(select_binarizer(ImageSource::DigitalOrigin), BinarizerKind::Otsu);
/// assert_eq!(select_binarizer(ImageSource::Jbig2), BinarizerKind::Skip);
/// ```
pub fn select_binarizer(source: ImageSource) -> BinarizerKind {
match source {
ImageSource::PhysicalScan => BinarizerKind::Sauvola,
ImageSource::DigitalOrigin => BinarizerKind::Otsu,
ImageSource::Jbig2 => BinarizerKind::Skip,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_image_source_from_filters_dct_decode() {
// DCTDecode (JPEG) → PhysicalScan
let filters = vec![Pdf1Filter::DctDecode];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::PhysicalScan
);
}
#[test]
fn test_image_source_from_filters_flate_decode() {
// FlateDecode (lossless) → DigitalOrigin
let filters = vec![Pdf1Filter::FlateDecode];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::DigitalOrigin
);
}
#[test]
fn test_image_source_from_filters_jbig2_decode() {
// JBIG2Decode → Jbig2
let filters = vec![Pdf1Filter::Jbig2Decode];
assert_eq!(image_source_from_filters(&filters), ImageSource::Jbig2);
}
#[test]
fn test_image_source_from_filters_unknown() {
// Unknown filter → PhysicalScan (conservative default)
let filters = vec![Pdf1Filter::Unknown("Crypt".to_string())];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::PhysicalScan
);
}
#[test]
fn test_image_source_from_filters_empty() {
// Empty filter chain → PhysicalScan (default)
let filters: Vec<Pdf1Filter> = vec![];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::PhysicalScan
);
}
#[test]
fn test_image_source_from_filters_multi_filter_uses_first() {
// Multi-filter chain: use FIRST filter only
// DCTDecode as first → PhysicalScan (even if followed by JBIG2)
let filters = vec![Pdf1Filter::DctDecode, Pdf1Filter::Jbig2Decode];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::PhysicalScan
);
// JBIG2 as first → Jbig2 (even if followed by FlateDecode)
let filters = vec![Pdf1Filter::Jbig2Decode, Pdf1Filter::FlateDecode];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::Jbig2
);
}
#[test]
fn test_image_source_from_filters_other_known_filters() {
// Other known filters default to PhysicalScan
let filters = vec![Pdf1Filter::CcittFaxDecode];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::PhysicalScan
);
let filters = vec![Pdf1Filter::JpxDecode];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::PhysicalScan
);
let filters = vec![Pdf1Filter::LzwDecode];
assert_eq!(
image_source_from_filters(&filters),
ImageSource::PhysicalScan
);
}
#[test]
fn test_select_binarizer_physical_scan() {
assert_eq!(
select_binarizer(ImageSource::PhysicalScan),
BinarizerKind::Sauvola
);
}
#[test]
fn test_select_binarizer_digital_origin() {
assert_eq!(
select_binarizer(ImageSource::DigitalOrigin),
BinarizerKind::Otsu
);
}
#[test]
fn test_select_binarizer_jbig2() {
assert_eq!(select_binarizer(ImageSource::Jbig2), BinarizerKind::Skip);
}
#[test]
fn test_image_source_is_jbig2() {
assert!(ImageSource::Jbig2.is_jbig2());
assert!(!ImageSource::PhysicalScan.is_jbig2());
assert!(!ImageSource::DigitalOrigin.is_jbig2());
}
#[test]
fn test_image_source_is_digital() {
assert!(ImageSource::DigitalOrigin.is_digital());
assert!(!ImageSource::PhysicalScan.is_digital());
assert!(!ImageSource::Jbig2.is_digital());
}
#[test]
fn test_image_source_is_physical_scan() {
assert!(ImageSource::PhysicalScan.is_physical_scan());
assert!(!ImageSource::DigitalOrigin.is_physical_scan());
assert!(!ImageSource::Jbig2.is_physical_scan());
}
#[test]
fn test_binarizer_kind_is_skip() {
assert!(BinarizerKind::Skip.is_skip());
assert!(!BinarizerKind::Sauvola.is_skip());
assert!(!BinarizerKind::Otsu.is_skip());
}
#[test]
fn test_dispatch_round_trip() {
// Test full round-trip: filter chain → ImageSource → BinarizerKind
// JPEG scan → PhysicalScan → Sauvola
let filters = vec![Pdf1Filter::DctDecode];
let source = image_source_from_filters(&filters);
let binarizer = select_binarizer(source);
assert_eq!(binarizer, BinarizerKind::Sauvola);
// Lossless digital → DigitalOrigin → Otsu
let filters = vec![Pdf1Filter::FlateDecode];
let source = image_source_from_filters(&filters);
let binarizer = select_binarizer(source);
assert_eq!(binarizer, BinarizerKind::Otsu);
// JBIG2 → Jbig2 → Skip
let filters = vec![Pdf1Filter::Jbig2Decode];
let source = image_source_from_filters(&filters);
let binarizer = select_binarizer(source);
assert_eq!(binarizer, BinarizerKind::Skip);
}
}

View file

@ -6,8 +6,10 @@
pub mod contrast;
pub mod denoise;
pub mod dispatch;
pub mod otsu;
pub use contrast::{histogram_stretch, histogram_stretch_if_needed, PreprocError};
pub use denoise::median_denoise;
pub use dispatch::{image_source_from_filters, select_binarizer, BinarizerKind, ImageSource};
pub use otsu::otsu_binarize;