diff --git a/crates/pdftract-core/src/hybrid.rs b/crates/pdftract-core/src/hybrid.rs index 83893cf..dab690e 100644 --- a/crates/pdftract-core/src/hybrid.rs +++ b/crates/pdftract-core/src/hybrid.rs @@ -42,7 +42,7 @@ pub struct Span { pub text: String, } -/// Source of a span - either vector extraction, OCR, or assisted OCR. +/// Source of a span - either vector extraction, OCR, assisted OCR, or OCR fallback. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SpanSource { /// Text extracted from content stream (Phase 3). @@ -51,6 +51,8 @@ pub enum SpanSource { Ocr, /// Text extracted via assisted OCR with position validation (Phase 5.5). OcrAssisted, + /// Text extracted via pure OCR fallback after region-level validation failed (Phase 5.5.3). + OcrFallback, } impl Span { @@ -79,6 +81,11 @@ impl Span { Self::new(bbox, confidence, SpanSource::OcrAssisted, text) } + /// Create a span with OCR fallback source (region-level validation failed). + pub fn ocr_fallback(bbox: [f64; 4], confidence: f32, text: String) -> Self { + Self::new(bbox, confidence, SpanSource::OcrFallback, text) + } + /// Get the width of the span's bbox. #[inline] pub fn width(&self) -> f64 { diff --git a/crates/pdftract-core/src/ocr.rs b/crates/pdftract-core/src/ocr.rs index 053e64c..8dd2576 100644 --- a/crates/pdftract-core/src/ocr.rs +++ b/crates/pdftract-core/src/ocr.rs @@ -17,7 +17,7 @@ use std::ffi::CString; use std::fs; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicUsize, Ordering}; -use tesseract::TessBaseAPI; +use tesseract::{PageSegMode, TessBaseAPI}; /// Global counter for tracking Tesseract initializations across all threads. /// @@ -286,6 +286,11 @@ pub struct TessOpts { /// /// Default: None pub tessdata_path: Option, + /// Page segmentation mode. + /// + /// Controls how Tesseract interprets the page layout. + /// Default: None (Tesseract's default, usually PSM_AUTO). + pub page_seg_mode: Option, } impl Default for TessOpts { @@ -293,6 +298,7 @@ impl Default for TessOpts { Self { language: "eng".to_string(), tessdata_path: None, + page_seg_mode: None, } } } @@ -317,6 +323,7 @@ impl TessOpts { Self { language: language.to_string(), tessdata_path: None, + page_seg_mode: None, } } @@ -340,6 +347,31 @@ impl TessOpts { Self { language: "eng".to_string(), tessdata_path: Some(tessdata_path), + page_seg_mode: None, + } + } + + /// Create TessOpts with a specific page segmentation mode. + /// + /// # Arguments + /// + /// * `page_seg_mode` - Page segmentation mode for Tesseract + /// + /// # Examples + /// + /// ``` + /// use pdftract_core::ocr::TessOpts; + /// use tesseract::PageSegMode; + /// + /// let opts = TessOpts::with_page_seg_mode(PageSegMode::PsmSparseText); + /// assert!(opts.page_seg_mode.is_some()); + /// ``` + #[must_use] + pub fn with_page_seg_mode(page_seg_mode: PageSegMode) -> Self { + Self { + language: "eng".to_string(), + tessdata_path: None, + page_seg_mode: Some(page_seg_mode), } } @@ -436,6 +468,11 @@ impl TessState { ) })?; + // Set page segmentation mode if specified + if let Some(mode) = opts.page_seg_mode { + api.set_page_seg_mode(mode); + } + // Track initialization for testing INIT_COUNT.fetch_add(1, Ordering::SeqCst); @@ -549,6 +586,7 @@ mod tests { let opts = TessOpts::default(); assert_eq!(opts.language, "eng"); assert!(opts.tessdata_path.is_none()); + assert!(opts.page_seg_mode.is_none()); } #[test] @@ -556,6 +594,7 @@ mod tests { let opts = TessOpts::with_language("fra"); assert_eq!(opts.language, "fra"); assert!(opts.tessdata_path.is_none()); + assert!(opts.page_seg_mode.is_none()); } #[test] @@ -564,6 +603,15 @@ mod tests { let opts = TessOpts::with_tessdata_path(path.clone()); assert_eq!(opts.language, "eng"); assert_eq!(opts.tessdata_path, Some(path)); + assert!(opts.page_seg_mode.is_none()); + } + + #[test] + fn test_tess_opts_with_page_seg_mode() { + let opts = TessOpts::with_page_seg_mode(PageSegMode::PsmSparseText); + assert_eq!(opts.language, "eng"); + assert!(opts.tessdata_path.is_none()); + assert_eq!(opts.page_seg_mode, Some(PageSegMode::PsmSparseText)); } #[test] @@ -578,6 +626,9 @@ mod tests { let path = PathBuf::from("/custom/path"); let opts4 = TessOpts::with_tessdata_path(path); assert_ne!(opts1, opts4); + + let opts5 = TessOpts::with_page_seg_mode(PageSegMode::PsmSparseText); + assert_ne!(opts1, opts5); } #[test] @@ -586,6 +637,7 @@ mod tests { let opts = TessOpts { language: "eng".to_string(), tessdata_path: Some(path.clone()), + page_seg_mode: None, }; let resolved = opts.resolve_tessdata_path(); @@ -613,6 +665,7 @@ mod tests { let opts = TessOpts { language: "eng".to_string(), tessdata_path: Some(path.clone()), + page_seg_mode: None, }; let resolved = opts.resolve_tessdata_path(); @@ -2347,6 +2400,19 @@ const ASSISTED_OCR_CONFIDENCE_CAP: f32 = 0.4; /// For small N (< 100), linear scan is faster due to lower overhead. const ASSISTED_OCR_KDTREE_THRESHOLD: usize = 100; +/// Region-level confidence threshold for keeping assisted-OCR output. +/// +/// If the mean confidence of all assisted-OCR words in a region is greater +/// than this value, the region is kept as-is with confidence_source = "ocr-assisted". +const ASSISTED_OCR_KEEP_THRESH: f32 = 0.7; + +/// Region-level confidence threshold for falling back to pure OCR. +/// +/// If the mean confidence of all assisted-OCR words in a region is less +/// than this value, the region is reprocessed with pure OCR (no validation filter) +/// and emitted with confidence_source = "ocr-fallback". +const ASSISTED_OCR_FALLBACK_THRESH: f32 = 0.3; + /// Validate OCR words against vector glyph position hints. /// /// This function implements the per-word validation filter for the @@ -2448,6 +2514,172 @@ pub fn validate_ocr_with_position_hints( .collect() } +/// Region (line) for grouping OCR words by baseline proximity. +#[derive(Debug, Clone)] +struct OcrRegion { + /// Words in this region. + words: Vec<(HocrWord, [f64; 4])>, // (HocrWord, PDF bbox) + /// Mean confidence of all words in this region. + mean_confidence: f32, +} + +/// Apply region-level confidence policy to assisted-OCR spans. +/// +/// This function implements Phase 5.5.3 step 5: for each region (line), +/// compute the mean confidence across all assisted-OCR words and decide +/// whether to keep as-is, keep with high confidence flag, or trigger fallback. +/// +/// # Arguments +/// +/// * `hocr_words` - OCR words from Tesseract (in pixel coordinates) +/// * `vector_glyphs` - Position hints from Phase 3 +/// * `dpi` - DPI used for rendering +/// * `page_height_pt` - Page height in PDF points +/// +/// # Returns +/// +/// A tuple of: +/// - Vec of spans with adjusted confidence sources +/// - Vec of HocrWords that need fallback (grouped by regions with mean < 0.3) +/// +/// # Region Grouping +/// +/// Words are grouped into regions by baseline proximity (Y-coordinate). +/// Two words are in the same region if their baselines are within 12pt +/// (approximately 1.5x the typical line height for 12pt text). +/// +/// # Policy +/// +/// For each region: +/// - mean > 0.7: keep with `OcrAssisted` source +/// - mean < 0.3: flag for fallback (caller should rerun Tesseract) +/// - 0.3 <= mean <= 0.7: keep with `OcrAssisted` source +/// +/// # See also +/// +/// - Phase 5.5 pipeline step 5 (plan line 1937) +/// - `validate_ocr_with_position_hints` for per-word validation +pub fn apply_region_level_confidence_policy( + hocr_words: &[HocrWord], + vector_glyphs: &[Glyph], + dpi: u32, + page_height_pt: f64, +) -> (Vec, Vec<(HocrWord, [f64; 4])>) { + // First, apply per-word validation to get initial confidence-adjusted spans + let validated_spans = + validate_ocr_with_position_hints(hocr_words, vector_glyphs, dpi, page_height_pt); + + // Group words into regions by baseline proximity + let regions = group_words_by_region(hocr_words, dpi, page_height_pt); + + // Compute mean confidence for each region and classify + let mut final_spans = Vec::new(); + let mut fallback_words = Vec::new(); + + for region in regions { + if region.mean_confidence < ASSISTED_OCR_FALLBACK_THRESH { + // Region needs fallback - collect original words for rerun + for (word, pdf_bbox) in region.words { + fallback_words.push((word, pdf_bbox)); + } + } else { + // Keep region - convert validated spans to final output + // Words in this region are already in validated_spans + // We need to match them up by position + for (word, pdf_bbox) in region.words { + // Find the corresponding validated span + if let Some(span) = validated_spans + .iter() + .find(|s| s.bbox == pdf_bbox && s.text == word.text) + { + let span = if region.mean_confidence > ASSISTED_OCR_KEEP_THRESH { + // High confidence region - keep as OcrAssisted + crate::hybrid::Span::ocr_assisted( + span.bbox, + span.confidence, + span.text.clone(), + ) + } else { + // Medium confidence region - keep as-is (OcrAssisted) + span.clone() + }; + final_spans.push(span); + } + } + } + } + + (final_spans, fallback_words) +} + +/// Group OCR words into regions by baseline proximity. +/// +/// Two words are in the same region if their baselines are within 12pt. +/// The baseline is computed as `y0 + (bbox_height * 0.2)`. +/// +/// # Arguments +/// +/// * `hocr_words` - OCR words from Tesseract +/// * `dpi` - DPI used for rendering +/// * `page_height_pt` - Page height in PDF points +/// +/// # Returns +/// +/// A vector of regions, each containing words and their mean confidence. +fn group_words_by_region(hocr_words: &[HocrWord], dpi: u32, page_height_pt: f64) -> Vec { + if hocr_words.is_empty() { + return Vec::new(); + } + + // Convert all words to PDF coordinates and compute baselines + let mut word_info: Vec<(HocrWord, [f64; 4], f64)> = hocr_words + .iter() + .map(|word| { + let pdf_bbox = word.to_pdf_bbox(dpi, page_height_pt, None, None); + let baseline = pdf_bbox[1] + (pdf_bbox[3] - pdf_bbox[1]) * 0.2; + (word.clone(), pdf_bbox, baseline) + }) + .collect(); + + // Sort by baseline for deterministic grouping + word_info.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal)); + + // Group by baseline proximity (within 12pt) + let mut regions: Vec = Vec::new(); + const BASELINE_TOLERANCE_PT: f64 = 12.0; + + for (word, pdf_bbox, baseline) in word_info { + let confidence = word.confidence(); + + // Find existing region with compatible baseline + let region = regions.iter_mut().find(|r| { + if r.words.is_empty() { + return false; + } + // Compute region's baseline from first word + let (_, first_bbox, _) = &r.words[0]; + let region_baseline = first_bbox[1] + (first_bbox[3] - first_bbox[1]) * 0.2; + (region_baseline - baseline).abs() < BASELINE_TOLERANCE_PT + }); + + if let Some(region) = region { + // Add to existing region + region.words.push((word, pdf_bbox)); + // Recompute mean confidence + let sum: f32 = region.words.iter().map(|(w, _)| w.confidence()).sum(); + region.mean_confidence = sum / region.words.len() as f32; + } else { + // Create new region + regions.push(OcrRegion { + words: vec![(word, pdf_bbox)], + mean_confidence: confidence, + }); + } + } + + regions +} + #[cfg(test)] mod assisted_ocr_tests { use super::*; @@ -2586,6 +2818,135 @@ mod assisted_ocr_tests { assert_eq!(ASSISTED_OCR_DISTANCE_PT, 5.0); assert_eq!(ASSISTED_OCR_CONFIDENCE_CAP, 0.4); assert_eq!(ASSISTED_OCR_KDTREE_THRESHOLD, 100); + assert_eq!(ASSISTED_OCR_KEEP_THRESH, 0.7); + assert_eq!(ASSISTED_OCR_FALLBACK_THRESH, 0.3); + } + + #[test] + fn test_region_level_policy_high_confidence_region() { + // Test region with mean confidence > 0.7 - should keep as OcrAssisted + let glyphs = vec![ + Glyph::position_hint([100.0, 200.0, 110.0, 210.0]), + Glyph::position_hint([120.0, 200.0, 130.0, 210.0]), + ]; + let words = vec![ + HocrWord { + text: "hello".to_string(), + bbox_px: [102, 202, 108, 208], + confidence_0_100: 95, + }, + HocrWord { + text: "world".to_string(), + bbox_px: [122, 202, 128, 208], + confidence_0_100: 90, + }, + ]; + + let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0); + + // Both words are near glyphs, so they keep high confidence + assert_eq!(spans.len(), 2); + assert_eq!(fallback.len(), 0); // No fallback needed + assert!(spans + .iter() + .all(|s| s.source == crate::hybrid::SpanSource::OcrAssisted)); + } + + #[test] + fn test_region_level_policy_low_confidence_region() { + // Test region with mean confidence < 0.3 - should trigger fallback + let glyphs = vec![]; // No glyphs -> all words capped at 0.4 + let words = vec![ + HocrWord { + text: "low1".to_string(), + bbox_px: [100, 100, 120, 120], + confidence_0_100: 20, + }, + HocrWord { + text: "low2".to_string(), + bbox_px: [130, 100, 150, 120], + confidence_0_100: 25, + }, + ]; + + let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0); + + // Low confidence region -> fallback triggered + assert_eq!(spans.len(), 0); // No spans kept + assert_eq!(fallback.len(), 2); // Both words need fallback + } + + #[test] + fn test_region_level_policy_medium_confidence_region() { + // Test region with 0.3 <= mean confidence <= 0.7 - should keep as-is + let glyphs = vec![]; + let words = vec![ + HocrWord { + text: "med1".to_string(), + bbox_px: [100, 100, 120, 120], + confidence_0_100: 40, + }, + HocrWord { + text: "med2".to_string(), + bbox_px: [130, 100, 150, 120], + confidence_0_100: 50, + }, + ]; + + let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0); + + // Medium confidence region -> kept as-is (capped at 0.4 by validation) + assert_eq!(spans.len(), 2); + assert_eq!(fallback.len(), 0); // No fallback needed + } + + #[test] + fn test_region_level_policy_multiple_regions() { + // Test multiple regions with different confidence levels + let glyphs = vec![ + Glyph::position_hint([100.0, 200.0, 110.0, 210.0]), // For high confidence region + ]; + let words = vec![ + // Region 1: high confidence (near glyph) + HocrWord { + text: "hello".to_string(), + bbox_px: [102, 202, 108, 208], + confidence_0_100: 95, + }, + // Region 2: low confidence (far from glyph, different Y) + HocrWord { + text: "low".to_string(), + bbox_px: [500, 500, 520, 520], + confidence_0_100: 20, + }, + ]; + + let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0); + + // One span kept, one word needs fallback + assert_eq!(spans.len(), 1); + assert_eq!(fallback.len(), 1); + assert_eq!(spans[0].text, "hello"); + } + + #[test] + fn test_group_words_by_region_empty() { + let words: Vec = vec![]; + let regions = group_words_by_region(&words, 300, 792.0); + assert_eq!(regions.len(), 0); + } + + #[test] + fn test_group_words_by_region_single_word() { + let words = vec![HocrWord { + text: "test".to_string(), + bbox_px: [100, 100, 120, 120], + confidence_0_100: 80, + }]; + let regions = group_words_by_region(&words, 300, 792.0); + assert_eq!(regions.len(), 1); + assert_eq!(regions[0].words.len(), 1); + assert_eq!(regions[0].mean_confidence, 0.8); } } diff --git a/crates/pdftract-core/src/schema/mod.rs b/crates/pdftract-core/src/schema/mod.rs index 9dccbd2..013e20a 100644 --- a/crates/pdftract-core/src/schema/mod.rs +++ b/crates/pdftract-core/src/schema/mod.rs @@ -28,6 +28,13 @@ use crate::signature::Signature; /// /// A span is the smallest unit of extracted text, representing a /// contiguous run of text with consistent font and styling. +/// +/// # TODO: Phase 6.1 - Add confidence_source field +/// +/// When the `confidence_source` field is added to the schema (per plan line 363, 1662), +/// it should include "ocr-fallback" as a valid value for spans emitted via +/// Phase 5.5.3 region-level fallback. The internal `SpanSource::OcrFallback` variant +/// in `hybrid.rs` maps to this value. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct SpanJson {