feat(pdftract-29gu): implement Phase 5.5.3 region-level confidence policy

- Add OcrFallback variant to SpanSource enum for fallback spans
- Add page_seg_mode field to TessOpts for PSM_SPARSE_TEXT support
- Add ASSISTED_OCR_KEEP_THRESH (0.7) and ASSISTED_OCR_FALLBACK_THRESH (0.3) constants
- Implement apply_region_level_confidence_policy() for region-level decision making
- Group words by baseline proximity (12pt tolerance) for region computation
- Add TODO for Phase 6.1 confidence_source enum to include "ocr-fallback"

Closes: pdftract-29gu
This commit is contained in:
jedarden 2026-05-24 05:15:46 -04:00
parent 6aefd76c63
commit a639794133
3 changed files with 377 additions and 2 deletions

View file

@ -42,7 +42,7 @@ pub struct Span {
pub text: String,
}
/// Source of a span - either vector extraction, OCR, or assisted OCR.
/// Source of a span - either vector extraction, OCR, assisted OCR, or OCR fallback.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SpanSource {
/// Text extracted from content stream (Phase 3).
@ -51,6 +51,8 @@ pub enum SpanSource {
Ocr,
/// Text extracted via assisted OCR with position validation (Phase 5.5).
OcrAssisted,
/// Text extracted via pure OCR fallback after region-level validation failed (Phase 5.5.3).
OcrFallback,
}
impl Span {
@ -79,6 +81,11 @@ impl Span {
Self::new(bbox, confidence, SpanSource::OcrAssisted, text)
}
/// Create a span with OCR fallback source (region-level validation failed).
pub fn ocr_fallback(bbox: [f64; 4], confidence: f32, text: String) -> Self {
Self::new(bbox, confidence, SpanSource::OcrFallback, text)
}
/// Get the width of the span's bbox.
#[inline]
pub fn width(&self) -> f64 {

View file

@ -17,7 +17,7 @@ use std::ffi::CString;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicUsize, Ordering};
use tesseract::TessBaseAPI;
use tesseract::{PageSegMode, TessBaseAPI};
/// Global counter for tracking Tesseract initializations across all threads.
///
@ -286,6 +286,11 @@ pub struct TessOpts {
///
/// Default: None
pub tessdata_path: Option<PathBuf>,
/// Page segmentation mode.
///
/// Controls how Tesseract interprets the page layout.
/// Default: None (Tesseract's default, usually PSM_AUTO).
pub page_seg_mode: Option<PageSegMode>,
}
impl Default for TessOpts {
@ -293,6 +298,7 @@ impl Default for TessOpts {
Self {
language: "eng".to_string(),
tessdata_path: None,
page_seg_mode: None,
}
}
}
@ -317,6 +323,7 @@ impl TessOpts {
Self {
language: language.to_string(),
tessdata_path: None,
page_seg_mode: None,
}
}
@ -340,6 +347,31 @@ impl TessOpts {
Self {
language: "eng".to_string(),
tessdata_path: Some(tessdata_path),
page_seg_mode: None,
}
}
/// Create TessOpts with a specific page segmentation mode.
///
/// # Arguments
///
/// * `page_seg_mode` - Page segmentation mode for Tesseract
///
/// # Examples
///
/// ```
/// use pdftract_core::ocr::TessOpts;
/// use tesseract::PageSegMode;
///
/// let opts = TessOpts::with_page_seg_mode(PageSegMode::PsmSparseText);
/// assert!(opts.page_seg_mode.is_some());
/// ```
#[must_use]
pub fn with_page_seg_mode(page_seg_mode: PageSegMode) -> Self {
Self {
language: "eng".to_string(),
tessdata_path: None,
page_seg_mode: Some(page_seg_mode),
}
}
@ -436,6 +468,11 @@ impl TessState {
)
})?;
// Set page segmentation mode if specified
if let Some(mode) = opts.page_seg_mode {
api.set_page_seg_mode(mode);
}
// Track initialization for testing
INIT_COUNT.fetch_add(1, Ordering::SeqCst);
@ -549,6 +586,7 @@ mod tests {
let opts = TessOpts::default();
assert_eq!(opts.language, "eng");
assert!(opts.tessdata_path.is_none());
assert!(opts.page_seg_mode.is_none());
}
#[test]
@ -556,6 +594,7 @@ mod tests {
let opts = TessOpts::with_language("fra");
assert_eq!(opts.language, "fra");
assert!(opts.tessdata_path.is_none());
assert!(opts.page_seg_mode.is_none());
}
#[test]
@ -564,6 +603,15 @@ mod tests {
let opts = TessOpts::with_tessdata_path(path.clone());
assert_eq!(opts.language, "eng");
assert_eq!(opts.tessdata_path, Some(path));
assert!(opts.page_seg_mode.is_none());
}
#[test]
fn test_tess_opts_with_page_seg_mode() {
let opts = TessOpts::with_page_seg_mode(PageSegMode::PsmSparseText);
assert_eq!(opts.language, "eng");
assert!(opts.tessdata_path.is_none());
assert_eq!(opts.page_seg_mode, Some(PageSegMode::PsmSparseText));
}
#[test]
@ -578,6 +626,9 @@ mod tests {
let path = PathBuf::from("/custom/path");
let opts4 = TessOpts::with_tessdata_path(path);
assert_ne!(opts1, opts4);
let opts5 = TessOpts::with_page_seg_mode(PageSegMode::PsmSparseText);
assert_ne!(opts1, opts5);
}
#[test]
@ -586,6 +637,7 @@ mod tests {
let opts = TessOpts {
language: "eng".to_string(),
tessdata_path: Some(path.clone()),
page_seg_mode: None,
};
let resolved = opts.resolve_tessdata_path();
@ -613,6 +665,7 @@ mod tests {
let opts = TessOpts {
language: "eng".to_string(),
tessdata_path: Some(path.clone()),
page_seg_mode: None,
};
let resolved = opts.resolve_tessdata_path();
@ -2347,6 +2400,19 @@ const ASSISTED_OCR_CONFIDENCE_CAP: f32 = 0.4;
/// For small N (< 100), linear scan is faster due to lower overhead.
const ASSISTED_OCR_KDTREE_THRESHOLD: usize = 100;
/// Region-level confidence threshold for keeping assisted-OCR output.
///
/// If the mean confidence of all assisted-OCR words in a region is greater
/// than this value, the region is kept as-is with confidence_source = "ocr-assisted".
const ASSISTED_OCR_KEEP_THRESH: f32 = 0.7;
/// Region-level confidence threshold for falling back to pure OCR.
///
/// If the mean confidence of all assisted-OCR words in a region is less
/// than this value, the region is reprocessed with pure OCR (no validation filter)
/// and emitted with confidence_source = "ocr-fallback".
const ASSISTED_OCR_FALLBACK_THRESH: f32 = 0.3;
/// Validate OCR words against vector glyph position hints.
///
/// This function implements the per-word validation filter for the
@ -2448,6 +2514,172 @@ pub fn validate_ocr_with_position_hints(
.collect()
}
/// Region (line) for grouping OCR words by baseline proximity.
#[derive(Debug, Clone)]
struct OcrRegion {
/// Words in this region.
words: Vec<(HocrWord, [f64; 4])>, // (HocrWord, PDF bbox)
/// Mean confidence of all words in this region.
mean_confidence: f32,
}
/// Apply region-level confidence policy to assisted-OCR spans.
///
/// This function implements Phase 5.5.3 step 5: for each region (line),
/// compute the mean confidence across all assisted-OCR words and decide
/// whether to keep as-is, keep with high confidence flag, or trigger fallback.
///
/// # Arguments
///
/// * `hocr_words` - OCR words from Tesseract (in pixel coordinates)
/// * `vector_glyphs` - Position hints from Phase 3
/// * `dpi` - DPI used for rendering
/// * `page_height_pt` - Page height in PDF points
///
/// # Returns
///
/// A tuple of:
/// - Vec of spans with adjusted confidence sources
/// - Vec of HocrWords that need fallback (grouped by regions with mean < 0.3)
///
/// # Region Grouping
///
/// Words are grouped into regions by baseline proximity (Y-coordinate).
/// Two words are in the same region if their baselines are within 12pt
/// (approximately 1.5x the typical line height for 12pt text).
///
/// # Policy
///
/// For each region:
/// - mean > 0.7: keep with `OcrAssisted` source
/// - mean < 0.3: flag for fallback (caller should rerun Tesseract)
/// - 0.3 <= mean <= 0.7: keep with `OcrAssisted` source
///
/// # See also
///
/// - Phase 5.5 pipeline step 5 (plan line 1937)
/// - `validate_ocr_with_position_hints` for per-word validation
pub fn apply_region_level_confidence_policy(
hocr_words: &[HocrWord],
vector_glyphs: &[Glyph],
dpi: u32,
page_height_pt: f64,
) -> (Vec<crate::hybrid::Span>, Vec<(HocrWord, [f64; 4])>) {
// First, apply per-word validation to get initial confidence-adjusted spans
let validated_spans =
validate_ocr_with_position_hints(hocr_words, vector_glyphs, dpi, page_height_pt);
// Group words into regions by baseline proximity
let regions = group_words_by_region(hocr_words, dpi, page_height_pt);
// Compute mean confidence for each region and classify
let mut final_spans = Vec::new();
let mut fallback_words = Vec::new();
for region in regions {
if region.mean_confidence < ASSISTED_OCR_FALLBACK_THRESH {
// Region needs fallback - collect original words for rerun
for (word, pdf_bbox) in region.words {
fallback_words.push((word, pdf_bbox));
}
} else {
// Keep region - convert validated spans to final output
// Words in this region are already in validated_spans
// We need to match them up by position
for (word, pdf_bbox) in region.words {
// Find the corresponding validated span
if let Some(span) = validated_spans
.iter()
.find(|s| s.bbox == pdf_bbox && s.text == word.text)
{
let span = if region.mean_confidence > ASSISTED_OCR_KEEP_THRESH {
// High confidence region - keep as OcrAssisted
crate::hybrid::Span::ocr_assisted(
span.bbox,
span.confidence,
span.text.clone(),
)
} else {
// Medium confidence region - keep as-is (OcrAssisted)
span.clone()
};
final_spans.push(span);
}
}
}
}
(final_spans, fallback_words)
}
/// Group OCR words into regions by baseline proximity.
///
/// Two words are in the same region if their baselines are within 12pt.
/// The baseline is computed as `y0 + (bbox_height * 0.2)`.
///
/// # Arguments
///
/// * `hocr_words` - OCR words from Tesseract
/// * `dpi` - DPI used for rendering
/// * `page_height_pt` - Page height in PDF points
///
/// # Returns
///
/// A vector of regions, each containing words and their mean confidence.
fn group_words_by_region(hocr_words: &[HocrWord], dpi: u32, page_height_pt: f64) -> Vec<OcrRegion> {
if hocr_words.is_empty() {
return Vec::new();
}
// Convert all words to PDF coordinates and compute baselines
let mut word_info: Vec<(HocrWord, [f64; 4], f64)> = hocr_words
.iter()
.map(|word| {
let pdf_bbox = word.to_pdf_bbox(dpi, page_height_pt, None, None);
let baseline = pdf_bbox[1] + (pdf_bbox[3] - pdf_bbox[1]) * 0.2;
(word.clone(), pdf_bbox, baseline)
})
.collect();
// Sort by baseline for deterministic grouping
word_info.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal));
// Group by baseline proximity (within 12pt)
let mut regions: Vec<OcrRegion> = Vec::new();
const BASELINE_TOLERANCE_PT: f64 = 12.0;
for (word, pdf_bbox, baseline) in word_info {
let confidence = word.confidence();
// Find existing region with compatible baseline
let region = regions.iter_mut().find(|r| {
if r.words.is_empty() {
return false;
}
// Compute region's baseline from first word
let (_, first_bbox, _) = &r.words[0];
let region_baseline = first_bbox[1] + (first_bbox[3] - first_bbox[1]) * 0.2;
(region_baseline - baseline).abs() < BASELINE_TOLERANCE_PT
});
if let Some(region) = region {
// Add to existing region
region.words.push((word, pdf_bbox));
// Recompute mean confidence
let sum: f32 = region.words.iter().map(|(w, _)| w.confidence()).sum();
region.mean_confidence = sum / region.words.len() as f32;
} else {
// Create new region
regions.push(OcrRegion {
words: vec![(word, pdf_bbox)],
mean_confidence: confidence,
});
}
}
regions
}
#[cfg(test)]
mod assisted_ocr_tests {
use super::*;
@ -2586,6 +2818,135 @@ mod assisted_ocr_tests {
assert_eq!(ASSISTED_OCR_DISTANCE_PT, 5.0);
assert_eq!(ASSISTED_OCR_CONFIDENCE_CAP, 0.4);
assert_eq!(ASSISTED_OCR_KDTREE_THRESHOLD, 100);
assert_eq!(ASSISTED_OCR_KEEP_THRESH, 0.7);
assert_eq!(ASSISTED_OCR_FALLBACK_THRESH, 0.3);
}
#[test]
fn test_region_level_policy_high_confidence_region() {
// Test region with mean confidence > 0.7 - should keep as OcrAssisted
let glyphs = vec![
Glyph::position_hint([100.0, 200.0, 110.0, 210.0]),
Glyph::position_hint([120.0, 200.0, 130.0, 210.0]),
];
let words = vec![
HocrWord {
text: "hello".to_string(),
bbox_px: [102, 202, 108, 208],
confidence_0_100: 95,
},
HocrWord {
text: "world".to_string(),
bbox_px: [122, 202, 128, 208],
confidence_0_100: 90,
},
];
let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0);
// Both words are near glyphs, so they keep high confidence
assert_eq!(spans.len(), 2);
assert_eq!(fallback.len(), 0); // No fallback needed
assert!(spans
.iter()
.all(|s| s.source == crate::hybrid::SpanSource::OcrAssisted));
}
#[test]
fn test_region_level_policy_low_confidence_region() {
// Test region with mean confidence < 0.3 - should trigger fallback
let glyphs = vec![]; // No glyphs -> all words capped at 0.4
let words = vec![
HocrWord {
text: "low1".to_string(),
bbox_px: [100, 100, 120, 120],
confidence_0_100: 20,
},
HocrWord {
text: "low2".to_string(),
bbox_px: [130, 100, 150, 120],
confidence_0_100: 25,
},
];
let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0);
// Low confidence region -> fallback triggered
assert_eq!(spans.len(), 0); // No spans kept
assert_eq!(fallback.len(), 2); // Both words need fallback
}
#[test]
fn test_region_level_policy_medium_confidence_region() {
// Test region with 0.3 <= mean confidence <= 0.7 - should keep as-is
let glyphs = vec![];
let words = vec![
HocrWord {
text: "med1".to_string(),
bbox_px: [100, 100, 120, 120],
confidence_0_100: 40,
},
HocrWord {
text: "med2".to_string(),
bbox_px: [130, 100, 150, 120],
confidence_0_100: 50,
},
];
let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0);
// Medium confidence region -> kept as-is (capped at 0.4 by validation)
assert_eq!(spans.len(), 2);
assert_eq!(fallback.len(), 0); // No fallback needed
}
#[test]
fn test_region_level_policy_multiple_regions() {
// Test multiple regions with different confidence levels
let glyphs = vec![
Glyph::position_hint([100.0, 200.0, 110.0, 210.0]), // For high confidence region
];
let words = vec![
// Region 1: high confidence (near glyph)
HocrWord {
text: "hello".to_string(),
bbox_px: [102, 202, 108, 208],
confidence_0_100: 95,
},
// Region 2: low confidence (far from glyph, different Y)
HocrWord {
text: "low".to_string(),
bbox_px: [500, 500, 520, 520],
confidence_0_100: 20,
},
];
let (spans, fallback) = apply_region_level_confidence_policy(&words, &glyphs, 300, 792.0);
// One span kept, one word needs fallback
assert_eq!(spans.len(), 1);
assert_eq!(fallback.len(), 1);
assert_eq!(spans[0].text, "hello");
}
#[test]
fn test_group_words_by_region_empty() {
let words: Vec<HocrWord> = vec![];
let regions = group_words_by_region(&words, 300, 792.0);
assert_eq!(regions.len(), 0);
}
#[test]
fn test_group_words_by_region_single_word() {
let words = vec![HocrWord {
text: "test".to_string(),
bbox_px: [100, 100, 120, 120],
confidence_0_100: 80,
}];
let regions = group_words_by_region(&words, 300, 792.0);
assert_eq!(regions.len(), 1);
assert_eq!(regions[0].words.len(), 1);
assert_eq!(regions[0].mean_confidence, 0.8);
}
}

View file

@ -28,6 +28,13 @@ use crate::signature::Signature;
///
/// A span is the smallest unit of extracted text, representing a
/// contiguous run of text with consistent font and styling.
///
/// # TODO: Phase 6.1 - Add confidence_source field
///
/// When the `confidence_source` field is added to the schema (per plan line 363, 1662),
/// it should include "ocr-fallback" as a valid value for spans emitted via
/// Phase 5.5.3 region-level fallback. The internal `SpanSource::OcrFallback` variant
/// in `hybrid.rs` maps to this value.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct SpanJson {