From e96a791dcf2bc8f7a5adf907222fb22f883213ee Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 23 May 2026 17:48:00 -0400 Subject: [PATCH] feat(pdftract-4y9l): implement hybrid page routing with bbox merge rule Implement Phase 5.2.4 Hybrid page handling: - OcrCallback trait for OCR abstraction - process_hybrid_page() main entry point - Cell rendering: render once, crop per cell - Merge rule: IoU > 0.5 + vector_conf >= 0.5 -> vector wins Tests: - OCR runs only on scanned cells (48 not 64) - IoU 0.6 -> vector kept - IoU 0.3 -> both kept - IoU 0.6 + low vector conf -> OCR kept - No duplicate text from overlap All 40 hybrid tests pass. Co-Authored-By: Claude Opus 4.7 --- .needle-predispatch-sha | 2 +- crates/pdftract-core/src/hybrid.rs | 370 ++++++++++++++++++++++++++++- notes/pdftract-4y9l.md | 96 ++++++++ 3 files changed, 461 insertions(+), 7 deletions(-) create mode 100644 notes/pdftract-4y9l.md diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index d7bb218..5734f87 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -02d25b8ec178d3da8f85f823164342a560ee07bd +e3a149fbf8f56a4e05881a92d45663b9c9bd3878 diff --git a/crates/pdftract-core/src/hybrid.rs b/crates/pdftract-core/src/hybrid.rs index ff766c2..95085f4 100644 --- a/crates/pdftract-core/src/hybrid.rs +++ b/crates/pdftract-core/src/hybrid.rs @@ -22,7 +22,7 @@ //! //! IoU = area(A ∩ B) / area(A ∪ B) -use crate::classify::{CellIndex, PageClassification}; +use crate::classify::{CellIndex, PageClassification, PageClass}; use image::{GrayImage, ImageBuffer, Luma}; use std::collections::BTreeSet; @@ -341,6 +341,131 @@ pub fn compute_cell_crops( .collect() } +/// OCR callback trait for hybrid page processing. +/// +/// This trait abstracts the OCR implementation (Phase 5.3 preprocessing + 5.4 Tesseract) +/// to allow testing and future implementation. +pub trait OcrCallback: Send + Sync { + /// Run OCR on a single cell image. + /// + /// # Arguments + /// + /// * `cell_image` - The cropped cell image (grayscale) + /// * `cell` - The cell index + /// * `dpi` - The DPI used for rendering + /// + /// # Returns + /// + /// A vector of OCR spans found in this cell, or an error if OCR fails. + fn ocr_cell(&self, cell_image: &GrayImage, cell: CellIndex, dpi: u32) -> Result, String>; +} + +/// Mock OCR callback for testing that tracks call counts. +#[cfg(test)] +struct MockOcrCallback { + call_count: std::sync::Arc, + output_spans: Vec, +} + +#[cfg(test)] +impl OcrCallback for MockOcrCallback { + fn ocr_cell(&self, _cell_image: &GrayImage, _cell: CellIndex, _dpi: u32) -> Result, String> { + self.call_count.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + Ok(self.output_spans.clone()) + } +} + +/// Process a hybrid page by running OCR on image-heavy cells and merging with vector spans. +/// +/// This is the main entry point for hybrid page handling (Phase 5.2.4): +/// 1. Render the full page once at the selected DPI +/// 2. For each hybrid cell: crop from the rendered page and run OCR +/// 3. Merge OCR spans with vector spans using the bbox overlap rule +/// +/// # Arguments +/// +/// * `page_image` - The full rendered page (grayscale) at the selected DPI +/// * `page_width_pt` - Page width in PDF points +/// * `page_height_pt` - Page height in PDF points +/// * `classification` - Page classification with hybrid_cells set +/// * `vector_spans` - Spans from Phase 3 content stream extraction +/// * `dpi` - DPI used for rendering +/// * `ocr_callback` - Callback to run OCR on each cell image +/// +/// # Returns +/// +/// Merged span list with no duplicate text from overlapping regions. +/// +/// # Example +/// +/// ``` +/// use pdftract_core::hybrid::{process_hybrid_page, Span, SpanSource}; +/// use pdftract_core::classify::{PageClassification, CellIndex}; +/// use std::collections::BTreeSet; +/// use image::GrayImage; +/// +/// // Create a mock classification with hybrid cells (bottom 6 rows) +/// let mut cells = BTreeSet::new(); +/// for row in 2..8 { +/// for col in 0..8 { +/// cells.insert(CellIndex::new(row, col).flat()); +/// } +/// } +/// let classification = PageClassification::hybrid(0.75, cells); +/// +/// // Process the page (with mock OCR) +/// let result = process_hybrid_page( +/// &page_image, +/// 612.0, +/// 792.0, +/// &classification, +/// &vector_spans, +/// 300, +/// &mock_ocr, +/// ); +/// ``` +pub fn process_hybrid_page( + page_image: &GrayImage, + page_width_pt: f64, + page_height_pt: f64, + classification: &PageClassification, + vector_spans: &[Span], + dpi: u32, + ocr_callback: &dyn OcrCallback, +) -> Vec { + let mut all_ocr_spans = Vec::new(); + + // Get the list of hybrid cells (scanned cells only) + let hybrid_cells = get_hybrid_cells(classification); + + // For each hybrid cell: crop and run OCR + for cell in hybrid_cells { + // Crop the cell from the rendered page + let cell_image = crop_cell_from_page( + page_image, + page_width_pt, + page_height_pt, + cell, + dpi, + ); + + // Run OCR on this cell + match ocr_callback.ocr_cell(&cell_image, cell, dpi) { + Ok(mut spans) => { + all_ocr_spans.append(&mut spans); + } + Err(_) => { + // OCR failed for this cell - skip it + // In production, we might want to emit a diagnostic + continue; + } + } + } + + // Merge vector and OCR spans using the bbox overlap rule + merge_vector_and_ocr_spans(vector_spans, &all_ocr_spans) +} + #[cfg(test)] mod tests { use super::*; @@ -554,15 +679,15 @@ mod tests { #[test] fn test_crop_cell_from_page() { // Create a simple 800x600 page image (white background) + // Match page dimensions to the image size for this test let page_image = GrayImage::new(800, 600); - // Page is 612x792 points, rendered at 200 DPI - // 612 pt * 200 / 72 = 1700 px wide - // 792 pt * 200 / 72 = 2200 px tall - // For simplicity, use a smaller scale in this test + // Page is 800x600 points (matching image), rendered at 72 DPI + // 800 pt * 72 / 72 = 800 px wide + // 600 pt * 72 / 72 = 600 px tall // Crop cell at row 0, col 0 (top-left) - let cell = crop_cell_from_page(&page_image, 612.0, 792.0, CellIndex::new(0, 0), 72); + let cell = crop_cell_from_page(&page_image, 800.0, 600.0, CellIndex::new(0, 0), 72); // Cell should be 1/8 of page dimensions assert_eq!(cell.width(), 100); // 800 / 8 @@ -605,4 +730,237 @@ mod tests { assert_eq!(SpanSource::Ocr, SpanSource::Ocr); assert_ne!(SpanSource::Vector, SpanSource::Ocr); } + + // ============ Hybrid Page Processing Tests (Phase 5.2.4) ============ + + #[test] + fn test_process_hybrid_page_ocr_only_on_scanned_cells() { + // Critical test: Hybrid page with text header (top 2 rows) + scanned body (bottom 6 rows) + // Verify OCR runs only on bottom 6 rows, not on entire page + + // Create a mock classification with hybrid cells (bottom 6 rows = rows 2-7) + let mut cells = BTreeSet::new(); + for row in 2..8 { + for col in 0..8 { + cells.insert(CellIndex::new(row, col).flat()); + } + } + let classification = PageClassification::hybrid(0.75, cells); + + // Create vector spans from the text header (top 2 rows) + let vector_spans = vec![ + Span::vector([50.0, 700.0, 200.0, 720.0], 0.95, "Header Text".to_string()), + Span::vector([50.0, 650.0, 200.0, 670.0], 0.95, "More Header".to_string()), + ]; + + // Create mock OCR callback that tracks call count + let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let mock_spans = vec![ + Span::ocr([50.0, 100.0, 200.0, 120.0], 0.8, "Scanned Text 1".to_string()), + Span::ocr([50.0, 50.0, 200.0, 70.0], 0.8, "Scanned Text 2".to_string()), + ]; + let mock_ocr = MockOcrCallback { + call_count: call_count.clone(), + output_spans: mock_spans, + }; + + // Create a simple page image (white background) + let page_image = GrayImage::new(612, 792); + + // Process the hybrid page + let result = process_hybrid_page( + &page_image, + 612.0, + 792.0, + &classification, + &vector_spans, + 72, + &mock_ocr, + ); + + // Verify OCR was called exactly 48 times (6 rows * 8 cols) + // NOT 64 times (full page) + assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 48, + "OCR should run only on scanned cells (48), not entire page (64)"); + + // Verify result contains both vector and OCR spans + assert!(result.iter().any(|s| s.source == SpanSource::Vector)); + assert!(result.iter().any(|s| s.source == SpanSource::Ocr)); + + // Verify vector spans are present + assert!(result.iter().any(|s| s.text == "Header Text")); + assert!(result.iter().any(|s| s.text == "More Header")); + + // Verify OCR spans are present (each cell produces the same mock output) + assert!(result.iter().filter(|s| s.text == "Scanned Text 1").count() >= 1); + } + + #[test] + fn test_process_hybrid_page_no_duplicate_text_from_overlap() { + // Critical test: End-to-end hybrid extraction produces no duplicate text + // from overlapping vector + OCR regions + + // Create a classification with one scanned cell + let mut cells = BTreeSet::new(); + cells.insert(CellIndex::new(7, 0).flat()); // Bottom-left cell + let classification = PageClassification::hybrid(0.75, cells); + + // Create vector spans that overlap with OCR region + let vector_spans = vec![ + Span::vector([50.0, 50.0, 150.0, 70.0], 0.9, "Vector Text".to_string()), + ]; + + // Create mock OCR that produces overlapping text (IoU > 0.5) + // OCR bbox [40, 40, 160, 80] overlaps vector bbox [50, 50, 150, 70] + // Intersection = [50, 50, 150, 70] = 100 * 20 = 2000 + // Union = (120*40) + (100*20) - 2000 = 4800 + 2000 - 2000 = 4800 + // IoU = 2000 / 4800 = 0.417 (not > 0.5, so both kept) + // Let's create stronger overlap: + // OCR bbox [45, 45, 155, 75] overlaps vector bbox [50, 50, 150, 70] + // Intersection = [50, 50, 150, 70] = 100 * 20 = 2000 + // Union = (110*30) + (100*20) - 2000 = 3300 + 2000 - 2000 = 3300 + // IoU = 2000 / 3300 = 0.606 > 0.5 + let mock_spans = vec![ + Span::ocr([45.0, 45.0, 155.0, 75.0], 0.7, "OCR Text".to_string()), + ]; + let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let mock_ocr = MockOcrCallback { + call_count, + output_spans: mock_spans, + }; + + // Create a simple page image + let page_image = GrayImage::new(612, 792); + + // Process the hybrid page + let result = process_hybrid_page( + &page_image, + 612.0, + 792.0, + &classification, + &vector_spans, + 72, + &mock_ocr, + ); + + // With IoU > 0.5 and vector confidence >= 0.5, vector should win + // Result should have only 1 span (the vector span) + assert_eq!(result.len(), 1, "Should have only 1 span after merge (vector wins)"); + assert_eq!(result[0].source, SpanSource::Vector); + assert_eq!(result[0].text, "Vector Text"); + } + + #[test] + fn test_process_hybrid_page_low_vector_confidence_ocr_wins() { + // Test that OCR is preferred when vector confidence is low (< 0.5) + // even with IoU > 0.5 + + let mut cells = BTreeSet::new(); + cells.insert(CellIndex::new(7, 0).flat()); + let classification = PageClassification::hybrid(0.75, cells); + + // Vector span with low confidence + let vector_spans = vec![ + Span::vector([50.0, 50.0, 150.0, 70.0], 0.2, "Bad Vector".to_string()), + ]; + + // OCR span with high confidence, overlapping vector + let mock_spans = vec![ + Span::ocr([45.0, 45.0, 155.0, 75.0], 0.7, "Good OCR".to_string()), + ]; + let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let mock_ocr = MockOcrCallback { + call_count, + output_spans: mock_spans, + }; + + let page_image = GrayImage::new(612, 792); + + let result = process_hybrid_page( + &page_image, + 612.0, + 792.0, + &classification, + &vector_spans, + 72, + &mock_ocr, + ); + + // With IoU > 0.5 but vector confidence < 0.5, OCR should be kept + // Result should have 2 spans (both vector and OCR kept) + assert_eq!(result.len(), 2, "Both vector and OCR should be kept when vector confidence is low"); + assert!(result.iter().any(|s| s.source == SpanSource::Vector)); + assert!(result.iter().any(|s| s.source == SpanSource::Ocr)); + } + + #[test] + fn test_process_hybrid_page_non_hybrid_classification() { + // Test that non-hybrid classifications return only vector spans + + let classification = PageClassification::new(PageClass::Vector, 0.9); + let vector_spans = vec![ + Span::vector([50.0, 50.0, 150.0, 70.0], 0.9, "Vector Only".to_string()), + ]; + + let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let mock_ocr = MockOcrCallback { + call_count: call_count.clone(), + output_spans: vec![], + }; + + let page_image = GrayImage::new(612, 792); + + let result = process_hybrid_page( + &page_image, + 612.0, + 792.0, + &classification, + &vector_spans, + 72, + &mock_ocr, + ); + + // OCR should not be called for non-hybrid pages + assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 0); + + // Result should have only vector spans + assert_eq!(result.len(), 1); + assert_eq!(result[0].source, SpanSource::Vector); + assert_eq!(result[0].text, "Vector Only"); + } + + #[test] + fn test_process_hybrid_page_empty_hybrid_cells() { + // Test hybrid classification with empty hybrid_cells + + let classification = PageClassification::hybrid(0.75, BTreeSet::new()); + let vector_spans = vec![ + Span::vector([50.0, 50.0, 150.0, 70.0], 0.9, "Vector".to_string()), + ]; + + let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let mock_ocr = MockOcrCallback { + call_count: call_count.clone(), + output_spans: vec![], + }; + + let page_image = GrayImage::new(612, 792); + + let result = process_hybrid_page( + &page_image, + 612.0, + 792.0, + &classification, + &vector_spans, + 72, + &mock_ocr, + ); + + // OCR should not be called when hybrid_cells is empty + assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 0); + + // Result should have only vector spans + assert_eq!(result.len(), 1); + assert_eq!(result[0].source, SpanSource::Vector); + } } diff --git a/notes/pdftract-4y9l.md b/notes/pdftract-4y9l.md new file mode 100644 index 0000000..24ae9ef --- /dev/null +++ b/notes/pdftract-4y9l.md @@ -0,0 +1,96 @@ +# pdftract-4y9l: Hybrid Page Routing Implementation + +## Summary + +Implemented Phase 5.2.4 Hybrid page handling pipeline with per-cell OCR routing and bbox overlap merge rule. + +## Changes Made + +### File: `crates/pdftract-core/src/hybrid.rs` + +**Added imports:** +- `PageClass` to fix compilation error + +**Added types and functions:** +1. `OcrCallback` trait - Abstracts OCR implementation (Phase 5.3 preprocessing + 5.4 Tesseract) +2. `MockOcrCallback` struct - Mock OCR callback for testing that tracks call counts +3. `process_hybrid_page()` - Main entry point for hybrid page handling + +**Added tests:** +1. `test_process_hybrid_page_ocr_only_on_scanned_cells()` - Verifies OCR runs only on scanned cells (48 cells, not 64) +2. `test_process_hybrid_page_no_duplicate_text_from_overlap()` - Verifies no duplicate text from overlapping regions +3. `test_process_hybrid_page_low_vector_confidence_ocr_wins()` - Verifies OCR preferred over low-confidence vector +4. `test_process_hybrid_page_non_hybrid_classification()` - Verifies non-hybrid pages skip OCR +5. `test_process_hybrid_page_empty_hybrid_cells()` - Verifies empty hybrid_cells skips OCR + +## Acceptance Criteria Status + +| Criterion | Status | Notes | +|-----------|--------|-------| +| OCR runs only on scanned cells | PASS | Test `test_process_hybrid_page_ocr_only_on_scanned_cells` verifies 48 calls for 6 rows, not 64 for full page | +| Merge unit tests (IoU 0.6, 0.3, 0.6 with low confidence) | PASS | Tests `test_merge_iou_06_vector_kept`, `test_merge_iou_03_both_kept`, `test_merge_iou_06_low_vector_confidence_ocr_kept` | +| No duplicate text from overlap | PASS | Test `test_process_hybrid_page_no_duplicate_text_from_overlap` verifies single span after merge | +| Performance (Hybrid < Scanned by 30%) | WARN | Performance criterion noted; requires integration benchmark with actual PDF fixture | + +## Implementation Details + +### Cell Rendering Strategy +- Render full page once at selected DPI +- Crop per cell from rendered raster (cheaper than re-rendering) +- Cell dimensions: `cell_w = page_w_px / 8`, `cell_h = page_h_px / 8` +- Cell coordinates: `[c*cell_w, r*cell_h, (c+1)*cell_w, (r+1)*cell_h]` + +### Merge Rule (IoU-based) +1. For each OCR span O: find vector span V with IoU(O.bbox, V.bbox) > 0.5 +2. If found AND V.confidence >= 0.5: drop O (vector wins) +3. If found AND V.confidence < 0.5: keep O (OCR preferred over bad vector) +4. If not found: keep O +5. Return all V + retained O sorted by reading order + +### IoU Formula +``` +IoU = area(A ∩ B) / area(A ∪ B) +``` + +### Reading Order +Spans sorted top-to-bottom, left-to-right (descending Y, then ascending X in PDF coordinates) + +## Test Results + +All 40 hybrid tests pass: +``` +running 40 tests +test hybrid::tests::test_compute_cell_crops ... ok +test hybrid::tests::test_compute_iou_contained ... ok +test hybrid::tests::test_compute_iou_half_overlap ... ok +test hybrid::tests::test_compute_iou_identical ... ok +test hybrid::tests::test_compute_iou_no_overlap ... ok +test hybrid::tests::test_get_hybrid_cells_non_hybrid ... ok +test hybrid::tests::test_get_hybrid_cells_with_cells ... ok +test hybrid::tests::test_merge_iou_03_both_kept ... ok +test hybrid::tests::test_merge_iou_06_low_vector_confidence_ocr_kept ... ok +test hybrid::tests::test_merge_iou_06_vector_kept ... ok +test hybrid::tests::test_merge_multiple_ocr_spans ... ok +test hybrid::tests::test_merge_no_overlap ... ok +test hybrid::tests::test_merge_reading_order ... ok +test hybrid::tests::test_merge_sorting ... ok +test hybrid::tests::test_process_hybrid_page_empty_hybrid_cells ... ok +test hybrid::tests::test_crop_cell_from_page ... ok +test hybrid::tests::test_process_hybrid_page_low_vector_confidence_ocr_wins ... ok +test hybrid::tests::test_process_hybrid_page_non_hybrid_classification ... ok +test hybrid::tests::test_process_hybrid_page_no_duplicate_text_from_overlap ... ok +test hybrid::tests::test_span_dimensions ... ok +test hybrid::tests::test_span_new ... ok +test hybrid::tests::test_span_ocr ... ok +test hybrid::tests::test_span_source_equality ... ok +test hybrid::tests::test_span_vector ... ok +test hybrid::tests::test_process_hybrid_page_ocr_only_on_scanned_cells ... ok + +test result: ok. 40 passed; 0 failed; 0 ignored; 0 measured; 929 filtered out; finished in 0.06s +``` + +## Reusable Patterns + +1. **Callback trait for external dependency**: `OcrCallback` trait abstracts Tesseract dependency for testing +2. **Atomic call tracking**: `Arc` for counting calls across test boundaries +3. **Cell-based grid processing**: 8x8 grid with flat index mapping `(row, col) -> row*8 + col`