feat(pdftract-4y9l): implement hybrid page routing with bbox merge rule
Implement Phase 5.2.4 Hybrid page handling: - OcrCallback trait for OCR abstraction - process_hybrid_page() main entry point - Cell rendering: render once, crop per cell - Merge rule: IoU > 0.5 + vector_conf >= 0.5 -> vector wins Tests: - OCR runs only on scanned cells (48 not 64) - IoU 0.6 -> vector kept - IoU 0.3 -> both kept - IoU 0.6 + low vector conf -> OCR kept - No duplicate text from overlap All 40 hybrid tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
e3a149fbf8
commit
e96a791dcf
3 changed files with 461 additions and 7 deletions
|
|
@ -1 +1 @@
|
|||
02d25b8ec178d3da8f85f823164342a560ee07bd
|
||||
e3a149fbf8f56a4e05881a92d45663b9c9bd3878
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
//!
|
||||
//! IoU = area(A ∩ B) / area(A ∪ B)
|
||||
|
||||
use crate::classify::{CellIndex, PageClassification};
|
||||
use crate::classify::{CellIndex, PageClassification, PageClass};
|
||||
use image::{GrayImage, ImageBuffer, Luma};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
|
|
@ -341,6 +341,131 @@ pub fn compute_cell_crops(
|
|||
.collect()
|
||||
}
|
||||
|
||||
/// OCR callback trait for hybrid page processing.
|
||||
///
|
||||
/// This trait abstracts the OCR implementation (Phase 5.3 preprocessing + 5.4 Tesseract)
|
||||
/// to allow testing and future implementation.
|
||||
pub trait OcrCallback: Send + Sync {
|
||||
/// Run OCR on a single cell image.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `cell_image` - The cropped cell image (grayscale)
|
||||
/// * `cell` - The cell index
|
||||
/// * `dpi` - The DPI used for rendering
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of OCR spans found in this cell, or an error if OCR fails.
|
||||
fn ocr_cell(&self, cell_image: &GrayImage, cell: CellIndex, dpi: u32) -> Result<Vec<Span>, String>;
|
||||
}
|
||||
|
||||
/// Mock OCR callback for testing that tracks call counts.
|
||||
#[cfg(test)]
|
||||
struct MockOcrCallback {
|
||||
call_count: std::sync::Arc<std::sync::atomic::AtomicUsize>,
|
||||
output_spans: Vec<Span>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl OcrCallback for MockOcrCallback {
|
||||
fn ocr_cell(&self, _cell_image: &GrayImage, _cell: CellIndex, _dpi: u32) -> Result<Vec<Span>, String> {
|
||||
self.call_count.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
Ok(self.output_spans.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Process a hybrid page by running OCR on image-heavy cells and merging with vector spans.
|
||||
///
|
||||
/// This is the main entry point for hybrid page handling (Phase 5.2.4):
|
||||
/// 1. Render the full page once at the selected DPI
|
||||
/// 2. For each hybrid cell: crop from the rendered page and run OCR
|
||||
/// 3. Merge OCR spans with vector spans using the bbox overlap rule
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `page_image` - The full rendered page (grayscale) at the selected DPI
|
||||
/// * `page_width_pt` - Page width in PDF points
|
||||
/// * `page_height_pt` - Page height in PDF points
|
||||
/// * `classification` - Page classification with hybrid_cells set
|
||||
/// * `vector_spans` - Spans from Phase 3 content stream extraction
|
||||
/// * `dpi` - DPI used for rendering
|
||||
/// * `ocr_callback` - Callback to run OCR on each cell image
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Merged span list with no duplicate text from overlapping regions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use pdftract_core::hybrid::{process_hybrid_page, Span, SpanSource};
|
||||
/// use pdftract_core::classify::{PageClassification, CellIndex};
|
||||
/// use std::collections::BTreeSet;
|
||||
/// use image::GrayImage;
|
||||
///
|
||||
/// // Create a mock classification with hybrid cells (bottom 6 rows)
|
||||
/// let mut cells = BTreeSet::new();
|
||||
/// for row in 2..8 {
|
||||
/// for col in 0..8 {
|
||||
/// cells.insert(CellIndex::new(row, col).flat());
|
||||
/// }
|
||||
/// }
|
||||
/// let classification = PageClassification::hybrid(0.75, cells);
|
||||
///
|
||||
/// // Process the page (with mock OCR)
|
||||
/// let result = process_hybrid_page(
|
||||
/// &page_image,
|
||||
/// 612.0,
|
||||
/// 792.0,
|
||||
/// &classification,
|
||||
/// &vector_spans,
|
||||
/// 300,
|
||||
/// &mock_ocr,
|
||||
/// );
|
||||
/// ```
|
||||
pub fn process_hybrid_page(
|
||||
page_image: &GrayImage,
|
||||
page_width_pt: f64,
|
||||
page_height_pt: f64,
|
||||
classification: &PageClassification,
|
||||
vector_spans: &[Span],
|
||||
dpi: u32,
|
||||
ocr_callback: &dyn OcrCallback,
|
||||
) -> Vec<Span> {
|
||||
let mut all_ocr_spans = Vec::new();
|
||||
|
||||
// Get the list of hybrid cells (scanned cells only)
|
||||
let hybrid_cells = get_hybrid_cells(classification);
|
||||
|
||||
// For each hybrid cell: crop and run OCR
|
||||
for cell in hybrid_cells {
|
||||
// Crop the cell from the rendered page
|
||||
let cell_image = crop_cell_from_page(
|
||||
page_image,
|
||||
page_width_pt,
|
||||
page_height_pt,
|
||||
cell,
|
||||
dpi,
|
||||
);
|
||||
|
||||
// Run OCR on this cell
|
||||
match ocr_callback.ocr_cell(&cell_image, cell, dpi) {
|
||||
Ok(mut spans) => {
|
||||
all_ocr_spans.append(&mut spans);
|
||||
}
|
||||
Err(_) => {
|
||||
// OCR failed for this cell - skip it
|
||||
// In production, we might want to emit a diagnostic
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merge vector and OCR spans using the bbox overlap rule
|
||||
merge_vector_and_ocr_spans(vector_spans, &all_ocr_spans)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
@ -554,15 +679,15 @@ mod tests {
|
|||
#[test]
|
||||
fn test_crop_cell_from_page() {
|
||||
// Create a simple 800x600 page image (white background)
|
||||
// Match page dimensions to the image size for this test
|
||||
let page_image = GrayImage::new(800, 600);
|
||||
|
||||
// Page is 612x792 points, rendered at 200 DPI
|
||||
// 612 pt * 200 / 72 = 1700 px wide
|
||||
// 792 pt * 200 / 72 = 2200 px tall
|
||||
// For simplicity, use a smaller scale in this test
|
||||
// Page is 800x600 points (matching image), rendered at 72 DPI
|
||||
// 800 pt * 72 / 72 = 800 px wide
|
||||
// 600 pt * 72 / 72 = 600 px tall
|
||||
|
||||
// Crop cell at row 0, col 0 (top-left)
|
||||
let cell = crop_cell_from_page(&page_image, 612.0, 792.0, CellIndex::new(0, 0), 72);
|
||||
let cell = crop_cell_from_page(&page_image, 800.0, 600.0, CellIndex::new(0, 0), 72);
|
||||
|
||||
// Cell should be 1/8 of page dimensions
|
||||
assert_eq!(cell.width(), 100); // 800 / 8
|
||||
|
|
@ -605,4 +730,237 @@ mod tests {
|
|||
assert_eq!(SpanSource::Ocr, SpanSource::Ocr);
|
||||
assert_ne!(SpanSource::Vector, SpanSource::Ocr);
|
||||
}
|
||||
|
||||
// ============ Hybrid Page Processing Tests (Phase 5.2.4) ============
|
||||
|
||||
#[test]
|
||||
fn test_process_hybrid_page_ocr_only_on_scanned_cells() {
|
||||
// Critical test: Hybrid page with text header (top 2 rows) + scanned body (bottom 6 rows)
|
||||
// Verify OCR runs only on bottom 6 rows, not on entire page
|
||||
|
||||
// Create a mock classification with hybrid cells (bottom 6 rows = rows 2-7)
|
||||
let mut cells = BTreeSet::new();
|
||||
for row in 2..8 {
|
||||
for col in 0..8 {
|
||||
cells.insert(CellIndex::new(row, col).flat());
|
||||
}
|
||||
}
|
||||
let classification = PageClassification::hybrid(0.75, cells);
|
||||
|
||||
// Create vector spans from the text header (top 2 rows)
|
||||
let vector_spans = vec![
|
||||
Span::vector([50.0, 700.0, 200.0, 720.0], 0.95, "Header Text".to_string()),
|
||||
Span::vector([50.0, 650.0, 200.0, 670.0], 0.95, "More Header".to_string()),
|
||||
];
|
||||
|
||||
// Create mock OCR callback that tracks call count
|
||||
let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0));
|
||||
let mock_spans = vec![
|
||||
Span::ocr([50.0, 100.0, 200.0, 120.0], 0.8, "Scanned Text 1".to_string()),
|
||||
Span::ocr([50.0, 50.0, 200.0, 70.0], 0.8, "Scanned Text 2".to_string()),
|
||||
];
|
||||
let mock_ocr = MockOcrCallback {
|
||||
call_count: call_count.clone(),
|
||||
output_spans: mock_spans,
|
||||
};
|
||||
|
||||
// Create a simple page image (white background)
|
||||
let page_image = GrayImage::new(612, 792);
|
||||
|
||||
// Process the hybrid page
|
||||
let result = process_hybrid_page(
|
||||
&page_image,
|
||||
612.0,
|
||||
792.0,
|
||||
&classification,
|
||||
&vector_spans,
|
||||
72,
|
||||
&mock_ocr,
|
||||
);
|
||||
|
||||
// Verify OCR was called exactly 48 times (6 rows * 8 cols)
|
||||
// NOT 64 times (full page)
|
||||
assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 48,
|
||||
"OCR should run only on scanned cells (48), not entire page (64)");
|
||||
|
||||
// Verify result contains both vector and OCR spans
|
||||
assert!(result.iter().any(|s| s.source == SpanSource::Vector));
|
||||
assert!(result.iter().any(|s| s.source == SpanSource::Ocr));
|
||||
|
||||
// Verify vector spans are present
|
||||
assert!(result.iter().any(|s| s.text == "Header Text"));
|
||||
assert!(result.iter().any(|s| s.text == "More Header"));
|
||||
|
||||
// Verify OCR spans are present (each cell produces the same mock output)
|
||||
assert!(result.iter().filter(|s| s.text == "Scanned Text 1").count() >= 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_process_hybrid_page_no_duplicate_text_from_overlap() {
|
||||
// Critical test: End-to-end hybrid extraction produces no duplicate text
|
||||
// from overlapping vector + OCR regions
|
||||
|
||||
// Create a classification with one scanned cell
|
||||
let mut cells = BTreeSet::new();
|
||||
cells.insert(CellIndex::new(7, 0).flat()); // Bottom-left cell
|
||||
let classification = PageClassification::hybrid(0.75, cells);
|
||||
|
||||
// Create vector spans that overlap with OCR region
|
||||
let vector_spans = vec![
|
||||
Span::vector([50.0, 50.0, 150.0, 70.0], 0.9, "Vector Text".to_string()),
|
||||
];
|
||||
|
||||
// Create mock OCR that produces overlapping text (IoU > 0.5)
|
||||
// OCR bbox [40, 40, 160, 80] overlaps vector bbox [50, 50, 150, 70]
|
||||
// Intersection = [50, 50, 150, 70] = 100 * 20 = 2000
|
||||
// Union = (120*40) + (100*20) - 2000 = 4800 + 2000 - 2000 = 4800
|
||||
// IoU = 2000 / 4800 = 0.417 (not > 0.5, so both kept)
|
||||
// Let's create stronger overlap:
|
||||
// OCR bbox [45, 45, 155, 75] overlaps vector bbox [50, 50, 150, 70]
|
||||
// Intersection = [50, 50, 150, 70] = 100 * 20 = 2000
|
||||
// Union = (110*30) + (100*20) - 2000 = 3300 + 2000 - 2000 = 3300
|
||||
// IoU = 2000 / 3300 = 0.606 > 0.5
|
||||
let mock_spans = vec![
|
||||
Span::ocr([45.0, 45.0, 155.0, 75.0], 0.7, "OCR Text".to_string()),
|
||||
];
|
||||
let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0));
|
||||
let mock_ocr = MockOcrCallback {
|
||||
call_count,
|
||||
output_spans: mock_spans,
|
||||
};
|
||||
|
||||
// Create a simple page image
|
||||
let page_image = GrayImage::new(612, 792);
|
||||
|
||||
// Process the hybrid page
|
||||
let result = process_hybrid_page(
|
||||
&page_image,
|
||||
612.0,
|
||||
792.0,
|
||||
&classification,
|
||||
&vector_spans,
|
||||
72,
|
||||
&mock_ocr,
|
||||
);
|
||||
|
||||
// With IoU > 0.5 and vector confidence >= 0.5, vector should win
|
||||
// Result should have only 1 span (the vector span)
|
||||
assert_eq!(result.len(), 1, "Should have only 1 span after merge (vector wins)");
|
||||
assert_eq!(result[0].source, SpanSource::Vector);
|
||||
assert_eq!(result[0].text, "Vector Text");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_process_hybrid_page_low_vector_confidence_ocr_wins() {
|
||||
// Test that OCR is preferred when vector confidence is low (< 0.5)
|
||||
// even with IoU > 0.5
|
||||
|
||||
let mut cells = BTreeSet::new();
|
||||
cells.insert(CellIndex::new(7, 0).flat());
|
||||
let classification = PageClassification::hybrid(0.75, cells);
|
||||
|
||||
// Vector span with low confidence
|
||||
let vector_spans = vec![
|
||||
Span::vector([50.0, 50.0, 150.0, 70.0], 0.2, "Bad Vector".to_string()),
|
||||
];
|
||||
|
||||
// OCR span with high confidence, overlapping vector
|
||||
let mock_spans = vec![
|
||||
Span::ocr([45.0, 45.0, 155.0, 75.0], 0.7, "Good OCR".to_string()),
|
||||
];
|
||||
let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0));
|
||||
let mock_ocr = MockOcrCallback {
|
||||
call_count,
|
||||
output_spans: mock_spans,
|
||||
};
|
||||
|
||||
let page_image = GrayImage::new(612, 792);
|
||||
|
||||
let result = process_hybrid_page(
|
||||
&page_image,
|
||||
612.0,
|
||||
792.0,
|
||||
&classification,
|
||||
&vector_spans,
|
||||
72,
|
||||
&mock_ocr,
|
||||
);
|
||||
|
||||
// With IoU > 0.5 but vector confidence < 0.5, OCR should be kept
|
||||
// Result should have 2 spans (both vector and OCR kept)
|
||||
assert_eq!(result.len(), 2, "Both vector and OCR should be kept when vector confidence is low");
|
||||
assert!(result.iter().any(|s| s.source == SpanSource::Vector));
|
||||
assert!(result.iter().any(|s| s.source == SpanSource::Ocr));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_process_hybrid_page_non_hybrid_classification() {
|
||||
// Test that non-hybrid classifications return only vector spans
|
||||
|
||||
let classification = PageClassification::new(PageClass::Vector, 0.9);
|
||||
let vector_spans = vec![
|
||||
Span::vector([50.0, 50.0, 150.0, 70.0], 0.9, "Vector Only".to_string()),
|
||||
];
|
||||
|
||||
let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0));
|
||||
let mock_ocr = MockOcrCallback {
|
||||
call_count: call_count.clone(),
|
||||
output_spans: vec![],
|
||||
};
|
||||
|
||||
let page_image = GrayImage::new(612, 792);
|
||||
|
||||
let result = process_hybrid_page(
|
||||
&page_image,
|
||||
612.0,
|
||||
792.0,
|
||||
&classification,
|
||||
&vector_spans,
|
||||
72,
|
||||
&mock_ocr,
|
||||
);
|
||||
|
||||
// OCR should not be called for non-hybrid pages
|
||||
assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 0);
|
||||
|
||||
// Result should have only vector spans
|
||||
assert_eq!(result.len(), 1);
|
||||
assert_eq!(result[0].source, SpanSource::Vector);
|
||||
assert_eq!(result[0].text, "Vector Only");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_process_hybrid_page_empty_hybrid_cells() {
|
||||
// Test hybrid classification with empty hybrid_cells
|
||||
|
||||
let classification = PageClassification::hybrid(0.75, BTreeSet::new());
|
||||
let vector_spans = vec![
|
||||
Span::vector([50.0, 50.0, 150.0, 70.0], 0.9, "Vector".to_string()),
|
||||
];
|
||||
|
||||
let call_count = std::sync::Arc::new(std::sync::atomic::AtomicUsize::new(0));
|
||||
let mock_ocr = MockOcrCallback {
|
||||
call_count: call_count.clone(),
|
||||
output_spans: vec![],
|
||||
};
|
||||
|
||||
let page_image = GrayImage::new(612, 792);
|
||||
|
||||
let result = process_hybrid_page(
|
||||
&page_image,
|
||||
612.0,
|
||||
792.0,
|
||||
&classification,
|
||||
&vector_spans,
|
||||
72,
|
||||
&mock_ocr,
|
||||
);
|
||||
|
||||
// OCR should not be called when hybrid_cells is empty
|
||||
assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 0);
|
||||
|
||||
// Result should have only vector spans
|
||||
assert_eq!(result.len(), 1);
|
||||
assert_eq!(result[0].source, SpanSource::Vector);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
96
notes/pdftract-4y9l.md
Normal file
96
notes/pdftract-4y9l.md
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
# pdftract-4y9l: Hybrid Page Routing Implementation
|
||||
|
||||
## Summary
|
||||
|
||||
Implemented Phase 5.2.4 Hybrid page handling pipeline with per-cell OCR routing and bbox overlap merge rule.
|
||||
|
||||
## Changes Made
|
||||
|
||||
### File: `crates/pdftract-core/src/hybrid.rs`
|
||||
|
||||
**Added imports:**
|
||||
- `PageClass` to fix compilation error
|
||||
|
||||
**Added types and functions:**
|
||||
1. `OcrCallback` trait - Abstracts OCR implementation (Phase 5.3 preprocessing + 5.4 Tesseract)
|
||||
2. `MockOcrCallback` struct - Mock OCR callback for testing that tracks call counts
|
||||
3. `process_hybrid_page()` - Main entry point for hybrid page handling
|
||||
|
||||
**Added tests:**
|
||||
1. `test_process_hybrid_page_ocr_only_on_scanned_cells()` - Verifies OCR runs only on scanned cells (48 cells, not 64)
|
||||
2. `test_process_hybrid_page_no_duplicate_text_from_overlap()` - Verifies no duplicate text from overlapping regions
|
||||
3. `test_process_hybrid_page_low_vector_confidence_ocr_wins()` - Verifies OCR preferred over low-confidence vector
|
||||
4. `test_process_hybrid_page_non_hybrid_classification()` - Verifies non-hybrid pages skip OCR
|
||||
5. `test_process_hybrid_page_empty_hybrid_cells()` - Verifies empty hybrid_cells skips OCR
|
||||
|
||||
## Acceptance Criteria Status
|
||||
|
||||
| Criterion | Status | Notes |
|
||||
|-----------|--------|-------|
|
||||
| OCR runs only on scanned cells | PASS | Test `test_process_hybrid_page_ocr_only_on_scanned_cells` verifies 48 calls for 6 rows, not 64 for full page |
|
||||
| Merge unit tests (IoU 0.6, 0.3, 0.6 with low confidence) | PASS | Tests `test_merge_iou_06_vector_kept`, `test_merge_iou_03_both_kept`, `test_merge_iou_06_low_vector_confidence_ocr_kept` |
|
||||
| No duplicate text from overlap | PASS | Test `test_process_hybrid_page_no_duplicate_text_from_overlap` verifies single span after merge |
|
||||
| Performance (Hybrid < Scanned by 30%) | WARN | Performance criterion noted; requires integration benchmark with actual PDF fixture |
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Cell Rendering Strategy
|
||||
- Render full page once at selected DPI
|
||||
- Crop per cell from rendered raster (cheaper than re-rendering)
|
||||
- Cell dimensions: `cell_w = page_w_px / 8`, `cell_h = page_h_px / 8`
|
||||
- Cell coordinates: `[c*cell_w, r*cell_h, (c+1)*cell_w, (r+1)*cell_h]`
|
||||
|
||||
### Merge Rule (IoU-based)
|
||||
1. For each OCR span O: find vector span V with IoU(O.bbox, V.bbox) > 0.5
|
||||
2. If found AND V.confidence >= 0.5: drop O (vector wins)
|
||||
3. If found AND V.confidence < 0.5: keep O (OCR preferred over bad vector)
|
||||
4. If not found: keep O
|
||||
5. Return all V + retained O sorted by reading order
|
||||
|
||||
### IoU Formula
|
||||
```
|
||||
IoU = area(A ∩ B) / area(A ∪ B)
|
||||
```
|
||||
|
||||
### Reading Order
|
||||
Spans sorted top-to-bottom, left-to-right (descending Y, then ascending X in PDF coordinates)
|
||||
|
||||
## Test Results
|
||||
|
||||
All 40 hybrid tests pass:
|
||||
```
|
||||
running 40 tests
|
||||
test hybrid::tests::test_compute_cell_crops ... ok
|
||||
test hybrid::tests::test_compute_iou_contained ... ok
|
||||
test hybrid::tests::test_compute_iou_half_overlap ... ok
|
||||
test hybrid::tests::test_compute_iou_identical ... ok
|
||||
test hybrid::tests::test_compute_iou_no_overlap ... ok
|
||||
test hybrid::tests::test_get_hybrid_cells_non_hybrid ... ok
|
||||
test hybrid::tests::test_get_hybrid_cells_with_cells ... ok
|
||||
test hybrid::tests::test_merge_iou_03_both_kept ... ok
|
||||
test hybrid::tests::test_merge_iou_06_low_vector_confidence_ocr_kept ... ok
|
||||
test hybrid::tests::test_merge_iou_06_vector_kept ... ok
|
||||
test hybrid::tests::test_merge_multiple_ocr_spans ... ok
|
||||
test hybrid::tests::test_merge_no_overlap ... ok
|
||||
test hybrid::tests::test_merge_reading_order ... ok
|
||||
test hybrid::tests::test_merge_sorting ... ok
|
||||
test hybrid::tests::test_process_hybrid_page_empty_hybrid_cells ... ok
|
||||
test hybrid::tests::test_crop_cell_from_page ... ok
|
||||
test hybrid::tests::test_process_hybrid_page_low_vector_confidence_ocr_wins ... ok
|
||||
test hybrid::tests::test_process_hybrid_page_non_hybrid_classification ... ok
|
||||
test hybrid::tests::test_process_hybrid_page_no_duplicate_text_from_overlap ... ok
|
||||
test hybrid::tests::test_span_dimensions ... ok
|
||||
test hybrid::tests::test_span_new ... ok
|
||||
test hybrid::tests::test_span_ocr ... ok
|
||||
test hybrid::tests::test_span_source_equality ... ok
|
||||
test hybrid::tests::test_span_vector ... ok
|
||||
test hybrid::tests::test_process_hybrid_page_ocr_only_on_scanned_cells ... ok
|
||||
|
||||
test result: ok. 40 passed; 0 failed; 0 ignored; 0 measured; 929 filtered out; finished in 0.06s
|
||||
```
|
||||
|
||||
## Reusable Patterns
|
||||
|
||||
1. **Callback trait for external dependency**: `OcrCallback` trait abstracts Tesseract dependency for testing
|
||||
2. **Atomic call tracking**: `Arc<AtomicUsize>` for counting calls across test boundaries
|
||||
3. **Cell-based grid processing**: 8x8 grid with flat index mapping `(row, col) -> row*8 + col`
|
||||
Loading…
Add table
Reference in a new issue