diff --git a/crates/pdftract-cli/src/inspect/render/ocr_regions.rs b/crates/pdftract-cli/src/inspect/render/ocr_regions.rs new file mode 100644 index 0000000..0f34c5c --- /dev/null +++ b/crates/pdftract-cli/src/inspect/render/ocr_regions.rs @@ -0,0 +1,399 @@ +//! OCR regions layer renderer for the inspector. +//! +//! This module renders SVG diagonal-stripe overlays for text spans that +//! were extracted via OCR (Tesseract). This distinguishes vector-text spans +//! from OCR-derived spans visually. +//! +//! Each overlay includes data-* attributes for tooltip and click consumption: +//! - data-ocr-source: the confidence source (ocr, ocr-assisted, ocr-fallback) +//! - data-confidence: the OCR confidence score (0.0-1.0) +//! - data-text: the extracted text content (for tooltip display) +//! - data-span-index: the span's index in the page (for JSON-tree navigation) + +use pdftract_core::schema::SpanJson; + +/// Render SVG diagonal-stripe overlays for OCR-derived spans. +/// +/// # Arguments +/// +/// * `spans` - Slice of spans to filter and render +/// +/// # Returns +/// +/// A vector of SVG strings. The first element (if any OCR spans exist) +/// is a `` element containing the diagonal-stripe pattern definition. +/// Subsequent elements are `` overlays for each OCR span, using the +/// pattern as fill. +/// +/// # Visual style +/// +/// - Cyan (#00d9ff) diagonal stripes at 45° angle +/// - 4px stripe width, 8px spacing +/// - Translucent background (opacity 0.15) +/// - Thin cyan stroke (1px, opacity 0.5) +/// +/// # Data attributes +/// +/// Each rect includes: +/// - `data-ocr-source`: the span's confidence_source (XML-escaped) +/// - `data-confidence`: OCR confidence score or empty string +/// - `data-text`: the span's text content, truncated to 100 chars (XML-escaped) +/// - `data-span-index`: the span's index in the page (for JSON-tree navigation) +/// +/// # CSS class +/// +/// Each rect has class `ocr-region-rect` for styling and frontend toggling. +pub fn render_ocr_regions(spans: &[SpanJson]) -> Vec { + // Filter OCR spans + let ocr_spans: Vec<(usize, &SpanJson)> = spans + .iter() + .enumerate() + .filter(|(_, span)| is_ocr_span(span)) + .collect(); + + if ocr_spans.is_empty() { + return Vec::new(); + } + + let mut result = Vec::new(); + + // Add pattern definition + result.push(PATTERN_DEF.to_string()); + + // Add overlay rects for each OCR span + for (index, span) in ocr_spans { + let [x0, y0, x1, y1] = span.bbox; + let width = x1 - x0; + let height = y1 - y0; + let data_source = escape_xml_attr( + span.confidence_source.as_deref().unwrap_or("") + ); + let confidence_str = span.confidence.map(|c| c.to_string()).unwrap_or_default(); + let data_confidence = escape_xml_attr(&confidence_str); + + // Truncate text for tooltip (max ~100 chars) + let tooltip_text = if span.text.len() > 99 { + format!("{}...", &span.text[..99]) + } else { + span.text.clone() + }; + let data_text = escape_xml_attr(&tooltip_text); + + result.push(format!( + r#""#, + x0, y0, width, height, data_source, data_confidence, data_text, index + )); + } + + result +} + +/// Check if a span was extracted via OCR. +/// +/// Returns true if the span's confidence_source contains "ocr" +/// (matches: "ocr", "ocr-assisted", "ocr-fallback"). +fn is_ocr_span(span: &SpanJson) -> bool { + span.confidence_source + .as_ref() + .map(|s| s.contains("ocr")) + .unwrap_or(false) +} + +/// SVG pattern definition for cyan diagonal stripes. +/// +/// 45° diagonal stripes, 4px wide, 8px spacing, cyan (#00d9ff). +const PATTERN_DEF: &str = r#" + + + + +"#; + +/// Escape a string for use in an XML attribute value. +/// +/// Replaces special XML characters with their entity references: +/// - `&` → `&` +/// - `<` → `<` +/// - `>` → `>` +/// - `"` → `"` +/// - `'` → `'` +fn escape_xml_attr(s: &str) -> String { + s.replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) + .replace('\'', "'") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_span(text: &str, bbox: [f64; 4], confidence_source: Option<&str>) -> SpanJson { + SpanJson { + text: text.to_string(), + bbox, + font: "Arial".to_string(), + size: 12.0, + color: None, + rendering_mode: None, + confidence: Some(0.85), + confidence_source: confidence_source.map(|s| s.to_string()), + lang: None, + flags: vec![], + receipt: None, + column: None, + } + } + + #[test] + fn test_render_ocr_regions_empty() { + let spans: Vec = vec![]; + let output = render_ocr_regions(&spans); + assert!(output.is_empty()); + } + + #[test] + fn test_render_ocr_regions_no_ocr_spans() { + let spans = vec![ + make_test_span("Vector text", [100.0, 200.0, 300.0, 220.0], Some("vector")), + make_test_span("Native text", [100.0, 230.0, 300.0, 250.0], Some("native")), + ]; + let output = render_ocr_regions(&spans); + assert!(output.is_empty()); + } + + #[test] + fn test_render_ocr_regions_single() { + let spans = vec![make_test_span( + "OCR text", + [100.0, 200.0, 300.0, 220.0], + Some("ocr"), + )]; + + let output = render_ocr_regions(&spans); + assert!(!output.is_empty()); + + // First element should be pattern definition + assert!(output[0].contains("")); + assert!(output[0].contains("ocr-diagonal-stripes")); + + // Second element should be overlay rect + assert!(output.len() >= 2); + let rect = &output[1]; + + // Check basic SVG structure + assert!(rect.contains("= 3); + + // Check first OCR span + assert!(output[1].contains(r#"data-ocr-source="ocr""#)); + assert!(output[1].contains(r#"data-text="OCR 1""#)); + + // Check second OCR span + assert!(output[2].contains(r#"data-ocr-source="ocr-assisted""#)); + assert!(output[2].contains(r#"data-text="OCR 2""#)); + } + + #[test] + fn test_render_ocr_regions_all_ocr_sources() { + let test_cases = [ + ("ocr", true), + ("ocr-assisted", true), + ("ocr-fallback", true), + ("vector", false), + ("native", false), + ("heuristic", false), + ]; + + for (source, expected_render) in test_cases { + let spans = vec![make_test_span("Test", [0.0, 0.0, 100.0, 20.0], Some(source))]; + let output = render_ocr_regions(&spans); + + if expected_render { + assert!(!output.is_empty(), "Source '{}' should render", source); + assert!(output[1].contains(&format!("data-ocr-source=\"{}\"", source))); + } else { + assert!(output.is_empty(), "Source '{}' should not render", source); + } + } + } + + #[test] + fn test_render_ocr_regions_text_truncation() { + let long_text = "a".repeat(200); + let spans = vec![make_test_span( + &long_text, + [0.0, 0.0, 100.0, 20.0], + Some("ocr"), + )]; + + let output = render_ocr_regions(&spans); + let rect = &output[1]; + + // Text should be truncated with "..." suffix + assert!(rect.contains("data-text=\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa...\"")); + } + + #[test] + fn test_render_ocr_regions_xml_escaping() { + let spans = vec![make_test_span( + "Text & ", + [0.0, 0.0, 100.0, 20.0], + Some("ocr"), + )]; + + let output = render_ocr_regions(&spans); + let rect = &output[1]; + + // Check XML escaping in data attributes + assert!(rect.contains("data-text=\"Text & <tags>\"")); + } + + #[test] + fn test_render_ocr_regions_confidence_none() { + let mut span = make_test_span("OCR", [0.0, 0.0, 100.0, 20.0], Some("ocr")); + span.confidence = None; + + let output = render_ocr_regions(&[span]); + let rect = &output[1]; + + // Should have empty confidence string + assert!(rect.contains(r#"data-confidence="""#)); + } + + #[test] + fn test_render_ocr_regions_css_class() { + let spans = vec![make_test_span("OCR", [0.0, 0.0, 100.0, 20.0], Some("ocr"))]; + + let output = render_ocr_regions(&spans); + let rect = &output[1]; + + assert!(rect.contains(r#"class="ocr-region-rect""#)); + } + + #[test] + fn test_is_ocr_span() { + let mut span = make_test_span("Test", [0.0, 0.0, 100.0, 20.0], Some("ocr")); + assert!(is_ocr_span(&span)); + + span.confidence_source = Some("ocr-assisted".to_string()); + assert!(is_ocr_span(&span)); + + span.confidence_source = Some("ocr-fallback".to_string()); + assert!(is_ocr_span(&span)); + + span.confidence_source = Some("vector".to_string()); + assert!(!is_ocr_span(&span)); + + span.confidence_source = None; + assert!(!is_ocr_span(&span)); + } + + #[test] + fn test_escape_xml_attr() { + assert_eq!(escape_xml_attr("hello"), "hello"); + assert_eq!(escape_xml_attr("a&b"), "a&b"); + assert_eq!(escape_xml_attr(""), "<tag>"); + assert_eq!(escape_xml_attr("\"quote\""), ""quote""); + assert_eq!(escape_xml_attr("'apos'"), "'apos'"); + } + + #[test] + fn test_render_ocr_regions_pattern_def() { + let spans = vec![make_test_span("OCR", [0.0, 0.0, 100.0, 20.0], Some("ocr"))]; + let output = render_ocr_regions(&spans); + + // Check pattern definition structure + assert!(output[0].contains("")); + assert!(output[0].ends_with("")); + + // Rect should be valid XML + assert!(output[1].starts_with("")); + } + + #[test] + fn test_render_ocr_regions_float_bbox() { + let spans = vec![make_test_span( + "OCR", + [10.567, 20.891, 100.234, 110.567], + Some("ocr"), + )]; + + let output = render_ocr_regions(&spans); + let rect = &output[1]; + + // Check that coordinates are rounded to 2 decimal places + assert!(rect.contains(r#"x="10.57""#)); + assert!(rect.contains(r#"y="20.89""#)); + assert!(rect.contains(r#"width="89.67""#)); // 100.234 - 10.567 + assert!(rect.contains(r#"height="89.68""#)); // 110.567 - 20.891 + } + + #[test] + fn test_render_ocr_regions_span_index_tracking() { + let spans = vec![ + make_test_span("Vector", [0.0, 0.0, 50.0, 10.0], Some("vector")), + make_test_span("OCR 1", [0.0, 20.0, 50.0, 30.0], Some("ocr")), + make_test_span("Vector 2", [0.0, 40.0, 50.0, 50.0], Some("vector")), + make_test_span("OCR 2", [0.0, 60.0, 50.0, 70.0], Some("ocr")), + ]; + + let output = render_ocr_regions(&spans); + + // Should have 2 OCR rects + assert_eq!(output.len(), 3); // pattern def + 2 rects + + // Check span indices (should be 1 and 3, not 0 and 1) + assert!(output[1].contains(r#"data-span-index="1""#)); // OCR 1 is at index 1 + assert!(output[2].contains(r#"data-span-index="3""#)); // OCR 2 is at index 3 + } +}