diff --git a/crates/pdftract-cli/src/inspect/mod.rs b/crates/pdftract-cli/src/inspect/mod.rs new file mode 100644 index 0000000..5cef127 --- /dev/null +++ b/crates/pdftract-cli/src/inspect/mod.rs @@ -0,0 +1,7 @@ +//! Inspector web debug viewer. +//! +//! This module implements Phase 7.9's `pdftract inspect` subcommand: +//! a local web server that renders PDF extraction results with +//! interactive debugging overlays. + +pub mod render; diff --git a/crates/pdftract-cli/src/inspect/render/mod.rs b/crates/pdftract-cli/src/inspect/render/mod.rs new file mode 100644 index 0000000..586e785 --- /dev/null +++ b/crates/pdftract-cli/src/inspect/render/mod.rs @@ -0,0 +1,13 @@ +//! Layer renderers for the inspector debug viewer. +//! +//! Each renderer generates SVG elements for a specific debugging layer. +//! All renderers follow a common pattern: +//! +//! ```rust +//! pub fn render_(input: &[InputType]) -> Vec +//! ``` +//! +//! The returned Vec contains SVG elements that are placed inside +//! a `` group in the final output. + +pub mod spans; diff --git a/crates/pdftract-cli/src/inspect/render/spans.rs b/crates/pdftract-cli/src/inspect/render/spans.rs new file mode 100644 index 0000000..df210fd --- /dev/null +++ b/crates/pdftract-cli/src/inspect/render/spans.rs @@ -0,0 +1,337 @@ +//! Span layer renderer for the inspector. +//! +//! This module renders SVG outline rectangles for each text span, +//! color-coded by extraction confidence. Red (< 0.5), yellow (0.5-0.8), +//! and green (> 0.8) indicate low, medium, and high confidence respectively. +//! +//! Each rect includes data-* attributes for tooltip and click consumption: +//! - data-text: the extracted text content +//! - data-confidence: the confidence score (0.0-1.0) +//! - data-font: the font name +//! - data-size: the font size in points + +use pdftract_core::schema::SpanJson; + +/// Render SVG outline rectangles for each span. +/// +/// # Arguments +/// +/// * `spans` - Slice of spans to render +/// +/// # Returns +/// +/// A vector of SVG `` element strings. Each rect is positioned at +/// the span's bbox with stroke color indicating confidence. +/// +/// # Color coding +/// +/// - Red (#ef4444): confidence < 0.5 (low) +/// - Yellow (#eab308): 0.5 <= confidence < 0.8 (medium) +/// - Green (#22c55e): confidence >= 0.8 (high) +/// - Gray (#94a3b8): no confidence value (direct extraction) +/// +/// # Data attributes +/// +/// Each rect includes: +/// - `data-text`: the span's text content (XML-escaped) +/// - `data-confidence`: confidence score or empty string +/// - `data-font`: font name (XML-escaped) +/// - `data-size`: font size in points +pub fn render_spans(spans: &[SpanJson]) -> Vec { + spans.iter().map(|span| { + let [x0, y0, x1, y1] = span.bbox; + let width = x1 - x0; + let height = y1 - y0; + let stroke = confidence_to_color(span.confidence); + let data_text = escape_xml_attr(&span.text); + let data_font = escape_xml_attr(&span.font); + let confidence_str = span.confidence.map(|c| c.to_string()).unwrap_or_default(); + let data_confidence = escape_xml_attr(&confidence_str); + + format!( + r#""#, + x0, y0, width, height, stroke, data_text, data_confidence, data_font, span.size + ) + }).collect() +} + +/// Convert a confidence score to an SVG stroke color. +/// +/// # Arguments +/// +/// * `confidence` - Optional confidence score (0.0 to 1.0) +/// +/// # Returns +/// +/// A CSS hex color string. +/// +/// # Color mapping +/// +/// - `None`: gray (#94a3b8) - direct extraction without OCR +/// - `Some(c) where c < 0.5`: red (#ef4444) - low confidence +/// - `Some(c) where 0.5 <= c < 0.8`: yellow (#eab308) - medium confidence +/// - `Some(c) where c >= 0.8`: green (#22c55e) - high confidence +fn confidence_to_color(confidence: Option) -> &'static str { + match confidence { + None => "#94a3b8", // gray - direct extraction + Some(c) if c < 0.5 => "#ef4444", // red - low confidence + Some(c) if c < 0.8 => "#eab308", // yellow - medium confidence + Some(_) => "#22c55e", // green - high confidence + } +} + +/// Escape a string for use in an XML attribute value. +/// +/// Replaces special XML characters with their entity references: +/// - `&` → `&` +/// - `<` → `<` +/// - `>` → `>` +/// - `"` → `"` +/// - `'` → `'` +fn escape_xml_attr(s: &str) -> String { + s.replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) + .replace('\'', "'") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_render_spans_empty() { + let spans: Vec = vec![]; + let output = render_spans(&spans); + assert!(output.is_empty()); + } + + #[test] + fn test_render_spans_single() { + let spans = vec![ + SpanJson { + text: "Hello".to_string(), + bbox: [100.0, 200.0, 200.0, 220.0], + font: "Helvetica".to_string(), + size: 12.0, + confidence: None, + receipt: None, + } + ]; + + let output = render_spans(&spans); + assert_eq!(output.len(), 1); + let rect = &output[0]; + + // Check basic SVG structure + assert!(rect.contains("".to_string(), + bbox: [50.0, 100.0, 150.0, 120.0], + font: "Times \"Roman\"".to_string(), + size: 14.0, + confidence: Some(0.85), + receipt: None, + } + ]; + + let output = render_spans(&spans); + let rect = &output[0]; + + // Check XML escaping in data attributes + assert!(rect.contains("data-text=\"Test & <quote>\"")); + assert!(rect.contains("data-font=\"Times "Roman"\"")); + assert!(rect.contains("data-confidence=\"0.85\"")); + assert!(rect.contains("data-size=\"14\"")); + } + + #[test] + fn test_render_spans_multiple() { + let spans = vec![ + SpanJson { + text: "First".to_string(), + bbox: [0.0, 0.0, 50.0, 10.0], + font: "Arial".to_string(), + size: 10.0, + confidence: Some(0.9), // green + receipt: None, + }, + SpanJson { + text: "Second".to_string(), + bbox: [60.0, 0.0, 120.0, 10.0], + font: "Arial".to_string(), + size: 10.0, + confidence: Some(0.6), // yellow + receipt: None, + }, + SpanJson { + text: "Third".to_string(), + bbox: [130.0, 0.0, 180.0, 10.0], + font: "Arial".to_string(), + size: 10.0, + confidence: Some(0.3), // red + receipt: None, + }, + ]; + + let output = render_spans(&spans); + assert_eq!(output.len(), 3); + + // Check that each has the correct color + assert!(output[0].contains("stroke=\"#22c55e\"")); // green + assert!(output[1].contains("stroke=\"#eab308\"")); // yellow + assert!(output[2].contains("stroke=\"#ef4444\"")); // red + } + + #[test] + fn test_render_spans_css_class() { + let spans = vec![ + SpanJson { + text: "Test".to_string(), + bbox: [0.0, 0.0, 100.0, 20.0], + font: "Arial".to_string(), + size: 12.0, + confidence: None, + receipt: None, + } + ]; + + let output = render_spans(&spans); + assert!(output[0].contains(r#"class="span-rect""#)); + } + + #[test] + fn test_confidence_to_color_boundaries() { + // Test exact boundary conditions + assert_eq!(confidence_to_color(None), "#94a3b8"); + assert_eq!(confidence_to_color(Some(0.0)), "#ef4444"); + assert_eq!(confidence_to_color(Some(0.49)), "#ef4444"); + assert_eq!(confidence_to_color(Some(0.5)), "#eab308"); + assert_eq!(confidence_to_color(Some(0.79)), "#eab308"); + assert_eq!(confidence_to_color(Some(0.8)), "#22c55e"); + assert_eq!(confidence_to_color(Some(1.0)), "#22c55e"); + } + + #[test] + fn test_escape_xml_attr() { + assert_eq!(escape_xml_attr("hello"), "hello"); + assert_eq!(escape_xml_attr("a&b"), "a&b"); + assert_eq!(escape_xml_attr(""), "<tag>"); + assert_eq!(escape_xml_attr("\"quote\""), ""quote""); + assert_eq!(escape_xml_attr("'apos'"), "'apos'"); + assert_eq!( + escape_xml_attr("All & \"chars'"), + "All & <special> "chars'" + ); + } + + #[test] + fn test_render_spans_float_bbox() { + let spans = vec![ + SpanJson { + text: "Float".to_string(), + bbox: [10.567, 20.891, 100.234, 110.567], + font: "Arial".to_string(), + size: 12.5, + confidence: None, + receipt: None, + } + ]; + + let output = render_spans(&spans); + let rect = &output[0]; + + // Check that coordinates are rounded to 2 decimal places + assert!(rect.contains(r#"x="10.57""#)); + assert!(rect.contains(r#"y="20.89""#)); + assert!(rect.contains(r#"width="89.67""#)); // 100.234 - 10.567 + assert!(rect.contains(r#"height="89.68""#)); // 110.567 - 20.891 + } + + #[test] + fn test_render_spans_output_is_valid_svg() { + let spans = vec![ + SpanJson { + text: "Valid".to_string(), + bbox: [0.0, 0.0, 100.0, 20.0], + font: "Arial".to_string(), + size: 12.0, + confidence: Some(0.95), + receipt: None, + } + ]; + + let output = render_spans(&spans); + let rect = &output[0]; + + // Verify basic XML structure + assert!(rect.starts_with("")); + + // Check that all required attributes are present + assert!(rect.contains("x=")); + assert!(rect.contains("y=")); + assert!(rect.contains("width=")); + assert!(rect.contains("height=")); + assert!(rect.contains("fill=")); + assert!(rect.contains("stroke=")); + assert!(rect.contains("stroke-width=")); + assert!(rect.contains("class=")); + } +} diff --git a/crates/pdftract-cli/src/lib.rs b/crates/pdftract-cli/src/lib.rs index e71a11a..a7dd875 100644 --- a/crates/pdftract-cli/src/lib.rs +++ b/crates/pdftract-cli/src/lib.rs @@ -2,6 +2,7 @@ //! //! This library exports the CLI's internal modules for integration testing. +pub mod inspect; pub mod mcp; // Re-export diagnostics for testing diff --git a/notes/pdftract-p4vzu.md b/notes/pdftract-p4vzu.md new file mode 100644 index 0000000..a148c5a --- /dev/null +++ b/notes/pdftract-p4vzu.md @@ -0,0 +1,103 @@ +# pdftract-p4vzu: Inspector layer renderer - render_spans + +## Summary + +Implemented `render_spans` helper that builds SVG outline rectangles for each Span, with stroke color-coded by confidence level (red < 0.5; yellow 0.5-0.8; green > 0.8; gray for None). Sets data-* attributes for tooltip + click consumption. + +## Files Created + +- `crates/pdftract-cli/src/inspect/mod.rs` - Inspector module root +- `crates/pdftract-cli/src/inspect/render/mod.rs` - Layer renderers module +- `crates/pdftract-cli/src/inspect/render/spans.rs` - Span layer renderer + +## Files Modified + +- `crates/pdftract-cli/src/lib.rs` - Added `pub mod inspect;` + +## Implementation Details + +### `render_spans(spans: &[SpanJson]) -> Vec` + +Returns a vector of SVG `` element strings. Each rect: +- Positioned at the span's bbox with `x`, `y`, `width`, `height` attributes +- `fill="none"` with stroke color based on confidence +- Stroke width of 1 pixel +- CSS class `span-rect` for frontend toggling +- Data attributes: + - `data-text`: text content (XML-escaped) + - `data-confidence`: confidence score or empty string + - `data-font`: font name (XML-escaped) + - `data-size`: font size in points + +### Color Mapping + +- `None`: `#94a3b8` (gray) - direct extraction without OCR +- `Some(c) where c < 0.5`: `#ef4444` (red) - low confidence +- `Some(c) where 0.5 <= c < 0.8`: `#eab308` (yellow) - medium confidence +- `Some(c) where c >= 0.8`: `#22c55e` (green) - high confidence + +### XML Escaping + +The `escape_xml_attr` function properly escapes special characters in attribute values: +- `&` → `&` +- `<` → `<` +- `>` → `>` +- `"` → `"` +- `'` → `'` + +## Tests + +All 10 unit tests pass: + +1. `test_render_spans_empty` - Empty input produces empty output +2. `test_render_spans_single` - Single span renders correctly with all attributes +3. `test_render_spans_confidence_colors` - All confidence boundary conditions produce correct colors +4. `test_render_spans_data_attributes` - XML escaping works correctly +5. `test_render_spans_multiple` - Multiple spans each get correct colors +6. `test_render_spans_css_class` - CSS class is present +7. `test_confidence_to_color_boundaries` - Boundary values map correctly +8. `test_escape_xml_attr` - XML escaping function works +9. `test_render_spans_float_bbox` - Float coordinates are rounded to 2 decimal places +10. `test_render_spans_output_is_valid_svg` - Output is well-formed SVG + +## Acceptance Criteria Status + +- ✅ Helper compiles and produces valid SVG output +- ✅ Layer is independently toggleable via CSS class (`class="span-rect"`) +- ✅ data-* attrs populated for downstream UI consumption +- ⚠️ Renders correctly in headless browser (deferred - requires fixture) +- ✅ Performance: Pure function, no I/O, deterministic + +## Performance Note + +The implementation is a pure function with no I/O or external state. For 1000 spans on a typical page: +- String allocation: ~1000 small strings (~100 bytes each) = ~100 KB +- Time complexity: O(n) where n = number of spans +- Should render in well under 200ms for 1000 elements + +## Deferrals + +- Headless browser pixel-match fixture: Requires Phase 7.9.3 frontend CSS to be implemented first. The SVG output is structurally correct and follows the same pattern as the existing receipt SVG code. + +## Git Commit + +``` +feat(pdftract-p4vzu): implement inspector render_spans layer + +Implements the span layer renderer for the inspector debug viewer. +Renders SVG outline rectangles for each text span, color-coded by +extraction confidence. Red (< 0.5), yellow (0.5-0.8), and green (> 0.8) +indicate low, medium, and high confidence respectively. Gray indicates +direct extraction without OCR. + +Each rect includes data-* attributes for tooltip and click consumption: +- data-text: the extracted text content (XML-escaped) +- data-confidence: confidence score or empty string +- data-font: font name (XML-escaped) +- data-size: font size in points + +All 10 unit tests pass. The implementation follows the existing SVG +generation pattern in pdftract-core/src/receipts/svg.rs. + +Closes: pdftract-p4vzu +```