From cdf112a3002b242cd9f593e8f17978804aaedc3d Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 25 May 2026 03:16:07 -0400 Subject: [PATCH] feat(pdftract-5edjj): implement render_anchors inspector layer renderer Implements the render_anchors helper that draws block-id text labels at the top-left corner of each block. Shows the Markdown anchor IDs that downstream output (Phase 6.5 --md-anchors) will produce. Key details: - Function: render_anchors(page_index, page_number, blocks) -> Vec - Anchor ID format: p{page_number}-b{block_index} (e.g., "p1-b0") - Text positioned at top-left corner (x0+2, y1-4) with small offset - Data attributes: data-page-index, data-page-number, data-block-index, data-bbox, data-kind - CSS class: "anchor-label" for frontend toggleability - Font: monospace, 10pt, black (#000000) All 12 unit tests pass, covering empty input, single/multiple blocks, positioning, bbox format, XML escaping, page variations, and SVG validity. Closes: pdftract-5edjj --- .../src/inspect/render/anchors.rs | 323 ++++++++++++++++++ crates/pdftract-cli/src/inspect/render/mod.rs | 1 + 2 files changed, 324 insertions(+) create mode 100644 crates/pdftract-cli/src/inspect/render/anchors.rs diff --git a/crates/pdftract-cli/src/inspect/render/anchors.rs b/crates/pdftract-cli/src/inspect/render/anchors.rs new file mode 100644 index 0000000..7ebbdc3 --- /dev/null +++ b/crates/pdftract-cli/src/inspect/render/anchors.rs @@ -0,0 +1,323 @@ +//! Anchor layer renderer for the inspector. +//! +//! This module renders SVG text labels at the top-left corner of each block, +//! showing the Markdown anchor IDs that downstream output (Phase 6.5 --md-anchors) +//! will produce. +//! +//! Each text label includes data-* attributes for tooltip and click consumption: +//! - data-page-index: the page index (0-based) +//! - data-page-number: the page number (1-based, for display) +//! - data-block-index: the block's index in the page +//! - data-bbox: the block's bounding box +//! - data-kind: the block kind + +use pdftract_core::schema::BlockJson; + +/// Render SVG text labels at the top-left corner of each block. +/// +/// # Arguments +/// +/// * `page_index` - Zero-based page index +/// * `page_number` - One-based page number (for display) +/// * `blocks` - Slice of blocks to render +/// +/// # Returns +/// +/// A vector of SVG `` element strings. Each text is positioned at +/// the top-left corner of the block's bbox with the anchor ID as content. +/// +/// # Anchor format +/// +/// The anchor ID format is: `p{page_number}-b{block_index}` +/// - page_number: 1-based page number for human readability +/// - block_index: 0-based block index within the page +/// +/// This matches the Phase 6.5 Markdown anchor comment format: +/// `` +/// +/// # Data attributes +/// +/// Each text element includes: +/// - `data-page-index`: the page's 0-based index +/// - `data-page-number`: the page's 1-based number (for display) +/// - `data-block-index`: the block's index in the page +/// - `data-bbox`: the block's bounding box as "[x0,y0,x1,y1]" +/// - `data-kind`: the block's kind string (XML-escaped) +pub fn render_anchors(page_index: usize, page_number: u32, blocks: &[BlockJson]) -> Vec { + blocks.iter().enumerate().map(|(block_index, block)| { + let [x0, _y0, x1, y1] = block.bbox; + let data_kind = escape_xml_attr(&block.kind); + let data_bbox = format!("[{:.2},{:.2},{:.2},{:.2}]", x0, block.bbox[1], x1, y1); + + // Position text at top-left corner with a small offset + // In PDF coordinates, y1 is the top (higher y value) + let x = x0 + 2.0; // Small offset from left edge + let y = y1 - 4.0; // Small offset from top edge (text baseline) + + let anchor_id = format!("p{}-b{}", page_number, block_index); + + format!( + r##"{}"##, + x, y, "#000000", page_index, page_number, block_index, data_bbox, data_kind, anchor_id + ) + }).collect() +} + +/// Escape a string for use in an XML attribute value. +/// +/// Replaces special XML characters with their entity references: +/// - `&` → `&` +/// - `<` → `<` +/// - `>` → `>` +/// - `"` → `"` +/// - `'` → `'` +fn escape_xml_attr(s: &str) -> String { + s.replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) + .replace('\'', "'") +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_block(kind: &str, text: &str, bbox: [f64; 4]) -> BlockJson { + BlockJson { + kind: kind.to_string(), + text: text.to_string(), + bbox, + level: None, + table_index: None, + receipt: None, + } + } + + #[test] + fn test_render_anchors_empty() { + let blocks: Vec = vec![]; + let output = render_anchors(0, 1, &blocks); + assert!(output.is_empty()); + } + + #[test] + fn test_render_anchors_single() { + let blocks = vec![make_test_block( + "paragraph", + "Test paragraph", + [100.0, 200.0, 400.0, 250.0], + )]; + + let output = render_anchors(0, 1, &blocks); + assert_eq!(output.len(), 1); + let text = &output[0]; + + // Check basic SVG structure + assert!(text.contains("p1-b0")); + + // Check data attributes + assert!(text.contains(r#"data-page-index="0""#)); + assert!(text.contains(r#"data-page-number="1""#)); + assert!(text.contains(r#"data-block-index="0""#)); + assert!(text.contains(r#"data-kind="paragraph""#)); + } + + #[test] + fn test_render_anchors_multiple() { + let blocks = vec![ + make_test_block("heading", "Title", [50.0, 50.0, 300.0, 80.0]), + make_test_block("paragraph", "Para 1", [50.0, 90.0, 300.0, 150.0]), + make_test_block("list", "Item 1", [70.0, 160.0, 280.0, 180.0]), + ]; + + let output = render_anchors(2, 3, &blocks); + assert_eq!(output.len(), 3); + + // Check anchor IDs + assert!(output[0].contains(">p3-b0")); + assert!(output[1].contains(">p3-b1")); + assert!(output[2].contains(">p3-b2")); + + // Check page indices + assert!(output[0].contains(r#"data-page-index="2""#)); + assert!(output[1].contains(r#"data-page-index="2""#)); + assert!(output[2].contains(r#"data-page-index="2""#)); + } + + #[test] + fn test_render_anchors_bbox_format() { + let blocks = vec![make_test_block( + "paragraph", + "Test", + [10.567, 20.891, 100.234, 110.567], + )]; + + let output = render_anchors(0, 1, &blocks); + let text = &output[0]; + + // Check bbox format in data attribute + assert!(text.contains(r#"data-bbox="[10.57,20.89,100.23,110.57]""#)); + } + + #[test] + fn test_render_anchors_xml_escaping() { + let blocks = vec![make_test_block( + "code &