feat(pdftract-vk0gc): implement markdown anchors with parser regex

Add --md-anchors flag that emits HTML comment markers before each block
in Markdown output, allowing downstream tools to map excerpts back to
precise PDF locations.

Changes:
- Add markdown module with Anchor struct and parse_anchors() function
- Regex: <!-- pdftract: page=(\d+) block=(\d+) bbox=[([\d.,]+)] kind=(\w+) -->
- Add markdown_anchors: bool to ExtractionOptions
- Add --md-anchors CLI flag
- Implement block_to_markdown() and page_to_markdown() functions
- Add comprehensive documentation in docs/integrations/markdown-anchors.md
- 16 unit tests pass, including roundtrip test

Closes: pdftract-vk0gc
This commit is contained in:
jedarden 2026-05-24 02:49:16 -04:00
parent 585d861efc
commit 28c31ba0a1
7 changed files with 793 additions and 17 deletions

1
Cargo.lock generated
View file

@ -2361,6 +2361,7 @@ dependencies = [
"tracing",
"ttf-parser 0.24.1",
"unicode-normalization",
"url",
"zstd",
]

View file

@ -14,6 +14,7 @@ use codegen::Language;
use pdftract_core::options::{ReceiptsMode, ExtractionOptions};
use pdftract_core::extract::{extract_pdf, result_to_json};
use pdftract_core::cache;
use pdftract_core::markdown::{page_to_markdown, block_to_markdown};
// Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands
pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG};
@ -108,6 +109,10 @@ enum Commands {
/// Disable cache for this extraction (even if --cache-dir is set)
#[arg(long)]
no_cache: bool,
/// Emit HTML comment anchors before each block in Markdown output
#[arg(long)]
md_anchors: bool,
},
/// Verify a receipt against a PDF file
VerifyReceipt(verify_receipt::VerifyReceiptCommand),
@ -311,8 +316,9 @@ fn main() -> Result<()> {
cache_dir,
cache_size,
no_cache,
md_anchors,
} => {
if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache) {
if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache, md_anchors) {
eprintln!("Error: {}", e);
std::process::exit(1);
}
@ -427,6 +433,7 @@ fn cmd_extract(
cache_dir: Option<PathBuf>,
cache_size: &str,
no_cache: bool,
md_anchors: bool,
) -> Result<()> {
// Validate receipts mode
let receipts_mode = match ReceiptsMode::from_str(receipts) {
@ -474,6 +481,12 @@ fn cmd_extract(
// Build extraction options
let mut options = ExtractionOptions::with_receipts(receipts_mode);
// Set markdown anchors option
options.markdown_anchors = md_anchors;
if md_anchors {
eprintln!("Markdown anchors enabled");
}
// Set OCR language if specified
if !ocr_language.is_empty() {
options.ocr_language = ocr_language;
@ -540,23 +553,28 @@ fn cmd_extract(
}
}
"markdown" => {
// Markdown output: simple conversion
for page in &result.pages {
for block in &page.blocks {
match block.kind.as_str() {
"heading" => {
let level = block.level.unwrap_or(1);
let prefix = "#".repeat(level as usize);
println!("{} {}", prefix, block.text);
}
"paragraph" => {
println!("{}", block.text);
}
_ => {
println!("{}", block.text);
}
// Markdown output: simple conversion with optional anchors
let include_anchors = options.markdown_anchors;
let include_page_breaks = true; // Add --- between pages
for (page_idx, page) in result.pages.iter().enumerate() {
let is_last_page = page_idx == result.pages.len() - 1;
let include_break = include_page_breaks && !is_last_page;
if include_anchors {
// Use markdown module with anchors
let md = page_to_markdown(&page.blocks, page.index, true, include_break);
print!("{}", md);
} else {
// Simple conversion without anchors
for (block_idx, block) in page.blocks.iter().enumerate() {
let md = block_to_markdown(block, page.index, block_idx, false);
print!("{}", md);
println!();
}
if include_break {
println!("\n---\n");
}
println!();
}
}
}

View file

@ -24,6 +24,7 @@ pub mod layout;
pub mod graphics_state;
#[cfg(feature = "ocr")]
pub mod hybrid;
pub mod markdown;
pub mod options;
pub mod parser;
pub mod receipts;
@ -41,6 +42,7 @@ pub mod table;
pub use document::{PdfExtractor, PageIter, PageExtraction};
pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult, ExtractionMetadata};
pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics};
pub use markdown::{Anchor, parse_anchors, block_to_markdown, page_to_markdown};
pub use options::{ExtractionOptions, ReceiptsMode};
pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree};
pub use schema::{SpanJson, BlockJson, ExtractionQuality, TableJson, RowJson, CellJson, SpanRef};

View file

@ -0,0 +1,460 @@
//! Markdown output generation with positional HTML comment anchors.
//!
//! This module provides functions for converting extracted PDF content to
//! Markdown format with optional HTML comment anchors that allow downstream
//! tools to map excerpts back to precise PDF locations.
//!
//! # Anchor Format
//!
//! Each block can be preceded by a single-line HTML comment:
//!
//! ```markdown
//! <!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
//! ## Chapter 3
//! ```
//!
//! The anchor format is a stable schema parseable with one regex:
//!
//! ```text
//! <!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->
//! ```
//!
//! # Parsing Anchors
//!
//! Use [`parse_anchors`] to extract all anchors from markdown text:
//!
//! ```
//! use pdftract_core::markdown::{parse_anchors, Anchor};
//!
//! let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
//! # Title"#;
//!
//! let anchors = parse_anchors(md);
//! assert_eq!(anchors.len(), 1);
//! assert_eq!(anchors[0].page, 0);
//! assert_eq!(anchors[0].block, 0);
//! ```
use crate::schema::BlockJson;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::sync::OnceLock;
/// Regex for parsing pdftract HTML comment anchors.
///
/// Format: `<!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->`
fn anchor_regex() -> &'static Regex {
static REGEX: OnceLock<Regex> = OnceLock::new();
REGEX.get_or_init(|| {
Regex::new(r"<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->")
.expect("invalid ANCHOR_REGEX")
})
}
/// A parsed HTML comment anchor containing positional metadata.
///
/// Anchors are extracted from markdown output and provide a mapping from
/// markdown text back to precise PDF locations.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct Anchor {
/// Zero-based page index.
pub page: usize,
/// Zero-based block index within the page.
pub block: usize,
/// Bounding box in PDF points: [x0, y0, x1, y1].
pub bbox: [f32; 4],
/// Block kind (e.g., "heading", "paragraph", "table").
pub kind: String,
}
impl Anchor {
/// Create a new anchor from components.
pub fn new(page: usize, block: usize, bbox: [f32; 4], kind: String) -> Self {
Self { page, block, bbox, kind }
}
/// Format this anchor as an HTML comment.
///
/// Returns a single-line comment suitable for insertion before block content.
///
/// # Example
///
/// ```
/// use pdftract_core::markdown::Anchor;
///
/// let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
/// let comment = anchor.to_comment();
/// assert_eq!(comment, "<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->");
/// ```
pub fn to_comment(&self) -> String {
format!(
"<!-- pdftract: page={} block={} bbox=[{:.1},{:.1},{:.1},{:.1}] kind={} -->",
self.page, self.block, self.bbox[0], self.bbox[1], self.bbox[2], self.bbox[3], self.kind
)
}
}
/// Parse all pdftract anchors from markdown text.
///
/// Returns a vector of [`Anchor`] structs in the order they appear in the text.
/// Invalid anchor formats are silently skipped.
///
/// # Arguments
///
/// * `md` - The markdown text to parse
///
/// # Returns
///
/// A vector of parsed anchors.
///
/// # Example
///
/// ```
/// use pdftract_core::markdown::parse_anchors;
///
/// let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
/// # Title
///
/// <!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
/// Some text."#;
///
/// let anchors = parse_anchors(md);
/// assert_eq!(anchors.len(), 2);
/// assert_eq!(anchors[0].page, 0);
/// assert_eq!(anchors[0].block, 0);
/// assert_eq!(anchors[1].page, 0);
/// assert_eq!(anchors[1].block, 1);
/// ```
pub fn parse_anchors(md: &str) -> Vec<Anchor> {
let mut anchors = Vec::new();
for captures in anchor_regex().captures_iter(md) {
// Parse page number
let page = match captures.get(1).and_then(|m| m.as_str().parse().ok()) {
Some(p) => p,
None => continue,
};
// Parse block number
let block = match captures.get(2).and_then(|m| m.as_str().parse().ok()) {
Some(b) => b,
None => continue,
};
// Parse bbox: "x0,y0,x1,y1" with possible decimal points
let bbox_str = match captures.get(3) {
Some(m) => m.as_str(),
None => continue,
};
let bbox: [f32; 4] = match parse_bbox(bbox_str) {
Some(b) => b,
None => continue,
};
// Parse kind
let kind = match captures.get(4) {
Some(m) => m.as_str().to_string(),
None => continue,
};
anchors.push(Anchor::new(page, block, bbox, kind));
}
anchors
}
/// Parse a bbox string like "72.0,640.5,540.0,672.0" into [f32; 4].
fn parse_bbox(s: &str) -> Option<[f32; 4]> {
let parts: Vec<&str> = s.split(',').collect();
if parts.len() != 4 {
return None;
}
let mut bbox = [0.0f32; 4];
for (i, part) in parts.iter().enumerate() {
bbox[i] = part.trim().parse().ok()?;
}
Some(bbox)
}
/// Convert a block to markdown with optional anchor comment.
///
/// If `include_anchor` is true, emits an HTML comment before the block content.
///
/// # Arguments
///
/// * `block` - The block to convert
/// * `page_index` - Zero-based page index
/// * `block_index` - Zero-based block index within the page
/// * `include_anchor` - Whether to include the HTML comment anchor
///
/// # Returns
///
/// A markdown string with optional anchor.
pub fn block_to_markdown(block: &BlockJson, page_index: usize, block_index: usize, include_anchor: bool) -> String {
let mut result = String::new();
// Add anchor comment if requested
if include_anchor {
let anchor = Anchor::new(
page_index,
block_index,
[block.bbox[0] as f32, block.bbox[1] as f32, block.bbox[2] as f32, block.bbox[3] as f32],
block.kind.clone(),
);
result.push_str(&anchor.to_comment());
result.push('\n');
}
// Add block content based on kind
match block.kind.as_str() {
"heading" => {
let level = block.level.unwrap_or(1);
let prefix = "#".repeat(level as usize);
result.push_str(&format!("{} {}\n", prefix, block.text));
}
"paragraph" => {
result.push_str(&format!("{}\n", block.text));
}
"list" => {
result.push_str(&format!("* {}\n", block.text));
}
"table" => {
result.push_str(&format!("| {}\n", block.text));
}
"figure" => {
result.push_str(&format!("![]()\n\n{}\n", block.text));
}
_ => {
result.push_str(&format!("{}\n", block.text));
}
}
result
}
/// Convert all blocks from a page to markdown with optional anchors.
///
/// If `include_anchor` is true, each block is preceded by an HTML comment.
/// If `include_page_break` is true, adds a horizontal rule between pages.
///
/// # Arguments
///
/// * `blocks` - The blocks to convert
/// * `page_index` - Zero-based page index
/// * `include_anchor` - Whether to include HTML comment anchors
/// * `include_page_break` - Whether to add a page break separator
///
/// # Returns
///
/// A markdown string with all blocks from the page.
pub fn page_to_markdown(blocks: &[BlockJson], page_index: usize, include_anchor: bool, include_page_break: bool) -> String {
let mut result = String::new();
for (block_index, block) in blocks.iter().enumerate() {
let md = block_to_markdown(block, page_index, block_index, include_anchor);
result.push_str(&md);
result.push('\n');
}
// Add page break if requested and this isn't the last page
if include_page_break {
result.push_str("\n---\n\n");
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::schema::BlockJson;
fn make_test_block(kind: &str, text: &str, bbox: [f64; 4]) -> BlockJson {
BlockJson {
kind: kind.to_string(),
text: text.to_string(),
bbox,
level: None,
table_index: None,
receipt: None,
}
}
#[test]
fn test_anchor_to_comment() {
let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
let comment = anchor.to_comment();
assert_eq!(comment, "<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->");
}
#[test]
fn test_anchor_to_comment_round_bbox() {
let anchor = Anchor::new(0, 0, [72.123, 640.567, 540.999, 672.111], "paragraph".to_string());
let comment = anchor.to_comment();
// Should be rounded to 1 decimal place
assert_eq!(comment, "<!-- pdftract: page=0 block=0 bbox=[72.1,640.6,541.0,672.1] kind=paragraph -->");
}
#[test]
fn test_parse_anchors_single() {
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
# Title"#;
let anchors = parse_anchors(md);
assert_eq!(anchors.len(), 1);
assert_eq!(anchors[0].page, 0);
assert_eq!(anchors[0].block, 0);
assert_eq!(anchors[0].bbox, [72.0, 640.5, 540.0, 672.0]);
assert_eq!(anchors[0].kind, "heading");
}
#[test]
fn test_parse_anchors_multiple() {
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
# Title
<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
Some text."#;
let anchors = parse_anchors(md);
assert_eq!(anchors.len(), 2);
assert_eq!(anchors[0].page, 0);
assert_eq!(anchors[0].block, 0);
assert_eq!(anchors[1].page, 0);
assert_eq!(anchors[1].block, 1);
}
#[test]
fn test_parse_anchors_invalid_format_skipped() {
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
# Title
<!-- malformed anchor -->
Some text."#;
let anchors = parse_anchors(md);
assert_eq!(anchors.len(), 1);
}
#[test]
fn test_parse_anchors_whitespace_tolerant() {
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"#;
let anchors = parse_anchors(md);
assert_eq!(anchors.len(), 1);
}
#[test]
fn test_parse_bbox() {
assert_eq!(parse_bbox("72.0,640.5,540.0,672.0"), Some([72.0, 640.5, 540.0, 672.0]));
assert_eq!(parse_bbox("0,0,100,100"), Some([0.0, 0.0, 100.0, 100.0]));
assert_eq!(parse_bbox("72.0, 640.5, 540.0, 672.0"), Some([72.0, 640.5, 540.0, 672.0])); // with spaces
assert_eq!(parse_bbox("invalid"), None);
assert_eq!(parse_bbox("1,2,3"), None); // too few values
assert_eq!(parse_bbox("1,2,3,4,5"), None); // too many values
}
#[test]
fn test_block_to_markdown_heading_with_anchor() {
let block = BlockJson {
kind: "heading".to_string(),
text: "Chapter 1".to_string(),
bbox: [72.0, 640.5, 540.0, 672.0],
level: Some(2),
table_index: None,
receipt: None,
};
let md = block_to_markdown(&block, 0, 0, true);
assert!(md.contains("<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"));
assert!(md.contains("## Chapter 1"));
}
#[test]
fn test_block_to_markdown_paragraph_without_anchor() {
let block = make_test_block("paragraph", "Some text.", [72.0, 600.0, 540.0, 630.0]);
let md = block_to_markdown(&block, 0, 0, false);
assert!(!md.contains("<!-- pdftract:"));
assert!(md.contains("Some text."));
}
#[test]
fn test_block_to_markdown_list() {
let block = make_test_block("list", "Item 1", [72.0, 500.0, 540.0, 520.0]);
let md = block_to_markdown(&block, 0, 0, false);
assert!(md.contains("* Item 1"));
}
#[test]
fn test_block_to_markdown_table() {
let block = make_test_block("table", "Cell data", [72.0, 400.0, 540.0, 450.0]);
let md = block_to_markdown(&block, 0, 0, false);
assert!(md.contains("| Cell data"));
}
#[test]
fn test_block_to_markdown_figure() {
let block = make_test_block("figure", "Alt text", [72.0, 300.0, 540.0, 350.0]);
let md = block_to_markdown(&block, 0, 0, false);
assert!(md.contains("![]()"));
assert!(md.contains("Alt text"));
}
#[test]
fn test_page_to_markdown_with_page_break() {
let blocks = vec![
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
];
let md = page_to_markdown(&blocks, 0, false, true);
assert!(md.contains("---"));
}
#[test]
fn test_page_to_markdown_without_page_break() {
let blocks = vec![
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
];
let md = page_to_markdown(&blocks, 0, false, false);
assert!(!md.contains("---"));
}
#[test]
fn test_page_to_markdown_with_anchors() {
let blocks = vec![
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
];
let md = page_to_markdown(&blocks, 0, true, false);
assert_eq!(md.matches("<!-- pdftract:").count(), 2);
}
#[test]
fn test_roundtrip_extract_and_parse() {
let blocks = vec![
BlockJson {
kind: "heading".to_string(),
text: "Chapter 1".to_string(),
bbox: [72.0, 640.5, 540.0, 672.0],
level: Some(2),
table_index: None,
receipt: None,
},
];
let md = page_to_markdown(&blocks, 3, true, false);
let anchors = parse_anchors(&md);
assert_eq!(anchors.len(), 1);
assert_eq!(anchors[0].page, 3);
assert_eq!(anchors[0].block, 0);
assert_eq!(anchors[0].kind, "heading");
}
}

View file

@ -146,6 +146,24 @@ pub struct ExtractionOptions {
///
/// See docs/notes/ocr-language-packs.md for the full distribution strategy.
pub ocr_language: Vec<String>,
/// Emit HTML comment anchors before each block in Markdown output (Phase 6.5).
///
/// When enabled, each block in markdown output is preceded by a single-line
/// HTML comment containing positional metadata:
///
/// ```markdown
/// <!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
/// ## Chapter 3
/// ```
///
/// This allows downstream tools (LLM agents, audit tools, document Q&A systems)
/// to map a Markdown excerpt back to a precise PDF location. HTML comments
/// are passthrough in every major Markdown renderer (GitHub, GitLab, Obsidian,
/// Notion import, pulldown-cmark, marked, markdown-it).
///
/// Default: false (anchors disabled)
pub markdown_anchors: bool,
}
impl Default for ExtractionOptions {
@ -157,6 +175,7 @@ impl Default for ExtractionOptions {
full_render: false,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
}
}
}
@ -190,6 +209,7 @@ impl ExtractionOptions {
receipts,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
..Default::default()
}
}
@ -200,6 +220,7 @@ impl ExtractionOptions {
receipts: ReceiptsMode::from_str(receipts)?,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
..Default::default()
})
}
@ -219,6 +240,7 @@ impl ExtractionOptions {
memory_budget_mb: memory_budget_mb.max(64),
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
..Default::default()
}
}

View file

@ -0,0 +1,163 @@
# Markdown Anchors Integration Guide
This document describes the positional HTML comment anchors feature in pdftract's Markdown output.
## Overview
When `--md-anchors` is enabled, each block in markdown output is preceded by a single-line HTML comment containing positional metadata. This allows downstream tools (LLM agents, audit tools, document Q&A systems) to map a Markdown excerpt back to a precise PDF location.
## Anchor Format
Each anchor is a single-line HTML comment:
```markdown
<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
## Chapter 3
```
### Fields
- `page`: Zero-based page index (0, 1, 2, ...)
- `block`: Zero-based block index within the page (0, 1, 2, ...)
- `bbox`: Bounding box in PDF points `[x0, y0, x1, y1]` with 1 decimal place precision
- `kind`: Block kind (`heading`, `paragraph`, `list`, `table`, `figure`, etc.)
### Regex Schema
The anchor format is parseable with this stable regex:
```regex
<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->
```
## Usage
### CLI
```bash
# Enable anchors in markdown output
pdftract extract input.pdf --format markdown --md-anchors > output.md
```
### Rust API
```rust
use pdftract_core::markdown::{parse_anchors, Anchor};
// Parse anchors from markdown text
let md = std::fs::read_to_string("output.md")?;
let anchors = parse_anchors(&md);
for anchor in anchors {
println!("Page {} Block {} at {:?}", anchor.page, anchor.block, anchor.bbox);
}
```
## Properties
### Stability
The anchor format is a **stable public API**. The regex schema will not change in a breaking way across minor versions. New fields may be added, but existing fields will remain compatible.
### Passthrough
HTML comments are passthrough in every major Markdown renderer:
- GitHub
- GitLab
- Obsidian
- Notion import
- pulldown-cmark
- marked
- markdown-it
Anchored output remains human-readable while machines can recover positional metadata.
### Round-trip
A round-trip property holds: extracting → parsing anchors → recovering the original block list (modulo inline styling, which is lossy in Markdown).
## Edge Cases
### Code Fences
HTML comments inside code fences (```) are not recognized by Markdown renderers—they're emitted verbatim. This is a limitation of the Markdown spec, not pdftract.
### Empty Blocks
Empty blocks (e.g., blank pages) still emit anchors with empty content following.
### Block Index
Block index is **per-page**, not global. Each page starts at block 0. Use the `page` field to compute global indices if needed.
## Examples
### Heading with Anchor
```markdown
<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
# Introduction
```
### Paragraph with Anchor
```markdown
<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
This is the first paragraph of the document.
```
### Table with Anchor
```markdown
<!-- pdftract: page=1 block=0 bbox=[72.0,500.0,540.0,400.0] kind=table -->
| Column 1 | Column 2 |
|----------|----------|
| Cell 1 | Cell 2 |
```
## Integration Examples
### Python: Extract Anchors
```python
import re
ANCHOR_RE = re.compile(
r'<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->'
)
def extract_anchors(md_text):
"""Return list of (page, block, bbox, kind) tuples."""
anchors = []
for match in ANCHOR_RE.finditer(md_text):
page = int(match.group(1))
block = int(match.group(2))
bbox = [float(x) for x in match.group(3).split(',')]
kind = match.group(4)
anchors.append((page, block, bbox, kind))
return anchors
```
### JavaScript: Parse Anchors
```javascript
const ANCHOR_RE = /<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->/g;
function extractAnchors(md) {
const anchors = [];
let match;
while ((match = ANCHOR_RE.exec(md)) !== null) {
anchors.push({
page: parseInt(match[1]),
block: parseInt(match[2]),
bbox: match[3).split(',').map(Number),
kind: match[4]
});
}
return anchors;
}
```
## Version History
- **v0.1.0**: Initial release with `--md-anchors` flag and stable regex schema.

110
notes/pdftract-vk0gc.md Normal file
View file

@ -0,0 +1,110 @@
# Verification Note: pdftract-vk0gc (Markdown Anchors)
## Summary
Implemented `--md-anchors` positional HTML comment markers for Markdown output with parser regex.
## Changes Made
### 1. Core Implementation (crates/pdftract-core/src/markdown.rs)
Created new markdown module with:
- `Anchor` struct with `page`, `block`, `bbox`, `kind` fields
- `parse_anchors()` function with regex: `r"<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->"`
- `block_to_markdown()` - converts single block to markdown with optional anchor
- `page_to_markdown()` - converts all blocks from a page with optional anchors and page breaks
- `Anchor::to_comment()` - formats anchor as HTML comment with 1 decimal place precision
### 2. Options (crates/pdftract-core/src/options.rs)
Added `markdown_anchors: bool` field to `ExtractionOptions` with default `false`.
### 3. CLI Integration (crates/pdftract-cli/src/main.rs)
- Added `--md-anchors` flag to Extract command
- Passed flag through to ExtractionOptions
- Updated markdown output to use `page_to_markdown()` when anchors enabled
- Added import for `page_to_markdown` and `block_to_markdown`
### 4. Documentation (docs/integrations/markdown-anchors.md)
Created comprehensive integration guide covering:
- Anchor format specification
- Regex schema
- CLI and Rust API usage
- Edge cases (code fences, empty blocks, per-page indexing)
- Integration examples for Python and JavaScript
## Acceptance Criteria
### PASS
- ✅ `--md-anchors` flag emits comment before every block
- ✅ Parser regex extracts page, block, bbox, kind from sample output
- ✅ Round-trip test: `test_roundtrip_extract_and_parse` passes
- ✅ Comment is ONE LINE (no embedded newline)
- ✅ bbox precision: 1 decimal place exact (verified in `test_anchor_to_comment_round_bbox`)
- ✅ kind matches block kind (heading, paragraph, etc.)
- ✅ Parser library `parse_anchors()` available
- ✅ Module exports: `Anchor`, `parse_anchors`, `block_to_markdown`, `page_to_markdown`
- ✅ 16 unit tests pass (including roundtrip, bbox parsing, multiple anchors)
- ✅ Regex is stable public API (documented in markdown-anchors.md)
- ✅ HTML comments are passthrough in major renderers (documented)
- ✅ Block index is per-page (0-based within page)
### WARN (Infrastructure limitations)
- None
## Testing
### Unit Tests (16/16 pass)
- `test_anchor_to_comment` - basic comment formatting
- `test_anchor_to_comment_round_bbox` - 1 decimal place precision
- `test_parse_anchors_single` - parse single anchor
- `test_parse_anchors_multiple` - parse multiple anchors
- `test_parse_anchors_invalid_format_skipped` - invalid formats skipped
- `test_parse_anchors_whitespace_tolerant` - whitespace tolerance
- `test_parse_bbox` - bbox parsing with various formats
- `test_block_to_markdown_heading_with_anchor` - heading with anchor
- `test_block_to_markdown_paragraph_without_anchor` - paragraph without anchor
- `test_block_to_markdown_list` - list block
- `test_block_to_markdown_table` - table block
- `test_block_to_markdown_figure` - figure block
- `test_page_to_markdown_with_page_break` - page break separator
- `test_page_to_markdown_without_page_break` - no page break
- `test_page_to_markdown_with_anchors` - anchors enabled
- `test_roundtrip_extract_and_parse` - full roundtrip
### Build Verification
- `cargo build -p pdftract-core` - ✅ Success
- `cargo build -p pdftract-cli` - ✅ Success
- `cargo test -p pdftract-core --lib markdown` - ✅ 16/16 tests pass
## Example Output
With `--md-anchors` enabled:
```markdown
<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
# Chapter 1
<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
This is the first paragraph.
```
## Files Modified
- `crates/pdftract-core/src/markdown.rs` (new)
- `crates/pdftract-core/src/lib.rs` (module export)
- `crates/pdftract-core/src/options.rs` (markdown_anchors field)
- `crates/pdftract-core/Cargo.toml` (regex dependency already present)
- `crates/pdftract-cli/src/main.rs` (CLI flag and output logic)
- `docs/integrations/markdown-anchors.md` (new documentation)
## References
- Plan section: Phase 6.5 positional anchors (lines 2183-2197)
- Bead: pdftract-vk0gc