From 28c31ba0a14ba2cfcdae0ccef211890fdcbe4ed4 Mon Sep 17 00:00:00 2001 From: jedarden Date: Sun, 24 May 2026 02:49:16 -0400 Subject: [PATCH] feat(pdftract-vk0gc): implement markdown anchors with parser regex Add --md-anchors flag that emits HTML comment markers before each block in Markdown output, allowing downstream tools to map excerpts back to precise PDF locations. Changes: - Add markdown module with Anchor struct and parse_anchors() function - Regex: - Add markdown_anchors: bool to ExtractionOptions - Add --md-anchors CLI flag - Implement block_to_markdown() and page_to_markdown() functions - Add comprehensive documentation in docs/integrations/markdown-anchors.md - 16 unit tests pass, including roundtrip test Closes: pdftract-vk0gc --- Cargo.lock | 1 + crates/pdftract-cli/src/main.rs | 52 ++- crates/pdftract-core/src/lib.rs | 2 + crates/pdftract-core/src/markdown.rs | 460 ++++++++++++++++++++++++++ crates/pdftract-core/src/options.rs | 22 ++ docs/integrations/markdown-anchors.md | 163 +++++++++ notes/pdftract-vk0gc.md | 110 ++++++ 7 files changed, 793 insertions(+), 17 deletions(-) create mode 100644 crates/pdftract-core/src/markdown.rs create mode 100644 docs/integrations/markdown-anchors.md create mode 100644 notes/pdftract-vk0gc.md diff --git a/Cargo.lock b/Cargo.lock index 28f3e5e..c319e67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2361,6 +2361,7 @@ dependencies = [ "tracing", "ttf-parser 0.24.1", "unicode-normalization", + "url", "zstd", ] diff --git a/crates/pdftract-cli/src/main.rs b/crates/pdftract-cli/src/main.rs index 2c1866f..1de9162 100644 --- a/crates/pdftract-cli/src/main.rs +++ b/crates/pdftract-cli/src/main.rs @@ -14,6 +14,7 @@ use codegen::Language; use pdftract_core::options::{ReceiptsMode, ExtractionOptions}; use pdftract_core::extract::{extract_pdf, result_to_json}; use pdftract_core::cache; +use pdftract_core::markdown::{page_to_markdown, block_to_markdown}; // Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG}; @@ -108,6 +109,10 @@ enum Commands { /// Disable cache for this extraction (even if --cache-dir is set) #[arg(long)] no_cache: bool, + + /// Emit HTML comment anchors before each block in Markdown output + #[arg(long)] + md_anchors: bool, }, /// Verify a receipt against a PDF file VerifyReceipt(verify_receipt::VerifyReceiptCommand), @@ -311,8 +316,9 @@ fn main() -> Result<()> { cache_dir, cache_size, no_cache, + md_anchors, } => { - if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache) { + if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache, md_anchors) { eprintln!("Error: {}", e); std::process::exit(1); } @@ -427,6 +433,7 @@ fn cmd_extract( cache_dir: Option, cache_size: &str, no_cache: bool, + md_anchors: bool, ) -> Result<()> { // Validate receipts mode let receipts_mode = match ReceiptsMode::from_str(receipts) { @@ -474,6 +481,12 @@ fn cmd_extract( // Build extraction options let mut options = ExtractionOptions::with_receipts(receipts_mode); + // Set markdown anchors option + options.markdown_anchors = md_anchors; + if md_anchors { + eprintln!("Markdown anchors enabled"); + } + // Set OCR language if specified if !ocr_language.is_empty() { options.ocr_language = ocr_language; @@ -540,23 +553,28 @@ fn cmd_extract( } } "markdown" => { - // Markdown output: simple conversion - for page in &result.pages { - for block in &page.blocks { - match block.kind.as_str() { - "heading" => { - let level = block.level.unwrap_or(1); - let prefix = "#".repeat(level as usize); - println!("{} {}", prefix, block.text); - } - "paragraph" => { - println!("{}", block.text); - } - _ => { - println!("{}", block.text); - } + // Markdown output: simple conversion with optional anchors + let include_anchors = options.markdown_anchors; + let include_page_breaks = true; // Add --- between pages + + for (page_idx, page) in result.pages.iter().enumerate() { + let is_last_page = page_idx == result.pages.len() - 1; + let include_break = include_page_breaks && !is_last_page; + + if include_anchors { + // Use markdown module with anchors + let md = page_to_markdown(&page.blocks, page.index, true, include_break); + print!("{}", md); + } else { + // Simple conversion without anchors + for (block_idx, block) in page.blocks.iter().enumerate() { + let md = block_to_markdown(block, page.index, block_idx, false); + print!("{}", md); + println!(); + } + if include_break { + println!("\n---\n"); } - println!(); } } } diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index 646bdb5..0119a99 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -24,6 +24,7 @@ pub mod layout; pub mod graphics_state; #[cfg(feature = "ocr")] pub mod hybrid; +pub mod markdown; pub mod options; pub mod parser; pub mod receipts; @@ -41,6 +42,7 @@ pub mod table; pub use document::{PdfExtractor, PageIter, PageExtraction}; pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult, ExtractionMetadata}; pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics}; +pub use markdown::{Anchor, parse_anchors, block_to_markdown, page_to_markdown}; pub use options::{ExtractionOptions, ReceiptsMode}; pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree}; pub use schema::{SpanJson, BlockJson, ExtractionQuality, TableJson, RowJson, CellJson, SpanRef}; diff --git a/crates/pdftract-core/src/markdown.rs b/crates/pdftract-core/src/markdown.rs new file mode 100644 index 0000000..4e97156 --- /dev/null +++ b/crates/pdftract-core/src/markdown.rs @@ -0,0 +1,460 @@ +//! Markdown output generation with positional HTML comment anchors. +//! +//! This module provides functions for converting extracted PDF content to +//! Markdown format with optional HTML comment anchors that allow downstream +//! tools to map excerpts back to precise PDF locations. +//! +//! # Anchor Format +//! +//! Each block can be preceded by a single-line HTML comment: +//! +//! ```markdown +//! +//! ## Chapter 3 +//! ``` +//! +//! The anchor format is a stable schema parseable with one regex: +//! +//! ```text +//! +//! ``` +//! +//! # Parsing Anchors +//! +//! Use [`parse_anchors`] to extract all anchors from markdown text: +//! +//! ``` +//! use pdftract_core::markdown::{parse_anchors, Anchor}; +//! +//! let md = r#" +//! # Title"#; +//! +//! let anchors = parse_anchors(md); +//! assert_eq!(anchors.len(), 1); +//! assert_eq!(anchors[0].page, 0); +//! assert_eq!(anchors[0].block, 0); +//! ``` + +use crate::schema::BlockJson; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::sync::OnceLock; + +/// Regex for parsing pdftract HTML comment anchors. +/// +/// Format: `` +fn anchor_regex() -> &'static Regex { + static REGEX: OnceLock = OnceLock::new(); + REGEX.get_or_init(|| { + Regex::new(r"") + .expect("invalid ANCHOR_REGEX") + }) +} + +/// A parsed HTML comment anchor containing positional metadata. +/// +/// Anchors are extracted from markdown output and provide a mapping from +/// markdown text back to precise PDF locations. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] +pub struct Anchor { + /// Zero-based page index. + pub page: usize, + /// Zero-based block index within the page. + pub block: usize, + /// Bounding box in PDF points: [x0, y0, x1, y1]. + pub bbox: [f32; 4], + /// Block kind (e.g., "heading", "paragraph", "table"). + pub kind: String, +} + +impl Anchor { + /// Create a new anchor from components. + pub fn new(page: usize, block: usize, bbox: [f32; 4], kind: String) -> Self { + Self { page, block, bbox, kind } + } + + /// Format this anchor as an HTML comment. + /// + /// Returns a single-line comment suitable for insertion before block content. + /// + /// # Example + /// + /// ``` + /// use pdftract_core::markdown::Anchor; + /// + /// let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string()); + /// let comment = anchor.to_comment(); + /// assert_eq!(comment, ""); + /// ``` + pub fn to_comment(&self) -> String { + format!( + "", + self.page, self.block, self.bbox[0], self.bbox[1], self.bbox[2], self.bbox[3], self.kind + ) + } +} + +/// Parse all pdftract anchors from markdown text. +/// +/// Returns a vector of [`Anchor`] structs in the order they appear in the text. +/// Invalid anchor formats are silently skipped. +/// +/// # Arguments +/// +/// * `md` - The markdown text to parse +/// +/// # Returns +/// +/// A vector of parsed anchors. +/// +/// # Example +/// +/// ``` +/// use pdftract_core::markdown::parse_anchors; +/// +/// let md = r#" +/// # Title +/// +/// +/// Some text."#; +/// +/// let anchors = parse_anchors(md); +/// assert_eq!(anchors.len(), 2); +/// assert_eq!(anchors[0].page, 0); +/// assert_eq!(anchors[0].block, 0); +/// assert_eq!(anchors[1].page, 0); +/// assert_eq!(anchors[1].block, 1); +/// ``` +pub fn parse_anchors(md: &str) -> Vec { + let mut anchors = Vec::new(); + + for captures in anchor_regex().captures_iter(md) { + // Parse page number + let page = match captures.get(1).and_then(|m| m.as_str().parse().ok()) { + Some(p) => p, + None => continue, + }; + + // Parse block number + let block = match captures.get(2).and_then(|m| m.as_str().parse().ok()) { + Some(b) => b, + None => continue, + }; + + // Parse bbox: "x0,y0,x1,y1" with possible decimal points + let bbox_str = match captures.get(3) { + Some(m) => m.as_str(), + None => continue, + }; + + let bbox: [f32; 4] = match parse_bbox(bbox_str) { + Some(b) => b, + None => continue, + }; + + // Parse kind + let kind = match captures.get(4) { + Some(m) => m.as_str().to_string(), + None => continue, + }; + + anchors.push(Anchor::new(page, block, bbox, kind)); + } + + anchors +} + +/// Parse a bbox string like "72.0,640.5,540.0,672.0" into [f32; 4]. +fn parse_bbox(s: &str) -> Option<[f32; 4]> { + let parts: Vec<&str> = s.split(',').collect(); + if parts.len() != 4 { + return None; + } + + let mut bbox = [0.0f32; 4]; + for (i, part) in parts.iter().enumerate() { + bbox[i] = part.trim().parse().ok()?; + } + + Some(bbox) +} + +/// Convert a block to markdown with optional anchor comment. +/// +/// If `include_anchor` is true, emits an HTML comment before the block content. +/// +/// # Arguments +/// +/// * `block` - The block to convert +/// * `page_index` - Zero-based page index +/// * `block_index` - Zero-based block index within the page +/// * `include_anchor` - Whether to include the HTML comment anchor +/// +/// # Returns +/// +/// A markdown string with optional anchor. +pub fn block_to_markdown(block: &BlockJson, page_index: usize, block_index: usize, include_anchor: bool) -> String { + let mut result = String::new(); + + // Add anchor comment if requested + if include_anchor { + let anchor = Anchor::new( + page_index, + block_index, + [block.bbox[0] as f32, block.bbox[1] as f32, block.bbox[2] as f32, block.bbox[3] as f32], + block.kind.clone(), + ); + result.push_str(&anchor.to_comment()); + result.push('\n'); + } + + // Add block content based on kind + match block.kind.as_str() { + "heading" => { + let level = block.level.unwrap_or(1); + let prefix = "#".repeat(level as usize); + result.push_str(&format!("{} {}\n", prefix, block.text)); + } + "paragraph" => { + result.push_str(&format!("{}\n", block.text)); + } + "list" => { + result.push_str(&format!("* {}\n", block.text)); + } + "table" => { + result.push_str(&format!("| {}\n", block.text)); + } + "figure" => { + result.push_str(&format!("![]()\n\n{}\n", block.text)); + } + _ => { + result.push_str(&format!("{}\n", block.text)); + } + } + + result +} + +/// Convert all blocks from a page to markdown with optional anchors. +/// +/// If `include_anchor` is true, each block is preceded by an HTML comment. +/// If `include_page_break` is true, adds a horizontal rule between pages. +/// +/// # Arguments +/// +/// * `blocks` - The blocks to convert +/// * `page_index` - Zero-based page index +/// * `include_anchor` - Whether to include HTML comment anchors +/// * `include_page_break` - Whether to add a page break separator +/// +/// # Returns +/// +/// A markdown string with all blocks from the page. +pub fn page_to_markdown(blocks: &[BlockJson], page_index: usize, include_anchor: bool, include_page_break: bool) -> String { + let mut result = String::new(); + + for (block_index, block) in blocks.iter().enumerate() { + let md = block_to_markdown(block, page_index, block_index, include_anchor); + result.push_str(&md); + result.push('\n'); + } + + // Add page break if requested and this isn't the last page + if include_page_break { + result.push_str("\n---\n\n"); + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::BlockJson; + + fn make_test_block(kind: &str, text: &str, bbox: [f64; 4]) -> BlockJson { + BlockJson { + kind: kind.to_string(), + text: text.to_string(), + bbox, + level: None, + table_index: None, + receipt: None, + } + } + + #[test] + fn test_anchor_to_comment() { + let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string()); + let comment = anchor.to_comment(); + assert_eq!(comment, ""); + } + + #[test] + fn test_anchor_to_comment_round_bbox() { + let anchor = Anchor::new(0, 0, [72.123, 640.567, 540.999, 672.111], "paragraph".to_string()); + let comment = anchor.to_comment(); + // Should be rounded to 1 decimal place + assert_eq!(comment, ""); + } + + #[test] + fn test_parse_anchors_single() { + let md = r#" +# Title"#; + + let anchors = parse_anchors(md); + assert_eq!(anchors.len(), 1); + assert_eq!(anchors[0].page, 0); + assert_eq!(anchors[0].block, 0); + assert_eq!(anchors[0].bbox, [72.0, 640.5, 540.0, 672.0]); + assert_eq!(anchors[0].kind, "heading"); + } + + #[test] + fn test_parse_anchors_multiple() { + let md = r#" +# Title + + +Some text."#; + + let anchors = parse_anchors(md); + assert_eq!(anchors.len(), 2); + assert_eq!(anchors[0].page, 0); + assert_eq!(anchors[0].block, 0); + assert_eq!(anchors[1].page, 0); + assert_eq!(anchors[1].block, 1); + } + + #[test] + fn test_parse_anchors_invalid_format_skipped() { + let md = r#" +# Title + + +Some text."#; + + let anchors = parse_anchors(md); + assert_eq!(anchors.len(), 1); + } + + #[test] + fn test_parse_anchors_whitespace_tolerant() { + let md = r#""#; + let anchors = parse_anchors(md); + assert_eq!(anchors.len(), 1); + } + + #[test] + fn test_parse_bbox() { + assert_eq!(parse_bbox("72.0,640.5,540.0,672.0"), Some([72.0, 640.5, 540.0, 672.0])); + assert_eq!(parse_bbox("0,0,100,100"), Some([0.0, 0.0, 100.0, 100.0])); + assert_eq!(parse_bbox("72.0, 640.5, 540.0, 672.0"), Some([72.0, 640.5, 540.0, 672.0])); // with spaces + assert_eq!(parse_bbox("invalid"), None); + assert_eq!(parse_bbox("1,2,3"), None); // too few values + assert_eq!(parse_bbox("1,2,3,4,5"), None); // too many values + } + + #[test] + fn test_block_to_markdown_heading_with_anchor() { + let block = BlockJson { + kind: "heading".to_string(), + text: "Chapter 1".to_string(), + bbox: [72.0, 640.5, 540.0, 672.0], + level: Some(2), + table_index: None, + receipt: None, + }; + + let md = block_to_markdown(&block, 0, 0, true); + assert!(md.contains("")); + assert!(md.contains("## Chapter 1")); + } + + #[test] + fn test_block_to_markdown_paragraph_without_anchor() { + let block = make_test_block("paragraph", "Some text.", [72.0, 600.0, 540.0, 630.0]); + let md = block_to_markdown(&block, 0, 0, false); + assert!(!md.contains(" + /// ## Chapter 3 + /// ``` + /// + /// This allows downstream tools (LLM agents, audit tools, document Q&A systems) + /// to map a Markdown excerpt back to a precise PDF location. HTML comments + /// are passthrough in every major Markdown renderer (GitHub, GitLab, Obsidian, + /// Notion import, pulldown-cmark, marked, markdown-it). + /// + /// Default: false (anchors disabled) + pub markdown_anchors: bool, } impl Default for ExtractionOptions { @@ -157,6 +175,7 @@ impl Default for ExtractionOptions { full_render: false, ocr_dpi_override: None, ocr_language: vec!["eng".to_string()], + markdown_anchors: false, } } } @@ -190,6 +209,7 @@ impl ExtractionOptions { receipts, ocr_dpi_override: None, ocr_language: vec!["eng".to_string()], + markdown_anchors: false, ..Default::default() } } @@ -200,6 +220,7 @@ impl ExtractionOptions { receipts: ReceiptsMode::from_str(receipts)?, ocr_dpi_override: None, ocr_language: vec!["eng".to_string()], + markdown_anchors: false, ..Default::default() }) } @@ -219,6 +240,7 @@ impl ExtractionOptions { memory_budget_mb: memory_budget_mb.max(64), ocr_dpi_override: None, ocr_language: vec!["eng".to_string()], + markdown_anchors: false, ..Default::default() } } diff --git a/docs/integrations/markdown-anchors.md b/docs/integrations/markdown-anchors.md new file mode 100644 index 0000000..01c5867 --- /dev/null +++ b/docs/integrations/markdown-anchors.md @@ -0,0 +1,163 @@ +# Markdown Anchors Integration Guide + +This document describes the positional HTML comment anchors feature in pdftract's Markdown output. + +## Overview + +When `--md-anchors` is enabled, each block in markdown output is preceded by a single-line HTML comment containing positional metadata. This allows downstream tools (LLM agents, audit tools, document Q&A systems) to map a Markdown excerpt back to a precise PDF location. + +## Anchor Format + +Each anchor is a single-line HTML comment: + +```markdown + +## Chapter 3 +``` + +### Fields + +- `page`: Zero-based page index (0, 1, 2, ...) +- `block`: Zero-based block index within the page (0, 1, 2, ...) +- `bbox`: Bounding box in PDF points `[x0, y0, x1, y1]` with 1 decimal place precision +- `kind`: Block kind (`heading`, `paragraph`, `list`, `table`, `figure`, etc.) + +### Regex Schema + +The anchor format is parseable with this stable regex: + +```regex + +``` + +## Usage + +### CLI + +```bash +# Enable anchors in markdown output +pdftract extract input.pdf --format markdown --md-anchors > output.md +``` + +### Rust API + +```rust +use pdftract_core::markdown::{parse_anchors, Anchor}; + +// Parse anchors from markdown text +let md = std::fs::read_to_string("output.md")?; +let anchors = parse_anchors(&md); + +for anchor in anchors { + println!("Page {} Block {} at {:?}", anchor.page, anchor.block, anchor.bbox); +} +``` + +## Properties + +### Stability + +The anchor format is a **stable public API**. The regex schema will not change in a breaking way across minor versions. New fields may be added, but existing fields will remain compatible. + +### Passthrough + +HTML comments are passthrough in every major Markdown renderer: +- GitHub +- GitLab +- Obsidian +- Notion import +- pulldown-cmark +- marked +- markdown-it + +Anchored output remains human-readable while machines can recover positional metadata. + +### Round-trip + +A round-trip property holds: extracting → parsing anchors → recovering the original block list (modulo inline styling, which is lossy in Markdown). + +## Edge Cases + +### Code Fences + +HTML comments inside code fences (```) are not recognized by Markdown renderers—they're emitted verbatim. This is a limitation of the Markdown spec, not pdftract. + +### Empty Blocks + +Empty blocks (e.g., blank pages) still emit anchors with empty content following. + +### Block Index + +Block index is **per-page**, not global. Each page starts at block 0. Use the `page` field to compute global indices if needed. + +## Examples + +### Heading with Anchor + +```markdown + +# Introduction +``` + +### Paragraph with Anchor + +```markdown + +This is the first paragraph of the document. +``` + +### Table with Anchor + +```markdown + +| Column 1 | Column 2 | +|----------|----------| +| Cell 1 | Cell 2 | +``` + +## Integration Examples + +### Python: Extract Anchors + +```python +import re + +ANCHOR_RE = re.compile( + r'' +) + +def extract_anchors(md_text): + """Return list of (page, block, bbox, kind) tuples.""" + anchors = [] + for match in ANCHOR_RE.finditer(md_text): + page = int(match.group(1)) + block = int(match.group(2)) + bbox = [float(x) for x in match.group(3).split(',')] + kind = match.group(4) + anchors.append((page, block, bbox, kind)) + return anchors +``` + +### JavaScript: Parse Anchors + +```javascript +const ANCHOR_RE = //g; + +function extractAnchors(md) { + const anchors = []; + let match; + while ((match = ANCHOR_RE.exec(md)) !== null) { + anchors.push({ + page: parseInt(match[1]), + block: parseInt(match[2]), + bbox: match[3).split(',').map(Number), + kind: match[4] + }); + } + return anchors; +} +``` + +## Version History + +- **v0.1.0**: Initial release with `--md-anchors` flag and stable regex schema. diff --git a/notes/pdftract-vk0gc.md b/notes/pdftract-vk0gc.md new file mode 100644 index 0000000..147d612 --- /dev/null +++ b/notes/pdftract-vk0gc.md @@ -0,0 +1,110 @@ +# Verification Note: pdftract-vk0gc (Markdown Anchors) + +## Summary + +Implemented `--md-anchors` positional HTML comment markers for Markdown output with parser regex. + +## Changes Made + +### 1. Core Implementation (crates/pdftract-core/src/markdown.rs) + +Created new markdown module with: +- `Anchor` struct with `page`, `block`, `bbox`, `kind` fields +- `parse_anchors()` function with regex: `r""` +- `block_to_markdown()` - converts single block to markdown with optional anchor +- `page_to_markdown()` - converts all blocks from a page with optional anchors and page breaks +- `Anchor::to_comment()` - formats anchor as HTML comment with 1 decimal place precision + +### 2. Options (crates/pdftract-core/src/options.rs) + +Added `markdown_anchors: bool` field to `ExtractionOptions` with default `false`. + +### 3. CLI Integration (crates/pdftract-cli/src/main.rs) + +- Added `--md-anchors` flag to Extract command +- Passed flag through to ExtractionOptions +- Updated markdown output to use `page_to_markdown()` when anchors enabled +- Added import for `page_to_markdown` and `block_to_markdown` + +### 4. Documentation (docs/integrations/markdown-anchors.md) + +Created comprehensive integration guide covering: +- Anchor format specification +- Regex schema +- CLI and Rust API usage +- Edge cases (code fences, empty blocks, per-page indexing) +- Integration examples for Python and JavaScript + +## Acceptance Criteria + +### PASS + +- ✅ `--md-anchors` flag emits comment before every block +- ✅ Parser regex extracts page, block, bbox, kind from sample output +- ✅ Round-trip test: `test_roundtrip_extract_and_parse` passes +- ✅ Comment is ONE LINE (no embedded newline) +- ✅ bbox precision: 1 decimal place exact (verified in `test_anchor_to_comment_round_bbox`) +- ✅ kind matches block kind (heading, paragraph, etc.) +- ✅ Parser library `parse_anchors()` available +- ✅ Module exports: `Anchor`, `parse_anchors`, `block_to_markdown`, `page_to_markdown` +- ✅ 16 unit tests pass (including roundtrip, bbox parsing, multiple anchors) +- ✅ Regex is stable public API (documented in markdown-anchors.md) +- ✅ HTML comments are passthrough in major renderers (documented) +- ✅ Block index is per-page (0-based within page) + +### WARN (Infrastructure limitations) + +- None + +## Testing + +### Unit Tests (16/16 pass) + +- `test_anchor_to_comment` - basic comment formatting +- `test_anchor_to_comment_round_bbox` - 1 decimal place precision +- `test_parse_anchors_single` - parse single anchor +- `test_parse_anchors_multiple` - parse multiple anchors +- `test_parse_anchors_invalid_format_skipped` - invalid formats skipped +- `test_parse_anchors_whitespace_tolerant` - whitespace tolerance +- `test_parse_bbox` - bbox parsing with various formats +- `test_block_to_markdown_heading_with_anchor` - heading with anchor +- `test_block_to_markdown_paragraph_without_anchor` - paragraph without anchor +- `test_block_to_markdown_list` - list block +- `test_block_to_markdown_table` - table block +- `test_block_to_markdown_figure` - figure block +- `test_page_to_markdown_with_page_break` - page break separator +- `test_page_to_markdown_without_page_break` - no page break +- `test_page_to_markdown_with_anchors` - anchors enabled +- `test_roundtrip_extract_and_parse` - full roundtrip + +### Build Verification + +- `cargo build -p pdftract-core` - ✅ Success +- `cargo build -p pdftract-cli` - ✅ Success +- `cargo test -p pdftract-core --lib markdown` - ✅ 16/16 tests pass + +## Example Output + +With `--md-anchors` enabled: + +```markdown + +# Chapter 1 + + +This is the first paragraph. +``` + +## Files Modified + +- `crates/pdftract-core/src/markdown.rs` (new) +- `crates/pdftract-core/src/lib.rs` (module export) +- `crates/pdftract-core/src/options.rs` (markdown_anchors field) +- `crates/pdftract-core/Cargo.toml` (regex dependency already present) +- `crates/pdftract-cli/src/main.rs` (CLI flag and output logic) +- `docs/integrations/markdown-anchors.md` (new documentation) + +## References + +- Plan section: Phase 6.5 positional anchors (lines 2183-2197) +- Bead: pdftract-vk0gc