feat(pdftract-vk0gc): implement markdown anchors with parser regex

Add --md-anchors flag that emits HTML comment markers before each block in Markdown output, allowing downstream tools to map excerpts back to precise PDF locations. Changes: - Add markdown module with Anchor struct and parse_anchors() function - Regex:  - Add markdown_anchors: bool to ExtractionOptions - Add --md-anchors CLI flag - Implement block_to_markdown() and page_to_markdown() functions - Add comprehensive documentation in docs/integrations/markdown-anchors.md - 16 unit tests pass, including roundtrip test Closes: pdftract-vk0gc
2026-05-24 02:49:16 -04:00 · 2026-05-24 02:49:16 -04:00 · 28c31ba0a1
commit 28c31ba0a1
parent 585d861efc
7 changed files with 793 additions and 17 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2361,6 +2361,7 @@ dependencies = [
 "tracing",
 "ttf-parser 0.24.1",
 "unicode-normalization",
+ "url",
 "zstd",
 ]

--- a/crates/pdftract-cli/src/main.rs
+++ b/crates/pdftract-cli/src/main.rs
@ -14,6 +14,7 @@ use codegen::Language;
 use pdftract_core::options::{ReceiptsMode, ExtractionOptions};
 use pdftract_core::extract::{extract_pdf, result_to_json};
 use pdftract_core::cache;
+use pdftract_core::markdown::{page_to_markdown, block_to_markdown};

 // Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands
 pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG};
@ -108,6 +109,10 @@ enum Commands {
        /// Disable cache for this extraction (even if --cache-dir is set)
        #[arg(long)]
        no_cache: bool,
+
+        /// Emit HTML comment anchors before each block in Markdown output
+        #[arg(long)]
+        md_anchors: bool,
    },
    /// Verify a receipt against a PDF file
    VerifyReceipt(verify_receipt::VerifyReceiptCommand),
@ -311,8 +316,9 @@ fn main() -> Result<()> {
            cache_dir,
            cache_size,
            no_cache,
+            md_anchors,
        } => {
-            if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache) {
+            if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache, md_anchors) {
                eprintln!("Error: {}", e);
                std::process::exit(1);
            }
@ -427,6 +433,7 @@ fn cmd_extract(
    cache_dir: Option<PathBuf>,
    cache_size: &str,
    no_cache: bool,
+    md_anchors: bool,
 ) -> Result<()> {
    // Validate receipts mode
    let receipts_mode = match ReceiptsMode::from_str(receipts) {
@ -474,6 +481,12 @@ fn cmd_extract(
    // Build extraction options
    let mut options = ExtractionOptions::with_receipts(receipts_mode);

+    // Set markdown anchors option
+    options.markdown_anchors = md_anchors;
+    if md_anchors {
+        eprintln!("Markdown anchors enabled");
+    }
+
    // Set OCR language if specified
    if !ocr_language.is_empty() {
        options.ocr_language = ocr_language;
@ -540,23 +553,28 @@ fn cmd_extract(
            }
        }
        "markdown" => {
-            // Markdown output: simple conversion
-            for page in &result.pages {
-                for block in &page.blocks {
-                    match block.kind.as_str() {
-                        "heading" => {
-                            let level = block.level.unwrap_or(1);
-                            let prefix = "#".repeat(level as usize);
-                            println!("{} {}", prefix, block.text);
-                        }
-                        "paragraph" => {
-                            println!("{}", block.text);
-                        }
-                        _ => {
-                            println!("{}", block.text);
-                        }
+            // Markdown output: simple conversion with optional anchors
+            let include_anchors = options.markdown_anchors;
+            let include_page_breaks = true; // Add --- between pages
+
+            for (page_idx, page) in result.pages.iter().enumerate() {
+                let is_last_page = page_idx == result.pages.len() - 1;
+                let include_break = include_page_breaks && !is_last_page;
+
+                if include_anchors {
+                    // Use markdown module with anchors
+                    let md = page_to_markdown(&page.blocks, page.index, true, include_break);
+                    print!("{}", md);
+                } else {
+                    // Simple conversion without anchors
+                    for (block_idx, block) in page.blocks.iter().enumerate() {
+                        let md = block_to_markdown(block, page.index, block_idx, false);
+                        print!("{}", md);
+                        println!();
+                    }
+                    if include_break {
+                        println!("\n---\n");
                    }
-                    println!();
                }
            }
        }
--- a/crates/pdftract-core/src/lib.rs
+++ b/crates/pdftract-core/src/lib.rs
@ -24,6 +24,7 @@ pub mod layout;
 pub mod graphics_state;
 #[cfg(feature = "ocr")]
 pub mod hybrid;
+pub mod markdown;
 pub mod options;
 pub mod parser;
 pub mod receipts;
@ -41,6 +42,7 @@ pub mod table;
 pub use document::{PdfExtractor, PageIter, PageExtraction};
 pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult, ExtractionMetadata};
 pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics};
+pub use markdown::{Anchor, parse_anchors, block_to_markdown, page_to_markdown};
 pub use options::{ExtractionOptions, ReceiptsMode};
 pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree};
 pub use schema::{SpanJson, BlockJson, ExtractionQuality, TableJson, RowJson, CellJson, SpanRef};
--- a/crates/pdftract-core/src/markdown.rs
+++ b/crates/pdftract-core/src/markdown.rs
@ -0,0 +1,460 @@
+//! Markdown output generation with positional HTML comment anchors.
+//!
+//! This module provides functions for converting extracted PDF content to
+//! Markdown format with optional HTML comment anchors that allow downstream
+//! tools to map excerpts back to precise PDF locations.
+//!
+//! # Anchor Format
+//!
+//! Each block can be preceded by a single-line HTML comment:
+//!
+//! ```markdown
+//! <!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+//! ## Chapter 3
+//! ```
+//!
+//! The anchor format is a stable schema parseable with one regex:
+//!
+//! ```text
+//! <!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->
+//! ```
+//!
+//! # Parsing Anchors
+//!
+//! Use [`parse_anchors`] to extract all anchors from markdown text:
+//!
+//! ```
+//! use pdftract_core::markdown::{parse_anchors, Anchor};
+//!
+//! let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+//! # Title"#;
+//!
+//! let anchors = parse_anchors(md);
+//! assert_eq!(anchors.len(), 1);
+//! assert_eq!(anchors[0].page, 0);
+//! assert_eq!(anchors[0].block, 0);
+//! ```
+
+use crate::schema::BlockJson;
+use regex::Regex;
+use serde::{Deserialize, Serialize};
+use std::sync::OnceLock;
+
+/// Regex for parsing pdftract HTML comment anchors.
+///
+/// Format: `<!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->`
+fn anchor_regex() -> &'static Regex {
+    static REGEX: OnceLock<Regex> = OnceLock::new();
+    REGEX.get_or_init(|| {
+        Regex::new(r"<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->")
+            .expect("invalid ANCHOR_REGEX")
+    })
+}
+
+/// A parsed HTML comment anchor containing positional metadata.
+///
+/// Anchors are extracted from markdown output and provide a mapping from
+/// markdown text back to precise PDF locations.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
+pub struct Anchor {
+    /// Zero-based page index.
+    pub page: usize,
+    /// Zero-based block index within the page.
+    pub block: usize,
+    /// Bounding box in PDF points: [x0, y0, x1, y1].
+    pub bbox: [f32; 4],
+    /// Block kind (e.g., "heading", "paragraph", "table").
+    pub kind: String,
+}
+
+impl Anchor {
+    /// Create a new anchor from components.
+    pub fn new(page: usize, block: usize, bbox: [f32; 4], kind: String) -> Self {
+        Self { page, block, bbox, kind }
+    }
+
+    /// Format this anchor as an HTML comment.
+    ///
+    /// Returns a single-line comment suitable for insertion before block content.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use pdftract_core::markdown::Anchor;
+    ///
+    /// let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
+    /// let comment = anchor.to_comment();
+    /// assert_eq!(comment, "<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->");
+    /// ```
+    pub fn to_comment(&self) -> String {
+        format!(
+            "<!-- pdftract: page={} block={} bbox=[{:.1},{:.1},{:.1},{:.1}] kind={} -->",
+            self.page, self.block, self.bbox[0], self.bbox[1], self.bbox[2], self.bbox[3], self.kind
+        )
+    }
+}
+
+/// Parse all pdftract anchors from markdown text.
+///
+/// Returns a vector of [`Anchor`] structs in the order they appear in the text.
+/// Invalid anchor formats are silently skipped.
+///
+/// # Arguments
+///
+/// * `md` - The markdown text to parse
+///
+/// # Returns
+///
+/// A vector of parsed anchors.
+///
+/// # Example
+///
+/// ```
+/// use pdftract_core::markdown::parse_anchors;
+///
+/// let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+/// # Title
+///
+/// <!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
+/// Some text."#;
+///
+/// let anchors = parse_anchors(md);
+/// assert_eq!(anchors.len(), 2);
+/// assert_eq!(anchors[0].page, 0);
+/// assert_eq!(anchors[0].block, 0);
+/// assert_eq!(anchors[1].page, 0);
+/// assert_eq!(anchors[1].block, 1);
+/// ```
+pub fn parse_anchors(md: &str) -> Vec<Anchor> {
+    let mut anchors = Vec::new();
+
+    for captures in anchor_regex().captures_iter(md) {
+        // Parse page number
+        let page = match captures.get(1).and_then(|m| m.as_str().parse().ok()) {
+            Some(p) => p,
+            None => continue,
+        };
+
+        // Parse block number
+        let block = match captures.get(2).and_then(|m| m.as_str().parse().ok()) {
+            Some(b) => b,
+            None => continue,
+        };
+
+        // Parse bbox: "x0,y0,x1,y1" with possible decimal points
+        let bbox_str = match captures.get(3) {
+            Some(m) => m.as_str(),
+            None => continue,
+        };
+
+        let bbox: [f32; 4] = match parse_bbox(bbox_str) {
+            Some(b) => b,
+            None => continue,
+        };
+
+        // Parse kind
+        let kind = match captures.get(4) {
+            Some(m) => m.as_str().to_string(),
+            None => continue,
+        };
+
+        anchors.push(Anchor::new(page, block, bbox, kind));
+    }
+
+    anchors
+}
+
+/// Parse a bbox string like "72.0,640.5,540.0,672.0" into [f32; 4].
+fn parse_bbox(s: &str) -> Option<[f32; 4]> {
+    let parts: Vec<&str> = s.split(',').collect();
+    if parts.len() != 4 {
+        return None;
+    }
+
+    let mut bbox = [0.0f32; 4];
+    for (i, part) in parts.iter().enumerate() {
+        bbox[i] = part.trim().parse().ok()?;
+    }
+
+    Some(bbox)
+}
+
+/// Convert a block to markdown with optional anchor comment.
+///
+/// If `include_anchor` is true, emits an HTML comment before the block content.
+///
+/// # Arguments
+///
+/// * `block` - The block to convert
+/// * `page_index` - Zero-based page index
+/// * `block_index` - Zero-based block index within the page
+/// * `include_anchor` - Whether to include the HTML comment anchor
+///
+/// # Returns
+///
+/// A markdown string with optional anchor.
+pub fn block_to_markdown(block: &BlockJson, page_index: usize, block_index: usize, include_anchor: bool) -> String {
+    let mut result = String::new();
+
+    // Add anchor comment if requested
+    if include_anchor {
+        let anchor = Anchor::new(
+            page_index,
+            block_index,
+            [block.bbox[0] as f32, block.bbox[1] as f32, block.bbox[2] as f32, block.bbox[3] as f32],
+            block.kind.clone(),
+        );
+        result.push_str(&anchor.to_comment());
+        result.push('\n');
+    }
+
+    // Add block content based on kind
+    match block.kind.as_str() {
+        "heading" => {
+            let level = block.level.unwrap_or(1);
+            let prefix = "#".repeat(level as usize);
+            result.push_str(&format!("{} {}\n", prefix, block.text));
+        }
+        "paragraph" => {
+            result.push_str(&format!("{}\n", block.text));
+        }
+        "list" => {
+            result.push_str(&format!("* {}\n", block.text));
+        }
+        "table" => {
+            result.push_str(&format!("| {}\n", block.text));
+        }
+        "figure" => {
+            result.push_str(&format!("![]()\n\n{}\n", block.text));
+        }
+        _ => {
+            result.push_str(&format!("{}\n", block.text));
+        }
+    }
+
+    result
+}
+
+/// Convert all blocks from a page to markdown with optional anchors.
+///
+/// If `include_anchor` is true, each block is preceded by an HTML comment.
+/// If `include_page_break` is true, adds a horizontal rule between pages.
+///
+/// # Arguments
+///
+/// * `blocks` - The blocks to convert
+/// * `page_index` - Zero-based page index
+/// * `include_anchor` - Whether to include HTML comment anchors
+/// * `include_page_break` - Whether to add a page break separator
+///
+/// # Returns
+///
+/// A markdown string with all blocks from the page.
+pub fn page_to_markdown(blocks: &[BlockJson], page_index: usize, include_anchor: bool, include_page_break: bool) -> String {
+    let mut result = String::new();
+
+    for (block_index, block) in blocks.iter().enumerate() {
+        let md = block_to_markdown(block, page_index, block_index, include_anchor);
+        result.push_str(&md);
+        result.push('\n');
+    }
+
+    // Add page break if requested and this isn't the last page
+    if include_page_break {
+        result.push_str("\n---\n\n");
+    }
+
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::schema::BlockJson;
+
+    fn make_test_block(kind: &str, text: &str, bbox: [f64; 4]) -> BlockJson {
+        BlockJson {
+            kind: kind.to_string(),
+            text: text.to_string(),
+            bbox,
+            level: None,
+            table_index: None,
+            receipt: None,
+        }
+    }
+
+    #[test]
+    fn test_anchor_to_comment() {
+        let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
+        let comment = anchor.to_comment();
+        assert_eq!(comment, "<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->");
+    }
+
+    #[test]
+    fn test_anchor_to_comment_round_bbox() {
+        let anchor = Anchor::new(0, 0, [72.123, 640.567, 540.999, 672.111], "paragraph".to_string());
+        let comment = anchor.to_comment();
+        // Should be rounded to 1 decimal place
+        assert_eq!(comment, "<!-- pdftract: page=0 block=0 bbox=[72.1,640.6,541.0,672.1] kind=paragraph -->");
+    }
+
+    #[test]
+    fn test_parse_anchors_single() {
+        let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+# Title"#;
+
+        let anchors = parse_anchors(md);
+        assert_eq!(anchors.len(), 1);
+        assert_eq!(anchors[0].page, 0);
+        assert_eq!(anchors[0].block, 0);
+        assert_eq!(anchors[0].bbox, [72.0, 640.5, 540.0, 672.0]);
+        assert_eq!(anchors[0].kind, "heading");
+    }
+
+    #[test]
+    fn test_parse_anchors_multiple() {
+        let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+# Title
+
+<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
+Some text."#;
+
+        let anchors = parse_anchors(md);
+        assert_eq!(anchors.len(), 2);
+        assert_eq!(anchors[0].page, 0);
+        assert_eq!(anchors[0].block, 0);
+        assert_eq!(anchors[1].page, 0);
+        assert_eq!(anchors[1].block, 1);
+    }
+
+    #[test]
+    fn test_parse_anchors_invalid_format_skipped() {
+        let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+# Title
+
+<!-- malformed anchor -->
+Some text."#;
+
+        let anchors = parse_anchors(md);
+        assert_eq!(anchors.len(), 1);
+    }
+
+    #[test]
+    fn test_parse_anchors_whitespace_tolerant() {
+        let md = r#"<!--  pdftract:  page=0  block=0  bbox=[72.0,640.5,540.0,672.0]  kind=heading  -->"#;
+        let anchors = parse_anchors(md);
+        assert_eq!(anchors.len(), 1);
+    }
+
+    #[test]
+    fn test_parse_bbox() {
+        assert_eq!(parse_bbox("72.0,640.5,540.0,672.0"), Some([72.0, 640.5, 540.0, 672.0]));
+        assert_eq!(parse_bbox("0,0,100,100"), Some([0.0, 0.0, 100.0, 100.0]));
+        assert_eq!(parse_bbox("72.0, 640.5, 540.0, 672.0"), Some([72.0, 640.5, 540.0, 672.0])); // with spaces
+        assert_eq!(parse_bbox("invalid"), None);
+        assert_eq!(parse_bbox("1,2,3"), None); // too few values
+        assert_eq!(parse_bbox("1,2,3,4,5"), None); // too many values
+    }
+
+    #[test]
+    fn test_block_to_markdown_heading_with_anchor() {
+        let block = BlockJson {
+            kind: "heading".to_string(),
+            text: "Chapter 1".to_string(),
+            bbox: [72.0, 640.5, 540.0, 672.0],
+            level: Some(2),
+            table_index: None,
+            receipt: None,
+        };
+
+        let md = block_to_markdown(&block, 0, 0, true);
+        assert!(md.contains("<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"));
+        assert!(md.contains("## Chapter 1"));
+    }
+
+    #[test]
+    fn test_block_to_markdown_paragraph_without_anchor() {
+        let block = make_test_block("paragraph", "Some text.", [72.0, 600.0, 540.0, 630.0]);
+        let md = block_to_markdown(&block, 0, 0, false);
+        assert!(!md.contains("<!-- pdftract:"));
+        assert!(md.contains("Some text."));
+    }
+
+    #[test]
+    fn test_block_to_markdown_list() {
+        let block = make_test_block("list", "Item 1", [72.0, 500.0, 540.0, 520.0]);
+        let md = block_to_markdown(&block, 0, 0, false);
+        assert!(md.contains("* Item 1"));
+    }
+
+    #[test]
+    fn test_block_to_markdown_table() {
+        let block = make_test_block("table", "Cell data", [72.0, 400.0, 540.0, 450.0]);
+        let md = block_to_markdown(&block, 0, 0, false);
+        assert!(md.contains("| Cell data"));
+    }
+
+    #[test]
+    fn test_block_to_markdown_figure() {
+        let block = make_test_block("figure", "Alt text", [72.0, 300.0, 540.0, 350.0]);
+        let md = block_to_markdown(&block, 0, 0, false);
+        assert!(md.contains("![]()"));
+        assert!(md.contains("Alt text"));
+    }
+
+    #[test]
+    fn test_page_to_markdown_with_page_break() {
+        let blocks = vec![
+            make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
+            make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
+        ];
+
+        let md = page_to_markdown(&blocks, 0, false, true);
+        assert!(md.contains("---"));
+    }
+
+    #[test]
+    fn test_page_to_markdown_without_page_break() {
+        let blocks = vec![
+            make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
+            make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
+        ];
+
+        let md = page_to_markdown(&blocks, 0, false, false);
+        assert!(!md.contains("---"));
+    }
+
+    #[test]
+    fn test_page_to_markdown_with_anchors() {
+        let blocks = vec![
+            make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
+            make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
+        ];
+
+        let md = page_to_markdown(&blocks, 0, true, false);
+        assert_eq!(md.matches("<!-- pdftract:").count(), 2);
+    }
+
+    #[test]
+    fn test_roundtrip_extract_and_parse() {
+        let blocks = vec![
+            BlockJson {
+                kind: "heading".to_string(),
+                text: "Chapter 1".to_string(),
+                bbox: [72.0, 640.5, 540.0, 672.0],
+                level: Some(2),
+                table_index: None,
+                receipt: None,
+            },
+        ];
+
+        let md = page_to_markdown(&blocks, 3, true, false);
+        let anchors = parse_anchors(&md);
+
+        assert_eq!(anchors.len(), 1);
+        assert_eq!(anchors[0].page, 3);
+        assert_eq!(anchors[0].block, 0);
+        assert_eq!(anchors[0].kind, "heading");
+    }
+}
--- a/crates/pdftract-core/src/options.rs
+++ b/crates/pdftract-core/src/options.rs
@ -146,6 +146,24 @@ pub struct ExtractionOptions {
    ///
    /// See docs/notes/ocr-language-packs.md for the full distribution strategy.
    pub ocr_language: Vec<String>,
+
+    /// Emit HTML comment anchors before each block in Markdown output (Phase 6.5).
+    ///
+    /// When enabled, each block in markdown output is preceded by a single-line
+    /// HTML comment containing positional metadata:
+    ///
+    /// ```markdown
+    /// <!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+    /// ## Chapter 3
+    /// ```
+    ///
+    /// This allows downstream tools (LLM agents, audit tools, document Q&A systems)
+    /// to map a Markdown excerpt back to a precise PDF location. HTML comments
+    /// are passthrough in every major Markdown renderer (GitHub, GitLab, Obsidian,
+    /// Notion import, pulldown-cmark, marked, markdown-it).
+    ///
+    /// Default: false (anchors disabled)
+    pub markdown_anchors: bool,
 }

 impl Default for ExtractionOptions {
@ -157,6 +175,7 @@ impl Default for ExtractionOptions {
            full_render: false,
            ocr_dpi_override: None,
            ocr_language: vec!["eng".to_string()],
+            markdown_anchors: false,
        }
    }
 }
@ -190,6 +209,7 @@ impl ExtractionOptions {
            receipts,
            ocr_dpi_override: None,
            ocr_language: vec!["eng".to_string()],
+            markdown_anchors: false,
            ..Default::default()
        }
    }
@ -200,6 +220,7 @@ impl ExtractionOptions {
            receipts: ReceiptsMode::from_str(receipts)?,
            ocr_dpi_override: None,
            ocr_language: vec!["eng".to_string()],
+            markdown_anchors: false,
            ..Default::default()
        })
    }
@ -219,6 +240,7 @@ impl ExtractionOptions {
            memory_budget_mb: memory_budget_mb.max(64),
            ocr_dpi_override: None,
            ocr_language: vec!["eng".to_string()],
+            markdown_anchors: false,
            ..Default::default()
        }
    }
--- a/docs/integrations/markdown-anchors.md
+++ b/docs/integrations/markdown-anchors.md
@ -0,0 +1,163 @@
+# Markdown Anchors Integration Guide
+
+This document describes the positional HTML comment anchors feature in pdftract's Markdown output.
+
+## Overview
+
+When `--md-anchors` is enabled, each block in markdown output is preceded by a single-line HTML comment containing positional metadata. This allows downstream tools (LLM agents, audit tools, document Q&A systems) to map a Markdown excerpt back to a precise PDF location.
+
+## Anchor Format
+
+Each anchor is a single-line HTML comment:
+
+```markdown
+<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+## Chapter 3
+```
+
+### Fields
+
+- `page`: Zero-based page index (0, 1, 2, ...)
+- `block`: Zero-based block index within the page (0, 1, 2, ...)
+- `bbox`: Bounding box in PDF points `[x0, y0, x1, y1]` with 1 decimal place precision
+- `kind`: Block kind (`heading`, `paragraph`, `list`, `table`, `figure`, etc.)
+
+### Regex Schema
+
+The anchor format is parseable with this stable regex:
+
+```regex
+<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->
+```
+
+## Usage
+
+### CLI
+
+```bash
+# Enable anchors in markdown output
+pdftract extract input.pdf --format markdown --md-anchors > output.md
+```
+
+### Rust API
+
+```rust
+use pdftract_core::markdown::{parse_anchors, Anchor};
+
+// Parse anchors from markdown text
+let md = std::fs::read_to_string("output.md")?;
+let anchors = parse_anchors(&md);
+
+for anchor in anchors {
+    println!("Page {} Block {} at {:?}", anchor.page, anchor.block, anchor.bbox);
+}
+```
+
+## Properties
+
+### Stability
+
+The anchor format is a **stable public API**. The regex schema will not change in a breaking way across minor versions. New fields may be added, but existing fields will remain compatible.
+
+### Passthrough
+
+HTML comments are passthrough in every major Markdown renderer:
+- GitHub
+- GitLab
+- Obsidian
+- Notion import
+- pulldown-cmark
+- marked
+- markdown-it
+
+Anchored output remains human-readable while machines can recover positional metadata.
+
+### Round-trip
+
+A round-trip property holds: extracting → parsing anchors → recovering the original block list (modulo inline styling, which is lossy in Markdown).
+
+## Edge Cases
+
+### Code Fences
+
+HTML comments inside code fences (```) are not recognized by Markdown renderers—they're emitted verbatim. This is a limitation of the Markdown spec, not pdftract.
+
+### Empty Blocks
+
+Empty blocks (e.g., blank pages) still emit anchors with empty content following.
+
+### Block Index
+
+Block index is **per-page**, not global. Each page starts at block 0. Use the `page` field to compute global indices if needed.
+
+## Examples
+
+### Heading with Anchor
+
+```markdown
+<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+# Introduction
+```
+
+### Paragraph with Anchor
+
+```markdown
+<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
+This is the first paragraph of the document.
+```
+
+### Table with Anchor
+
+```markdown
+<!-- pdftract: page=1 block=0 bbox=[72.0,500.0,540.0,400.0] kind=table -->
+| Column 1 | Column 2 |
+|----------|----------|
+| Cell 1   | Cell 2   |
+```
+
+## Integration Examples
+
+### Python: Extract Anchors
+
+```python
+import re
+
+ANCHOR_RE = re.compile(
+    r'<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->'
+)
+
+def extract_anchors(md_text):
+    """Return list of (page, block, bbox, kind) tuples."""
+    anchors = []
+    for match in ANCHOR_RE.finditer(md_text):
+        page = int(match.group(1))
+        block = int(match.group(2))
+        bbox = [float(x) for x in match.group(3).split(',')]
+        kind = match.group(4)
+        anchors.append((page, block, bbox, kind))
+    return anchors
+```
+
+### JavaScript: Parse Anchors
+
+```javascript
+const ANCHOR_RE = /<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->/g;
+
+function extractAnchors(md) {
+    const anchors = [];
+    let match;
+    while ((match = ANCHOR_RE.exec(md)) !== null) {
+        anchors.push({
+            page: parseInt(match[1]),
+            block: parseInt(match[2]),
+            bbox: match[3).split(',').map(Number),
+            kind: match[4]
+        });
+    }
+    return anchors;
+}
+```
+
+## Version History
+
+- **v0.1.0**: Initial release with `--md-anchors` flag and stable regex schema.
--- a/notes/pdftract-vk0gc.md
+++ b/notes/pdftract-vk0gc.md
@ -0,0 +1,110 @@
+# Verification Note: pdftract-vk0gc (Markdown Anchors)
+
+## Summary
+
+Implemented `--md-anchors` positional HTML comment markers for Markdown output with parser regex.
+
+## Changes Made
+
+### 1. Core Implementation (crates/pdftract-core/src/markdown.rs)
+
+Created new markdown module with:
+- `Anchor` struct with `page`, `block`, `bbox`, `kind` fields
+- `parse_anchors()` function with regex: `r"<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->"`
+- `block_to_markdown()` - converts single block to markdown with optional anchor
+- `page_to_markdown()` - converts all blocks from a page with optional anchors and page breaks
+- `Anchor::to_comment()` - formats anchor as HTML comment with 1 decimal place precision
+
+### 2. Options (crates/pdftract-core/src/options.rs)
+
+Added `markdown_anchors: bool` field to `ExtractionOptions` with default `false`.
+
+### 3. CLI Integration (crates/pdftract-cli/src/main.rs)
+
+- Added `--md-anchors` flag to Extract command
+- Passed flag through to ExtractionOptions
+- Updated markdown output to use `page_to_markdown()` when anchors enabled
+- Added import for `page_to_markdown` and `block_to_markdown`
+
+### 4. Documentation (docs/integrations/markdown-anchors.md)
+
+Created comprehensive integration guide covering:
+- Anchor format specification
+- Regex schema
+- CLI and Rust API usage
+- Edge cases (code fences, empty blocks, per-page indexing)
+- Integration examples for Python and JavaScript
+
+## Acceptance Criteria
+
+### PASS
+
+- ✅ `--md-anchors` flag emits comment before every block
+- ✅ Parser regex extracts page, block, bbox, kind from sample output
+- ✅ Round-trip test: `test_roundtrip_extract_and_parse` passes
+- ✅ Comment is ONE LINE (no embedded newline)
+- ✅ bbox precision: 1 decimal place exact (verified in `test_anchor_to_comment_round_bbox`)
+- ✅ kind matches block kind (heading, paragraph, etc.)
+- ✅ Parser library `parse_anchors()` available
+- ✅ Module exports: `Anchor`, `parse_anchors`, `block_to_markdown`, `page_to_markdown`
+- ✅ 16 unit tests pass (including roundtrip, bbox parsing, multiple anchors)
+- ✅ Regex is stable public API (documented in markdown-anchors.md)
+- ✅ HTML comments are passthrough in major renderers (documented)
+- ✅ Block index is per-page (0-based within page)
+
+### WARN (Infrastructure limitations)
+
+- None
+
+## Testing
+
+### Unit Tests (16/16 pass)
+
+- `test_anchor_to_comment` - basic comment formatting
+- `test_anchor_to_comment_round_bbox` - 1 decimal place precision
+- `test_parse_anchors_single` - parse single anchor
+- `test_parse_anchors_multiple` - parse multiple anchors
+- `test_parse_anchors_invalid_format_skipped` - invalid formats skipped
+- `test_parse_anchors_whitespace_tolerant` - whitespace tolerance
+- `test_parse_bbox` - bbox parsing with various formats
+- `test_block_to_markdown_heading_with_anchor` - heading with anchor
+- `test_block_to_markdown_paragraph_without_anchor` - paragraph without anchor
+- `test_block_to_markdown_list` - list block
+- `test_block_to_markdown_table` - table block
+- `test_block_to_markdown_figure` - figure block
+- `test_page_to_markdown_with_page_break` - page break separator
+- `test_page_to_markdown_without_page_break` - no page break
+- `test_page_to_markdown_with_anchors` - anchors enabled
+- `test_roundtrip_extract_and_parse` - full roundtrip
+
+### Build Verification
+
+- `cargo build -p pdftract-core` - ✅ Success
+- `cargo build -p pdftract-cli` - ✅ Success
+- `cargo test -p pdftract-core --lib markdown` - ✅ 16/16 tests pass
+
+## Example Output
+
+With `--md-anchors` enabled:
+
+```markdown
+<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
+# Chapter 1
+
+<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
+This is the first paragraph.
+```
+
+## Files Modified
+
+- `crates/pdftract-core/src/markdown.rs` (new)
+- `crates/pdftract-core/src/lib.rs` (module export)
+- `crates/pdftract-core/src/options.rs` (markdown_anchors field)
+- `crates/pdftract-core/Cargo.toml` (regex dependency already present)
+- `crates/pdftract-cli/src/main.rs` (CLI flag and output logic)
+- `docs/integrations/markdown-anchors.md` (new documentation)
+
+## References
+
+- Plan section: Phase 6.5 positional anchors (lines 2183-2197)
+- Bead: pdftract-vk0gc