feat(pdftract-vk0gc): implement markdown anchors with parser regex
Add --md-anchors flag that emits HTML comment markers before each block in Markdown output, allowing downstream tools to map excerpts back to precise PDF locations. Changes: - Add markdown module with Anchor struct and parse_anchors() function - Regex: <!-- pdftract: page=(\d+) block=(\d+) bbox=[([\d.,]+)] kind=(\w+) --> - Add markdown_anchors: bool to ExtractionOptions - Add --md-anchors CLI flag - Implement block_to_markdown() and page_to_markdown() functions - Add comprehensive documentation in docs/integrations/markdown-anchors.md - 16 unit tests pass, including roundtrip test Closes: pdftract-vk0gc
This commit is contained in:
parent
585d861efc
commit
28c31ba0a1
7 changed files with 793 additions and 17 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -2361,6 +2361,7 @@ dependencies = [
|
|||
"tracing",
|
||||
"ttf-parser 0.24.1",
|
||||
"unicode-normalization",
|
||||
"url",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ use codegen::Language;
|
|||
use pdftract_core::options::{ReceiptsMode, ExtractionOptions};
|
||||
use pdftract_core::extract::{extract_pdf, result_to_json};
|
||||
use pdftract_core::cache;
|
||||
use pdftract_core::markdown::{page_to_markdown, block_to_markdown};
|
||||
|
||||
// Re-export diagnostics for the --list-diagnostics and --explain-diagnostic commands
|
||||
pub use pdftract_core::diagnostics::{DiagCode, DiagInfo, DIAGNOSTIC_CATALOG};
|
||||
|
|
@ -108,6 +109,10 @@ enum Commands {
|
|||
/// Disable cache for this extraction (even if --cache-dir is set)
|
||||
#[arg(long)]
|
||||
no_cache: bool,
|
||||
|
||||
/// Emit HTML comment anchors before each block in Markdown output
|
||||
#[arg(long)]
|
||||
md_anchors: bool,
|
||||
},
|
||||
/// Verify a receipt against a PDF file
|
||||
VerifyReceipt(verify_receipt::VerifyReceiptCommand),
|
||||
|
|
@ -311,8 +316,9 @@ fn main() -> Result<()> {
|
|||
cache_dir,
|
||||
cache_size,
|
||||
no_cache,
|
||||
md_anchors,
|
||||
} => {
|
||||
if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache) {
|
||||
if let Err(e) = cmd_extract(input, password_stdin, password, &format, &receipts, ocr, ocr_language, cache_dir, &cache_size, no_cache, md_anchors) {
|
||||
eprintln!("Error: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
|
@ -427,6 +433,7 @@ fn cmd_extract(
|
|||
cache_dir: Option<PathBuf>,
|
||||
cache_size: &str,
|
||||
no_cache: bool,
|
||||
md_anchors: bool,
|
||||
) -> Result<()> {
|
||||
// Validate receipts mode
|
||||
let receipts_mode = match ReceiptsMode::from_str(receipts) {
|
||||
|
|
@ -474,6 +481,12 @@ fn cmd_extract(
|
|||
// Build extraction options
|
||||
let mut options = ExtractionOptions::with_receipts(receipts_mode);
|
||||
|
||||
// Set markdown anchors option
|
||||
options.markdown_anchors = md_anchors;
|
||||
if md_anchors {
|
||||
eprintln!("Markdown anchors enabled");
|
||||
}
|
||||
|
||||
// Set OCR language if specified
|
||||
if !ocr_language.is_empty() {
|
||||
options.ocr_language = ocr_language;
|
||||
|
|
@ -540,23 +553,28 @@ fn cmd_extract(
|
|||
}
|
||||
}
|
||||
"markdown" => {
|
||||
// Markdown output: simple conversion
|
||||
for page in &result.pages {
|
||||
for block in &page.blocks {
|
||||
match block.kind.as_str() {
|
||||
"heading" => {
|
||||
let level = block.level.unwrap_or(1);
|
||||
let prefix = "#".repeat(level as usize);
|
||||
println!("{} {}", prefix, block.text);
|
||||
}
|
||||
"paragraph" => {
|
||||
println!("{}", block.text);
|
||||
}
|
||||
_ => {
|
||||
println!("{}", block.text);
|
||||
}
|
||||
// Markdown output: simple conversion with optional anchors
|
||||
let include_anchors = options.markdown_anchors;
|
||||
let include_page_breaks = true; // Add --- between pages
|
||||
|
||||
for (page_idx, page) in result.pages.iter().enumerate() {
|
||||
let is_last_page = page_idx == result.pages.len() - 1;
|
||||
let include_break = include_page_breaks && !is_last_page;
|
||||
|
||||
if include_anchors {
|
||||
// Use markdown module with anchors
|
||||
let md = page_to_markdown(&page.blocks, page.index, true, include_break);
|
||||
print!("{}", md);
|
||||
} else {
|
||||
// Simple conversion without anchors
|
||||
for (block_idx, block) in page.blocks.iter().enumerate() {
|
||||
let md = block_to_markdown(block, page.index, block_idx, false);
|
||||
print!("{}", md);
|
||||
println!();
|
||||
}
|
||||
if include_break {
|
||||
println!("\n---\n");
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ pub mod layout;
|
|||
pub mod graphics_state;
|
||||
#[cfg(feature = "ocr")]
|
||||
pub mod hybrid;
|
||||
pub mod markdown;
|
||||
pub mod options;
|
||||
pub mod parser;
|
||||
pub mod receipts;
|
||||
|
|
@ -41,6 +42,7 @@ pub mod table;
|
|||
pub use document::{PdfExtractor, PageIter, PageExtraction};
|
||||
pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult, ExtractionMetadata};
|
||||
pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics};
|
||||
pub use markdown::{Anchor, parse_anchors, block_to_markdown, page_to_markdown};
|
||||
pub use options::{ExtractionOptions, ReceiptsMode};
|
||||
pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree};
|
||||
pub use schema::{SpanJson, BlockJson, ExtractionQuality, TableJson, RowJson, CellJson, SpanRef};
|
||||
|
|
|
|||
460
crates/pdftract-core/src/markdown.rs
Normal file
460
crates/pdftract-core/src/markdown.rs
Normal file
|
|
@ -0,0 +1,460 @@
|
|||
//! Markdown output generation with positional HTML comment anchors.
|
||||
//!
|
||||
//! This module provides functions for converting extracted PDF content to
|
||||
//! Markdown format with optional HTML comment anchors that allow downstream
|
||||
//! tools to map excerpts back to precise PDF locations.
|
||||
//!
|
||||
//! # Anchor Format
|
||||
//!
|
||||
//! Each block can be preceded by a single-line HTML comment:
|
||||
//!
|
||||
//! ```markdown
|
||||
//! <!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
//! ## Chapter 3
|
||||
//! ```
|
||||
//!
|
||||
//! The anchor format is a stable schema parseable with one regex:
|
||||
//!
|
||||
//! ```text
|
||||
//! <!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->
|
||||
//! ```
|
||||
//!
|
||||
//! # Parsing Anchors
|
||||
//!
|
||||
//! Use [`parse_anchors`] to extract all anchors from markdown text:
|
||||
//!
|
||||
//! ```
|
||||
//! use pdftract_core::markdown::{parse_anchors, Anchor};
|
||||
//!
|
||||
//! let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
//! # Title"#;
|
||||
//!
|
||||
//! let anchors = parse_anchors(md);
|
||||
//! assert_eq!(anchors.len(), 1);
|
||||
//! assert_eq!(anchors[0].page, 0);
|
||||
//! assert_eq!(anchors[0].block, 0);
|
||||
//! ```
|
||||
|
||||
use crate::schema::BlockJson;
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
/// Regex for parsing pdftract HTML comment anchors.
|
||||
///
|
||||
/// Format: `<!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->`
|
||||
fn anchor_regex() -> &'static Regex {
|
||||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||||
REGEX.get_or_init(|| {
|
||||
Regex::new(r"<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->")
|
||||
.expect("invalid ANCHOR_REGEX")
|
||||
})
|
||||
}
|
||||
|
||||
/// A parsed HTML comment anchor containing positional metadata.
|
||||
///
|
||||
/// Anchors are extracted from markdown output and provide a mapping from
|
||||
/// markdown text back to precise PDF locations.
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct Anchor {
|
||||
/// Zero-based page index.
|
||||
pub page: usize,
|
||||
/// Zero-based block index within the page.
|
||||
pub block: usize,
|
||||
/// Bounding box in PDF points: [x0, y0, x1, y1].
|
||||
pub bbox: [f32; 4],
|
||||
/// Block kind (e.g., "heading", "paragraph", "table").
|
||||
pub kind: String,
|
||||
}
|
||||
|
||||
impl Anchor {
|
||||
/// Create a new anchor from components.
|
||||
pub fn new(page: usize, block: usize, bbox: [f32; 4], kind: String) -> Self {
|
||||
Self { page, block, bbox, kind }
|
||||
}
|
||||
|
||||
/// Format this anchor as an HTML comment.
|
||||
///
|
||||
/// Returns a single-line comment suitable for insertion before block content.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use pdftract_core::markdown::Anchor;
|
||||
///
|
||||
/// let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
|
||||
/// let comment = anchor.to_comment();
|
||||
/// assert_eq!(comment, "<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->");
|
||||
/// ```
|
||||
pub fn to_comment(&self) -> String {
|
||||
format!(
|
||||
"<!-- pdftract: page={} block={} bbox=[{:.1},{:.1},{:.1},{:.1}] kind={} -->",
|
||||
self.page, self.block, self.bbox[0], self.bbox[1], self.bbox[2], self.bbox[3], self.kind
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse all pdftract anchors from markdown text.
|
||||
///
|
||||
/// Returns a vector of [`Anchor`] structs in the order they appear in the text.
|
||||
/// Invalid anchor formats are silently skipped.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `md` - The markdown text to parse
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of parsed anchors.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use pdftract_core::markdown::parse_anchors;
|
||||
///
|
||||
/// let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
/// # Title
|
||||
///
|
||||
/// <!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
|
||||
/// Some text."#;
|
||||
///
|
||||
/// let anchors = parse_anchors(md);
|
||||
/// assert_eq!(anchors.len(), 2);
|
||||
/// assert_eq!(anchors[0].page, 0);
|
||||
/// assert_eq!(anchors[0].block, 0);
|
||||
/// assert_eq!(anchors[1].page, 0);
|
||||
/// assert_eq!(anchors[1].block, 1);
|
||||
/// ```
|
||||
pub fn parse_anchors(md: &str) -> Vec<Anchor> {
|
||||
let mut anchors = Vec::new();
|
||||
|
||||
for captures in anchor_regex().captures_iter(md) {
|
||||
// Parse page number
|
||||
let page = match captures.get(1).and_then(|m| m.as_str().parse().ok()) {
|
||||
Some(p) => p,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Parse block number
|
||||
let block = match captures.get(2).and_then(|m| m.as_str().parse().ok()) {
|
||||
Some(b) => b,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Parse bbox: "x0,y0,x1,y1" with possible decimal points
|
||||
let bbox_str = match captures.get(3) {
|
||||
Some(m) => m.as_str(),
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let bbox: [f32; 4] = match parse_bbox(bbox_str) {
|
||||
Some(b) => b,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Parse kind
|
||||
let kind = match captures.get(4) {
|
||||
Some(m) => m.as_str().to_string(),
|
||||
None => continue,
|
||||
};
|
||||
|
||||
anchors.push(Anchor::new(page, block, bbox, kind));
|
||||
}
|
||||
|
||||
anchors
|
||||
}
|
||||
|
||||
/// Parse a bbox string like "72.0,640.5,540.0,672.0" into [f32; 4].
|
||||
fn parse_bbox(s: &str) -> Option<[f32; 4]> {
|
||||
let parts: Vec<&str> = s.split(',').collect();
|
||||
if parts.len() != 4 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut bbox = [0.0f32; 4];
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
bbox[i] = part.trim().parse().ok()?;
|
||||
}
|
||||
|
||||
Some(bbox)
|
||||
}
|
||||
|
||||
/// Convert a block to markdown with optional anchor comment.
|
||||
///
|
||||
/// If `include_anchor` is true, emits an HTML comment before the block content.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `block` - The block to convert
|
||||
/// * `page_index` - Zero-based page index
|
||||
/// * `block_index` - Zero-based block index within the page
|
||||
/// * `include_anchor` - Whether to include the HTML comment anchor
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A markdown string with optional anchor.
|
||||
pub fn block_to_markdown(block: &BlockJson, page_index: usize, block_index: usize, include_anchor: bool) -> String {
|
||||
let mut result = String::new();
|
||||
|
||||
// Add anchor comment if requested
|
||||
if include_anchor {
|
||||
let anchor = Anchor::new(
|
||||
page_index,
|
||||
block_index,
|
||||
[block.bbox[0] as f32, block.bbox[1] as f32, block.bbox[2] as f32, block.bbox[3] as f32],
|
||||
block.kind.clone(),
|
||||
);
|
||||
result.push_str(&anchor.to_comment());
|
||||
result.push('\n');
|
||||
}
|
||||
|
||||
// Add block content based on kind
|
||||
match block.kind.as_str() {
|
||||
"heading" => {
|
||||
let level = block.level.unwrap_or(1);
|
||||
let prefix = "#".repeat(level as usize);
|
||||
result.push_str(&format!("{} {}\n", prefix, block.text));
|
||||
}
|
||||
"paragraph" => {
|
||||
result.push_str(&format!("{}\n", block.text));
|
||||
}
|
||||
"list" => {
|
||||
result.push_str(&format!("* {}\n", block.text));
|
||||
}
|
||||
"table" => {
|
||||
result.push_str(&format!("| {}\n", block.text));
|
||||
}
|
||||
"figure" => {
|
||||
result.push_str(&format!("![]()\n\n{}\n", block.text));
|
||||
}
|
||||
_ => {
|
||||
result.push_str(&format!("{}\n", block.text));
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Convert all blocks from a page to markdown with optional anchors.
|
||||
///
|
||||
/// If `include_anchor` is true, each block is preceded by an HTML comment.
|
||||
/// If `include_page_break` is true, adds a horizontal rule between pages.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `blocks` - The blocks to convert
|
||||
/// * `page_index` - Zero-based page index
|
||||
/// * `include_anchor` - Whether to include HTML comment anchors
|
||||
/// * `include_page_break` - Whether to add a page break separator
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A markdown string with all blocks from the page.
|
||||
pub fn page_to_markdown(blocks: &[BlockJson], page_index: usize, include_anchor: bool, include_page_break: bool) -> String {
|
||||
let mut result = String::new();
|
||||
|
||||
for (block_index, block) in blocks.iter().enumerate() {
|
||||
let md = block_to_markdown(block, page_index, block_index, include_anchor);
|
||||
result.push_str(&md);
|
||||
result.push('\n');
|
||||
}
|
||||
|
||||
// Add page break if requested and this isn't the last page
|
||||
if include_page_break {
|
||||
result.push_str("\n---\n\n");
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::schema::BlockJson;
|
||||
|
||||
fn make_test_block(kind: &str, text: &str, bbox: [f64; 4]) -> BlockJson {
|
||||
BlockJson {
|
||||
kind: kind.to_string(),
|
||||
text: text.to_string(),
|
||||
bbox,
|
||||
level: None,
|
||||
table_index: None,
|
||||
receipt: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anchor_to_comment() {
|
||||
let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
|
||||
let comment = anchor.to_comment();
|
||||
assert_eq!(comment, "<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_anchor_to_comment_round_bbox() {
|
||||
let anchor = Anchor::new(0, 0, [72.123, 640.567, 540.999, 672.111], "paragraph".to_string());
|
||||
let comment = anchor.to_comment();
|
||||
// Should be rounded to 1 decimal place
|
||||
assert_eq!(comment, "<!-- pdftract: page=0 block=0 bbox=[72.1,640.6,541.0,672.1] kind=paragraph -->");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_anchors_single() {
|
||||
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
# Title"#;
|
||||
|
||||
let anchors = parse_anchors(md);
|
||||
assert_eq!(anchors.len(), 1);
|
||||
assert_eq!(anchors[0].page, 0);
|
||||
assert_eq!(anchors[0].block, 0);
|
||||
assert_eq!(anchors[0].bbox, [72.0, 640.5, 540.0, 672.0]);
|
||||
assert_eq!(anchors[0].kind, "heading");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_anchors_multiple() {
|
||||
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
# Title
|
||||
|
||||
<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
|
||||
Some text."#;
|
||||
|
||||
let anchors = parse_anchors(md);
|
||||
assert_eq!(anchors.len(), 2);
|
||||
assert_eq!(anchors[0].page, 0);
|
||||
assert_eq!(anchors[0].block, 0);
|
||||
assert_eq!(anchors[1].page, 0);
|
||||
assert_eq!(anchors[1].block, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_anchors_invalid_format_skipped() {
|
||||
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
# Title
|
||||
|
||||
<!-- malformed anchor -->
|
||||
Some text."#;
|
||||
|
||||
let anchors = parse_anchors(md);
|
||||
assert_eq!(anchors.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_anchors_whitespace_tolerant() {
|
||||
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"#;
|
||||
let anchors = parse_anchors(md);
|
||||
assert_eq!(anchors.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_bbox() {
|
||||
assert_eq!(parse_bbox("72.0,640.5,540.0,672.0"), Some([72.0, 640.5, 540.0, 672.0]));
|
||||
assert_eq!(parse_bbox("0,0,100,100"), Some([0.0, 0.0, 100.0, 100.0]));
|
||||
assert_eq!(parse_bbox("72.0, 640.5, 540.0, 672.0"), Some([72.0, 640.5, 540.0, 672.0])); // with spaces
|
||||
assert_eq!(parse_bbox("invalid"), None);
|
||||
assert_eq!(parse_bbox("1,2,3"), None); // too few values
|
||||
assert_eq!(parse_bbox("1,2,3,4,5"), None); // too many values
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_to_markdown_heading_with_anchor() {
|
||||
let block = BlockJson {
|
||||
kind: "heading".to_string(),
|
||||
text: "Chapter 1".to_string(),
|
||||
bbox: [72.0, 640.5, 540.0, 672.0],
|
||||
level: Some(2),
|
||||
table_index: None,
|
||||
receipt: None,
|
||||
};
|
||||
|
||||
let md = block_to_markdown(&block, 0, 0, true);
|
||||
assert!(md.contains("<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"));
|
||||
assert!(md.contains("## Chapter 1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_to_markdown_paragraph_without_anchor() {
|
||||
let block = make_test_block("paragraph", "Some text.", [72.0, 600.0, 540.0, 630.0]);
|
||||
let md = block_to_markdown(&block, 0, 0, false);
|
||||
assert!(!md.contains("<!-- pdftract:"));
|
||||
assert!(md.contains("Some text."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_to_markdown_list() {
|
||||
let block = make_test_block("list", "Item 1", [72.0, 500.0, 540.0, 520.0]);
|
||||
let md = block_to_markdown(&block, 0, 0, false);
|
||||
assert!(md.contains("* Item 1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_to_markdown_table() {
|
||||
let block = make_test_block("table", "Cell data", [72.0, 400.0, 540.0, 450.0]);
|
||||
let md = block_to_markdown(&block, 0, 0, false);
|
||||
assert!(md.contains("| Cell data"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_to_markdown_figure() {
|
||||
let block = make_test_block("figure", "Alt text", [72.0, 300.0, 540.0, 350.0]);
|
||||
let md = block_to_markdown(&block, 0, 0, false);
|
||||
assert!(md.contains("![]()"));
|
||||
assert!(md.contains("Alt text"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_to_markdown_with_page_break() {
|
||||
let blocks = vec![
|
||||
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
|
||||
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
|
||||
];
|
||||
|
||||
let md = page_to_markdown(&blocks, 0, false, true);
|
||||
assert!(md.contains("---"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_to_markdown_without_page_break() {
|
||||
let blocks = vec![
|
||||
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
|
||||
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
|
||||
];
|
||||
|
||||
let md = page_to_markdown(&blocks, 0, false, false);
|
||||
assert!(!md.contains("---"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_to_markdown_with_anchors() {
|
||||
let blocks = vec![
|
||||
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
|
||||
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
|
||||
];
|
||||
|
||||
let md = page_to_markdown(&blocks, 0, true, false);
|
||||
assert_eq!(md.matches("<!-- pdftract:").count(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_extract_and_parse() {
|
||||
let blocks = vec![
|
||||
BlockJson {
|
||||
kind: "heading".to_string(),
|
||||
text: "Chapter 1".to_string(),
|
||||
bbox: [72.0, 640.5, 540.0, 672.0],
|
||||
level: Some(2),
|
||||
table_index: None,
|
||||
receipt: None,
|
||||
},
|
||||
];
|
||||
|
||||
let md = page_to_markdown(&blocks, 3, true, false);
|
||||
let anchors = parse_anchors(&md);
|
||||
|
||||
assert_eq!(anchors.len(), 1);
|
||||
assert_eq!(anchors[0].page, 3);
|
||||
assert_eq!(anchors[0].block, 0);
|
||||
assert_eq!(anchors[0].kind, "heading");
|
||||
}
|
||||
}
|
||||
|
|
@ -146,6 +146,24 @@ pub struct ExtractionOptions {
|
|||
///
|
||||
/// See docs/notes/ocr-language-packs.md for the full distribution strategy.
|
||||
pub ocr_language: Vec<String>,
|
||||
|
||||
/// Emit HTML comment anchors before each block in Markdown output (Phase 6.5).
|
||||
///
|
||||
/// When enabled, each block in markdown output is preceded by a single-line
|
||||
/// HTML comment containing positional metadata:
|
||||
///
|
||||
/// ```markdown
|
||||
/// <!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
/// ## Chapter 3
|
||||
/// ```
|
||||
///
|
||||
/// This allows downstream tools (LLM agents, audit tools, document Q&A systems)
|
||||
/// to map a Markdown excerpt back to a precise PDF location. HTML comments
|
||||
/// are passthrough in every major Markdown renderer (GitHub, GitLab, Obsidian,
|
||||
/// Notion import, pulldown-cmark, marked, markdown-it).
|
||||
///
|
||||
/// Default: false (anchors disabled)
|
||||
pub markdown_anchors: bool,
|
||||
}
|
||||
|
||||
impl Default for ExtractionOptions {
|
||||
|
|
@ -157,6 +175,7 @@ impl Default for ExtractionOptions {
|
|||
full_render: false,
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
markdown_anchors: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -190,6 +209,7 @@ impl ExtractionOptions {
|
|||
receipts,
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
markdown_anchors: false,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
|
@ -200,6 +220,7 @@ impl ExtractionOptions {
|
|||
receipts: ReceiptsMode::from_str(receipts)?,
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
markdown_anchors: false,
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
|
@ -219,6 +240,7 @@ impl ExtractionOptions {
|
|||
memory_budget_mb: memory_budget_mb.max(64),
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
markdown_anchors: false,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
|
|
|||
163
docs/integrations/markdown-anchors.md
Normal file
163
docs/integrations/markdown-anchors.md
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
# Markdown Anchors Integration Guide
|
||||
|
||||
This document describes the positional HTML comment anchors feature in pdftract's Markdown output.
|
||||
|
||||
## Overview
|
||||
|
||||
When `--md-anchors` is enabled, each block in markdown output is preceded by a single-line HTML comment containing positional metadata. This allows downstream tools (LLM agents, audit tools, document Q&A systems) to map a Markdown excerpt back to a precise PDF location.
|
||||
|
||||
## Anchor Format
|
||||
|
||||
Each anchor is a single-line HTML comment:
|
||||
|
||||
```markdown
|
||||
<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
## Chapter 3
|
||||
```
|
||||
|
||||
### Fields
|
||||
|
||||
- `page`: Zero-based page index (0, 1, 2, ...)
|
||||
- `block`: Zero-based block index within the page (0, 1, 2, ...)
|
||||
- `bbox`: Bounding box in PDF points `[x0, y0, x1, y1]` with 1 decimal place precision
|
||||
- `kind`: Block kind (`heading`, `paragraph`, `list`, `table`, `figure`, etc.)
|
||||
|
||||
### Regex Schema
|
||||
|
||||
The anchor format is parseable with this stable regex:
|
||||
|
||||
```regex
|
||||
<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### CLI
|
||||
|
||||
```bash
|
||||
# Enable anchors in markdown output
|
||||
pdftract extract input.pdf --format markdown --md-anchors > output.md
|
||||
```
|
||||
|
||||
### Rust API
|
||||
|
||||
```rust
|
||||
use pdftract_core::markdown::{parse_anchors, Anchor};
|
||||
|
||||
// Parse anchors from markdown text
|
||||
let md = std::fs::read_to_string("output.md")?;
|
||||
let anchors = parse_anchors(&md);
|
||||
|
||||
for anchor in anchors {
|
||||
println!("Page {} Block {} at {:?}", anchor.page, anchor.block, anchor.bbox);
|
||||
}
|
||||
```
|
||||
|
||||
## Properties
|
||||
|
||||
### Stability
|
||||
|
||||
The anchor format is a **stable public API**. The regex schema will not change in a breaking way across minor versions. New fields may be added, but existing fields will remain compatible.
|
||||
|
||||
### Passthrough
|
||||
|
||||
HTML comments are passthrough in every major Markdown renderer:
|
||||
- GitHub
|
||||
- GitLab
|
||||
- Obsidian
|
||||
- Notion import
|
||||
- pulldown-cmark
|
||||
- marked
|
||||
- markdown-it
|
||||
|
||||
Anchored output remains human-readable while machines can recover positional metadata.
|
||||
|
||||
### Round-trip
|
||||
|
||||
A round-trip property holds: extracting → parsing anchors → recovering the original block list (modulo inline styling, which is lossy in Markdown).
|
||||
|
||||
## Edge Cases
|
||||
|
||||
### Code Fences
|
||||
|
||||
HTML comments inside code fences (```) are not recognized by Markdown renderers—they're emitted verbatim. This is a limitation of the Markdown spec, not pdftract.
|
||||
|
||||
### Empty Blocks
|
||||
|
||||
Empty blocks (e.g., blank pages) still emit anchors with empty content following.
|
||||
|
||||
### Block Index
|
||||
|
||||
Block index is **per-page**, not global. Each page starts at block 0. Use the `page` field to compute global indices if needed.
|
||||
|
||||
## Examples
|
||||
|
||||
### Heading with Anchor
|
||||
|
||||
```markdown
|
||||
<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
# Introduction
|
||||
```
|
||||
|
||||
### Paragraph with Anchor
|
||||
|
||||
```markdown
|
||||
<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
|
||||
This is the first paragraph of the document.
|
||||
```
|
||||
|
||||
### Table with Anchor
|
||||
|
||||
```markdown
|
||||
<!-- pdftract: page=1 block=0 bbox=[72.0,500.0,540.0,400.0] kind=table -->
|
||||
| Column 1 | Column 2 |
|
||||
|----------|----------|
|
||||
| Cell 1 | Cell 2 |
|
||||
```
|
||||
|
||||
## Integration Examples
|
||||
|
||||
### Python: Extract Anchors
|
||||
|
||||
```python
|
||||
import re
|
||||
|
||||
ANCHOR_RE = re.compile(
|
||||
r'<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->'
|
||||
)
|
||||
|
||||
def extract_anchors(md_text):
|
||||
"""Return list of (page, block, bbox, kind) tuples."""
|
||||
anchors = []
|
||||
for match in ANCHOR_RE.finditer(md_text):
|
||||
page = int(match.group(1))
|
||||
block = int(match.group(2))
|
||||
bbox = [float(x) for x in match.group(3).split(',')]
|
||||
kind = match.group(4)
|
||||
anchors.append((page, block, bbox, kind))
|
||||
return anchors
|
||||
```
|
||||
|
||||
### JavaScript: Parse Anchors
|
||||
|
||||
```javascript
|
||||
const ANCHOR_RE = /<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->/g;
|
||||
|
||||
function extractAnchors(md) {
|
||||
const anchors = [];
|
||||
let match;
|
||||
while ((match = ANCHOR_RE.exec(md)) !== null) {
|
||||
anchors.push({
|
||||
page: parseInt(match[1]),
|
||||
block: parseInt(match[2]),
|
||||
bbox: match[3).split(',').map(Number),
|
||||
kind: match[4]
|
||||
});
|
||||
}
|
||||
return anchors;
|
||||
}
|
||||
```
|
||||
|
||||
## Version History
|
||||
|
||||
- **v0.1.0**: Initial release with `--md-anchors` flag and stable regex schema.
|
||||
110
notes/pdftract-vk0gc.md
Normal file
110
notes/pdftract-vk0gc.md
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
# Verification Note: pdftract-vk0gc (Markdown Anchors)
|
||||
|
||||
## Summary
|
||||
|
||||
Implemented `--md-anchors` positional HTML comment markers for Markdown output with parser regex.
|
||||
|
||||
## Changes Made
|
||||
|
||||
### 1. Core Implementation (crates/pdftract-core/src/markdown.rs)
|
||||
|
||||
Created new markdown module with:
|
||||
- `Anchor` struct with `page`, `block`, `bbox`, `kind` fields
|
||||
- `parse_anchors()` function with regex: `r"<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->"`
|
||||
- `block_to_markdown()` - converts single block to markdown with optional anchor
|
||||
- `page_to_markdown()` - converts all blocks from a page with optional anchors and page breaks
|
||||
- `Anchor::to_comment()` - formats anchor as HTML comment with 1 decimal place precision
|
||||
|
||||
### 2. Options (crates/pdftract-core/src/options.rs)
|
||||
|
||||
Added `markdown_anchors: bool` field to `ExtractionOptions` with default `false`.
|
||||
|
||||
### 3. CLI Integration (crates/pdftract-cli/src/main.rs)
|
||||
|
||||
- Added `--md-anchors` flag to Extract command
|
||||
- Passed flag through to ExtractionOptions
|
||||
- Updated markdown output to use `page_to_markdown()` when anchors enabled
|
||||
- Added import for `page_to_markdown` and `block_to_markdown`
|
||||
|
||||
### 4. Documentation (docs/integrations/markdown-anchors.md)
|
||||
|
||||
Created comprehensive integration guide covering:
|
||||
- Anchor format specification
|
||||
- Regex schema
|
||||
- CLI and Rust API usage
|
||||
- Edge cases (code fences, empty blocks, per-page indexing)
|
||||
- Integration examples for Python and JavaScript
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
### PASS
|
||||
|
||||
- ✅ `--md-anchors` flag emits comment before every block
|
||||
- ✅ Parser regex extracts page, block, bbox, kind from sample output
|
||||
- ✅ Round-trip test: `test_roundtrip_extract_and_parse` passes
|
||||
- ✅ Comment is ONE LINE (no embedded newline)
|
||||
- ✅ bbox precision: 1 decimal place exact (verified in `test_anchor_to_comment_round_bbox`)
|
||||
- ✅ kind matches block kind (heading, paragraph, etc.)
|
||||
- ✅ Parser library `parse_anchors()` available
|
||||
- ✅ Module exports: `Anchor`, `parse_anchors`, `block_to_markdown`, `page_to_markdown`
|
||||
- ✅ 16 unit tests pass (including roundtrip, bbox parsing, multiple anchors)
|
||||
- ✅ Regex is stable public API (documented in markdown-anchors.md)
|
||||
- ✅ HTML comments are passthrough in major renderers (documented)
|
||||
- ✅ Block index is per-page (0-based within page)
|
||||
|
||||
### WARN (Infrastructure limitations)
|
||||
|
||||
- None
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Tests (16/16 pass)
|
||||
|
||||
- `test_anchor_to_comment` - basic comment formatting
|
||||
- `test_anchor_to_comment_round_bbox` - 1 decimal place precision
|
||||
- `test_parse_anchors_single` - parse single anchor
|
||||
- `test_parse_anchors_multiple` - parse multiple anchors
|
||||
- `test_parse_anchors_invalid_format_skipped` - invalid formats skipped
|
||||
- `test_parse_anchors_whitespace_tolerant` - whitespace tolerance
|
||||
- `test_parse_bbox` - bbox parsing with various formats
|
||||
- `test_block_to_markdown_heading_with_anchor` - heading with anchor
|
||||
- `test_block_to_markdown_paragraph_without_anchor` - paragraph without anchor
|
||||
- `test_block_to_markdown_list` - list block
|
||||
- `test_block_to_markdown_table` - table block
|
||||
- `test_block_to_markdown_figure` - figure block
|
||||
- `test_page_to_markdown_with_page_break` - page break separator
|
||||
- `test_page_to_markdown_without_page_break` - no page break
|
||||
- `test_page_to_markdown_with_anchors` - anchors enabled
|
||||
- `test_roundtrip_extract_and_parse` - full roundtrip
|
||||
|
||||
### Build Verification
|
||||
|
||||
- `cargo build -p pdftract-core` - ✅ Success
|
||||
- `cargo build -p pdftract-cli` - ✅ Success
|
||||
- `cargo test -p pdftract-core --lib markdown` - ✅ 16/16 tests pass
|
||||
|
||||
## Example Output
|
||||
|
||||
With `--md-anchors` enabled:
|
||||
|
||||
```markdown
|
||||
<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||||
# Chapter 1
|
||||
|
||||
<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
|
||||
This is the first paragraph.
|
||||
```
|
||||
|
||||
## Files Modified
|
||||
|
||||
- `crates/pdftract-core/src/markdown.rs` (new)
|
||||
- `crates/pdftract-core/src/lib.rs` (module export)
|
||||
- `crates/pdftract-core/src/options.rs` (markdown_anchors field)
|
||||
- `crates/pdftract-core/Cargo.toml` (regex dependency already present)
|
||||
- `crates/pdftract-cli/src/main.rs` (CLI flag and output logic)
|
||||
- `docs/integrations/markdown-anchors.md` (new documentation)
|
||||
|
||||
## References
|
||||
|
||||
- Plan section: Phase 6.5 positional anchors (lines 2183-2197)
|
||||
- Bead: pdftract-vk0gc
|
||||
Loading…
Add table
Reference in a new issue