Fix two compilation errors at lines 584 and 658 where code was calling .code on &String diagnostics. Replaced d.code.to_string() with direct Vec<String> clone since diagnostics is already Vec<String>. Accepts criteria: - cargo check -p pdftract-cli emits no 'no field code' errors - serve.rs compiles cleanly
2940 lines
91 KiB
Rust
2940 lines
91 KiB
Rust
//! Markdown output generation with positional HTML comment anchors.
|
||
//!
|
||
//! This module provides functions for converting extracted PDF content to
|
||
//! Markdown format with optional HTML comment anchors that allow downstream
|
||
//! tools to map excerpts back to precise PDF locations.
|
||
//!
|
||
//! # Anchor Format
|
||
//!
|
||
//! Each block can be preceded by a single-line HTML comment:
|
||
//!
|
||
//! ```markdown
|
||
//! <!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||
//! ## Chapter 3
|
||
//! ```
|
||
//!
|
||
//! The anchor format is a stable schema parseable with one regex:
|
||
//!
|
||
//! ```text
|
||
//! <!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->
|
||
//! ```
|
||
//!
|
||
//! # Parsing Anchors
|
||
//!
|
||
//! Use [`parse_anchors`] to extract all anchors from markdown text:
|
||
//!
|
||
//! ```
|
||
//! use pdftract_core::markdown::{parse_anchors, Anchor};
|
||
//!
|
||
//! let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||
//! # Title"#;
|
||
//!
|
||
//! let anchors = parse_anchors(md);
|
||
//! assert_eq!(anchors.len(), 1);
|
||
//! assert_eq!(anchors[0].page, 0);
|
||
//! assert_eq!(anchors[0].block, 0);
|
||
//! ```
|
||
|
||
use crate::schema::{
|
||
BeadJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson,
|
||
SpanJson, TableJson, ThreadJson,
|
||
};
|
||
use regex::Regex;
|
||
use serde::{Deserialize, Serialize};
|
||
use std::sync::OnceLock;
|
||
|
||
/// Markdown emission options for controlling block inclusion.
|
||
#[derive(Debug, Clone, Copy, Default)]
|
||
pub struct MarkdownOptions {
|
||
/// Include header and footer blocks in output.
|
||
pub include_headers_footers: bool,
|
||
/// Include watermark blocks in output.
|
||
pub include_watermarks: bool,
|
||
/// Include page break separators between pages.
|
||
pub include_page_breaks: bool,
|
||
}
|
||
|
||
impl MarkdownOptions {
|
||
/// Create a new MarkdownOptions with default settings.
|
||
pub fn new() -> Self {
|
||
Self::default()
|
||
}
|
||
|
||
/// Set whether to include headers and footers.
|
||
pub fn with_headers_footers(mut self, include: bool) -> Self {
|
||
self.include_headers_footers = include;
|
||
self
|
||
}
|
||
|
||
/// Set whether to include watermarks.
|
||
pub fn with_watermarks(mut self, include: bool) -> Self {
|
||
self.include_watermarks = include;
|
||
self
|
||
}
|
||
|
||
/// Set whether to include page breaks.
|
||
pub fn with_page_breaks(mut self, include: bool) -> Self {
|
||
self.include_page_breaks = include;
|
||
self
|
||
}
|
||
}
|
||
|
||
/// Regex for parsing pdftract HTML comment anchors.
|
||
///
|
||
/// Format: `<!-- pdftract: page=(\d+) block=(\d+) bbox=\[([\d.,]+)\] kind=(\w+) -->`
|
||
fn anchor_regex() -> &'static Regex {
|
||
static REGEX: OnceLock<Regex> = OnceLock::new();
|
||
REGEX.get_or_init(|| {
|
||
Regex::new(
|
||
r"<!--\s*pdftract:\s*page=(\d+)\s+block=(\d+)\s+bbox=\[([\d.,]+)\]\s+kind=(\w+)\s*-->",
|
||
)
|
||
.expect("invalid ANCHOR_REGEX")
|
||
})
|
||
}
|
||
|
||
/// A parsed HTML comment anchor containing positional metadata.
|
||
///
|
||
/// Anchors are extracted from markdown output and provide a mapping from
|
||
/// markdown text back to precise PDF locations.
|
||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||
pub struct Anchor {
|
||
/// Zero-based page index.
|
||
pub page: usize,
|
||
/// Zero-based block index within the page.
|
||
pub block: usize,
|
||
/// Bounding box in PDF points: [x0, y0, x1, y1].
|
||
pub bbox: [f32; 4],
|
||
/// Block kind (e.g., "heading", "paragraph", "table").
|
||
pub kind: String,
|
||
}
|
||
|
||
impl Anchor {
|
||
/// Create a new anchor from components.
|
||
pub fn new(page: usize, block: usize, bbox: [f32; 4], kind: String) -> Self {
|
||
Self {
|
||
page,
|
||
block,
|
||
bbox,
|
||
kind,
|
||
}
|
||
}
|
||
|
||
/// Format this anchor as an HTML comment.
|
||
///
|
||
/// Returns a single-line comment suitable for insertion before block content.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::markdown::Anchor;
|
||
///
|
||
/// let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
|
||
/// let comment = anchor.to_comment();
|
||
/// assert_eq!(comment, "<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->");
|
||
/// ```
|
||
pub fn to_comment(&self) -> String {
|
||
format!(
|
||
"<!-- pdftract: page={} block={} bbox=[{:.1},{:.1},{:.1},{:.1}] kind={} -->",
|
||
self.page,
|
||
self.block,
|
||
self.bbox[0],
|
||
self.bbox[1],
|
||
self.bbox[2],
|
||
self.bbox[3],
|
||
self.kind
|
||
)
|
||
}
|
||
}
|
||
|
||
/// Parse all pdftract anchors from markdown text.
|
||
///
|
||
/// Returns a vector of [`Anchor`] structs in the order they appear in the text.
|
||
/// Invalid anchor formats are silently skipped.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `md` - The markdown text to parse
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A vector of parsed anchors.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::markdown::parse_anchors;
|
||
///
|
||
/// let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||
/// # Title
|
||
///
|
||
/// <!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
|
||
/// Some text."#;
|
||
///
|
||
/// let anchors = parse_anchors(md);
|
||
/// assert_eq!(anchors.len(), 2);
|
||
/// assert_eq!(anchors[0].page, 0);
|
||
/// assert_eq!(anchors[0].block, 0);
|
||
/// assert_eq!(anchors[1].page, 0);
|
||
/// assert_eq!(anchors[1].block, 1);
|
||
/// ```
|
||
pub fn parse_anchors(md: &str) -> Vec<Anchor> {
|
||
let mut anchors = Vec::new();
|
||
|
||
for captures in anchor_regex().captures_iter(md) {
|
||
// Parse page number
|
||
let page = match captures.get(1).and_then(|m| m.as_str().parse().ok()) {
|
||
Some(p) => p,
|
||
None => continue,
|
||
};
|
||
|
||
// Parse block number
|
||
let block = match captures.get(2).and_then(|m| m.as_str().parse().ok()) {
|
||
Some(b) => b,
|
||
None => continue,
|
||
};
|
||
|
||
// Parse bbox: "x0,y0,x1,y1" with possible decimal points
|
||
let bbox_str = match captures.get(3) {
|
||
Some(m) => m.as_str(),
|
||
None => continue,
|
||
};
|
||
|
||
let bbox: [f32; 4] = match parse_bbox(bbox_str) {
|
||
Some(b) => b,
|
||
None => continue,
|
||
};
|
||
|
||
// Parse kind
|
||
let kind = match captures.get(4) {
|
||
Some(m) => m.as_str().to_string(),
|
||
None => continue,
|
||
};
|
||
|
||
anchors.push(Anchor::new(page, block, bbox, kind));
|
||
}
|
||
|
||
anchors
|
||
}
|
||
|
||
/// Parse a bbox string like "72.0,640.5,540.0,672.0" into [f32; 4].
|
||
fn parse_bbox(s: &str) -> Option<[f32; 4]> {
|
||
let parts: Vec<&str> = s.split(',').collect();
|
||
if parts.len() != 4 {
|
||
return None;
|
||
}
|
||
|
||
let mut bbox = [0.0f32; 4];
|
||
for (i, part) in parts.iter().enumerate() {
|
||
bbox[i] = part.trim().parse().ok()?;
|
||
}
|
||
|
||
Some(bbox)
|
||
}
|
||
|
||
/// Emit a block as Markdown based on its kind.
|
||
///
|
||
/// This function implements the Phase 6.5 block-kind dispatch table, mapping
|
||
/// each block type to its appropriate Markdown representation.
|
||
///
|
||
/// # Block Kind Dispatch Table
|
||
///
|
||
/// | Block kind | Markdown emission |
|
||
/// |---|---|
|
||
/// | `heading` (level N) | `#` × N + space + text + `\n\n` |
|
||
/// | `paragraph` | text + `\n\n`; soft line breaks as ` \n` |
|
||
/// | `list` (bulleted) | `- item\n` per item |
|
||
/// | `list` (numbered) | `1. item\n` (preserves source numbering) |
|
||
/// | `code` | Fenced block with language detection |
|
||
/// | `formula` (inline) | `$expr$` |
|
||
/// | `formula` (display) | `$$\nexpr\n$$\n\n` |
|
||
/// | `table` | GFM pipe table or HTML fallback |
|
||
/// | `caption` | `*text*\n\n` |
|
||
/// | `figure` | `\n\n` |
|
||
/// | `header` / `footer` | Skipped unless `include_headers_footers` |
|
||
/// | `watermark` | Skipped unless `include_watermarks` |
|
||
/// | `block_quote` | `> line\n` per line |
|
||
/// | `toc` | Emitted as plain text |
|
||
/// | `note` / `footnote` | Emitted as inline text |
|
||
/// | `reference` | Emitted as plain text |
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `block` - The block to convert
|
||
/// * `tables` - The tables array for looking up table structures
|
||
/// * `options` - Markdown emission options
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string representing the block.
|
||
fn emit_block_kind(block: &BlockJson, tables: &[TableJson], options: &MarkdownOptions) -> String {
|
||
match block.kind.as_str() {
|
||
"heading" => emit_heading(block),
|
||
|
||
"paragraph" => emit_paragraph(block),
|
||
|
||
"list" | "list_item" => emit_list_item(block),
|
||
|
||
"code" => emit_code_block(block),
|
||
|
||
"formula" => emit_formula(block),
|
||
|
||
"table" => emit_table_block(block, tables),
|
||
|
||
"caption" => emit_caption(block),
|
||
|
||
"figure" => emit_figure(block),
|
||
|
||
"header" | "footer" => {
|
||
if options.include_headers_footers {
|
||
emit_header_footer(block)
|
||
} else {
|
||
String::new()
|
||
}
|
||
}
|
||
|
||
"watermark" => {
|
||
if options.include_watermarks {
|
||
emit_watermark(block)
|
||
} else {
|
||
String::new()
|
||
}
|
||
}
|
||
|
||
"block_quote" => emit_block_quote(block),
|
||
|
||
"toc" => emit_toc(block),
|
||
|
||
"note" | "footnote" => emit_note(block),
|
||
|
||
"reference" => emit_reference(block),
|
||
|
||
"list_label" | "list_body" => {
|
||
// These are internal structural elements, emit as plain text
|
||
format!("{}\n", block.text)
|
||
}
|
||
|
||
_ => {
|
||
// Unknown block kinds fall back to plain text
|
||
format!("{}\n", block.text)
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Emit a heading block with level from block.level or default to 1.
|
||
fn emit_heading(block: &BlockJson) -> String {
|
||
let level = block.level.unwrap_or(1).clamp(1, 6);
|
||
let prefix = "#".repeat(level as usize);
|
||
format!("{} {}\n\n", prefix, block.text)
|
||
}
|
||
|
||
/// Emit a paragraph block with soft line breaks preserved.
|
||
fn emit_paragraph(block: &BlockJson) -> String {
|
||
// Soft line breaks within a paragraph are encoded as trailing " \n"
|
||
// (CommonMark hard break syntax). Internal newlines in block.text
|
||
// become soft breaks, while the paragraph ends with "\n\n".
|
||
let text = block.text.replace('\n', " \n");
|
||
format!("{}\n\n", text)
|
||
}
|
||
|
||
/// Emit a list item (bulleted or numbered).
|
||
/// This is used for isolated list items without nesting context.
|
||
fn emit_list_item(block: &BlockJson) -> String {
|
||
// Try to detect if this is a numbered list by checking if text starts with a number
|
||
let is_numbered = block
|
||
.text
|
||
.chars()
|
||
.next()
|
||
.map(|c| c.is_ascii_digit())
|
||
.unwrap_or(false);
|
||
|
||
if is_numbered {
|
||
// Numbered list item - preserve source numbering
|
||
format!("{}\n", block.text)
|
||
} else {
|
||
// Bulleted list item
|
||
format!("* {}\n", block.text)
|
||
}
|
||
}
|
||
|
||
/// Emit a sequence of list blocks with proper nesting support.
|
||
///
|
||
/// This function groups consecutive list items and emits them with proper
|
||
/// indentation based on their bbox x0 (left margin) values. Nested sublists
|
||
/// are indented by 2 spaces per level per CommonMark convention.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `list_blocks` - A slice of consecutive list blocks
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with properly indented list items.
|
||
///
|
||
/// # Nesting Detection
|
||
///
|
||
/// Nesting level is inferred from the bbox x0 (left margin) value:
|
||
/// - All items at the same x0 are at the same nesting level
|
||
/// - Items with greater x0 are nested under the previous item
|
||
/// - Each nesting level adds 2 spaces of indentation
|
||
fn emit_list_blocks(list_blocks: &[BlockJson]) -> String {
|
||
if list_blocks.is_empty() {
|
||
return String::new();
|
||
}
|
||
|
||
// Group by x0 value to detect nesting levels
|
||
let mut result = String::new();
|
||
let mut indent_levels: Vec<f64> = Vec::new(); // Track x0 values for each nesting level
|
||
|
||
for block in list_blocks {
|
||
let x0 = block.bbox[0];
|
||
|
||
// Determine nesting level by comparing x0 to known levels
|
||
let mut level = 0;
|
||
for (i, &indent) in indent_levels.iter().enumerate() {
|
||
if (x0 - indent).abs() < 5.0 {
|
||
// x0 matches this level (within 5 point tolerance)
|
||
level = i;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// If x0 doesn't match any known level, it's a new level
|
||
if level == 0 && indent_levels.iter().all(|&v| (x0 - v).abs() >= 5.0) {
|
||
level = indent_levels.len();
|
||
indent_levels.push(x0);
|
||
} else if level < indent_levels.len() && indent_levels.iter().enumerate().all(|(i, &v)| i != level || (x0 - v).abs() >= 5.0) {
|
||
// x0 is a new level beyond current ones
|
||
level = indent_levels.len();
|
||
indent_levels.push(x0);
|
||
}
|
||
|
||
// Detect if this is a numbered list item
|
||
let is_numbered = block
|
||
.text
|
||
.chars()
|
||
.next()
|
||
.map(|c| c.is_ascii_digit())
|
||
.unwrap_or(false);
|
||
|
||
// Emit with proper indentation
|
||
let indent = " ".repeat(level);
|
||
if is_numbered {
|
||
// Numbered list item - preserve source numbering
|
||
result.push_str(&format!("{}{}\n", indent, block.text));
|
||
} else {
|
||
// Bulleted list item
|
||
result.push_str(&format!("{}* {}\n", indent, block.text));
|
||
}
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Emit a code block with language detection.
|
||
fn emit_code_block(block: &BlockJson) -> String {
|
||
// Detect language from monospace font hint + optional shebang/keyword sniff
|
||
let lang = detect_code_language(&block.text);
|
||
format!("```{}\n{}\n```\n\n", lang, block.text)
|
||
}
|
||
|
||
/// Detect the programming language from code content.
|
||
///
|
||
/// This is a best-effort heuristic based on:
|
||
/// - Shebang lines (e.g., `#!/usr/bin/env python`)
|
||
/// - Common language keywords/patterns
|
||
/// Falls back to empty string (no language specified)
|
||
fn detect_code_language(code: &str) -> &str {
|
||
let first_line = code.lines().next().unwrap_or("");
|
||
|
||
// Check for shebang
|
||
if first_line.starts_with("#!") {
|
||
if first_line.contains("python") || first_line.contains("python3") {
|
||
return "python";
|
||
}
|
||
if first_line.contains("bash") || first_line.contains("sh") {
|
||
return "bash";
|
||
}
|
||
if first_line.contains("node") || first_line.contains("javascript") {
|
||
return "javascript";
|
||
}
|
||
if first_line.contains("perl") {
|
||
return "perl";
|
||
}
|
||
if first_line.contains("ruby") {
|
||
return "ruby";
|
||
}
|
||
}
|
||
|
||
// Check for common language patterns
|
||
let lower = code.to_lowercase();
|
||
|
||
// Rust patterns
|
||
if lower.contains("fn main()") || lower.contains("use std::") || lower.contains("let mut ") {
|
||
return "rust";
|
||
}
|
||
|
||
// Python patterns
|
||
if lower.contains("def ") || lower.contains("import ") || lower.contains("from ") {
|
||
return "python";
|
||
}
|
||
|
||
// JavaScript patterns
|
||
if lower.contains("function ") || lower.contains("const ") || lower.contains("let ") {
|
||
return "javascript";
|
||
}
|
||
|
||
// C/C++ patterns
|
||
if lower.contains("#include <") || lower.contains("#include \"") {
|
||
return "c";
|
||
}
|
||
|
||
// Java patterns
|
||
if lower.contains("public class") || lower.contains("public static void main") {
|
||
return "java";
|
||
}
|
||
|
||
// Go patterns
|
||
if lower.contains("func ") && lower.contains("package ") {
|
||
return "go";
|
||
}
|
||
|
||
// Default: no language specified
|
||
""
|
||
}
|
||
|
||
/// Emit a formula (inline or display).
|
||
fn emit_formula(block: &BlockJson) -> String {
|
||
// Distinguish inline vs display mode by checking if the formula
|
||
// contains newlines. Single-line formulas are inline ($...$),
|
||
// multi-line formulas are display ($$\n...\n$$).
|
||
if block.text.contains('\n') {
|
||
// Display mode: multi-line formula
|
||
format!("$$\n{}\n$$\n\n", block.text)
|
||
} else {
|
||
// Inline mode: single-line formula
|
||
format!("${}$", block.text)
|
||
}
|
||
}
|
||
|
||
/// Emit a table block with lookup from tables array.
|
||
fn emit_table_block(block: &BlockJson, tables: &[TableJson]) -> String {
|
||
// Look up the table structure from the tables array
|
||
if let Some(table_idx) = block.table_index {
|
||
if let Some(table) = tables.get(table_idx) {
|
||
emit_table(table)
|
||
} else {
|
||
// Fallback to text if table index is invalid
|
||
format!("| {}\n", block.text)
|
||
}
|
||
} else {
|
||
// Fallback to text if no table index
|
||
format!("| {}\n", block.text)
|
||
}
|
||
}
|
||
|
||
/// Emit a caption block (italic text).
|
||
fn emit_caption(block: &BlockJson) -> String {
|
||
format!("*{}*\n\n", block.text)
|
||
}
|
||
|
||
/// Emit a figure block with alt text placeholder.
|
||
fn emit_figure(block: &BlockJson) -> String {
|
||
// Use block.text as alt text, with placeholder path
|
||
format!("![{}]()\n\n", block.text)
|
||
}
|
||
|
||
/// Emit a header or footer block.
|
||
fn emit_header_footer(block: &BlockJson) -> String {
|
||
format!("{}\n", block.text)
|
||
}
|
||
|
||
/// Emit a watermark block.
|
||
fn emit_watermark(block: &BlockJson) -> String {
|
||
format!("{}\n", block.text)
|
||
}
|
||
|
||
/// Emit a block quote (prefixed lines).
|
||
fn emit_block_quote(block: &BlockJson) -> String {
|
||
// Prefix each line with "> "
|
||
block
|
||
.text
|
||
.lines()
|
||
.map(|line| format!("> {}\n", line))
|
||
.collect()
|
||
}
|
||
|
||
/// Emit a table of contents block.
|
||
fn emit_toc(block: &BlockJson) -> String {
|
||
format!("{}\n", block.text)
|
||
}
|
||
|
||
/// Emit a note or footnote block.
|
||
fn emit_note(block: &BlockJson) -> String {
|
||
format!("{}\n", block.text)
|
||
}
|
||
|
||
/// Emit a reference block.
|
||
fn emit_reference(block: &BlockJson) -> String {
|
||
format!("{}\n", block.text)
|
||
}
|
||
|
||
/// Convert a block to markdown with optional anchor comment.
|
||
///
|
||
/// If `include_anchor` is true, emits an HTML comment before the block content.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `block` - The block to convert
|
||
/// * `tables` - The tables array for looking up table structures by table_index
|
||
/// * `page_index` - Zero-based page index
|
||
/// * `block_index` - Zero-based block index within the page
|
||
/// * `include_anchor` - Whether to include the HTML comment anchor
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with optional anchor.
|
||
pub fn block_to_markdown(
|
||
block: &BlockJson,
|
||
tables: &[TableJson],
|
||
page_index: usize,
|
||
block_index: usize,
|
||
include_anchor: bool,
|
||
) -> String {
|
||
block_to_markdown_with_options(
|
||
block,
|
||
tables,
|
||
page_index,
|
||
block_index,
|
||
include_anchor,
|
||
&MarkdownOptions::default(),
|
||
)
|
||
}
|
||
|
||
/// Convert a block to markdown with optional anchor comment and custom options.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `block` - The block to convert
|
||
/// * `tables` - The tables array for looking up table structures by table_index
|
||
/// * `page_index` - Zero-based page index
|
||
/// * `block_index` - Zero-based block index within the page
|
||
/// * `include_anchor` - Whether to include the HTML comment anchor
|
||
/// * `options` - Markdown emission options
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with optional anchor.
|
||
pub fn block_to_markdown_with_options(
|
||
block: &BlockJson,
|
||
tables: &[TableJson],
|
||
page_index: usize,
|
||
block_index: usize,
|
||
include_anchor: bool,
|
||
options: &MarkdownOptions,
|
||
) -> String {
|
||
let mut result = String::new();
|
||
|
||
// Add anchor comment if requested
|
||
if include_anchor {
|
||
let anchor = Anchor::new(
|
||
page_index,
|
||
block_index,
|
||
[
|
||
block.bbox[0] as f32,
|
||
block.bbox[1] as f32,
|
||
block.bbox[2] as f32,
|
||
block.bbox[3] as f32,
|
||
],
|
||
block.kind.clone(),
|
||
);
|
||
result.push_str(&anchor.to_comment());
|
||
result.push('\n');
|
||
}
|
||
|
||
// Add block content based on kind using the dispatch table
|
||
result.push_str(&emit_block_kind(block, tables, options));
|
||
|
||
result
|
||
}
|
||
|
||
/// Convert all blocks from a page to markdown with optional anchors.
|
||
///
|
||
/// If `include_anchor` is true, each block is preceded by an HTML comment.
|
||
/// If `include_page_break` is true, adds a horizontal rule between pages.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `blocks` - The blocks to convert
|
||
/// * `tables` - The tables array for looking up table structures
|
||
/// * `page_index` - Zero-based page index
|
||
/// * `include_anchor` - Whether to include HTML comment anchors
|
||
/// * `include_page_break` - Whether to add a page break separator
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with all blocks from the page.
|
||
/// Convert all blocks from a page to markdown with optional anchors.
|
||
///
|
||
/// If `include_anchor` is true, each block is preceded by an HTML comment.
|
||
/// If `include_page_break` is true, adds a horizontal rule between pages.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `blocks` - The blocks to convert
|
||
/// * `tables` - The tables array for looking up table structures
|
||
/// * `page_index` - Zero-based page index
|
||
/// * `include_anchor` - Whether to include HTML comment anchors
|
||
/// * `include_page_break` - Whether to add a page break separator
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with all blocks from the page.
|
||
pub fn page_to_markdown(
|
||
blocks: &[BlockJson],
|
||
tables: &[TableJson],
|
||
page_index: usize,
|
||
include_anchor: bool,
|
||
include_page_break: bool,
|
||
) -> String {
|
||
let options = MarkdownOptions {
|
||
include_page_breaks: include_page_break,
|
||
..Default::default()
|
||
};
|
||
page_to_markdown_with_options(blocks, tables, page_index, include_anchor, &options)
|
||
}
|
||
|
||
/// Convert all blocks from a page to markdown with full options control.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `blocks` - The blocks to convert
|
||
/// * `tables` - The tables array for looking up table structures
|
||
/// * `page_index` - Zero-based page index
|
||
/// * `include_anchor` - Whether to include HTML comment anchors
|
||
/// * `options` - Markdown emission options
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with all blocks from the page.
|
||
pub fn page_to_markdown_with_options(
|
||
blocks: &[BlockJson],
|
||
tables: &[TableJson],
|
||
page_index: usize,
|
||
include_anchor: bool,
|
||
options: &MarkdownOptions,
|
||
) -> String {
|
||
let mut result = String::new();
|
||
let mut i = 0;
|
||
|
||
while i < blocks.len() {
|
||
let block = &blocks[i];
|
||
|
||
// Check if this is a list item and if there are consecutive list items
|
||
if block.kind == "list" || block.kind == "list_item" {
|
||
// Find the end of the consecutive list sequence
|
||
let mut list_end = i + 1;
|
||
while list_end < blocks.len()
|
||
&& (blocks[list_end].kind == "list" || blocks[list_end].kind == "list_item")
|
||
{
|
||
list_end += 1;
|
||
}
|
||
|
||
// Emit the entire list sequence as a group
|
||
let list_blocks = &blocks[i..list_end];
|
||
let list_md = emit_list_blocks(list_blocks);
|
||
result.push_str(&list_md);
|
||
result.push('\n');
|
||
|
||
i = list_end;
|
||
} else {
|
||
// Non-list block - emit individually
|
||
let md = block_to_markdown_with_options(
|
||
block,
|
||
tables,
|
||
page_index,
|
||
i,
|
||
include_anchor,
|
||
options,
|
||
);
|
||
result.push_str(&md);
|
||
result.push('\n');
|
||
i += 1;
|
||
}
|
||
}
|
||
|
||
// Add page break if requested and this isn't the last page
|
||
if options.include_page_breaks {
|
||
result.push_str("\n---\n\n");
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Emit spans with inline link support.
|
||
///
|
||
/// This function processes spans and emits them as markdown, with spans that
|
||
/// are part of link annotations emitted as inline links `[anchor text](URL)`
|
||
/// instead of plain styled text.
|
||
///
|
||
/// This implements Phase 6.5.5b: inline-link emission from Phase 7.6 link annotations.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `spans` - The spans to emit
|
||
/// * `page_links` - Link annotations for this page (from Phase 7.6)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with spans emitted, including inline links where applicable.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::markdown::spans_to_markdown_with_links;
|
||
/// use pdftract_core::schema::SpanJson;
|
||
///
|
||
/// let spans = vec![
|
||
/// SpanJson { text: "Click ".to_string(), ..Default::default() },
|
||
/// SpanJson { text: "here".to_string(), ..Default::default() },
|
||
/// SpanJson { text: " for more".to_string(), ..Default::default() },
|
||
/// ];
|
||
///
|
||
/// // If "here" is part of a link, it will be emitted as [here](https://example.com)
|
||
/// let md = spans_to_markdown_with_links(&spans, &[]);
|
||
/// ```
|
||
pub fn spans_to_markdown_with_links(spans: &[SpanJson], page_links: &[crate::schema::LinkJson]) -> String {
|
||
use crate::output::markdown::links;
|
||
|
||
if page_links.is_empty() {
|
||
// No links - emit spans normally with inline styling
|
||
return spans.iter().map(span_to_markdown).collect::<String>();
|
||
}
|
||
|
||
// Process links to find which spans are covered
|
||
let link_data = links::emit_page_links_from_json(spans, page_links);
|
||
|
||
// Build a map of span index -> link markdown (if part of a link)
|
||
let mut span_to_link: std::collections::HashMap<usize, String> = std::collections::HashMap::new();
|
||
for (span_indices, link_markdown) in &link_data {
|
||
for &idx in span_indices {
|
||
span_to_link.insert(idx, link_markdown.clone());
|
||
}
|
||
}
|
||
|
||
// Emit spans: if a span is part of a link, use the link markdown; otherwise use normal styling
|
||
let mut result = String::new();
|
||
for (idx, span) in spans.iter().enumerate() {
|
||
if let Some(link_md) = span_to_link.get(&idx) {
|
||
// This span is part of a link - emit the link markdown
|
||
// The link markdown from emit_page_links_from_json already includes the anchor text
|
||
// and URL, but we need to preserve any inline styling that might be on the spans
|
||
result.push_str(link_md);
|
||
} else {
|
||
// Not part of a link - emit normal styled span
|
||
result.push_str(&span_to_markdown(span));
|
||
}
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Emit a block's text with inline link support.
|
||
///
|
||
/// This function emits a block's text content, replacing portions that correspond
|
||
/// to link annotations with inline markdown links. This is useful for paragraphs
|
||
/// and other text blocks that may contain hyperlinks.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `block` - The block to emit
|
||
/// * `spans` - All spans on the page (for link detection)
|
||
/// * `page_links` - Link annotations for this page (from Phase 7.6)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with the block's text, including inline links where applicable.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::markdown::block_to_markdown_with_links;
|
||
/// use pdftract_core::schema::{BlockJson, SpanJson};
|
||
///
|
||
/// let block = BlockJson {
|
||
/// kind: "paragraph".to_string(),
|
||
/// text: "See our website for details.".to_string(),
|
||
/// // ... other fields
|
||
/// };
|
||
///
|
||
/// let md = block_to_markdown_with_links(&block, &spans, &links);
|
||
/// // Result might be: "See our [website](https://example.com) for details."
|
||
/// ```
|
||
pub fn block_to_markdown_with_links(
|
||
block: &BlockJson,
|
||
spans: &[SpanJson],
|
||
page_links: &[crate::schema::LinkJson],
|
||
) -> String {
|
||
if page_links.is_empty() {
|
||
// No links - return the block text as-is (paragraph emission will wrap it)
|
||
return block.text.clone();
|
||
}
|
||
|
||
use crate::output::markdown::links;
|
||
|
||
// Find which spans belong to this block
|
||
let block_span_indices: Vec<usize> = block.spans.iter().filter_map(|&idx| {
|
||
if idx < spans.len() { Some(idx) } else { None }
|
||
}).collect();
|
||
|
||
if block_span_indices.is_empty() {
|
||
// No spans for this block - return text as-is
|
||
return block.text.clone();
|
||
}
|
||
|
||
// Filter links to only those that intersect this block's spans
|
||
let block_links: Vec<&crate::schema::LinkJson> = page_links
|
||
.iter()
|
||
.filter(|link| {
|
||
// Check if any of this link's spans are in this block
|
||
let matched_spans = links::find_spans_in_link_json(spans, link);
|
||
matched_spans.iter().any(|idx| block.spans.contains(idx))
|
||
})
|
||
.collect();
|
||
|
||
if block_links.is_empty() {
|
||
// No links for this block - return text as-is
|
||
return block.text.clone();
|
||
}
|
||
|
||
// Emit the spans for this block with link support
|
||
let block_spans: Vec<SpanJson> = block_span_indices
|
||
.iter()
|
||
.filter_map(|&idx| spans.get(idx).cloned())
|
||
.collect();
|
||
|
||
let block_links_refs: Vec<crate::schema::LinkJson> = block_links
|
||
.iter()
|
||
.map(|&link| link.clone())
|
||
.collect();
|
||
|
||
spans_to_markdown_with_links(&block_spans, &block_links_refs)
|
||
}
|
||
|
||
/// Emit all blocks from a page with inline link support.
|
||
///
|
||
/// This is a variant of `page_to_markdown_with_options` that also processes
|
||
/// link annotations and emits inline markdown links where applicable.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `blocks` - The blocks to convert
|
||
/// * `spans` - All spans on the page (for link detection)
|
||
/// * `tables` - The tables array for looking up table structures
|
||
/// * `page_links` - Link annotations for this page (from Phase 7.6)
|
||
/// * `page_index` - Zero-based page index
|
||
/// * `include_anchor` - Whether to include HTML comment anchors
|
||
/// * `options` - Markdown emission options
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with all blocks from the page, including inline links.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::markdown::page_to_markdown_with_links;
|
||
///
|
||
/// let md = page_to_markdown_with_links(
|
||
/// &blocks,
|
||
/// &spans,
|
||
/// &tables,
|
||
/// &links,
|
||
/// 0,
|
||
/// true,
|
||
/// &MarkdownOptions::default(),
|
||
/// );
|
||
/// ```
|
||
pub fn page_to_markdown_with_links(
|
||
blocks: &[BlockJson],
|
||
spans: &[SpanJson],
|
||
tables: &[TableJson],
|
||
page_links: &[crate::schema::LinkJson],
|
||
page_index: usize,
|
||
include_anchor: bool,
|
||
options: &MarkdownOptions,
|
||
) -> String {
|
||
let mut result = String::new();
|
||
let mut i = 0;
|
||
|
||
while i < blocks.len() {
|
||
let block = &blocks[i];
|
||
|
||
// Add anchor comment if requested
|
||
if include_anchor {
|
||
let anchor = Anchor::new(
|
||
page_index,
|
||
i,
|
||
[
|
||
block.bbox[0] as f32,
|
||
block.bbox[1] as f32,
|
||
block.bbox[2] as f32,
|
||
block.bbox[3] as f32,
|
||
],
|
||
block.kind.clone(),
|
||
);
|
||
result.push_str(&anchor.to_comment());
|
||
result.push('\n');
|
||
}
|
||
|
||
// Check if this is a list item and if there are consecutive list items
|
||
if block.kind == "list" || block.kind == "list_item" {
|
||
// Find the end of the consecutive list sequence
|
||
let mut list_end = i + 1;
|
||
while list_end < blocks.len()
|
||
&& (blocks[list_end].kind == "list" || blocks[list_end].kind == "list_item")
|
||
{
|
||
list_end += 1;
|
||
}
|
||
|
||
// Emit the entire list sequence as a group
|
||
let list_blocks = &blocks[i..list_end];
|
||
|
||
// For list items with links, emit each item with link support
|
||
for list_block in list_blocks {
|
||
let block_with_links = block_to_markdown_with_links(list_block, spans, page_links);
|
||
if !block_with_links.is_empty() {
|
||
// Detect if numbered or bulleted
|
||
let is_numbered = block_with_links
|
||
.chars()
|
||
.next()
|
||
.map(|c| c.is_ascii_digit())
|
||
.unwrap_or(false);
|
||
|
||
if is_numbered {
|
||
result.push_str(&block_with_links);
|
||
result.push('\n');
|
||
} else {
|
||
result.push_str("* ");
|
||
result.push_str(&block_with_links);
|
||
result.push('\n');
|
||
}
|
||
}
|
||
}
|
||
|
||
result.push('\n');
|
||
i = list_end;
|
||
} else {
|
||
// Non-list block - emit individually
|
||
let block_with_links = block_to_markdown_with_links(block, spans, page_links);
|
||
|
||
// For non-list blocks, use the existing block emission logic
|
||
// but replace the text content with link-aware content
|
||
let kind_result = if block_with_links != block.text {
|
||
// Links were detected - emit the link-aware version
|
||
emit_block_kind_with_text(block, tables, options, &block_with_links)
|
||
} else {
|
||
// No links - use standard emission
|
||
emit_block_kind(block, tables, options)
|
||
};
|
||
|
||
result.push_str(&kind_result);
|
||
i += 1;
|
||
}
|
||
}
|
||
|
||
// Add page break if requested and this isn't the last page
|
||
if options.include_page_breaks {
|
||
result.push_str("\n---\n\n");
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Emit a block kind with custom text content.
|
||
///
|
||
/// This is a helper for `page_to_markdown_with_links` that allows overriding
|
||
/// the block's text with link-aware content while preserving the block's
|
||
/// formatting and structure.
|
||
fn emit_block_kind_with_text(
|
||
block: &BlockJson,
|
||
tables: &[TableJson],
|
||
options: &MarkdownOptions,
|
||
custom_text: &str,
|
||
) -> String {
|
||
match block.kind.as_str() {
|
||
"heading" => {
|
||
let level = block.level.unwrap_or(1).clamp(1, 6);
|
||
let prefix = "#".repeat(level as usize);
|
||
format!("{} {}\n\n", prefix, custom_text)
|
||
}
|
||
|
||
"paragraph" => {
|
||
let text = custom_text.replace('\n', " \n");
|
||
format!("{}\n\n", text)
|
||
}
|
||
|
||
"list" | "list_item" => {
|
||
// Try to detect if this is a numbered list
|
||
let is_numbered = custom_text
|
||
.chars()
|
||
.next()
|
||
.map(|c| c.is_ascii_digit())
|
||
.unwrap_or(false);
|
||
|
||
if is_numbered {
|
||
format!("{}\n", custom_text)
|
||
} else {
|
||
format!("* {}\n", custom_text)
|
||
}
|
||
}
|
||
|
||
"caption" => format!("*{}\n\n", custom_text),
|
||
|
||
_ => {
|
||
// For other block kinds, fall back to standard emission
|
||
emit_block_kind(block, tables, options)
|
||
}
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use crate::schema::BlockJson;
|
||
|
||
fn make_test_block(kind: &str, text: &str, bbox: [f64; 4]) -> BlockJson {
|
||
BlockJson {
|
||
kind: kind.to_string(),
|
||
text: text.to_string(),
|
||
bbox,
|
||
level: None,
|
||
table_index: None,
|
||
spans: vec![],
|
||
receipt: None,
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_anchor_to_comment() {
|
||
let anchor = Anchor::new(3, 12, [72.0, 640.5, 540.0, 672.0], "heading".to_string());
|
||
let comment = anchor.to_comment();
|
||
assert_eq!(
|
||
comment,
|
||
"<!-- pdftract: page=3 block=12 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_anchor_to_comment_round_bbox() {
|
||
let anchor = Anchor::new(
|
||
0,
|
||
0,
|
||
[72.123, 640.567, 540.999, 672.111],
|
||
"paragraph".to_string(),
|
||
);
|
||
let comment = anchor.to_comment();
|
||
// Should be rounded to 1 decimal place
|
||
assert_eq!(
|
||
comment,
|
||
"<!-- pdftract: page=0 block=0 bbox=[72.1,640.6,541.0,672.1] kind=paragraph -->"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_anchors_single() {
|
||
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||
# Title"#;
|
||
|
||
let anchors = parse_anchors(md);
|
||
assert_eq!(anchors.len(), 1);
|
||
assert_eq!(anchors[0].page, 0);
|
||
assert_eq!(anchors[0].block, 0);
|
||
assert_eq!(anchors[0].bbox, [72.0, 640.5, 540.0, 672.0]);
|
||
assert_eq!(anchors[0].kind, "heading");
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_anchors_multiple() {
|
||
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||
# Title
|
||
|
||
<!-- pdftract: page=0 block=1 bbox=[72.0,600.0,540.0,630.0] kind=paragraph -->
|
||
Some text."#;
|
||
|
||
let anchors = parse_anchors(md);
|
||
assert_eq!(anchors.len(), 2);
|
||
assert_eq!(anchors[0].page, 0);
|
||
assert_eq!(anchors[0].block, 0);
|
||
assert_eq!(anchors[1].page, 0);
|
||
assert_eq!(anchors[1].block, 1);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_anchors_invalid_format_skipped() {
|
||
let md = r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->
|
||
# Title
|
||
|
||
<!-- malformed anchor -->
|
||
Some text."#;
|
||
|
||
let anchors = parse_anchors(md);
|
||
assert_eq!(anchors.len(), 1);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_anchors_whitespace_tolerant() {
|
||
let md =
|
||
r#"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"#;
|
||
let anchors = parse_anchors(md);
|
||
assert_eq!(anchors.len(), 1);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_bbox() {
|
||
assert_eq!(
|
||
parse_bbox("72.0,640.5,540.0,672.0"),
|
||
Some([72.0, 640.5, 540.0, 672.0])
|
||
);
|
||
assert_eq!(parse_bbox("0,0,100,100"), Some([0.0, 0.0, 100.0, 100.0]));
|
||
assert_eq!(
|
||
parse_bbox("72.0, 640.5, 540.0, 672.0"),
|
||
Some([72.0, 640.5, 540.0, 672.0])
|
||
); // with spaces
|
||
assert_eq!(parse_bbox("invalid"), None);
|
||
assert_eq!(parse_bbox("1,2,3"), None); // too few values
|
||
assert_eq!(parse_bbox("1,2,3,4,5"), None); // too many values
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_heading_with_anchor() {
|
||
let block = BlockJson {
|
||
kind: "heading".to_string(),
|
||
text: "Chapter 1".to_string(),
|
||
bbox: [72.0, 640.5, 540.0, 672.0],
|
||
level: Some(2),
|
||
table_index: None,
|
||
spans: vec![],
|
||
receipt: None,
|
||
};
|
||
|
||
let md = block_to_markdown(&block, &[], 0, 0, true);
|
||
assert!(md.contains(
|
||
"<!-- pdftract: page=0 block=0 bbox=[72.0,640.5,540.0,672.0] kind=heading -->"
|
||
));
|
||
assert!(md.contains("## Chapter 1"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_paragraph_without_anchor() {
|
||
let block = make_test_block("paragraph", "Some text.", [72.0, 600.0, 540.0, 630.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
assert!(!md.contains("<!-- pdftract:"));
|
||
assert!(md.contains("Some text."));
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_list() {
|
||
let block = make_test_block("list", "Item 1", [72.0, 500.0, 540.0, 520.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
assert!(md.contains("* Item 1"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_table() {
|
||
let block = make_test_block("table", "Cell data", [72.0, 400.0, 540.0, 450.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
assert!(md.contains("| Cell data"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_figure() {
|
||
let block = make_test_block("figure", "Alt text", [72.0, 300.0, 540.0, 350.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
assert!(md.contains("![]()"));
|
||
assert!(md.contains("Alt text"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_page_to_markdown_with_page_break() {
|
||
let blocks = vec![
|
||
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
|
||
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
|
||
];
|
||
|
||
let md = page_to_markdown(&blocks, &[], 0, false, true);
|
||
assert!(md.contains("---"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_page_to_markdown_without_page_break() {
|
||
let blocks = vec![
|
||
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
|
||
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
|
||
];
|
||
|
||
let md = page_to_markdown(&blocks, &[], 0, false, false);
|
||
assert!(!md.contains("---"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_page_to_markdown_with_anchors() {
|
||
let blocks = vec![
|
||
make_test_block("heading", "Title", [72.0, 640.5, 540.0, 672.0]),
|
||
make_test_block("paragraph", "Text", [72.0, 600.0, 540.0, 630.0]),
|
||
];
|
||
|
||
let md = page_to_markdown(&blocks, &[], 0, true, false);
|
||
assert_eq!(md.matches("<!-- pdftract:").count(), 2);
|
||
}
|
||
|
||
#[test]
|
||
fn test_roundtrip_extract_and_parse() {
|
||
let blocks = vec![BlockJson {
|
||
kind: "heading".to_string(),
|
||
text: "Chapter 1".to_string(),
|
||
bbox: [72.0, 640.5, 540.0, 672.0],
|
||
level: Some(2),
|
||
table_index: None,
|
||
spans: vec![],
|
||
receipt: None,
|
||
}];
|
||
|
||
let md = page_to_markdown(&blocks, &[], 3, true, false);
|
||
let anchors = parse_anchors(&md);
|
||
|
||
assert_eq!(anchors.len(), 1);
|
||
assert_eq!(anchors[0].page, 3);
|
||
assert_eq!(anchors[0].block, 0);
|
||
assert_eq!(anchors[0].kind, "heading");
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_paragraph_soft_line_break() {
|
||
// Paragraph with internal newlines should emit soft breaks as " \n"
|
||
let block = make_test_block("paragraph", "Line 1\nLine 2\nLine 3", [72.0, 600.0, 540.0, 630.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
// Internal newlines become " \n" (soft breaks)
|
||
assert!(md.contains("Line 1 \n"));
|
||
assert!(md.contains("Line 2 \n"));
|
||
assert!(md.contains("Line 3\n\n")); // Final paragraph ends with \n\n
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_paragraph_no_soft_break() {
|
||
// Paragraph without internal newlines
|
||
let block = make_test_block("paragraph", "Single line text", [72.0, 600.0, 540.0, 630.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
assert_eq!(md, "Single line text\n\n");
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_formula_inline() {
|
||
// Single-line formula should be inline: $E=mc^2$
|
||
let block = make_test_block("formula", "E=mc^2", [72.0, 600.0, 540.0, 630.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
assert_eq!(md, "$E=mc^2$");
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_formula_display() {
|
||
// Multi-line formula should be display: $$\n...\n$$
|
||
let block = make_test_block(
|
||
"formula",
|
||
"\\int_{-\\infty}^{\\infty} e^{-x^2} dx = \\sqrt{\\pi}",
|
||
[72.0, 600.0, 540.0, 630.0],
|
||
);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
assert!(md.contains("$$\n"));
|
||
assert!(md.contains("\n$$\n"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_list_numbered_preserves_numbering() {
|
||
// Numbered list should preserve source numbering
|
||
let block = make_test_block("list", "7. Seventh item", [72.0, 500.0, 540.0, 520.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
// Should preserve "7." numbering
|
||
assert!(md.contains("7. Seventh item"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_block_to_markdown_list_bulleted() {
|
||
// Bulleted list should use "* " prefix
|
||
let block = make_test_block("list", "Item text", [72.0, 500.0, 540.0, 520.0]);
|
||
let md = block_to_markdown(&block, &[], 0, 0, false);
|
||
// Should add "* " prefix
|
||
assert!(md.contains("* Item text"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_list_blocks_nested_sublist() {
|
||
// Critical test: nested sublist with proper indentation
|
||
// Level 0: x0 = 72.0
|
||
// Level 1: x0 = 90.0 (indented by 18 points)
|
||
// Level 2: x0 = 108.0 (indented by 36 points)
|
||
let list_blocks = vec![
|
||
make_test_block("list", "Item 1", [72.0, 500.0, 540.0, 520.0]),
|
||
make_test_block("list", "Item 2", [72.0, 480.0, 540.0, 500.0]),
|
||
make_test_block("list", "Nested 1", [90.0, 460.0, 540.0, 480.0]),
|
||
make_test_block("list", "Nested 2", [90.0, 440.0, 540.0, 460.0]),
|
||
make_test_block("list", "Deep nested", [108.0, 420.0, 540.0, 440.0]),
|
||
make_test_block("list", "Item 3", [72.0, 400.0, 540.0, 420.0]),
|
||
];
|
||
|
||
let md = emit_list_blocks(&list_blocks);
|
||
|
||
// Check that level 0 items have no indentation
|
||
assert!(md.contains("* Item 1"));
|
||
assert!(md.contains("* Item 2"));
|
||
assert!(md.contains("* Item 3"));
|
||
|
||
// Check that level 1 items are indented by 2 spaces
|
||
assert!(md.contains(" * Nested 1"));
|
||
assert!(md.contains(" * Nested 2"));
|
||
|
||
// Check that level 2 items are indented by 4 spaces
|
||
assert!(md.contains(" * Deep nested"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_list_blocks_single_item() {
|
||
// Single list item should still work
|
||
let list_blocks = vec![make_test_block("list", "Single item", [72.0, 500.0, 540.0, 520.0])];
|
||
let md = emit_list_blocks(&list_blocks);
|
||
assert!(md.contains("* Single item"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_list_blocks_empty() {
|
||
// Empty list should return empty string
|
||
let list_blocks: Vec<BlockJson> = vec![];
|
||
let md = emit_list_blocks(&list_blocks);
|
||
assert_eq!(md, "");
|
||
}
|
||
|
||
#[test]
|
||
fn test_page_to_markdown_with_nested_list() {
|
||
// Critical test: page with nested list in context
|
||
let blocks = vec![
|
||
make_test_block("heading", "Title", [72.0, 700.0, 540.0, 720.0]),
|
||
make_test_block("list", "Item 1", [72.0, 650.0, 540.0, 670.0]),
|
||
make_test_block("list", "Nested 1", [90.0, 630.0, 540.0, 650.0]),
|
||
make_test_block("list", "Item 2", [72.0, 610.0, 540.0, 630.0]),
|
||
make_test_block("paragraph", "Text after", [72.0, 580.0, 540.0, 600.0]),
|
||
];
|
||
|
||
let md = page_to_markdown(&blocks, &[], 0, false, false);
|
||
|
||
// Verify heading
|
||
assert!(md.contains("# Title"));
|
||
|
||
// Verify nested list structure
|
||
assert!(md.contains("* Item 1"));
|
||
assert!(md.contains(" * Nested 1"));
|
||
assert!(md.contains("* Item 2"));
|
||
|
||
// Verify paragraph after list
|
||
assert!(md.contains("Text after"));
|
||
}
|
||
}
|
||
|
||
/// Generate a markdown footer section for form fields.
|
||
///
|
||
/// This function creates a formatted markdown table listing all form fields
|
||
/// with their names, types, and current values. Only emits the section when
|
||
/// form_fields count > 0.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `form_fields` - The form fields to include in the footer
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with a form fields table, or an empty string if no fields.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// use pdftract_core::markdown::form_fields_to_markdown;
|
||
/// use pdftract_core::schema::{FormFieldJson, FormFieldTypeJson, FormFieldValueJson};
|
||
///
|
||
/// let fields = vec![
|
||
/// FormFieldJson {
|
||
/// name: "employee_name".to_string(),
|
||
/// field_type: FormFieldTypeJson::Text,
|
||
/// value: FormFieldValueJson::Text(Some("John Doe".to_string())),
|
||
/// // ... other fields
|
||
/// },
|
||
/// ];
|
||
///
|
||
/// let md = form_fields_to_markdown(&fields);
|
||
/// assert!(md.contains("## Form Fields"));
|
||
/// assert!(md.contains("employee_name"));
|
||
/// ```
|
||
pub fn form_fields_to_markdown(form_fields: &[FormFieldJson]) -> String {
|
||
if form_fields.is_empty() {
|
||
return String::new();
|
||
}
|
||
|
||
let mut result = String::from("\n\n## Form Fields\n\n");
|
||
result.push_str("| Name | Type | Value |\n");
|
||
result.push_str("|------|------|-------|\n");
|
||
|
||
for field in form_fields {
|
||
let type_str = match field.field_type {
|
||
FormFieldTypeJson::Text => "text",
|
||
FormFieldTypeJson::Button => "button",
|
||
FormFieldTypeJson::Choice => "choice",
|
||
FormFieldTypeJson::Signature => "signature",
|
||
};
|
||
|
||
let value_str = format_value_json(&field.value);
|
||
|
||
result.push_str(&format!(
|
||
"| {} | {} | {} |\n",
|
||
field.name, type_str, value_str
|
||
));
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Format a FormFieldValueJson as a string for markdown display.
|
||
fn format_value_json(value: &FormFieldValueJson) -> String {
|
||
match value {
|
||
FormFieldValueJson::Text(None) => "*empty*".to_string(),
|
||
FormFieldValueJson::Text(Some(s)) => escape_pipe(s),
|
||
FormFieldValueJson::Button(b) => b.to_string(),
|
||
FormFieldValueJson::Choice(ChoiceValueJson::Single(s)) => escape_pipe(s),
|
||
FormFieldValueJson::Choice(ChoiceValueJson::Multiple(vec)) => {
|
||
let values: Vec<String> = vec.iter().map(|s| escape_pipe(s.as_str())).collect();
|
||
values.join(", ")
|
||
}
|
||
FormFieldValueJson::Signature(None) => "*unsigned*".to_string(),
|
||
FormFieldValueJson::Signature(Some(n)) => format!("ref #{}", n),
|
||
}
|
||
}
|
||
|
||
/// Generate a markdown footer section for article threads.
|
||
///
|
||
/// This function creates a formatted markdown section listing all article
|
||
/// threads with their metadata and page ranges. Only emits the section
|
||
/// when threads count > 0.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `threads` - The threads to include in the footer
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with an article threads section, or an empty string if no threads.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```ignore
|
||
/// use pdftract_core::markdown::threads_to_markdown;
|
||
/// use pdftract_core::schema::{ThreadJson, BeadJson};
|
||
///
|
||
/// let threads = vec![
|
||
/// ThreadJson {
|
||
/// title: Some("Main Article".to_string()),
|
||
/// author: Some("John Doe".to_string()),
|
||
/// subject: None,
|
||
/// keywords: None,
|
||
/// beads: vec![
|
||
/// BeadJson { page_index: 0, rect: [100.0, 200.0, 300.0, 220.0] },
|
||
/// BeadJson { page_index: 1, rect: [100.0, 500.0, 300.0, 520.0] },
|
||
/// ],
|
||
/// },
|
||
/// ];
|
||
///
|
||
/// let md = threads_to_markdown(&threads);
|
||
/// assert!(md.contains("## Article Threads"));
|
||
/// assert!(md.contains("1. *Main Article* (John Doe) - pages 0-1 (2 beads)"));
|
||
/// ```
|
||
pub fn threads_to_markdown(threads: &[ThreadJson]) -> String {
|
||
if threads.is_empty() {
|
||
return String::new();
|
||
}
|
||
|
||
let mut result = String::from("\n\n## Article Threads\n\n");
|
||
|
||
for (i, thread) in threads.iter().enumerate() {
|
||
// Build the thread title line
|
||
let title = thread.title.as_deref().unwrap_or("(Untitled)");
|
||
let author = thread.author.as_deref().unwrap_or("");
|
||
|
||
// Collapse contiguous page ranges
|
||
let page_ranges = collapse_page_ranges(&thread.beads);
|
||
|
||
// Format: "1. *Title* (Author) - pages 0-1, 3-5 (3 beads)"
|
||
result.push_str(&format!(
|
||
"{}. *{}* ({}) - {} ({} beads)\n",
|
||
i + 1,
|
||
title,
|
||
author,
|
||
page_ranges,
|
||
thread.beads.len()
|
||
));
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Collapse contiguous page indices into ranges.
|
||
///
|
||
/// Given a list of beads with page indices, this function collapses
|
||
/// contiguous sequences into ranges for more compact display.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `beads` - The beads to collapse into page ranges
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A string like "pages 0-1, 3-5" representing the page ranges.
|
||
fn collapse_page_ranges(beads: &[BeadJson]) -> String {
|
||
if beads.is_empty() {
|
||
return "no pages".to_string();
|
||
}
|
||
|
||
let mut ranges = Vec::new();
|
||
let mut start = beads[0].page_index;
|
||
let mut end = beads[0].page_index;
|
||
|
||
for bead in beads.iter().skip(1) {
|
||
// Skip duplicate page indices
|
||
if bead.page_index == end {
|
||
continue;
|
||
}
|
||
|
||
if bead.page_index == end + 1 {
|
||
// Contiguous, extend the range
|
||
end = bead.page_index;
|
||
} else {
|
||
// Gap, emit the current range
|
||
ranges.push((start, end));
|
||
start = bead.page_index;
|
||
end = bead.page_index;
|
||
}
|
||
}
|
||
|
||
// Emit the last range
|
||
ranges.push((start, end));
|
||
|
||
// Format ranges
|
||
let parts: Vec<String> = ranges
|
||
.iter()
|
||
.map(|&(s, e)| {
|
||
if s == e {
|
||
format!("{}", s)
|
||
} else {
|
||
format!("{}-{}", s, e)
|
||
}
|
||
})
|
||
.collect();
|
||
|
||
format!("pages {}", parts.join(", "))
|
||
}
|
||
|
||
/// Convert a span to markdown with inline styling based on flags.
|
||
///
|
||
/// This function implements Phase 6.5 inline span styling, translating
|
||
/// span flag bitmask values to Markdown inline syntax.
|
||
///
|
||
/// # Styling Rules
|
||
///
|
||
/// - Bold (bit 0) → `**text**`
|
||
/// - Italic (bit 1) → `*text*`
|
||
/// - Bold + Italic → `***text***`
|
||
/// - Subscript (bit 3) → `<sub>text</sub>`
|
||
/// - Superscript (bit 4) → `<sup>text</sup>`
|
||
/// - Smallcaps (bit 2) → `<span style="font-variant: small-caps">text</span>`
|
||
/// - Color-only differences: no styling emitted
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `span` - The span to convert
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A markdown string with appropriate inline styling applied.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::schema::SpanJson;
|
||
/// use pdftract_core::markdown::span_to_markdown;
|
||
///
|
||
/// let mut span = SpanJson {
|
||
/// text: "important text".to_string(),
|
||
/// flags: vec!["bold".to_string()],
|
||
/// ..Default::default()
|
||
/// };
|
||
///
|
||
/// let md = span_to_markdown(&span);
|
||
/// assert_eq!(md, "**important text**");
|
||
/// ```
|
||
///
|
||
/// ```
|
||
/// // H₂O example: subscript
|
||
/// let mut span = SpanJson {
|
||
/// text: "2".to_string(),
|
||
/// flags: vec!["subscript".to_string()],
|
||
/// ..Default::default()
|
||
/// };
|
||
///
|
||
/// let md = span_to_markdown(&span);
|
||
/// assert_eq!(md, "<sub>2</sub>");
|
||
/// ```
|
||
///
|
||
/// ```
|
||
/// // 4th example: superscript
|
||
/// let mut span = SpanJson {
|
||
/// text: "th".to_string(),
|
||
/// flags: vec!["superscript".to_string()],
|
||
/// ..Default::default()
|
||
/// };
|
||
///
|
||
/// let md = span_to_markdown(&span);
|
||
/// assert_eq!(md, "<sup>th</sup>");
|
||
/// ```
|
||
///
|
||
/// ```
|
||
/// // Bold + italic combination
|
||
/// let mut span = SpanJson {
|
||
/// text: "emphasized".to_string(),
|
||
/// flags: vec!["bold".to_string(), "italic".to_string()],
|
||
/// ..Default::default()
|
||
/// };
|
||
///
|
||
/// let md = span_to_markdown(&span);
|
||
/// assert_eq!(md, "***emphasized***");
|
||
/// ```
|
||
///
|
||
/// ```
|
||
/// // Special character escaping
|
||
/// let mut span = SpanJson {
|
||
/// text: "1*2".to_string(),
|
||
/// flags: vec![],
|
||
/// ..Default::default()
|
||
/// };
|
||
///
|
||
/// let md = span_to_markdown(&span);
|
||
/// assert_eq!(md, "1\\*2");
|
||
/// ```
|
||
pub fn span_to_markdown(span: &SpanJson) -> String {
|
||
// Get the text content
|
||
let text = &span.text;
|
||
|
||
// Skip whitespace-only spans (no point styling whitespace)
|
||
if text.trim().is_empty() {
|
||
return text.clone();
|
||
}
|
||
|
||
// Check for each flag in the flags Vec<String>
|
||
let has_bold = span.flags.contains(&"bold".to_string());
|
||
let has_italic = span.flags.contains(&"italic".to_string());
|
||
let has_subscript = span.flags.contains(&"subscript".to_string());
|
||
let has_superscript = span.flags.contains(&"superscript".to_string());
|
||
let has_smallcaps = span.flags.contains(&"smallcaps".to_string());
|
||
|
||
// Color-only differences: emit no styling (just return escaped text)
|
||
// This is checked by seeing if none of the style flags are present
|
||
let has_any_style = has_bold || has_italic || has_subscript || has_superscript || has_smallcaps;
|
||
|
||
if !has_any_style {
|
||
// No styling flags, just escape and return
|
||
return escape_markdown_inline(text);
|
||
}
|
||
|
||
// Escape the text first (before wrapping in styling)
|
||
let escaped = escape_markdown_inline(text);
|
||
|
||
// Build the styled output
|
||
let mut result = String::new();
|
||
|
||
// Combination order:
|
||
// - Bold + italic wrapper (***text***) goes outermost
|
||
// - Smallcaps span wraps script tags (<span><sup>text</sup></span>)
|
||
// - Script tags go inside smallcaps (if both present)
|
||
// This order: **<span><sup>text</sup></span>** or **<sub>text</sub>** (if no smallcaps)
|
||
|
||
// Bold + italic wrapper (***text***)
|
||
if has_bold && has_italic {
|
||
result.push_str("***");
|
||
} else if has_bold {
|
||
result.push_str("**");
|
||
} else if has_italic {
|
||
result.push_str("*");
|
||
}
|
||
|
||
// Smallcaps wrapper (outer relative to scripts)
|
||
if has_smallcaps {
|
||
result.push_str("<span style=\"font-variant: small-caps\">");
|
||
}
|
||
|
||
// Script tags (sub/sup) go inside smallcaps
|
||
if has_subscript {
|
||
result.push_str("<sub>");
|
||
} else if has_superscript {
|
||
result.push_str("<sup>");
|
||
}
|
||
|
||
// Add the escaped text
|
||
result.push_str(&escaped);
|
||
|
||
// Close wrappers in reverse order
|
||
if has_subscript {
|
||
result.push_str("</sub>");
|
||
} else if has_superscript {
|
||
result.push_str("</sup>");
|
||
}
|
||
|
||
if has_smallcaps {
|
||
result.push_str("</span>");
|
||
}
|
||
|
||
if has_bold && has_italic {
|
||
result.push_str("***");
|
||
} else if has_bold {
|
||
result.push_str("**");
|
||
} else if has_italic {
|
||
result.push_str("*");
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Escape special Markdown characters in inline text.
|
||
///
|
||
/// This function escapes characters that have special meaning in Markdown
|
||
/// to prevent unintended formatting. Per CommonMark spec, these characters
|
||
/// are escaped to prevent them from being interpreted as Markdown syntax.
|
||
///
|
||
/// # Characters Escaped
|
||
///
|
||
/// The following characters are escaped with a backslash:
|
||
/// - `\` (backslash itself - must be escaped to avoid interpretation)
|
||
/// - `` ` `` (code span)
|
||
/// - `*` (emphasis/strong)
|
||
/// - `_` (emphasis)
|
||
/// - `[` (link start)
|
||
/// - `]` (link end)
|
||
/// - `(` (link destination start)
|
||
/// - `)` (link destination end)
|
||
/// - `#` (ATX heading)
|
||
/// - `!` (image)
|
||
/// - `+` (list marker)
|
||
/// - `<` (HTML tag/auto-link)
|
||
/// - `>` (blockquote)
|
||
///
|
||
/// # Characters NOT Escaped
|
||
///
|
||
/// - `-` (hyphen) - only special at start of line for lists/HR
|
||
/// - `.` (period) - only special as part of list marker like "1."
|
||
/// - `=` (equals) - not special in CommonMark
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `s` - The string to escape
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A string with special characters escaped.
|
||
fn escape_markdown_inline(s: &str) -> String {
|
||
let mut result = String::with_capacity(s.len() * 2);
|
||
|
||
for c in s.chars() {
|
||
match c {
|
||
'\\' | '`' | '*' | '_' | '[' | ']' | '(' | ')' | '#' | '!' | '+' | '<' | '>' => {
|
||
result.push('\\');
|
||
result.push(c);
|
||
}
|
||
_ => result.push(c),
|
||
}
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Emit a table as Markdown (GFM pipe table) or HTML fallback.
|
||
///
|
||
/// This function implements Phase 6.5 table emission:
|
||
/// - Simple tables (all 1x1 cells, no nested content) → GFM pipe table
|
||
/// - Complex tables (merged cells/colspan/rowspan/nested blocks) → HTML `<table>`
|
||
/// - Caption → italic line below the table
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `table` - The table to emit
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A Markdown string with the table in the appropriate format.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```
|
||
/// use pdftract_core::markdown::emit_table;
|
||
/// use pdftract_core::schema::{TableJson, RowJson, CellJson};
|
||
///
|
||
/// let table = TableJson {
|
||
/// id: "table_0".to_string(),
|
||
/// bbox: [50.0, 100.0, 400.0, 300.0],
|
||
/// rows: vec![
|
||
/// RowJson {
|
||
/// bbox: [50.0, 250.0, 400.0, 300.0],
|
||
/// cells: vec![
|
||
/// CellJson {
|
||
/// bbox: [50.0, 250.0, 200.0, 300.0],
|
||
/// text: "Header 1".to_string(),
|
||
/// spans: vec![],
|
||
/// row: 0,
|
||
/// col: 0,
|
||
/// rowspan: 1,
|
||
/// colspan: 1,
|
||
/// is_header_row: true,
|
||
/// },
|
||
/// CellJson {
|
||
/// bbox: [200.0, 250.0, 400.0, 300.0],
|
||
/// text: "Header 2".to_string(),
|
||
/// spans: vec![],
|
||
/// row: 0,
|
||
/// col: 1,
|
||
/// rowspan: 1,
|
||
/// colspan: 1,
|
||
/// is_header_row: true,
|
||
/// },
|
||
/// ],
|
||
/// is_header: true,
|
||
/// },
|
||
/// RowJson {
|
||
/// bbox: [50.0, 100.0, 400.0, 250.0],
|
||
/// cells: vec![
|
||
/// CellJson {
|
||
/// bbox: [50.0, 100.0, 200.0, 250.0],
|
||
/// text: "Data 1".to_string(),
|
||
/// spans: vec![],
|
||
/// row: 1,
|
||
/// col: 0,
|
||
/// rowspan: 1,
|
||
/// colspan: 1,
|
||
/// is_header_row: false,
|
||
/// },
|
||
/// CellJson {
|
||
/// bbox: [200.0, 100.0, 400.0, 250.0],
|
||
/// text: "Data 2".to_string(),
|
||
/// spans: vec![],
|
||
/// row: 1,
|
||
/// col: 1,
|
||
/// rowspan: 1,
|
||
/// colspan: 1,
|
||
/// is_header_row: false,
|
||
/// },
|
||
/// ],
|
||
/// is_header: false,
|
||
/// },
|
||
/// ],
|
||
/// header_rows: 1,
|
||
/// detection_method: "line_based".to_string(),
|
||
/// continued: false,
|
||
/// continued_from_prev: false,
|
||
/// page_index: 0,
|
||
/// };
|
||
///
|
||
/// let md = emit_table(&table);
|
||
/// assert!(md.contains("| Header 1 | Header 2 |"));
|
||
/// assert!(md.contains("| Data 1 | Data 2 |"));
|
||
/// ```
|
||
pub fn emit_table(table: &TableJson) -> String {
|
||
// Check if table is simple (all cells 1x1) or complex (merged cells)
|
||
let is_simple = table.rows.iter().all(|row| {
|
||
row.cells
|
||
.iter()
|
||
.all(|cell| cell.rowspan == 1 && cell.colspan == 1)
|
||
});
|
||
|
||
if is_simple {
|
||
emit_gfm_table(table)
|
||
} else {
|
||
emit_html_table(table)
|
||
}
|
||
}
|
||
|
||
/// Emit a table as GitHub-Flavored Markdown pipe table.
|
||
///
|
||
/// GFM pipe tables require:
|
||
/// - All cells have rowspan=1 and colspan=1 (no merged cells)
|
||
/// - Header row (first row if is_header=true, otherwise synthesized)
|
||
/// - Separator row with `| --- | --- |` syntax
|
||
/// - Body rows with `| val | val |` syntax
|
||
fn emit_gfm_table(table: &TableJson) -> String {
|
||
let mut result = String::new();
|
||
|
||
// Find the maximum number of columns across all rows
|
||
let max_cols = table
|
||
.rows
|
||
.iter()
|
||
.map(|row| row.cells.len())
|
||
.max()
|
||
.unwrap_or(0);
|
||
|
||
if max_cols == 0 {
|
||
return String::new();
|
||
}
|
||
|
||
// Emit header row (use first row if it exists)
|
||
if let Some(first_row) = table.rows.first() {
|
||
result.push_str("| ");
|
||
for (i, cell) in first_row.cells.iter().enumerate() {
|
||
if i > 0 {
|
||
result.push_str(" | ");
|
||
}
|
||
result.push_str(&escape_pipe(&cell.text));
|
||
}
|
||
// Pad missing columns
|
||
for i in first_row.cells.len()..max_cols {
|
||
if i > 0 || !first_row.cells.is_empty() {
|
||
result.push_str(" | ");
|
||
}
|
||
result.push_str(" ");
|
||
}
|
||
result.push_str(" |\n");
|
||
} else {
|
||
// Empty header row for table with no rows
|
||
for i in 0..max_cols {
|
||
if i > 0 {
|
||
result.push_str(" | ");
|
||
}
|
||
result.push_str(" ");
|
||
}
|
||
result.push_str(" |\n");
|
||
}
|
||
|
||
// Emit separator row
|
||
result.push_str("|");
|
||
for _ in 0..max_cols {
|
||
result.push_str(" --- |");
|
||
}
|
||
result.push('\n');
|
||
|
||
// Emit body rows (skip first row if it was header)
|
||
let body_start = if table.rows.first().map_or(false, |r| r.is_header) {
|
||
1
|
||
} else {
|
||
0
|
||
};
|
||
|
||
for row in table.rows.iter().skip(body_start) {
|
||
result.push_str("| ");
|
||
for (i, cell) in row.cells.iter().enumerate() {
|
||
if i > 0 {
|
||
result.push_str(" | ");
|
||
}
|
||
result.push_str(&escape_pipe(&cell.text));
|
||
}
|
||
// Pad missing columns
|
||
for i in row.cells.len()..max_cols {
|
||
if i > 0 || !row.cells.is_empty() {
|
||
result.push_str(" | ");
|
||
}
|
||
result.push_str(" ");
|
||
}
|
||
result.push_str(" |\n");
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
/// Emit a table as inline HTML `<table>`.
|
||
///
|
||
/// HTML fallback is used when:
|
||
/// - Any cell has colspan > 1 or rowspan > 1 (merged cells)
|
||
/// - Nested blocks are present (future enhancement)
|
||
pub fn emit_html_table(table: &TableJson) -> String {
|
||
let mut result = String::from("<table>\n");
|
||
|
||
for row in &table.rows {
|
||
result.push_str(" <tr>\n");
|
||
|
||
for cell in &row.cells {
|
||
let tag = if cell.is_header_row || row.is_header {
|
||
"th"
|
||
} else {
|
||
"td"
|
||
};
|
||
|
||
result.push_str(" <");
|
||
result.push_str(tag);
|
||
|
||
// Add colspan if > 1
|
||
if cell.colspan > 1 {
|
||
result.push_str(&format!(" colspan=\"{}\"", cell.colspan));
|
||
}
|
||
|
||
// Add rowspan if > 1
|
||
if cell.rowspan > 1 {
|
||
result.push_str(&format!(" rowspan=\"{}\"", cell.rowspan));
|
||
}
|
||
|
||
result.push_str(">");
|
||
result.push_str(&escape_pipe(&cell.text));
|
||
result.push_str("</");
|
||
result.push_str(tag);
|
||
result.push_str(">\n");
|
||
}
|
||
|
||
result.push_str(" </tr>\n");
|
||
}
|
||
|
||
result.push_str("</table>\n");
|
||
result
|
||
}
|
||
|
||
/// Escape pipe characters for markdown table cells.
|
||
///
|
||
/// This function escapes `|` as `\|` to prevent it from being interpreted
|
||
/// as a column separator in GFM pipe tables.
|
||
///
|
||
/// Also replaces newlines with `<br>` for GFM tables (HTML inside Markdown
|
||
/// table cells is allowed and widely supported).
|
||
fn escape_pipe(s: &str) -> String {
|
||
let mut result = String::with_capacity(s.len() * 2);
|
||
|
||
for c in s.chars() {
|
||
match c {
|
||
'|' => {
|
||
result.push_str("\\|");
|
||
}
|
||
'\n' => {
|
||
// Newlines in GFM tables become <br> tags
|
||
result.push_str("<br>");
|
||
}
|
||
'<' => {
|
||
// Escape < to prevent HTML injection
|
||
result.push_str("<");
|
||
}
|
||
'>' => {
|
||
// Escape > to prevent HTML injection
|
||
result.push_str(">");
|
||
}
|
||
_ => result.push(c),
|
||
}
|
||
}
|
||
|
||
result
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod span_tests {
|
||
use super::*;
|
||
|
||
/// Helper function to create a test span with the given text and flags.
|
||
/// All other fields are set to reasonable defaults for testing.
|
||
fn make_test_span(text: &str, flags: &[&str]) -> SpanJson {
|
||
SpanJson {
|
||
text: text.to_string(),
|
||
bbox: [0.0, 0.0, 100.0, 20.0],
|
||
font: "Helvetica".to_string(),
|
||
size: 12.0,
|
||
color: None,
|
||
rendering_mode: None,
|
||
confidence: None,
|
||
confidence_source: None,
|
||
lang: None,
|
||
flags: flags.iter().map(|s| s.to_string()).collect(),
|
||
receipt: None,
|
||
column: None,
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_bold() {
|
||
let span = make_test_span("important", &["bold"]);
|
||
assert_eq!(span_to_markdown(&span), "**important**");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_italic() {
|
||
let span = make_test_span("emphasized", &["italic"]);
|
||
assert_eq!(span_to_markdown(&span), "*emphasized*");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_bold_italic() {
|
||
// Critical test: bold + italic span emitted as ***text***
|
||
let span = make_test_span("very important", &["bold", "italic"]);
|
||
assert_eq!(span_to_markdown(&span), "***very important***");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_subscript() {
|
||
let span = make_test_span("2", &["subscript"]);
|
||
assert_eq!(span_to_markdown(&span), "<sub>2</sub>");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_superscript() {
|
||
let span = make_test_span("th", &["superscript"]);
|
||
assert_eq!(span_to_markdown(&span), "<sup>th</sup>");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_smallcaps() {
|
||
let span = make_test_span("CAPS", &["smallcaps"]);
|
||
assert_eq!(
|
||
span_to_markdown(&span),
|
||
"<span style=\"font-variant: small-caps\">CAPS</span>"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_no_flags() {
|
||
// Color-only difference or no styling: no styling emitted
|
||
let span = make_test_span("plain text", &[]);
|
||
assert_eq!(span_to_markdown(&span), "plain text");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_special_chars_escaped() {
|
||
// Special chars escaped: span text "1*2" -> "1\*2"
|
||
let span = make_test_span("1*2", &[]);
|
||
assert_eq!(span_to_markdown(&span), "1\\*2");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_bold_subscript_combination() {
|
||
// Bold + subscript: **<sub>text</sub>**
|
||
let span = make_test_span("ion", &["bold", "subscript"]);
|
||
assert_eq!(span_to_markdown(&span), "**<sub>ion</sub>**");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_bold_superscript_combination() {
|
||
// Bold + superscript: **<sup>text</sup>**
|
||
let span = make_test_span("st", &["bold", "superscript"]);
|
||
assert_eq!(span_to_markdown(&span), "**<sup>st</sup>**");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_italic_subscript_combination() {
|
||
// Italic + subscript: *<sub>text</sub>*
|
||
let span = make_test_span("ion", &["italic", "subscript"]);
|
||
assert_eq!(span_to_markdown(&span), "*<sub>ion</sub>*");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_all_flags() {
|
||
// All flags: bold + italic + smallcaps + superscript
|
||
let span = make_test_span("X", &["bold", "italic", "smallcaps", "superscript"]);
|
||
assert_eq!(
|
||
span_to_markdown(&span),
|
||
"***<span style=\"font-variant: small-caps\"><sup>X</sup></span>***"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_whitespace_only() {
|
||
// Empty/whitespace-only spans emit unwrapped
|
||
let span = make_test_span(" ", &["bold"]);
|
||
assert_eq!(span_to_markdown(&span), " ");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_empty_string() {
|
||
let span = make_test_span("", &["bold"]);
|
||
assert_eq!(span_to_markdown(&span), "");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_asterisk() {
|
||
assert_eq!(escape_markdown_inline("1*2"), "1\\*2");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_underscore() {
|
||
assert_eq!(escape_markdown_inline("hello_world"), "hello\\_world");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_backtick() {
|
||
assert_eq!(escape_markdown_inline("code`here"), "code\\`here");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_brackets() {
|
||
assert_eq!(escape_markdown_inline("[link]"), "\\[link\\]");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_multiple_special() {
|
||
assert_eq!(escape_markdown_inline("*_[link]*"), "\\*\\_\\[link\\]\\*");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_backslash() {
|
||
assert_eq!(escape_markdown_inline("C:\\path"), "C:\\\\path");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_hash() {
|
||
assert_eq!(escape_markdown_inline("#heading"), "\\#heading");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_plus_minus() {
|
||
assert_eq!(escape_markdown_inline("+/-"), "\\+/-");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_markdown_inline_less_greater() {
|
||
// < and > are escaped (HTML tags/auto-links)
|
||
assert_eq!(escape_markdown_inline("<tag>"), "\\<tag\\>");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_bold_with_asterisk_in_text() {
|
||
// Bold text containing asterisks should be escaped
|
||
let span = make_test_span("2*2=4", &["bold"]);
|
||
assert_eq!(span_to_markdown(&span), "**2\\*2=4**");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_subscript_with_special_chars() {
|
||
// Subscript with special characters
|
||
let span = make_test_span("2+", &["subscript"]);
|
||
assert_eq!(span_to_markdown(&span), "<sub>2\\+</sub>");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_superscript_with_special_chars() {
|
||
// Superscript with special characters
|
||
let span = make_test_span("n-1", &["superscript"]);
|
||
assert_eq!(span_to_markdown(&span), "<sup>n-1</sup>");
|
||
}
|
||
|
||
#[test]
|
||
fn test_span_to_markdown_smallcaps_with_special_chars() {
|
||
// Smallcaps with underscore
|
||
let span = make_test_span("HELLO_WORLD", &["smallcaps"]);
|
||
assert_eq!(
|
||
span_to_markdown(&span),
|
||
"<span style=\"font-variant: small-caps\">HELLO\\_WORLD</span>"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_threads_to_markdown_empty() {
|
||
// Empty threads list returns empty string
|
||
let threads: Vec<ThreadJson> = vec![];
|
||
assert_eq!(threads_to_markdown(&threads), "");
|
||
}
|
||
|
||
#[test]
|
||
fn test_threads_to_markdown_single_thread() {
|
||
// Single thread with multiple beads
|
||
let threads = vec![ThreadJson {
|
||
title: Some("Main Article".to_string()),
|
||
author: Some("John Doe".to_string()),
|
||
subject: None,
|
||
keywords: None,
|
||
beads: vec![
|
||
BeadJson {
|
||
page_index: 0,
|
||
rect: [100.0, 200.0, 300.0, 220.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 1,
|
||
rect: [100.0, 500.0, 300.0, 520.0],
|
||
},
|
||
],
|
||
}];
|
||
|
||
let md = threads_to_markdown(&threads);
|
||
assert!(md.contains("## Article Threads"));
|
||
assert!(md.contains("1. *Main Article* (John Doe) - pages 0-1 (2 beads)"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_threads_to_markdown_multiple_threads() {
|
||
// Multiple threads with various metadata
|
||
let threads = vec![
|
||
ThreadJson {
|
||
title: Some("Introduction".to_string()),
|
||
author: Some("Jane Smith".to_string()),
|
||
subject: None,
|
||
keywords: None,
|
||
beads: vec![BeadJson {
|
||
page_index: 0,
|
||
rect: [50.0, 100.0, 250.0, 120.0],
|
||
}],
|
||
},
|
||
ThreadJson {
|
||
title: Some("Main Content".to_string()),
|
||
author: None,
|
||
subject: Some("Chapter 1".to_string()),
|
||
keywords: Some("test, example".to_string()),
|
||
beads: vec![
|
||
BeadJson {
|
||
page_index: 1,
|
||
rect: [50.0, 400.0, 250.0, 420.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 2,
|
||
rect: [50.0, 100.0, 250.0, 120.0],
|
||
},
|
||
],
|
||
},
|
||
];
|
||
|
||
let md = threads_to_markdown(&threads);
|
||
assert!(md.contains("1. *Introduction* (Jane Smith) - pages 0 (1 beads)"));
|
||
assert!(md.contains("2. *Main Content* () - pages 1-2 (2 beads)"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_threads_to_markdown_untitled_thread() {
|
||
// Thread with no title
|
||
let threads = vec![ThreadJson {
|
||
title: None,
|
||
author: None,
|
||
subject: None,
|
||
keywords: None,
|
||
beads: vec![BeadJson {
|
||
page_index: 5,
|
||
rect: [100.0, 200.0, 300.0, 220.0],
|
||
}],
|
||
}];
|
||
|
||
let md = threads_to_markdown(&threads);
|
||
assert!(md.contains("1. *(Untitled)* () - pages 5 (1 beads)"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_collapse_page_ranges_single_page() {
|
||
// Single bead
|
||
let beads = vec![BeadJson {
|
||
page_index: 3,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
}];
|
||
assert_eq!(collapse_page_ranges(&beads), "pages 3");
|
||
}
|
||
|
||
#[test]
|
||
fn test_collapse_page_ranges_contiguous() {
|
||
// Contiguous pages
|
||
let beads = vec![
|
||
BeadJson {
|
||
page_index: 0,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 1,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 2,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
];
|
||
assert_eq!(collapse_page_ranges(&beads), "pages 0-2");
|
||
}
|
||
|
||
#[test]
|
||
fn test_collapse_page_ranges_gaps() {
|
||
// Pages with gaps
|
||
let beads = vec![
|
||
BeadJson {
|
||
page_index: 0,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 2,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 5,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
];
|
||
assert_eq!(collapse_page_ranges(&beads), "pages 0, 2, 5");
|
||
}
|
||
|
||
#[test]
|
||
fn test_collapse_page_ranges_mixed() {
|
||
// Mixed contiguous and gaps
|
||
let beads = vec![
|
||
BeadJson {
|
||
page_index: 0,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 1,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 3,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 4,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
BeadJson {
|
||
page_index: 4,
|
||
rect: [0.0, 0.0, 100.0, 20.0],
|
||
},
|
||
];
|
||
assert_eq!(collapse_page_ranges(&beads), "pages 0-1, 3-4");
|
||
}
|
||
|
||
// Table emission tests (Phase 6.5)
|
||
|
||
fn make_test_cell(
|
||
text: &str,
|
||
row: usize,
|
||
col: usize,
|
||
rowspan: u32,
|
||
colspan: u32,
|
||
is_header_row: bool,
|
||
) -> crate::schema::CellJson {
|
||
crate::schema::CellJson {
|
||
bbox: [0.0, 0.0, 100.0, 20.0],
|
||
text: text.to_string(),
|
||
spans: vec![],
|
||
row,
|
||
col,
|
||
rowspan,
|
||
colspan,
|
||
is_header_row,
|
||
}
|
||
}
|
||
|
||
fn make_test_row(cells: Vec<crate::schema::CellJson>, is_header: bool) -> crate::schema::RowJson {
|
||
crate::schema::RowJson {
|
||
bbox: [0.0, 0.0, 100.0, 20.0],
|
||
cells,
|
||
is_header,
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_simple_3x3() {
|
||
// Simple 3x3 table: GFM pipe format
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 300.0, 200.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("H1", 0, 0, 1, 1, true),
|
||
make_test_cell("H2", 0, 1, 1, 1, true),
|
||
make_test_cell("H3", 0, 2, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D1", 1, 0, 1, 1, false),
|
||
make_test_cell("D2", 1, 1, 1, 1, false),
|
||
make_test_cell("D3", 1, 2, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D4", 2, 0, 1, 1, false),
|
||
make_test_cell("D5", 2, 1, 1, 1, false),
|
||
make_test_cell("D6", 2, 2, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
assert!(md.contains("| H1 | H2 | H3 |"));
|
||
assert!(md.contains("| --- | --- | --- |"));
|
||
assert!(md.contains("| D1 | D2 | D3 |"));
|
||
assert!(md.contains("| D4 | D5 | D6 |"));
|
||
// Should NOT contain HTML table tags
|
||
assert!(!md.contains("<table>"));
|
||
assert!(!md.contains("<tr>"));
|
||
assert!(!md.contains("<td>"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_merged_cells_html_fallback() {
|
||
// Critical test: merged-cell table input -> falls back to inline <table>
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 300.0, 200.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Merged Header", 0, 0, 1, 2, true), // colspan=2
|
||
make_test_cell("H2", 0, 1, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D1", 1, 0, 1, 1, false),
|
||
make_test_cell("D2", 1, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Should contain HTML table tags
|
||
assert!(md.contains("<table>"));
|
||
assert!(md.contains("</table>"));
|
||
assert!(md.contains("<tr>"));
|
||
assert!(md.contains("</tr>"));
|
||
// Should have colspan attribute
|
||
assert!(md.contains("colspan=\"2\""));
|
||
// Should NOT contain GFM pipe syntax
|
||
assert!(!md.contains("| --- |"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_rowspan_html_fallback() {
|
||
// Table with rowspan -> HTML fallback
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 300.0, 200.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Rowspan", 0, 0, 2, 1, true), // rowspan=2
|
||
make_test_cell("H2", 0, 1, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D1", 1, 0, 1, 1, false), // This cell is below the rowspan cell
|
||
make_test_cell("D2", 1, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Should have rowspan attribute
|
||
assert!(md.contains("rowspan=\"2\""));
|
||
// Should NOT contain GFM pipe syntax
|
||
assert!(!md.contains("| --- |"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_pipe() {
|
||
// Cell with pipe character: escaped as \|
|
||
assert_eq!(escape_pipe("A|B"), "A\\|B");
|
||
assert_eq!(escape_pipe("|||"), "\\|\\|\\|");
|
||
assert_eq!(escape_pipe("test"), "test");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_pipe_newline_to_br() {
|
||
// Cell with newline: rendered with <br>
|
||
assert_eq!(escape_pipe("line1\nline2"), "line1<br>line2");
|
||
assert_eq!(escape_pipe("a\nb\nc"), "a<br>b<br>c");
|
||
}
|
||
|
||
#[test]
|
||
fn test_escape_pipe_html_entities() {
|
||
// < and > escaped as HTML entities
|
||
assert_eq!(escape_pipe("<tag>"), "<tag>");
|
||
assert_eq!(escape_pipe("a<b"), "a<b");
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_with_pipe_in_cell() {
|
||
// Cell with pipe character: escaped as \|
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 200.0, 100.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("A|B", 0, 0, 1, 1, true),
|
||
make_test_cell("Normal", 0, 1, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Data", 1, 0, 1, 1, false),
|
||
make_test_cell("Value", 1, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Pipe should be escaped in the output
|
||
assert!(md.contains("A\\|B"));
|
||
// The table should still render correctly
|
||
assert!(md.contains("| --- | --- |"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_with_newline_in_cell() {
|
||
// Cell with newline: rendered with <br>
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 200.0, 100.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Line1\nLine2", 0, 0, 1, 1, true),
|
||
make_test_cell("Normal", 0, 1, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Data", 1, 0, 1, 1, false),
|
||
make_test_cell("Value", 1, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Newline should become <br> tag
|
||
assert!(md.contains("Line1<br>Line2"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_empty() {
|
||
// Empty table (no rows)
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 100.0, 50.0],
|
||
rows: vec![],
|
||
header_rows: 0,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Empty table should return empty string
|
||
assert_eq!(md, "");
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_single_row() {
|
||
// Table with single row (no body rows)
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 200.0, 50.0],
|
||
rows: vec![make_test_row(
|
||
vec![
|
||
make_test_cell("H1", 0, 0, 1, 1, true),
|
||
make_test_cell("H2", 0, 1, 1, 1, true),
|
||
],
|
||
true,
|
||
)],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Should have header row and separator
|
||
assert!(md.contains("| H1 | H2 |"));
|
||
assert!(md.contains("| --- | --- |"));
|
||
// Should not have any body rows (no "| |" after separator)
|
||
let parts: Vec<&str> = md.lines().collect();
|
||
assert_eq!(parts.len(), 2); // Header row + separator
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_table_no_header() {
|
||
// Table with no header row (all rows are data)
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 200.0, 100.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D1", 0, 0, 1, 1, false),
|
||
make_test_cell("D2", 0, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D3", 1, 0, 1, 1, false),
|
||
make_test_cell("D4", 1, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 0,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Should use first row as header for GFM
|
||
assert!(md.contains("| D1 | D2 |"));
|
||
assert!(md.contains("| --- | --- |"));
|
||
// Second row should be in body
|
||
assert!(md.contains("| D3 | D4 |"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_html_table_header_cells() {
|
||
// HTML table with is_header_row cells should use <th> tags
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 200.0, 100.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Header1", 0, 0, 1, 1, true), // is_header_row=true
|
||
make_test_cell("Header2", 0, 1, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Data1", 1, 0, 1, 1, false), // is_header_row=false
|
||
make_test_cell("Data2", 1, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_html_table(&table);
|
||
// First row should use <th> tags
|
||
assert!(md.contains("<th>Header1</th>"));
|
||
assert!(md.contains("<th>Header2</th>"));
|
||
// Second row should use <td> tags
|
||
assert!(md.contains("<td>Data1</td>"));
|
||
assert!(md.contains("<td>Data2</td>"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_html_table_row_and_colspan() {
|
||
// HTML table with both rowspan and colspan
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 300.0, 200.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("Both", 0, 0, 2, 2, true), // rowspan=2, colspan=2
|
||
make_test_cell("H2", 0, 1, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D1", 1, 0, 1, 1, false),
|
||
make_test_cell("D2", 1, 1, 1, 1, false),
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_html_table(&table);
|
||
// Should have both colspan and rowspan attributes
|
||
assert!(md.contains("colspan=\"2\""));
|
||
assert!(md.contains("rowspan=\"2\""));
|
||
}
|
||
|
||
#[test]
|
||
fn test_emit_gfm_table_variable_width() {
|
||
// GFM table with different column counts per row
|
||
let table = TableJson {
|
||
id: "table_0".to_string(),
|
||
bbox: [0.0, 0.0, 300.0, 200.0],
|
||
rows: vec![
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("H1", 0, 0, 1, 1, true),
|
||
make_test_cell("H2", 0, 1, 1, 1, true),
|
||
make_test_cell("H3", 0, 2, 1, 1, true),
|
||
],
|
||
true,
|
||
),
|
||
make_test_row(
|
||
vec![
|
||
make_test_cell("D1", 1, 0, 1, 1, false),
|
||
make_test_cell("D2", 1, 1, 1, 1, false),
|
||
// Missing third cell - should pad
|
||
],
|
||
false,
|
||
),
|
||
],
|
||
header_rows: 1,
|
||
detection_method: "line_based".to_string(),
|
||
continued: false,
|
||
continued_from_prev: false,
|
||
page_index: 0,
|
||
};
|
||
|
||
let md = emit_table(&table);
|
||
// Should have 3 columns in all rows (padded with empty cells)
|
||
assert!(md.contains("| H1 | H2 | H3 |"));
|
||
assert!(md.contains("| --- | --- | --- |"));
|
||
// Second row should be padded
|
||
let lines: Vec<&str> = md.lines().collect();
|
||
let body_line = lines.get(2).unwrap();
|
||
assert_eq!(body_line.matches('|').count(), 4); // 4 pipes = 3 cells
|
||
}
|
||
}
|