diff --git a/crates/pdftract-core/src/markdown.rs b/crates/pdftract-core/src/markdown.rs index 161caae..973ab9c 100644 --- a/crates/pdftract-core/src/markdown.rs +++ b/crates/pdftract-core/src/markdown.rs @@ -37,7 +37,7 @@ use crate::schema::{ BeadJson, BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson, - SpanJson, ThreadJson, + SpanJson, TableJson, ThreadJson, }; use regex::Regex; use serde::{Deserialize, Serialize}; @@ -203,6 +203,7 @@ fn parse_bbox(s: &str) -> Option<[f32; 4]> { /// # Arguments /// /// * `block` - The block to convert +/// * `tables` - The tables array for looking up table structures by table_index /// * `page_index` - Zero-based page index /// * `block_index` - Zero-based block index within the page /// * `include_anchor` - Whether to include the HTML comment anchor @@ -212,6 +213,7 @@ fn parse_bbox(s: &str) -> Option<[f32; 4]> { /// A markdown string with optional anchor. pub fn block_to_markdown( block: &BlockJson, + tables: &[TableJson], page_index: usize, block_index: usize, include_anchor: bool, @@ -249,11 +251,26 @@ pub fn block_to_markdown( result.push_str(&format!("* {}\n", block.text)); } "table" => { - result.push_str(&format!("| {}\n", block.text)); + // Look up the table structure from the tables array + if let Some(table_idx) = block.table_index { + if let Some(table) = tables.get(table_idx) { + result.push_str(&emit_table(table)); + } else { + // Fallback to text if table index is invalid + result.push_str(&format!("| {}\n", block.text)); + } + } else { + // Fallback to text if no table index + result.push_str(&format!("| {}\n", block.text)); + } } "figure" => { result.push_str(&format!("![]()\n\n{}\n", block.text)); } + "caption" => { + // Captions are emitted as italic text + result.push_str(&format!("*{}*\n", block.text)); + } _ => { result.push_str(&format!("{}\n", block.text)); } @@ -270,6 +287,7 @@ pub fn block_to_markdown( /// # Arguments /// /// * `blocks` - The blocks to convert +/// * `tables` - The tables array for looking up table structures /// * `page_index` - Zero-based page index /// * `include_anchor` - Whether to include HTML comment anchors /// * `include_page_break` - Whether to add a page break separator @@ -279,6 +297,7 @@ pub fn block_to_markdown( /// A markdown string with all blocks from the page. pub fn page_to_markdown( blocks: &[BlockJson], + tables: &[TableJson], page_index: usize, include_anchor: bool, include_page_break: bool, @@ -286,7 +305,7 @@ pub fn page_to_markdown( let mut result = String::new(); for (block_index, block) in blocks.iter().enumerate() { - let md = block_to_markdown(block, page_index, block_index, include_anchor); + let md = block_to_markdown(block, tables, page_index, block_index, include_anchor); result.push_str(&md); result.push('\n'); } @@ -419,7 +438,7 @@ Some text."#; receipt: None, }; - let md = block_to_markdown(&block, 0, 0, true); + let md = block_to_markdown(&block, &[], 0, 0, true); assert!(md.contains( "" )); @@ -429,7 +448,7 @@ Some text."#; #[test] fn test_block_to_markdown_paragraph_without_anchor() { let block = make_test_block("paragraph", "Some text.", [72.0, 600.0, 540.0, 630.0]); - let md = block_to_markdown(&block, 0, 0, false); + let md = block_to_markdown(&block, &[], 0, 0, false); assert!(!md.contains("