diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs
index 060f86b..86a0bc3 100644
--- a/crates/pdftract-core/src/lib.rs
+++ b/crates/pdftract-core/src/lib.rs
@@ -64,7 +64,8 @@ pub use forms::{
combine, walk_acroform_fields, AcroFieldType, AcroFormField, ChoiceValue, FormFieldValue,
};
pub use markdown::{
- block_to_markdown, form_fields_to_markdown, page_to_markdown, parse_anchors, Anchor,
+ block_to_markdown, form_fields_to_markdown, page_to_markdown, parse_anchors, span_to_markdown,
+ Anchor,
};
pub use options::{ExtractionOptions, ReceiptsMode};
pub use page_class::{page_type_string, PageClass, PageClassification};
diff --git a/crates/pdftract-core/src/markdown.rs b/crates/pdftract-core/src/markdown.rs
index 5d6e513..8669ab8 100644
--- a/crates/pdftract-core/src/markdown.rs
+++ b/crates/pdftract-core/src/markdown.rs
@@ -36,7 +36,7 @@
//! ```
use crate::schema::{
- BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson,
+ BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson, SpanJson,
};
use regex::Regex;
use serde::{Deserialize, Serialize};
@@ -591,3 +591,423 @@ fn format_value_json(value: &FormFieldValueJson) -> String {
fn escape_pipe(s: &str) -> String {
s.replace('|', "\\|")
}
+
+/// Convert a span to markdown with inline styling based on flags.
+///
+/// This function implements Phase 6.5 inline span styling, translating
+/// span flag bitmask values to Markdown inline syntax.
+///
+/// # Styling Rules
+///
+/// - Bold (bit 0) → `**text**`
+/// - Italic (bit 1) → `*text*`
+/// - Bold + Italic → `***text***`
+/// - Subscript (bit 3) → `text`
+/// - Superscript (bit 4) → `text`
+/// - Smallcaps (bit 2) → `text`
+/// - Color-only differences: no styling emitted
+///
+/// # Arguments
+///
+/// * `span` - The span to convert
+///
+/// # Returns
+///
+/// A markdown string with appropriate inline styling applied.
+///
+/// # Examples
+///
+/// ```
+/// use pdftract_core::schema::SpanJson;
+/// use pdftract_core::markdown::span_to_markdown;
+///
+/// let mut span = SpanJson {
+/// text: "important text".to_string(),
+/// flags: vec!["bold".to_string()],
+/// ..Default::default()
+/// };
+///
+/// let md = span_to_markdown(&span);
+/// assert_eq!(md, "**important text**");
+/// ```
+///
+/// ```
+/// // H₂O example: subscript
+/// let mut span = SpanJson {
+/// text: "2".to_string(),
+/// flags: vec!["subscript".to_string()],
+/// ..Default::default()
+/// };
+///
+/// let md = span_to_markdown(&span);
+/// assert_eq!(md, "2");
+/// ```
+///
+/// ```
+/// // 4th example: superscript
+/// let mut span = SpanJson {
+/// text: "th".to_string(),
+/// flags: vec!["superscript".to_string()],
+/// ..Default::default()
+/// };
+///
+/// let md = span_to_markdown(&span);
+/// assert_eq!(md, "th");
+/// ```
+///
+/// ```
+/// // Bold + italic combination
+/// let mut span = SpanJson {
+/// text: "emphasized".to_string(),
+/// flags: vec!["bold".to_string(), "italic".to_string()],
+/// ..Default::default()
+/// };
+///
+/// let md = span_to_markdown(&span);
+/// assert_eq!(md, "***emphasized***");
+/// ```
+///
+/// ```
+/// // Special character escaping
+/// let mut span = SpanJson {
+/// text: "1*2".to_string(),
+/// flags: vec![],
+/// ..Default::default()
+/// };
+///
+/// let md = span_to_markdown(&span);
+/// assert_eq!(md, "1\\*2");
+/// ```
+pub fn span_to_markdown(span: &SpanJson) -> String {
+ // Get the text content
+ let text = &span.text;
+
+ // Skip whitespace-only spans (no point styling whitespace)
+ if text.trim().is_empty() {
+ return text.clone();
+ }
+
+ // Check for each flag in the flags Vec
+ let has_bold = span.flags.contains(&"bold".to_string());
+ let has_italic = span.flags.contains(&"italic".to_string());
+ let has_subscript = span.flags.contains(&"subscript".to_string());
+ let has_superscript = span.flags.contains(&"superscript".to_string());
+ let has_smallcaps = span.flags.contains(&"smallcaps".to_string());
+
+ // Color-only differences: emit no styling (just return escaped text)
+ // This is checked by seeing if none of the style flags are present
+ let has_any_style = has_bold || has_italic || has_subscript || has_superscript || has_smallcaps;
+
+ if !has_any_style {
+ // No styling flags, just escape and return
+ return escape_markdown_inline(text);
+ }
+
+ // Escape the text first (before wrapping in styling)
+ let escaped = escape_markdown_inline(text);
+
+ // Build the styled output
+ let mut result = String::new();
+
+ // Combination order:
+ // - Bold + italic wrapper (***text***) goes outermost
+ // - Smallcaps span wraps script tags (text)
+ // - Script tags go inside smallcaps (if both present)
+ // This order: **text** or **text** (if no smallcaps)
+
+ // Bold + italic wrapper (***text***)
+ if has_bold && has_italic {
+ result.push_str("***");
+ } else if has_bold {
+ result.push_str("**");
+ } else if has_italic {
+ result.push_str("*");
+ }
+
+ // Smallcaps wrapper (outer relative to scripts)
+ if has_smallcaps {
+ result.push_str("");
+ }
+
+ // Script tags (sub/sup) go inside smallcaps
+ if has_subscript {
+ result.push_str("");
+ } else if has_superscript {
+ result.push_str("");
+ }
+
+ // Add the escaped text
+ result.push_str(&escaped);
+
+ // Close wrappers in reverse order
+ if has_subscript {
+ result.push_str("");
+ } else if has_superscript {
+ result.push_str("");
+ }
+
+ if has_smallcaps {
+ result.push_str("");
+ }
+
+ if has_bold && has_italic {
+ result.push_str("***");
+ } else if has_bold {
+ result.push_str("**");
+ } else if has_italic {
+ result.push_str("*");
+ }
+
+ result
+}
+
+/// Escape special Markdown characters in inline text.
+///
+/// This function escapes characters that have special meaning in Markdown
+/// to prevent unintended formatting. Per CommonMark spec, these characters
+/// are escaped to prevent them from being interpreted as Markdown syntax.
+///
+/// # Characters Escaped
+///
+/// The following characters are escaped with a backslash:
+/// - `\` (backslash itself - must be escaped to avoid interpretation)
+/// - `` ` `` (code span)
+/// - `*` (emphasis/strong)
+/// - `_` (emphasis)
+/// - `[` (link start)
+/// - `]` (link end)
+/// - `(` (link destination start)
+/// - `)` (link destination end)
+/// - `#` (ATX heading)
+/// - `!` (image)
+/// - `+` (list marker)
+/// - `<` (HTML tag/auto-link)
+/// - `>` (blockquote)
+///
+/// # Characters NOT Escaped
+///
+/// - `-` (hyphen) - only special at start of line for lists/HR
+/// - `.` (period) - only special as part of list marker like "1."
+/// - `=` (equals) - not special in CommonMark
+///
+/// # Arguments
+///
+/// * `s` - The string to escape
+///
+/// # Returns
+///
+/// A string with special characters escaped.
+fn escape_markdown_inline(s: &str) -> String {
+ let mut result = String::with_capacity(s.len() * 2);
+
+ for c in s.chars() {
+ match c {
+ '\\' | '`' | '*' | '_' | '[' | ']' | '(' | ')' | '#' | '!' | '+' | '<' | '>' => {
+ result.push('\\');
+ result.push(c);
+ }
+ _ => result.push(c),
+ }
+ }
+
+ result
+}
+
+#[cfg(test)]
+mod span_tests {
+ use super::*;
+
+ /// Helper function to create a test span with the given text and flags.
+ /// All other fields are set to reasonable defaults for testing.
+ fn make_test_span(text: &str, flags: &[&str]) -> SpanJson {
+ SpanJson {
+ text: text.to_string(),
+ bbox: [0.0, 0.0, 100.0, 20.0],
+ font: "Helvetica".to_string(),
+ size: 12.0,
+ color: None,
+ rendering_mode: None,
+ confidence: None,
+ confidence_source: None,
+ lang: None,
+ flags: flags.iter().map(|s| s.to_string()).collect(),
+ receipt: None,
+ column: None,
+ }
+ }
+
+ #[test]
+ fn test_span_to_markdown_bold() {
+ let span = make_test_span("important", &["bold"]);
+ assert_eq!(span_to_markdown(&span), "**important**");
+ }
+
+ #[test]
+ fn test_span_to_markdown_italic() {
+ let span = make_test_span("emphasized", &["italic"]);
+ assert_eq!(span_to_markdown(&span), "*emphasized*");
+ }
+
+ #[test]
+ fn test_span_to_markdown_bold_italic() {
+ // Critical test: bold + italic span emitted as ***text***
+ let span = make_test_span("very important", &["bold", "italic"]);
+ assert_eq!(span_to_markdown(&span), "***very important***");
+ }
+
+ #[test]
+ fn test_span_to_markdown_subscript() {
+ let span = make_test_span("2", &["subscript"]);
+ assert_eq!(span_to_markdown(&span), "2");
+ }
+
+ #[test]
+ fn test_span_to_markdown_superscript() {
+ let span = make_test_span("th", &["superscript"]);
+ assert_eq!(span_to_markdown(&span), "th");
+ }
+
+ #[test]
+ fn test_span_to_markdown_smallcaps() {
+ let span = make_test_span("CAPS", &["smallcaps"]);
+ assert_eq!(
+ span_to_markdown(&span),
+ "CAPS"
+ );
+ }
+
+ #[test]
+ fn test_span_to_markdown_no_flags() {
+ // Color-only difference or no styling: no styling emitted
+ let span = make_test_span("plain text", &[]);
+ assert_eq!(span_to_markdown(&span), "plain text");
+ }
+
+ #[test]
+ fn test_span_to_markdown_special_chars_escaped() {
+ // Special chars escaped: span text "1*2" -> "1\*2"
+ let span = make_test_span("1*2", &[]);
+ assert_eq!(span_to_markdown(&span), "1\\*2");
+ }
+
+ #[test]
+ fn test_span_to_markdown_bold_subscript_combination() {
+ // Bold + subscript: **text**
+ let span = make_test_span("ion", &["bold", "subscript"]);
+ assert_eq!(span_to_markdown(&span), "**ion**");
+ }
+
+ #[test]
+ fn test_span_to_markdown_bold_superscript_combination() {
+ // Bold + superscript: **text**
+ let span = make_test_span("st", &["bold", "superscript"]);
+ assert_eq!(span_to_markdown(&span), "**st**");
+ }
+
+ #[test]
+ fn test_span_to_markdown_italic_subscript_combination() {
+ // Italic + subscript: *text*
+ let span = make_test_span("ion", &["italic", "subscript"]);
+ assert_eq!(span_to_markdown(&span), "*ion*");
+ }
+
+ #[test]
+ fn test_span_to_markdown_all_flags() {
+ // All flags: bold + italic + smallcaps + superscript
+ let span = make_test_span("X", &["bold", "italic", "smallcaps", "superscript"]);
+ assert_eq!(
+ span_to_markdown(&span),
+ "***X***"
+ );
+ }
+
+ #[test]
+ fn test_span_to_markdown_whitespace_only() {
+ // Empty/whitespace-only spans emit unwrapped
+ let span = make_test_span(" ", &["bold"]);
+ assert_eq!(span_to_markdown(&span), " ");
+ }
+
+ #[test]
+ fn test_span_to_markdown_empty_string() {
+ let span = make_test_span("", &["bold"]);
+ assert_eq!(span_to_markdown(&span), "");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_asterisk() {
+ assert_eq!(escape_markdown_inline("1*2"), "1\\*2");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_underscore() {
+ assert_eq!(escape_markdown_inline("hello_world"), "hello\\_world");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_backtick() {
+ assert_eq!(escape_markdown_inline("code`here"), "code\\`here");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_brackets() {
+ assert_eq!(escape_markdown_inline("[link]"), "\\[link\\]");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_multiple_special() {
+ assert_eq!(escape_markdown_inline("*_[link]*"), "\\*\\_\\[link\\]\\*");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_backslash() {
+ assert_eq!(escape_markdown_inline("C:\\path"), "C:\\\\path");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_hash() {
+ assert_eq!(escape_markdown_inline("#heading"), "\\#heading");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_plus_minus() {
+ assert_eq!(escape_markdown_inline("+/-"), "\\+/-");
+ }
+
+ #[test]
+ fn test_escape_markdown_inline_less_greater() {
+ // < and > are escaped (HTML tags/auto-links)
+ assert_eq!(escape_markdown_inline(""), "\\");
+ }
+
+ #[test]
+ fn test_span_to_markdown_bold_with_asterisk_in_text() {
+ // Bold text containing asterisks should be escaped
+ let span = make_test_span("2*2=4", &["bold"]);
+ assert_eq!(span_to_markdown(&span), "**2\\*2=4**");
+ }
+
+ #[test]
+ fn test_span_to_markdown_subscript_with_special_chars() {
+ // Subscript with special characters
+ let span = make_test_span("2+", &["subscript"]);
+ assert_eq!(span_to_markdown(&span), "2\\+");
+ }
+
+ #[test]
+ fn test_span_to_markdown_superscript_with_special_chars() {
+ // Superscript with special characters
+ let span = make_test_span("n-1", &["superscript"]);
+ assert_eq!(span_to_markdown(&span), "n-1");
+ }
+
+ #[test]
+ fn test_span_to_markdown_smallcaps_with_special_chars() {
+ // Smallcaps with underscore
+ let span = make_test_span("HELLO_WORLD", &["smallcaps"]);
+ assert_eq!(
+ span_to_markdown(&span),
+ "HELLO\\_WORLD"
+ );
+ }
+}
diff --git a/notes/pdftract-56yz8.md b/notes/pdftract-56yz8.md
new file mode 100644
index 0000000..3c2d4f1
--- /dev/null
+++ b/notes/pdftract-56yz8.md
@@ -0,0 +1,66 @@
+# Bead pdftract-56yz8: Inline Span Styling (Phase 6.5)
+
+## Summary
+
+Implemented `span_to_markdown` function that translates span flag bitmask values to Markdown inline syntax per Phase 6.5 of the plan (lines 2188-2195).
+
+## Changes Made
+
+### File: `crates/pdftract-core/src/markdown.rs`
+
+1. Added `SpanJson` import to the module
+2. Implemented `span_to_markdown(span: &SpanJson) -> String`:
+ - Reads span flags vector (`Vec`) for style indicators
+ - Emits appropriate Markdown syntax based on flags
+ - Handles combinations: bold+italic → `***text***`
+ - Handles script nesting: `**text**` (scripts inside bold/italic)
+ - Handles smallcaps+script: `**text**` (scripts inside smallcaps)
+ - Skips whitespace-only spans (no point styling whitespace)
+ - Color-only differences: no styling emitted
+
+3. Implemented `escape_markdown_inline(s: &str) -> String`:
+ - Escapes CommonMark special characters: `\` `` ` `` `*` `_` `[` `]` `(` `)` `#` `!` `+` `<` `>`
+ - Does NOT escape `-` `.` `=` (not special in inline context per CommonMark)
+
+4. Added comprehensive test coverage (20+ tests):
+ - Bold, italic, bold+italic combinations
+ - Subscript, superscript, smallcaps individually
+ - Combined styling (bold+subscript, italic+superscript, all flags)
+ - Special character escaping
+ - Whitespace-only edge cases
+
+### File: `crates/pdftract-core/src/lib.rs`
+
+- Exported `span_to_markdown` from the markdown module for public API
+
+## Acceptance Criteria Status
+
+| Criterion | Test | Status |
+|-----------|------|--------|
+| Bold + italic → ***text*** | `test_span_to_markdown_bold_italic` | PASS |
+| Subscript → `2` | `test_span_to_markdown_subscript` | PASS |
+| Superscript → `th` | `test_span_to_markdown_superscript` | PASS |
+| Smallcaps → `CAPS` | `test_span_to_markdown_smallcaps` | PASS |
+| Color-only difference: no styling | `test_span_to_markdown_no_flags` | PASS |
+| Special chars escaped: "1*2" → "1\*2" | `test_span_to_markdown_special_chars_escaped` | PASS |
+
+## Test Results
+
+```
+cargo test --package pdftract-core --lib markdown
+test result: ok. 43 passed; 0 failed
+```
+
+All acceptance criteria tests pass.
+
+## Implementation Notes
+
+1. **Nesting order**: Following plan guidance "emit **text** not **text**", script tags are placed inside bold/italic wrappers. For smallcaps+script combinations, smallcaps wraps scripts (e.g., `text`).
+
+2. **Escaping**: Implemented per CommonMark spec - only escapes characters that have special meaning in inline Markdown context. Characters like `-` and `.` are NOT escaped because they're only special at line start (for lists/HR), not inline.
+
+3. **Edge cases**: Whitespace-only spans skip styling entirely to avoid emitting empty formatting like `** **`.
+
+## Commits
+
+- `pdftract-core`: Add span_to_markdown function with inline span styling (Phase 6.5)