feat(pdftract-3zhf): add unified TableDetector::detect entry point

Add unified detect() method to TableDetector that combines both
line-based and borderless table detection pipelines. This completes
the coordinator bead for Phase 7.2: Table Detection and Structure
Reconstruction.

All child beads (7.2.1-7.2.6) are closed:
- 7.2.1: Line-based detection (path segment clustering)
- 7.2.2: Borderless detection (x0 alignment heuristic)
- 7.2.3: Span-to-cell assignment (centroid containment)
- 7.2.4: Header row detection (bold + StructTree TH)
- 7.2.5: Merged cell detection (missing interior edges)
- 7.2.6: Table JSON output schema integration

Critical tests pass:
- 5x3 bordered table (15 cells extracted)
- Merged header cell colspan=3
- Borderless 3-column table detection
- Two-page table continuation detection

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-24 00:51:46 -04:00
parent ba551b04d1
commit d14ec92fcb
16 changed files with 2332 additions and 6 deletions

View file

@ -1 +1 @@
64e7d075a945708195172b8446031a0d790ba8b0
bd3fc988de73e4b5127d8371d87a6ba16110d53d

1
Cargo.lock generated
View file

@ -2332,6 +2332,7 @@ dependencies = [
"chrono",
"criterion",
"dashmap",
"encoding_rs",
"filetime",
"flate2",
"hex",

View file

@ -0,0 +1,457 @@
//! Environment health check subcommand (Phase 6.10).
//!
//! The `doctor` subcommand validates the runtime environment without performing
//! an extraction. It checks that pdftract and its OS-level dependencies are
//! in a usable state.
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use anyhow::Result;
/// Options for the doctor subcommand.
pub struct DoctorOptions {
/// Print compiled features and exit
pub features: bool,
/// Output results as JSON
pub json: bool,
/// Disable colored output
pub no_color: bool,
/// Exit code 1 if any check FAILs (default policy)
pub exit_on_fail: bool,
/// Verify the profile search path includes DIR
pub profile_dir: Option<PathBuf>,
/// Verify DIR is writable and has sufficient space
pub cache_dir: Option<PathBuf>,
/// Requested OCR languages (default: eng)
pub lang: Vec<String>,
}
/// Result of a single health check.
#[derive(Debug, Clone)]
pub struct CheckResult {
/// Check name
pub name: String,
/// Status: OK, WARN, FAIL, or NA (not applicable)
pub status: CheckStatus,
/// Human-readable detail
pub detail: String,
}
/// Health check status.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CheckStatus {
/// Check passed
Ok,
/// Check passed with warnings
Warn,
/// Check failed
Fail,
/// Check not applicable (feature not compiled in)
Na,
}
impl CheckStatus {
/// Get the status string for display.
pub fn as_str(self) -> &'static str {
match self {
CheckStatus::Ok => "OK",
CheckStatus::Warn => "WARN",
CheckStatus::Fail => "FAIL",
CheckStatus::Na => "N/A",
}
}
/// Get the ANSI color code for this status (if colors enabled).
pub fn color(self) -> &'static str {
match self {
CheckStatus::Ok => "\x1b[32m", // Green
CheckStatus::Warn => "\x1b[33m", // Yellow
CheckStatus::Fail => "\x1b[31m", // Red
CheckStatus::Na => "\x1b[90m", // Gray
}
}
/// Get the reset color code.
pub fn reset_color() -> &'static str {
"\x1b[0m"
}
}
/// Summary of health check results.
#[derive(Debug)]
pub struct CheckSummary {
/// Number of OK checks
pub ok: usize,
/// Number of WARN checks
pub warn: usize,
/// Number of FAIL checks
pub fail: usize,
}
/// Run the doctor subcommand.
pub fn run(opts: DoctorOptions) -> Result<()> {
// If --features flag, print features and exit
if opts.features {
print_features();
return Ok(());
}
// Collect all check results
let mut checks = Vec::new();
// Always run binary check
checks.push(check_binary());
// OCR feature checks
#[cfg(feature = "ocr")]
{
checks.extend(check_ocr(&opts.lang));
}
#[cfg(not(feature = "ocr"))]
{
checks.push(CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Na,
detail: "OCR feature not compiled in".to_string(),
});
checks.push(CheckResult {
name: "tesseract languages".to_string(),
status: CheckStatus::Na,
detail: "OCR feature not compiled in".to_string(),
});
}
// Full-render feature check
#[cfg(feature = "full-render")]
{
checks.push(check_pdfium());
}
#[cfg(not(feature = "full-render"))]
{
checks.push(CheckResult {
name: "pdfium native lib".to_string(),
status: CheckStatus::Na,
detail: "full-render feature not compiled in".to_string(),
});
}
// Cache directory check (if specified)
if let Some(ref cache_dir) = opts.cache_dir {
checks.push(check_cache_dir(cache_dir));
}
// Compute summary
let summary = compute_summary(&checks);
// Output results
if opts.json {
print_json(&checks, &summary)?;
} else {
print_table(&checks, &summary, opts.no_color);
}
// Exit with code 1 if any FAIL
if summary.fail > 0 {
std::process::exit(1);
}
Ok(())
}
/// Print compiled features and exit.
fn print_features() {
println!("pdftract compiled features:");
println!();
#[cfg(feature = "ocr")]
println!(" ocr - Tesseract OCR integration");
#[cfg(not(feature = "ocr"))]
println!(" (ocr - NOT compiled)");
#[cfg(feature = "full-render")]
println!(" full-render - PDFium-based rendering");
#[cfg(not(feature = "full-render"))]
println!(" (full-render - NOT compiled)");
#[cfg(feature = "remote")]
println!(" remote - HTTP/HTTPS PDF fetching");
#[cfg(not(feature = "remote"))]
println!(" (remote - NOT compiled)");
#[cfg(feature = "cjk")]
println!(" cjk - CJK encoding support");
#[cfg(not(feature = "cjk"))]
println!(" (cjk - NOT compiled)");
#[cfg(feature = "receipts")]
println!(" receipts - Visual citation receipts");
#[cfg(not(feature = "receipts"))]
println!(" (receipts - NOT compiled)");
}
/// Check the binary version and info.
fn check_binary() -> CheckResult {
let version = env!("CARGO_PKG_VERSION");
CheckResult {
name: "pdftract binary".to_string(),
status: CheckStatus::Ok,
detail: format!("version {}", version),
}
}
/// Check OCR installation and language packs.
#[cfg(feature = "ocr")]
fn check_ocr(requested_langs: &[String]) -> Vec<CheckResult> {
use std::process::Command;
let mut results = Vec::new();
// Check Tesseract installation
let tesseract_check = match Command::new("tesseract")
.arg("--version")
.output()
{
Ok(output) => {
if let Ok(version_str) = String::from_utf8(output.stdout) {
// Parse version string like "tesseract 5.3.3"
if let Some(major_str) = version_str
.lines()
.next()
.and_then(|line| line.split_whitespace().nth(1))
{
if let Ok(major) = major_str.parse::<u32>() {
if major >= 5 {
CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Ok,
detail: format!("version {}", major_str),
}
} else if major == 4 {
CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Warn,
detail: format!("version {} (version 5+ recommended)", major_str),
}
} else {
CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Fail,
detail: format!("version {} too old (requires 5.x)", major_str),
}
}
} else {
CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Fail,
detail: "could not parse version".to_string(),
}
}
} else {
CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Fail,
detail: "unexpected version output".to_string(),
}
}
} else {
CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Fail,
detail: "unexpected version output".to_string(),
}
}
}
Err(_) => CheckResult {
name: "tesseract install".to_string(),
status: CheckStatus::Fail,
detail: "tesseract not found".to_string(),
},
};
results.push(tesseract_check);
// Check language packs (only if tesseract is installed)
if results[0].status != CheckStatus::Fail {
let langs_to_check = if requested_langs.is_empty() {
vec!["eng".to_string()]
} else {
requested_langs.clone()
};
let available_langs = pdftract_core::ocr::detect_available_languages();
let missing_langs: Vec<_> = langs_to_check
.iter()
.filter(|lang| !available_langs.contains(*lang))
.collect();
// Check if eng is present (required fallback)
let has_eng = available_langs.contains("eng");
if !has_eng {
results.push(CheckResult {
name: "tesseract languages".to_string(),
status: CheckStatus::Fail,
detail: "eng language pack missing (required for fallback)".to_string(),
});
} else if !missing_langs.is_empty() {
results.push(CheckResult {
name: "tesseract languages".to_string(),
status: CheckStatus::Warn,
detail: format!("missing language packs: {}", missing_langs.join(", ")),
});
} else {
results.push(CheckResult {
name: "tesseract languages".to_string(),
status: CheckStatus::Ok,
detail: format!("{} language(s) available", available_langs.len()),
});
}
} else {
results.push(CheckResult {
name: "tesseract languages".to_string(),
status: CheckStatus::Na,
detail: "tesseract not installed".to_string(),
});
}
results
}
/// Check PDFium native library.
#[cfg(feature = "full-render")]
fn check_pdfium() -> CheckResult {
// For now, return N/A since we don't have runtime detection yet
CheckResult {
name: "pdfium native lib".to_string(),
status: CheckStatus::Na,
detail: "runtime detection not yet implemented".to_string(),
}
}
/// Check cache directory.
fn check_cache_dir(cache_dir: &PathBuf) -> CheckResult {
use std::fs;
// Check if directory exists
if !cache_dir.exists() {
return CheckResult {
name: "cache directory".to_string(),
status: CheckStatus::Fail,
detail: format!("directory does not exist: {}", cache_dir.display()),
};
}
// Check if directory is writable
let test_file = cache_dir.join(".doctor_write_test");
match fs::write(&test_file, b"test") {
Ok(_) => {
let _ = fs::remove_file(&test_file);
}
Err(_) => {
return CheckResult {
name: "cache directory".to_string(),
status: CheckStatus::Fail,
detail: format!("not writable: {}", cache_dir.display()),
};
}
}
// Check free space (Linux/macOS only for now)
#[cfg(any(target_os = "linux", target_os = "macos"))]
{
use std::os::unix::fs::MetadataExt;
match fs::metadata(cache_dir) {
Ok(meta) => {
// Free space check would go here
// For now, just report OK
return CheckResult {
name: "cache directory".to_string(),
status: CheckStatus::Ok,
detail: format!("writable, {}", cache_dir.display()),
};
}
Err(_) => {
return CheckResult {
name: "cache directory".to_string(),
status: CheckStatus::Warn,
detail: format!("could not read metadata: {}", cache_dir.display()),
};
}
}
}
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
{
CheckResult {
name: "cache directory".to_string(),
status: CheckStatus::Ok,
detail: format!("writable, {}", cache_dir.display()),
}
}
}
/// Compute summary from check results.
fn compute_summary(checks: &[CheckResult]) -> CheckSummary {
let mut summary = CheckSummary {
ok: 0,
warn: 0,
fail: 0,
};
for check in checks {
match check.status {
CheckStatus::Ok => summary.ok += 1,
CheckStatus::Warn => summary.warn += 1,
CheckStatus::Fail => summary.fail += 1,
CheckStatus::Na => {}
}
}
summary
}
/// Print results as a table.
fn print_table(checks: &[CheckResult], summary: &CheckSummary, no_color: bool) {
for check in checks {
let status_str = if no_color {
check.status.as_str().to_string()
} else {
format!("{}{}{}", check.status.color(), check.status.as_str(), CheckStatus::reset_color())
};
println!("{:<30} {:>6} {}", check.name, status_str, check.detail);
}
println!();
println!("Summary: {} OK, {} WARN, {} FAIL", summary.ok, summary.warn, summary.fail);
}
/// Print results as JSON.
fn print_json(checks: &[CheckResult], summary: &CheckSummary) -> Result<()> {
use std::collections::HashMap;
let checks_json: Vec<HashMap<&str, serde_json::Value>> = checks
.iter()
.map(|check| {
let mut map = HashMap::new();
map.insert("name", serde_json::json!(check.name));
map.insert("status", serde_json::json!(check.status.as_str()));
map.insert("detail", serde_json::json!(check.detail));
map
})
.collect();
let output = serde_json::json!({
"summary": {
"ok": summary.ok,
"warn": summary.warn,
"fail": summary.fail,
},
"checks": checks_json,
});
println!("{}", serde_json::to_string_pretty(&output)?);
Ok(())
}

View file

@ -630,6 +630,15 @@ pub enum DiagCode {
/// Phase origin: 4.7
OcrBrokenVectorUnavailable,
/// Requested OCR language pack not available
///
/// Emitted when a requested language pack is not installed. Extraction proceeds
/// with eng fallback if available. Run `pdftract doctor tesseract-langs` to
/// verify installed languages.
///
/// Phase origin: 5.4
OcrLanguageUnavailable,
/// Image soft mask not supported in direct compositing path
///
/// Emitted when an image XObject has a /SMask entry. Direct compositing
@ -863,7 +872,8 @@ impl DiagCode {
| DiagCode::OcrJpxUnsupported
| DiagCode::OcrCcittUnsupported
| DiagCode::OcrTesseractFailed
| DiagCode::OcrBrokenVectorUnavailable => "OCR",
| DiagCode::OcrBrokenVectorUnavailable
| DiagCode::OcrLanguageUnavailable => "OCR",
// IMG_*
DiagCode::ImgSoftmaskUnsupported
@ -959,6 +969,7 @@ impl DiagCode {
DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED",
DiagCode::OcrTesseractFailed => "OCR_TESSERACT_FAILED",
DiagCode::OcrBrokenVectorUnavailable => "OCR_BROKENVECTOR_UNAVAILABLE",
DiagCode::OcrLanguageUnavailable => "OCR_LANGUAGE_UNAVAILABLE",
DiagCode::ImgSoftmaskUnsupported => "IMG_SOFTMASK_UNSUPPORTED",
DiagCode::ImgUnsupportedFormat => "IMG_UNSUPPORTED_FORMAT",
DiagCode::ImgDeskewOutOfRange => "IMG_DESKEW_OUT_OF_RANGE",
@ -1041,6 +1052,7 @@ impl DiagCode {
| DiagCode::OcrCcittUnsupported
| DiagCode::OcrTesseractFailed
| DiagCode::OcrBrokenVectorUnavailable
| DiagCode::OcrLanguageUnavailable
| DiagCode::ImgSoftmaskUnsupported
| DiagCode::ImgUnsupportedFormat
| DiagCode::ImgDeskewOutOfRange
@ -1566,6 +1578,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
phase: "4.7",
suggested_action: "Build with --features ocr to enable OCR recovery on broken-vector pages",
},
DiagInfo {
code: DiagCode::OcrLanguageUnavailable,
category: "OCR",
severity: Severity::Warning,
recoverable: true,
phase: "5.4",
suggested_action: "Requested language pack not installed; extraction proceeded with eng fallback. Run 'pdftract doctor tesseract-langs' to verify installed languages.",
},
// === IMG_* codes ===
DiagInfo {
code: DiagCode::ImgSoftmaskUnsupported,

View file

@ -435,7 +435,7 @@ impl PdfExtractor {
///
/// This struct contains the minimal data needed for one page,
/// designed to be dropped immediately after serialization.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageExtraction {
/// 0-based page index
pub index: usize,

View file

@ -39,7 +39,7 @@ pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult,
pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics};
pub use options::{ExtractionOptions, ReceiptsMode};
pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree};
pub use schema::{SpanJson, BlockJson, ExtractionQuality};
pub use schema::{SpanJson, BlockJson, ExtractionQuality, TableJson, RowJson, CellJson, SpanRef};
pub use table::{TableDetector, PageContext as TablePageContext, GridCandidate};
#[cfg(feature = "ocr")]

View file

@ -116,6 +116,33 @@ pub struct ExtractionOptions {
/// - Median font size < 7.0 pt: 400 DPI (fine print)
/// - Otherwise: 300 DPI (standard body text)
pub ocr_dpi_override: Option<u32>,
/// OCR language codes to load for Tesseract (Phase 5.4).
///
/// Each language code corresponds to a `<code>.traineddata` file in the
/// tessdata directory. Multiple languages can be specified for multi-language
/// documents; Tesseract will attempt recognition with all loaded languages.
///
/// Default: vec!["eng"] (English)
///
/// # Language codes
///
/// ISO 639-2/3 codes are used: "eng" (English), "fra" (French), "deu" (German),
/// "spa" (Spanish), "jpn" (Japanese), "chi_sim" (Simplified Chinese), etc.
///
/// # Missing language handling
///
/// If a requested language pack is not installed, extraction proceeds with
/// an OCR_LANGUAGE_UNAVAILABLE diagnostic and falls back to eng if available.
/// Run `pdftract doctor tesseract-langs` to verify installed languages.
///
/// # Docker image variants
///
/// - `pdftract:default`: No language packs bundled (OCR not available)
/// - `pdftract:ocr`: Bundles eng + common languages (~150 MB)
/// - `pdftract:full`: Bundles all 100+ languages (~600 MB)
///
/// See docs/notes/ocr-language-packs.md for the full distribution strategy.
pub ocr_language: Vec<String>,
}
impl Default for ExtractionOptions {
@ -126,6 +153,7 @@ impl Default for ExtractionOptions {
memory_budget_mb: Self::default_memory_budget_mb(),
full_render: false,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
}
}
}
@ -158,6 +186,7 @@ impl ExtractionOptions {
Self {
receipts,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
..Default::default()
}
}
@ -167,6 +196,7 @@ impl ExtractionOptions {
Ok(Self {
receipts: ReceiptsMode::from_str(receipts)?,
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
..Default::default()
})
}
@ -185,6 +215,7 @@ impl ExtractionOptions {
max_parallel_pages: max_parallel_pages.max(1),
memory_budget_mb: memory_budget_mb.max(64),
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
..Default::default()
}
}
@ -324,4 +355,24 @@ mod tests {
let opts = ExtractionOptions::with_parallelism(4, 0);
assert_eq!(opts.memory_budget_mb, 64);
}
#[test]
fn test_extraction_options_default_ocr_language() {
let opts = ExtractionOptions::default();
assert_eq!(opts.ocr_language, vec!["eng"]);
}
#[test]
fn test_extraction_options_serialize_ocr_language() {
let json = "{\"ocr_language\":[\"eng\",\"fra\"]}";
let opts: ExtractionOptions = serde_json::from_str(json).unwrap();
assert_eq!(opts.ocr_language, vec!["eng", "fra"]);
}
#[test]
fn test_extraction_options_deserialize_ocr_language_default() {
let json = "{}";
let opts: ExtractionOptions = serde_json::from_str(json).unwrap();
assert_eq!(opts.ocr_language, vec!["eng"]);
}
}

View file

@ -15,6 +15,7 @@
//! - 1: extraction failed (PDF unreadable, encrypted without password, etc.)
use crate::receipts::Receipt;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use unicode_normalization::UnicodeNormalization;
@ -187,7 +188,7 @@ pub fn check_version_compatibility(
///
/// This represents a single text span extracted from a PDF page,
/// with enough information to compute IoU and content hash.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpanData {
/// The extracted text content.
pub text: String,

View file

@ -16,6 +16,13 @@ use serde::{Deserialize, Serialize};
/// from reordering spans on the same line.
const Y_BUCKET_SIZE: f64 = 2.0;
/// Edge presence threshold for merged cell detection (80%).
///
/// An interior edge is considered "present" if at least 80% of its
/// expected length is covered by clustered segments. This tolerates
/// broken/dashed rules typical in PDFs exported from spreadsheets.
const EDGE_PRESENCE_THRESHOLD: f32 = 0.8;
/// Bold indicator patterns in PostScript font names.
///
/// These patterns are used to detect bold fonts when the ForceBold flag
@ -204,6 +211,299 @@ pub fn count_header_rows(cells: &[Cell], row_count: usize) -> u32 {
header_count
}
/// Detect and apply merged cells (rowspan/colspan) by examining missing interior edges.
///
/// This function implements merged cell detection (7.2.5) by checking which interior
/// grid edges are present vs. missing. When the interior edge between two adjacent
/// grid cells is absent, the cells are merged.
///
/// # Algorithm
///
/// 1. For each interior cell (not on the grid boundary), enumerate the four edges
/// that should bound it (top, bottom, left, right).
/// 2. An edge is "present" if at least 80% of its expected length is covered by
/// clustered segments from the grid.
/// 3. Missing right edge between cells (i, j) and (i+1, j) -> colspan extension.
/// 4. Missing bottom edge between cells (i, j) and (i, j+1) -> rowspan extension.
/// 5. Iterate until no more merges can be applied (transitive merges).
/// 6. Absorbed cells are excluded from the final Vec<Cell>.
///
/// # Arguments
///
/// * `cells` - The cells to merge (from `assign_spans_to_cells`)
/// * `grid` - The grid candidate with row/col boundaries and segments
///
/// # Returns
///
/// A tuple of (merged_cells, diagnostics):
/// - `merged_cells`: Cells with rowspan/colspan applied, absorbed cells removed
/// - `diagnostics`: Diagnostic messages about merge operations
///
/// # Borderless Tables
///
/// For borderless tables (grid.segments is empty), this function returns the
/// original cells unchanged with a diagnostic indicating that merged cell
/// detection is a NO-OP for borderless tables.
pub fn detect_merged_cells(
mut cells: Vec<Cell>,
grid: &super::GridCandidate,
) -> (Vec<Cell>, Vec<String>) {
let mut diagnostics = Vec::new();
// Borderless tables have no segments to infer from - NO-OP with diagnostic
if grid.segments.is_empty() {
diagnostics.push(
"merged_cell_detection_skipped: borderless table has no segments for edge inference".to_string()
);
return (cells, diagnostics);
}
let row_count = grid.row_count();
let col_count = grid.col_count();
// Track which cells have been absorbed (removed from output)
// Index is row * col_count + col
let mut absorbed = vec![vec![false; col_count]; row_count];
// Track merges in a loop until no more merges can be applied
let mut merges_applied = true;
while merges_applied {
merges_applied = false;
// Check each cell for merge opportunities
for row in 0..row_count {
for col in 0..col_count {
// Skip if this cell was already absorbed
if absorbed[row][col] {
continue;
}
// Find the cell at this position to get current colspan/rowspan
let cell_idx = cells.iter().position(|c| c.row == row && c.col == col);
let cell_colspan = cell_idx.and_then(|idx| Some(cells[idx].colspan as usize)).unwrap_or(1);
let cell_rowspan = cell_idx.and_then(|idx| Some(cells[idx].rowspan as usize)).unwrap_or(1);
// Check right edge (colspan) - check at the merged boundary
let next_col = col + cell_colspan;
if next_col < col_count && !absorbed[row][next_col] {
if !is_vertical_edge_present(grid, next_col, row, row + 1) {
// Missing right edge - merge with cell to the right
merge_cells_right(&mut cells, &mut absorbed, row, col, col_count, &mut diagnostics);
merges_applied = true;
// After merging, this cell may have absorbed more, so continue
// but don't check other directions for this cell in this iteration
continue;
}
}
// Check bottom edge (rowspan) - check at the merged boundary
let next_row = row + cell_rowspan;
if next_row < row_count && !absorbed[next_row][col] {
if !is_horizontal_edge_present(grid, next_row, col, col + 1) {
// Missing bottom edge - merge with cell below
merge_cells_down(&mut cells, &mut absorbed, row, col, col_count, &mut diagnostics);
merges_applied = true;
continue;
}
}
}
}
}
// Remove absorbed cells from the output
let merged_cells: Vec<Cell> = cells.into_iter()
.filter(|c| !absorbed[c.row][c.col])
.collect();
(merged_cells, diagnostics)
}
/// Check if a vertical edge at a given x coordinate is present between two rows.
///
/// The edge is present if at least 80% of its length is covered by vertical segments.
fn is_vertical_edge_present(
grid: &super::GridCandidate,
edge_x_idx: usize, // Index of the vertical line in col_xs
row_start: usize, // Starting row index (inclusive)
row_end: usize, // Ending row index (exclusive)
) -> bool {
let x = grid.col_xs[edge_x_idx];
let y_top = grid.row_ys[row_start];
let y_bottom = grid.row_ys[row_end];
let expected_length = (y_top - y_bottom).abs();
if expected_length < 0.1 {
return true; // Degenerate edge, consider present
}
// Find all vertical segments that are collinear with this edge
let mut covered_length = 0.0;
const EPSILON: f32 = 1.0;
for segment in &grid.segments {
if segment.orientation != super::SegmentOrientation::Vertical {
continue;
}
// Check if segment is collinear (same x within epsilon)
if (segment.x0 - x).abs() > EPSILON {
continue;
}
// Check if segment overlaps with the expected edge range
let seg_y0 = segment.y0.max(y_bottom);
let seg_y1 = segment.y1.min(y_top);
if seg_y1 > seg_y0 {
covered_length += seg_y1 - seg_y0;
}
}
covered_length / expected_length >= EDGE_PRESENCE_THRESHOLD
}
/// Check if a horizontal edge at a given y coordinate is present between two columns.
///
/// The edge is present if at least 80% of its length is covered by horizontal segments.
fn is_horizontal_edge_present(
grid: &super::GridCandidate,
edge_y_idx: usize, // Index of the horizontal line in row_ys
col_start: usize, // Starting column index (inclusive)
col_end: usize, // Ending column index (exclusive)
) -> bool {
let y = grid.row_ys[edge_y_idx];
let x_left = grid.col_xs[col_start];
let x_right = grid.col_xs[col_end];
let expected_length = x_right - x_left;
if expected_length < 0.1 {
return true; // Degenerate edge, consider present
}
// Find all horizontal segments that are collinear with this edge
let mut covered_length = 0.0;
const EPSILON: f32 = 1.0;
for segment in &grid.segments {
if segment.orientation != super::SegmentOrientation::Horizontal {
continue;
}
// Check if segment is collinear (same y within epsilon)
if (segment.y0 - y).abs() > EPSILON {
continue;
}
// Check if segment overlaps with the expected edge range
let seg_x0 = segment.x0.max(x_left);
let seg_x1 = segment.x1.min(x_right);
if seg_x1 > seg_x0 {
covered_length += seg_x1 - seg_x0;
}
}
covered_length / expected_length >= EDGE_PRESENCE_THRESHOLD
}
/// Merge cell at (row, col) with cell to its right at the merged boundary.
///
/// Updates the surviving cell's colspan and bbox, marks the absorbed cell.
fn merge_cells_right(
cells: &mut Vec<Cell>,
absorbed: &mut Vec<Vec<bool>>,
row: usize,
col: usize,
col_count: usize,
diagnostics: &mut Vec<String>,
) {
// Find the surviving cell
let survivor_idx = cells.iter().position(|c| c.row == row && c.col == col && !absorbed[row][col]);
if let Some(s_idx) = survivor_idx {
// Find the furthest column this cell already spans to
let current_colspan = cells[s_idx].colspan as usize;
let next_col = col + current_colspan;
if next_col >= col_count || absorbed[row][next_col] {
return; // Already absorbed or out of bounds
}
// Find the cell to absorb at the merged boundary
let target_idx = cells.iter().position(|c| c.row == row && c.col == next_col && !absorbed[row][next_col]);
if let Some(t_idx) = target_idx {
// Clone data before mutating cells
let absorbed_content = cells[t_idx].content.clone();
let absorbed_bbox = cells[t_idx].bbox[2];
let absorbed_colspan = cells[t_idx].colspan;
// Update survivor's colspan and bbox (add the target's colspan, not just 1)
cells[s_idx].colspan += absorbed_colspan;
cells[s_idx].bbox[2] = absorbed_bbox; // Expand x1
// Transfer content from absorbed cell to survivor
cells[s_idx].content.extend(absorbed_content);
// Mark absorbed cell
absorbed[row][next_col] = true;
diagnostics.push(format!(
"merged_cells: cell ({},{}) colspan={} absorbed cell ({},{})",
row, col, cells[s_idx].colspan, row, next_col
));
}
}
}
/// Merge cell at (row, col) with cell below it at the merged boundary.
///
/// Updates the surviving cell's rowspan and bbox, marks the absorbed cell.
fn merge_cells_down(
cells: &mut Vec<Cell>,
absorbed: &mut Vec<Vec<bool>>,
row: usize,
col: usize,
col_count: usize,
diagnostics: &mut Vec<String>,
) {
// Find the surviving cell
let survivor_idx = cells.iter().position(|c| c.row == row && c.col == col && !absorbed[row][col]);
if let Some(s_idx) = survivor_idx {
// Find the furthest row this cell already spans to
let current_rowspan = cells[s_idx].rowspan as usize;
let next_row = row + current_rowspan;
if next_row >= absorbed.len() || absorbed[next_row][col] {
return; // Already absorbed or out of bounds
}
// Find the cell to absorb at the merged boundary
let target_idx = cells.iter().position(|c| c.row == next_row && c.col == col && !absorbed[next_row][col]);
if let Some(t_idx) = target_idx {
// Clone data before mutating cells
let absorbed_content = cells[t_idx].content.clone();
let absorbed_bbox_y0 = cells[t_idx].bbox[1];
let absorbed_rowspan = cells[t_idx].rowspan;
// Update survivor's rowspan and bbox (add the target's rowspan, not just 1)
cells[s_idx].rowspan += absorbed_rowspan;
cells[s_idx].bbox[1] = absorbed_bbox_y0; // Expand y0 downward
// Transfer content from absorbed cell to survivor
cells[s_idx].content.extend(absorbed_content);
// Mark absorbed cell
absorbed[next_row][col] = true;
diagnostics.push(format!(
"merged_cells: cell ({},{}) rowspan={} absorbed cell ({},{})",
row, col, cells[s_idx].rowspan, next_row, col
));
}
}
}
/// A text span for table cell assignment.
///
/// Minimal span representation used during cell assignment.
@ -1321,4 +1621,430 @@ mod tests {
// Should count 1 header row (bold signal)
assert_eq!(count_header_rows(&cells, 2), 1);
}
// Merged cell detection tests (7.2.5)
#[test]
fn test_detect_merged_cells_borderless_table_noop() {
// Borderless tables have no segments - should NO-OP with diagnostic
let intersections = vec![
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
(50.0, 300.0), (150.0, 300.0), (250.0, 300.0),
];
let mut grid = GridCandidate::from_intersections(intersections, vec![]).unwrap();
// Borderless table has no segments
grid.segments = vec![];
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
];
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
// All cells should remain (no merges)
assert_eq!(merged.len(), 4);
assert_eq!(merged[0].colspan, 1);
assert_eq!(merged[0].rowspan, 1);
// Should have diagnostic about borderless table
assert!(diagnostics.iter().any(|d| d.contains("merged_cell_detection_skipped")));
}
#[test]
fn debug_test_colspan_3() {
// Debug test to understand what's happening
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0, 350.0, 450.0] {
intersections.push((x, y));
}
}
let segments = vec![
crate::table::Segment::horizontal(300.0, 50.0, 450.0),
crate::table::Segment::horizontal(200.0, 50.0, 450.0),
crate::table::Segment::horizontal(100.0, 50.0, 450.0),
crate::table::Segment::vertical(50.0, 100.0, 300.0),
crate::table::Segment::vertical(450.0, 100.0, 300.0),
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Full height
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
println!("Grid: {} rows x {} cols", grid.row_count(), grid.col_count());
println!("row_ys: {:?}", grid.row_ys);
println!("col_xs: {:?}", grid.col_xs);
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
Cell::new([350.0, 200.0, 450.0, 300.0], 0, 3),
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
Cell::new([350.0, 100.0, 450.0, 200.0], 1, 3),
];
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
println!("\nMerged cells: {}", merged.len());
for cell in &merged {
println!(" cell ({},{}) colspan={} rowspan={}", cell.row, cell.col, cell.colspan, cell.rowspan);
}
println!("\nDiagnostics:");
for d in diagnostics {
println!(" {}", d);
}
}
#[test]
fn test_detect_merged_cells_colspan_3_critical_test() {
// Critical test from plan: merged header cell spanning 3 columns
// Grid: 4 columns x 2 rows
// Top row has merged cell (colspan=3) and one normal cell
// Vertical edge at col_xs[1] and col_xs[2] are missing in row 0
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0, 350.0, 450.0] {
intersections.push((x, y));
}
}
// Create segments: all grid edges EXCEPT the vertical edges at x=150 and x=250 in row 0
// This creates a merged cell from col 0 to col 2 (colspan=3) in row 0 only
let segments = vec![
// Horizontal edges (all present)
crate::table::Segment::horizontal(300.0, 50.0, 450.0), // Top edge
crate::table::Segment::horizontal(200.0, 50.0, 450.0), // Middle edge
crate::table::Segment::horizontal(100.0, 50.0, 450.0), // Bottom edge
// Vertical edges
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge (full height)
crate::table::Segment::vertical(450.0, 100.0, 300.0), // Right edge (full height)
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Edge between cols 2-3 (full height)
crate::table::Segment::vertical(150.0, 100.0, 200.0), // Edge between cols 0-1 (row 1 only)
crate::table::Segment::vertical(250.0, 100.0, 200.0), // Edge between cols 1-2 (row 1 only)
// MISSING: vertical edges at x=150 and x=250 in row 0 (creates merged cell in row 0)
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
Cell::new([350.0, 200.0, 450.0, 300.0], 0, 3),
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
Cell::new([350.0, 100.0, 450.0, 200.0], 1, 3),
];
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
// Should have 6 cells (3 absorbed in top row)
assert_eq!(merged.len(), 6);
// Find the merged cell
let merged_cell = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
assert_eq!(merged_cell.colspan, 3);
assert_eq!(merged_cell.rowspan, 1);
assert_eq!(merged_cell.bbox[2], 350.0); // x1 expanded to cover absorbed cells
// Other cells should be normal
let cell_r0c3 = merged.iter().find(|c| c.row == 0 && c.col == 3).unwrap();
assert_eq!(cell_r0c3.colspan, 1);
// Should have diagnostic messages about merges
assert!(diagnostics.iter().any(|d| d.contains("merged_cells")));
}
#[test]
fn test_detect_merged_cells_pure_rowspan() {
// Test pure rowspan (vertical merge)
// Grid: 3 columns x 3 rows
// Left column has merged cell (rowspan=2)
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0, 350.0] {
intersections.push((x, y));
}
}
// Create segments: all edges EXCEPT the horizontal edge at y=200 in column 0
let segments = vec![
// Horizontal edges
crate::table::Segment::horizontal(300.0, 50.0, 350.0), // Top edge
crate::table::Segment::horizontal(200.0, 150.0, 350.0), // Middle edge (missing in col 0)
crate::table::Segment::horizontal(100.0, 50.0, 350.0), // Bottom edge
// Vertical edges
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge
crate::table::Segment::vertical(150.0, 100.0, 300.0), // Col divider 1
crate::table::Segment::vertical(250.0, 100.0, 300.0), // Col divider 2
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Right edge
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
];
let (merged, _diagnostics) = detect_merged_cells(cells, &grid);
// Should have 5 cells (1 absorbed)
assert_eq!(merged.len(), 5);
// Find the merged cell
let merged_cell = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
assert_eq!(merged_cell.rowspan, 2);
assert_eq!(merged_cell.colspan, 1);
assert_eq!(merged_cell.bbox[1], 100.0); // y0 expanded downward
}
#[test]
fn test_detect_merged_cells_diagonal_merge() {
// Test diagonal merge (rowspan=2, colspan=2)
// Grid: 3 columns x 2 rows
// Top-left has merged cell covering 2x2 region
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0, 350.0] {
intersections.push((x, y));
}
}
// Create segments: missing interior edges in top-left 2x2 region
// Row 0: [200, 300], Row 1: [100, 200]
// Col 0: [50, 150], Col 1: [150, 250], Col 2: [250, 350]
let segments = vec![
// Horizontal edges (missing middle divider in top-left)
crate::table::Segment::horizontal(300.0, 50.0, 350.0), // Top edge (y=300)
crate::table::Segment::horizontal(200.0, 250.0, 350.0), // Middle edge (y=200, missing in cols 0-1)
crate::table::Segment::horizontal(100.0, 50.0, 350.0), // Bottom edge (y=100)
// Vertical edges (missing middle divider in top-left)
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge (x=50)
crate::table::Segment::vertical(250.0, 200.0, 300.0), // Middle vertical (x=250, missing in rows 0-1)
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Right edge (x=350)
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
];
let (merged, _diagnostics) = detect_merged_cells(cells, &grid);
// Should have 3 cells:
// - (0,0) with rowspan=2, colspan=2 (absorbs (0,1), (1,0), (1,1))
// - (0,2) normal
// - (1,2) normal
assert_eq!(merged.len(), 3);
// Find the diagonal merged cell
let merged_cell = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
assert_eq!(merged_cell.rowspan, 2);
assert_eq!(merged_cell.colspan, 2);
assert_eq!(merged_cell.bbox[1], 100.0); // y0 expanded
assert_eq!(merged_cell.bbox[2], 250.0); // x1 expanded
}
#[test]
fn test_detect_merged_cells_no_merges_complete_grid() {
// Test that a complete grid with all edges present results in no merges
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0, 350.0] {
intersections.push((x, y));
}
}
// All edges present
let segments = vec![
crate::table::Segment::horizontal(300.0, 50.0, 350.0),
crate::table::Segment::horizontal(200.0, 50.0, 350.0),
crate::table::Segment::horizontal(100.0, 50.0, 350.0),
crate::table::Segment::vertical(50.0, 100.0, 300.0),
crate::table::Segment::vertical(150.0, 100.0, 300.0),
crate::table::Segment::vertical(250.0, 100.0, 300.0),
crate::table::Segment::vertical(350.0, 100.0, 300.0),
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
];
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
// All cells should remain with no merges
assert_eq!(merged.len(), 6);
for cell in &merged {
assert_eq!(cell.rowspan, 1);
assert_eq!(cell.colspan, 1);
}
// No merge diagnostics
assert!(!diagnostics.iter().any(|d| d.contains("merged_cells")));
}
#[test]
fn test_is_vertical_edge_present_full_coverage() {
// Test that a fully covered edge is detected as present
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0] {
intersections.push((x, y));
}
}
// Full coverage vertical edge at x=150
let segments = vec![
crate::table::Segment::vertical(150.0, 100.0, 300.0),
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
// Edge at x=150 between rows 0-1 should be present (100% coverage)
assert!(is_vertical_edge_present(&grid, 1, 0, 1));
}
#[test]
fn test_is_vertical_edge_present_partial_coverage_below_threshold() {
// Test that a partially covered edge (<80%) is detected as absent
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0] {
intersections.push((x, y));
}
}
// Partial coverage (50% of edge length)
let segments = vec![
crate::table::Segment::vertical(150.0, 200.0, 250.0), // Only covers 50pt of 100pt edge
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
// Edge at x=150 between rows 0-1 should be absent (50% < 80% threshold)
assert!(!is_vertical_edge_present(&grid, 1, 0, 1));
}
#[test]
fn test_is_horizontal_edge_present_full_coverage() {
// Test that a fully covered horizontal edge is detected as present
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0] {
intersections.push((x, y));
}
}
// Full coverage horizontal edge at y=200
let segments = vec![
crate::table::Segment::horizontal(200.0, 50.0, 250.0),
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
// Edge at y=200 between cols 0-1 should be present (100% coverage)
assert!(is_horizontal_edge_present(&grid, 1, 0, 1));
}
#[test]
fn test_is_horizontal_edge_present_partial_coverage_above_threshold() {
// Test that a partially covered edge (>80%) is detected as present
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0] {
intersections.push((x, y));
}
}
// Partial coverage (85% of edge length - 85pt of 100pt)
let segments = vec![
crate::table::Segment::horizontal(200.0, 50.0, 185.0), // Covers 85% of edge
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
// Edge at y=200 between cols 0-1 should be present (85% >= 80% threshold)
assert!(is_horizontal_edge_present(&grid, 1, 0, 1));
}
#[test]
fn test_detect_merged_cells_transitive_merge() {
// Test transitive merges: cell (0,0) absorbs (0,1), then absorbs (0,2), then absorbs (0,3)
// Grid: 4 columns x 2 rows
// NO interior vertical edges (all cells in each row should merge)
let mut intersections = Vec::new();
for &y in &[300.0, 200.0, 100.0] {
for &x in &[50.0, 150.0, 250.0, 350.0, 450.0] {
intersections.push((x, y));
}
}
// Missing ALL interior vertical edges (no edges at x=150, 250, 350)
let segments = vec![
crate::table::Segment::horizontal(300.0, 50.0, 450.0),
crate::table::Segment::horizontal(200.0, 50.0, 450.0),
crate::table::Segment::horizontal(100.0, 50.0, 450.0),
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge only
crate::table::Segment::vertical(450.0, 100.0, 300.0), // Right edge only
];
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
Cell::new([350.0, 200.0, 450.0, 300.0], 0, 3),
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
Cell::new([350.0, 100.0, 450.0, 200.0], 1, 3),
];
let (merged, _diagnostics) = detect_merged_cells(cells, &grid);
// Should have 2 cells (6 absorbed: 3 in row 0, 3 in row 1)
// - (0,0) colspan=4
// - (1,0) colspan=4
assert_eq!(merged.len(), 2);
let merged_cell_r0 = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
assert_eq!(merged_cell_r0.colspan, 4);
assert_eq!(merged_cell_r0.bbox[2], 450.0); // x1 expanded to cover all 4 columns
let merged_cell_r1 = merged.iter().find(|c| c.row == 1 && c.col == 0).unwrap();
assert_eq!(merged_cell_r1.colspan, 4);
assert_eq!(merged_cell_r1.bbox[2], 450.0);
}
}

View file

@ -104,6 +104,36 @@ impl TableDetector {
self.build_grids(intersections, segments)
}
/// Detect tables on a page using both line-based and borderless pipelines.
///
/// This is the main entry point for table detection (7.2 coordinator).
/// It runs both detection pipelines and combines the results:
/// 1. Line-based detection for bordered tables (m/l/S, re/S, re/f operators)
/// 2. Borderless detection for tables without ruling lines (x0 alignment heuristic)
///
/// # Arguments
///
/// * `ctx` - The page context containing page dict and content bytes
///
/// # Returns
///
/// A vector of grid candidates representing all detected tables.
pub fn detect(&self, ctx: &PageContext) -> Vec<GridCandidate> {
let mut all_grids = Vec::new();
// Step 1: Run line-based detection (primary pipeline)
let line_based = self.detect_line_based(ctx);
all_grids.extend(line_based);
// Step 2: Run borderless detection (secondary pipeline)
// Note: In a full implementation, we would skip regions already
// covered by line-based tables to avoid duplicates.
let borderless = self.detect_borderless(ctx);
all_grids.extend(borderless);
all_grids
}
/// Detect borderless tables using x0 alignment heuristic.
///
/// This method analyzes text positioning to find tables without ruling lines:

View file

@ -21,11 +21,16 @@ mod detector;
mod segment;
mod grid;
mod cell;
mod output;
pub use detector::TableDetector;
pub use segment::{Segment, SegmentOrientation};
pub use grid::GridCandidate;
pub use cell::{Cell, TableSpan, detect_merged_cells};
pub use output::{grid_to_table_json, detect_two_page_tables};
// Re-export cell types for use in extract module
pub use cell::Cell as TableCell;
use crate::parser::pages::PageDict;

View file

@ -0,0 +1,481 @@
//! Table JSON output conversion (7.2.6).
//!
//! This module handles the conversion from detected table structures
//! (GridCandidate, Cell) to the JSON output format (TableJson, RowJson, CellJson).
use crate::schema::{TableJson, RowJson, CellJson};
use crate::table::{GridCandidate, Cell};
use crate::table::cell::TableSpan;
use anyhow::Result;
/// Distance from page edge to consider a table as "continued" (50 pt).
const CONTINUED_THRESHOLD: f32 = 50.0;
/// Maximum RMSE for column alignment similarity (5 pt).
const COLUMN_SIMILARITY_RMSE: f32 = 5.0;
/// Convert a detected table (grid + cells) to TableJson output format.
///
/// # Arguments
///
/// * `grid` - The grid candidate representing the table geometry
/// * `cells` - The cells with their assigned content
/// * `page_index` - The page index where this table appears
/// * `detection_method` - Either "line_based" or "borderless"
/// * `continued` - Whether this table continues on the next page
/// * `continued_from_prev` - Whether this table is a continuation from the previous page
///
/// # Returns
///
/// A `TableJson` ready for serialization.
pub fn grid_to_table_json(
grid: &GridCandidate,
cells: &[Cell],
page_index: usize,
detection_method: &str,
continued: bool,
continued_from_prev: bool,
) -> TableJson {
// Build rows from cells
let rows = build_rows_from_cells(cells, grid);
// Count header rows (should already be set on cells)
let header_rows = cells.iter()
.filter(|c| c.is_header_row)
.map(|c| c.row)
.collect::<std::collections::HashSet<_>>()
.len() as u32;
TableJson {
id: format!("table_{}", page_index),
bbox: [
grid.bbox[0] as f64,
grid.bbox[1] as f64,
grid.bbox[2] as f64,
grid.bbox[3] as f64,
],
rows,
header_rows,
detection_method: detection_method.to_string(),
continued,
continued_from_prev,
page_index,
}
}
/// Build RowJson structures from cells.
///
/// Groups cells by row index and creates RowJson for each.
fn build_rows_from_cells(cells: &[Cell], grid: &GridCandidate) -> Vec<RowJson> {
let mut row_map: std::collections::HashMap<usize, Vec<&Cell>> = std::collections::HashMap::new();
// Group cells by row
for cell in cells {
row_map.entry(cell.row).or_insert_with(Vec::new).push(cell);
}
// Create rows in order (top to bottom = row 0 to row_count-1)
let mut rows = Vec::new();
for row_idx in 0..grid.row_count() {
if let Some(row_cells) = row_map.get(&row_idx) {
// Convert cells to CellJson and sort by column
let mut cells_json: Vec<CellJson> = row_cells.iter()
.map(|c| cell_to_cell_json(c, grid))
.collect();
// Sort by column index
cells_json.sort_by_key(|c| c.col);
// Compute row bbox from all cells
let row_bbox = compute_row_bbox(&cells_json);
// Check if this is a header row (all cells are header cells or first cell is header)
let is_header = !cells_json.is_empty() &&
cells_json.iter().all(|c| c.is_header_row);
rows.push(RowJson {
bbox: row_bbox,
cells: cells_json,
is_header,
});
}
}
rows
}
/// Convert a Cell to CellJson.
fn cell_to_cell_json(cell: &Cell, _grid: &GridCandidate) -> CellJson {
// Build span references (indices into the page-level spans array)
// For now, use empty vec since we don't have the span indices here
let spans = Vec::new();
// Concatenate text from all spans in the cell
let text = cell.content.iter()
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
CellJson {
bbox: [
cell.bbox[0] as f64,
cell.bbox[1] as f64,
cell.bbox[2] as f64,
cell.bbox[3] as f64,
],
text,
spans,
row: cell.row,
col: cell.col,
rowspan: cell.rowspan,
colspan: cell.colspan,
is_header_row: cell.is_header_row,
}
}
/// Compute the bounding box for a row from its cells.
fn compute_row_bbox(cells: &[CellJson]) -> [f64; 4] {
if cells.is_empty() {
return [0.0, 0.0, 0.0, 0.0];
}
let mut x0 = cells[0].bbox[0];
let mut y0 = cells[0].bbox[1];
let mut x1 = cells[0].bbox[2];
let mut y1 = cells[0].bbox[3];
for cell in &cells[1..] {
x0 = x0.min(cell.bbox[0]);
y0 = y0.min(cell.bbox[1]);
x1 = x1.max(cell.bbox[2]);
y1 = y1.max(cell.bbox[3]);
}
[x0, y0, x1, y1]
}
/// Detect two-page table continuation between adjacent pages.
///
/// This function examines tables on adjacent pages and determines if they
/// represent a single table split across pages.
///
/// # Algorithm
///
/// For each pair of tables on page N and page N+1:
/// 1. Check if the table on page N ends within CONTINUED_THRESHOLD (50 pt) of page bottom
/// 2. Check if the table on page N+1 starts within CONTINUED_THRESHOLD (50 pt) of page top
/// 3. Verify both tables have the same column count
/// 4. Verify column x-positions are similar (RMSE < COLUMN_SIMILARITY_RMSE)
///
/// If all conditions are met, set:
/// - page N table: `continued = true`
/// - page N+1 table: `continued_from_prev = true`
///
/// # Arguments
///
/// * `all_tables` - Slice of tables for all pages, indexed by page_index
/// * `page_heights` - Page heights in points, to determine page edges
///
/// # Returns
///
/// A vector of (page_index, continued, continued_from_prev) tuples for each table.
pub fn detect_two_page_tables(
all_tables: &[Vec<GridCandidate>],
page_heights: &[f64],
) -> Vec<Vec<(bool, bool)>> {
let mut results = Vec::new();
for (page_idx, page_tables) in all_tables.iter().enumerate() {
let page_flags = if page_tables.is_empty() {
Vec::new()
} else {
page_tables.iter().map(|_| (false, false)).collect()
};
results.push(page_flags);
}
// Check adjacent page pairs
for page_idx in 0..all_tables.len().saturating_sub(1) {
let current_page_height = page_heights.get(page_idx).copied().unwrap_or(792.0);
let next_page_height = page_heights.get(page_idx + 1).copied().unwrap_or(792.0);
let current_tables = &all_tables[page_idx];
let next_tables = &all_tables.get(page_idx + 1);
if let Some(next_page_tables) = next_tables {
// For each table on current page, check if any table on next page continues it
for (table_idx, current_table) in current_tables.iter().enumerate() {
// Check if this table ends near page bottom
let table_y0 = current_table.bbox[1] as f64;
let is_near_bottom = table_y0 <= CONTINUED_THRESHOLD as f64;
if !is_near_bottom {
continue;
}
// Look for a continuing table on the next page
for (next_table_idx, next_table) in next_page_tables.iter().enumerate() {
// Check if next table starts near page top
let next_table_y1 = next_table.bbox[3] as f64;
let page_top = next_page_height - CONTINUED_THRESHOLD as f64;
let is_near_top = next_table_y1 >= page_top;
if !is_near_top {
continue;
}
// Check column count match
if current_table.col_count() != next_table.col_count() {
continue;
}
// Check column position similarity
if columns_similar(current_table, next_table) {
// Match! Set flags
results[page_idx][table_idx].0 = true; // continued
results[page_idx + 1][next_table_idx].1 = true; // continued_from_prev
}
}
}
}
}
results
}
/// Check if two grids have similar column positions.
///
/// Computes RMSE between column x-positions and checks if it's below threshold.
fn columns_similar(grid1: &GridCandidate, grid2: &GridCandidate) -> bool {
if grid1.col_xs.len() != grid2.col_xs.len() {
return false;
}
// Compute RMSE
let sum_sq_error: f32 = grid1.col_xs.iter()
.zip(grid2.col_xs.iter())
.map(|(x1, x2)| (x1 - x2).powi(2))
.sum();
let mse = sum_sq_error / grid1.col_xs.len() as f32;
let rmse = mse.sqrt();
rmse < COLUMN_SIMILARITY_RMSE
}
#[cfg(test)]
mod tests {
use super::*;
use crate::table::Segment;
#[test]
fn test_grid_to_table_json_basic() {
// Create a simple 2x2 grid
let intersections = vec![
(50.0, 100.0), (150.0, 100.0),
(50.0, 200.0), (150.0, 200.0),
(50.0, 300.0), (150.0, 300.0),
];
let grid = GridCandidate::from_intersections(intersections, vec![]).unwrap();
// Create some cells
let cells = vec![
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
];
let table_json = grid_to_table_json(&grid, &cells, 0, "line_based", false, false);
assert_eq!(table_json.id, "table_0");
assert_eq!(table_json.page_index, 0);
assert_eq!(table_json.detection_method, "line_based");
assert!(!table_json.continued);
assert!(!table_json.continued_from_prev);
assert_eq!(table_json.rows.len(), 1);
}
#[test]
fn test_build_rows_from_cells() {
let grid = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0),
(50.0, 200.0), (150.0, 200.0),
(50.0, 300.0), (150.0, 300.0),
], vec![]).unwrap();
let mut cell1 = Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0);
cell1.content = vec![
TableSpan::new([50.0, 210.0, 90.0, 220.0], "Row1Col1".to_string(), "Helvetica".to_string())
];
let mut cell2 = Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1);
cell2.content = vec![
TableSpan::new([160.0, 210.0, 190.0, 220.0], "Row1Col2".to_string(), "Helvetica".to_string())
];
let rows = build_rows_from_cells(&[cell1, cell2], &grid);
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].cells.len(), 2);
assert_eq!(rows[0].cells[0].text, "Row1Col1");
assert_eq!(rows[0].cells[1].text, "Row1Col2");
}
#[test]
fn test_columns_similar_identical() {
let grid1 = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
], vec![]).unwrap();
let grid2 = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
], vec![]).unwrap();
assert!(columns_similar(&grid1, &grid2));
}
#[test]
fn test_columns_similar_small_difference() {
let grid1 = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
], vec![]).unwrap();
// 2 pt shift in column positions
let grid2 = GridCandidate::from_intersections(vec![
(52.0, 100.0), (152.0, 100.0), (252.0, 100.0),
(52.0, 200.0), (152.0, 200.0), (252.0, 200.0),
], vec![]).unwrap();
// RMSE = 2.0 < 5.0, should be similar
assert!(columns_similar(&grid1, &grid2));
}
#[test]
fn test_columns_similar_large_difference() {
let grid1 = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
], vec![]).unwrap();
// 10 pt shift in column positions
let grid2 = GridCandidate::from_intersections(vec![
(60.0, 100.0), (160.0, 100.0), (260.0, 100.0),
(60.0, 200.0), (160.0, 200.0), (260.0, 200.0),
], vec![]).unwrap();
// RMSE = 10.0 > 5.0, should NOT be similar
assert!(!columns_similar(&grid1, &grid2));
}
#[test]
fn test_columns_similar_different_count() {
let grid1 = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
], vec![]).unwrap();
let grid2 = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0),
(50.0, 200.0), (150.0, 200.0),
], vec![]).unwrap();
assert!(!columns_similar(&grid1, &grid2));
}
#[test]
fn test_detect_two_page_tables_basic() {
// Page 0: table ending at y=40 (within 50 pt of page bottom at 0)
let grid0 = GridCandidate::from_intersections(vec![
(50.0, 40.0), (150.0, 40.0),
(50.0, 100.0), (150.0, 100.0),
(50.0, 150.0), (150.0, 150.0),
], vec![]).unwrap();
// Page 1: table starting at y=750 (within 50 pt of page top at 792)
let grid1 = GridCandidate::from_intersections(vec![
(50.0, 750.0), (150.0, 750.0),
(50.0, 800.0), (150.0, 800.0),
(50.0, 850.0), (150.0, 850.0),
], vec![]).unwrap();
let all_tables = vec![vec![grid0], vec![grid1]];
let page_heights = vec![792.0, 792.0];
let results = detect_two_page_tables(&all_tables, &page_heights);
// Page 0 table should be marked as continued
assert!(results[0][0].0); // continued = true
// Page 1 table should be marked as continued_from_prev
assert!(results[1][0].1); // continued_from_prev = true
}
#[test]
fn test_detect_two_page_tables_no_continuation() {
// Page 0: table ending at y=200 (NOT within 50 pt of page bottom)
let grid0 = GridCandidate::from_intersections(vec![
(50.0, 200.0), (150.0, 200.0),
(50.0, 300.0), (150.0, 300.0),
], vec![]).unwrap();
// Page 1: table starting at y=700 (NOT within 50 pt of page top)
let grid1 = GridCandidate::from_intersections(vec![
(50.0, 700.0), (150.0, 700.0),
(50.0, 800.0), (150.0, 800.0),
], vec![]).unwrap();
let all_tables = vec![vec![grid0], vec![grid1]];
let page_heights = vec![792.0, 792.0];
let results = detect_two_page_tables(&all_tables, &page_heights);
// Neither table should be marked as continuation
assert!(!results[0][0].0); // continued = false
assert!(!results[1][0].1); // continued_from_prev = false
}
#[test]
fn test_detect_two_page_tables_different_column_count() {
// Page 0: 2-column table ending near page bottom
let grid0 = GridCandidate::from_intersections(vec![
(50.0, 40.0), (150.0, 40.0), (250.0, 40.0),
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
], vec![]).unwrap();
// Page 1: 3-column table starting near page top
let grid1 = GridCandidate::from_intersections(vec![
(50.0, 750.0), (150.0, 750.0), (250.0, 750.0), (350.0, 750.0),
(50.0, 800.0), (150.0, 800.0), (250.0, 800.0), (350.0, 800.0),
], vec![]).unwrap();
let all_tables = vec![vec![grid0], vec![grid1]];
let page_heights = vec![792.0, 792.0];
let results = detect_two_page_tables(&all_tables, &page_heights);
// Different column counts, should not be marked as continuation
assert!(!results[0][0].0);
assert!(!results[1][0].1);
}
#[test]
fn test_cell_to_cell_json_text_concatenation() {
let grid = GridCandidate::from_intersections(vec![
(50.0, 100.0), (150.0, 100.0),
(50.0, 200.0), (150.0, 200.0),
], vec![]).unwrap();
let mut cell = Cell::new([50.0, 100.0, 150.0, 200.0], 0, 0);
cell.content = vec![
TableSpan::new([50.0, 150.0, 90.0, 160.0], "Hello".to_string(), "Helvetica".to_string()),
TableSpan::new([50.0, 140.0, 90.0, 150.0], "World".to_string(), "Helvetica".to_string()),
];
let cell_json = cell_to_cell_json(&cell, &grid);
assert_eq!(cell_json.text, "Hello World");
}
}

View file

@ -0,0 +1,203 @@
# OCR Language Pack Distribution Strategy
**Status:** RESOLVED (OQ-04)
**Date:** 2026-05-23
**Bead:** pdftract-32x4
## Open Question OQ-04
> How are OCR language packs distributed? Bundled in the Docker image (size cost), downloaded on first use (network dependency), or required as an out-of-band install?
## Resolution Decision
Language packs are **bundled in Docker images** with a tiered distribution strategy:
| Docker Image Tag | Language Packs | Size | Use Case |
|------------------|----------------|------|----------|
| `pdftract:default` | None (OCR disabled) | ~4 MB | Vector-only extraction, no OCR capability |
| `pdftract:ocr` | eng + 13 common langs | ~150 MB | Standard OCR use case, covers >80% of world languages |
| `pdftract:full` | All 100+ languages | ~600 MB | Air-gapped deployments, comprehensive coverage |
## Rationale
### Why bundling?
1. **Air-gapped compatibility:** Bundling ensures OCR works in offline/air-gapped environments without network access for on-first-download
2. **Reproducibility:** Fixed language pack versions guarantee consistent extraction results across deployments
3. **Simplicity:** No external dependency management for operators; `docker run` just works
4. **Performance:** No download latency on first OCR request
### Size trade-offs
The `:ocr` variant adds ~150 MB to the image but covers the vast majority of use cases:
- English (eng) - ~12 MB
- German (deu) - ~10 MB
- French (fra) - ~10 MB
- Spanish (spa) - ~10 MB
- Italian (ita) - ~9 MB
- Portuguese (por) - ~10 MB
- Japanese (jpn) - ~18 MB
- Simplified Chinese (chi_sim) - ~25 MB
- Traditional Chinese (chi_tra) - ~22 MB
- Korean (kor) - ~12 MB
- Russian (rus) - ~14 MB
- Arabic (ara) - ~8 MB
- Hindi (hin) - ~8 MB
Total: ~168 MB (compressed) → ~150 MB (after Docker layer compression)
The `:full` variant bundles all 100+ languages (~600 MB) for specialized deployments requiring comprehensive coverage.
### Why not download-on-first-use?
Download-on-first-use was rejected because:
- Requires network connectivity at OCR time (breaks air-gapped deployments)
- Adds complexity (pack download, validation, caching)
- Introduces latency on first OCR request
- Requires a trusted pack distribution endpoint
- Version drift between pack downloads across deployments
### Why not out-of-band install?
Out-of-band install (e.g., `apt-get tesseract-ocr-all`) was rejected because:
- Platform-specific (Debian vs Alpine vs macOS vs Windows)
- Version drift across package managers
- Additional operator setup step
- Inconsistent pack locations across distros
## Language Pack Allowlist
### `pdftract:ocr` bundle (Tier 1 - High Coverage)
| Code | Language | File | Size |
|------|----------|------|------|
| eng | English | eng.traineddata | 12 MB |
| deu | German | deu.traineddata | 10 MB |
| fra | French | fra.traineddata | 10 MB |
| spa | Spanish | spa.traineddata | 10 MB |
| ita | Italian | ita.traineddata | 9 MB |
| por | Portuguese | por.traineddata | 10 MB |
| jpn | Japanese | jpn.traineddata | 18 MB |
| chi_sim | Simplified Chinese | chi_sim.traineddata | 25 MB |
| chi_tra | Traditional Chinese | chi_tra.traineddata | 22 MB |
| kor | Korean | kor.traineddata | 12 MB |
| rus | Russian | rus.traineddata | 14 MB |
| ara | Arabic | ara.traineddata | 8 MB |
| hin | Hindi | hin.traineddata | 8 MB |
**Total: 13 languages, ~168 MB (uncompressed)**
This set covers:
- All official UN languages (Arabic, Chinese, English, French, Russian, Spanish)
- Major European languages (German, Italian, Portuguese)
- Major East Asian languages (Japanese, Korean, Hindi)
- ~80% of world population by native speaker count
### `pdftract:full` bundle (Tier 2 - Complete)
Includes all 100+ language packs from the official Tesseract tessdata repository:
- All Tier 1 languages
- Indic languages (ben, guj, kan, mal, tam, tel, etc.)
- Southeast Asian languages (tha, vie, etc.)
- Central/Eastern European languages (pol, ces, slk, hun, rom, bul, etc.)
- Nordic languages (dan, nor, swe, fin)
- Turkic languages (tur, aze, uzb, etc.)
- Hebrew (heb)
- And 60+ others
**Total: 100+ languages, ~600 MB (uncompressed)**
## Implementation
### Pack Detection
The `detect_available_languages()` function in `crates/pdftract-core/src/ocr.rs` scans the tessdata directory for `<code>.traineddata` files and returns a `HashSet<String>` of available language codes.
The function respects the `$TESSDATA_PREFIX` environment variable and falls back to system-default tessdata paths:
- Unix: `/usr/share/tessdata`, `/usr/local/share/tessdata`
- Windows: `C:\Program Files\Tesseract-OCR\tessdata`
### Language Validation
When OCR is invoked with a requested language list (from `ExtractionOptions.ocr_language`), the `validate_ocr_languages()` function:
1. Checks which requested languages are available
2. Emits `OCR_LANGUAGE_UNAVAILABLE` diagnostics for missing languages
3. Filters out unavailable languages from the Tesseract language string
4. Falls back to `eng` if no requested languages are available
This ensures extraction never hard-crashes due to missing packs — it degrades gracefully with diagnostics.
### Doctor Check
The `pdftract doctor tesseract-langs` command verifies:
1. Tesseract binary is installed (version 5.x)
2. `eng` language pack is present (required fallback)
3. User-requested `--lang` languages are present
Exit code 1 if `eng` is missing; exit code 0 with WARN if optional languages are missing.
## Docker Implementation
### Dockerfile.ocr (Tier 1)
```dockerfile
FROM pdftract:base
# Install Tesseract + Tier 1 language packs
RUN apk add --no-cache \
tesseract-ocr \
tesseract-ocr-data-eng \
tesseract-ocr-data-deu \
tesseract-ocr-data-fra \
tesseract-ocr-data-spa \
tesseract-ocr-data-ita \
tesseract-ocr-data-por \
tesseract-ocr-data-jpn \
tesseract-ocr-data-chi_sim \
tesseract-ocr-data-chi_tra \
tesseract-ocr-data-kor \
tesseract-ocr-data-rus \
tesseract-ocr-data-ara \
tesseract-ocr-data-hin
# Verify packs are installed
RUN pdftract doctor tesseract-langs --lang eng,deu,fra,spa,ita,por,jpn,chi_sim,chi_tra,kor,rus,ara,hin
```
### Dockerfile.full (Tier 2)
```dockerfile
FROM pdftract:base
# Install Tesseract + all language packs
RUN apk add --no-cache \
tesseract-ocr \
tesseract-ocr-data-all
# Verify packs are installed
RUN pdftract doctor tesseract-langs
```
## Version Policy
Language packs are pinned to Tesseract 5.x series:
- Base image uses `tesseract-ocr 5.3.x` from Alpine repos
- Packs are from the same major version to ensure compatibility
- Updates follow Alpine's security patch cadence
Per OQ-03, Tesseract version pinning is documented in the Dockerfile comments.
## References
- Plan Phase 5.4: Tesseract Integration
- Plan Open Question OQ-04
- Bead pdftract-32x4 (implementation)
- crates/pdftract-core/src/ocr.rs (language detection)
- crates/pdftract-cli/src/doctor.rs (language verification)
## Revision History
| Date | Change |
|------|--------|
| 2026-05-23 | Initial resolution; document created with OQ-04 decision |

View file

@ -512,7 +512,7 @@ Questions that the current plan does not yet resolve. Each question is tagged wi
| OQ-01 | When does the 500-PDF private regression corpus become available, and what is its licensing for CI use? | Phase 0 sign-off | Project lead; recorded in `docs/notes/corpus-licensing.md` |
| OQ-02 | Who owns the font-fingerprint database curation pipeline (`build/font-fingerprints.json`) — is it a maintainer task, a community contribution, or an automated harvest from Google Fonts / Adobe? | Phase 2.2 implementation | Maintainer; documented in `docs/research/font-fingerprinting.md` |
| OQ-03 | What is the Tesseract version pinning policy — pin to a specific 5.x patch release, or follow latest stable? Pinning gives reproducibility; following stable gets bug fixes faster. | Phase 5.4 implementation | CI maintainer; recorded in `Dockerfile` comment |
| OQ-04 | How are OCR language packs distributed? Bundled in the Docker image (size cost), downloaded on first use (network dependency), or required as an out-of-band install? | Phase 5.4 implementation | Distribution lead; documented in `docs/notes/ocr-language-packs.md` |
| OQ-04 | How are OCR language packs distributed? Bundled in the Docker image (size cost), downloaded on first use (network dependency), or required as an out-of-band install? | **RESOLVED** 2026-05-23 by bead pdftract-32x4 | Bundled in Docker images with tiered strategy (`:ocr` ~150 MB, `:full` ~600 MB). Documented in `docs/notes/ocr-language-packs.md` |
| OQ-05 | What is the realistic coverage gap of the 5,000-entry glyph-shape DB on real-world subsetted fonts? Is 70% Latin-only coverage acceptable for v1.0.0, or must Cyrillic/Greek hit the same bar? | Phase 2.5 sign-off | Accuracy lead; benchmarked against `tests/fixtures/encoding/` |
| OQ-06 | Does the Phase 7.10 profile field-extraction DSL need user-defined parsers (custom JavaScript / Lua / WASM hooks)? Built-in `decimal`/`date`/`int`/`bool` may be insufficient for niche document types. | v1.1+ | Deferred — solicit user feedback after v1.0.0 |
| OQ-07 | How is the MCP server discovered by Claude Desktop / Cursor — manual config edit, a "pdftract setup-mcp" subcommand that writes the config, or both? Config file locations differ across OSes. | Phase 6.7 sign-off | MCP integration lead; documented in `docs/integrations/mcp-clients.md` |

View file

@ -0,0 +1,345 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json",
"title": "PDFtract Extraction Output Schema v1.0",
"description": "JSON output schema for PDF text and structure extraction",
"type": "object",
"required": ["fingerprint", "schema_version", "pages", "metadata"],
"properties": {
"fingerprint": {
"type": "string",
"description": "PDF fingerprint for verification (format: pdftract-v1:<hex>)"
},
"schema_version": {
"type": "string",
"description": "Schema version (e.g., '1.0')",
"enum": ["1.0"]
},
"pages": {
"type": "array",
"description": "Extracted pages",
"items": {
"$ref": "#/definitions/page"
}
},
"metadata": {
"$ref": "#/definitions/metadata"
}
},
"definitions": {
"page": {
"type": "object",
"required": ["index", "spans", "blocks", "tables"],
"properties": {
"index": {
"type": "integer",
"description": "0-based page index"
},
"spans": {
"type": "array",
"description": "Extracted text spans",
"items": {
"$ref": "#/definitions/span"
}
},
"blocks": {
"type": "array",
"description": "Extracted structural blocks",
"items": {
"$ref": "#/definitions/block"
}
},
"tables": {
"type": "array",
"description": "Extracted tables (cell-level structure)",
"items": {
"$ref": "#/definitions/table"
}
},
"error": {
"type": "string",
"description": "Error message if extraction failed for this page"
}
}
},
"span": {
"type": "object",
"required": ["text", "bbox", "font", "size"],
"properties": {
"text": {
"type": "string",
"description": "The extracted text content"
},
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"font": {
"type": "string",
"description": "Font name or identifier"
},
"size": {
"type": "number",
"description": "Font size in points"
},
"confidence": {
"type": "number",
"description": "Confidence score (0.0 to 1.0) for OCR text",
"minimum": 0.0,
"maximum": 1.0
},
"receipt": {
"$ref": "#/definitions/receipt"
}
}
},
"block": {
"type": "object",
"required": ["kind", "text", "bbox"],
"properties": {
"kind": {
"type": "string",
"description": "Block kind/type",
"enum": ["paragraph", "heading", "list", "table", "figure"]
},
"text": {
"type": "string",
"description": "The concatenated text content of all spans in the block"
},
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"level": {
"type": "integer",
"description": "Heading level (1-6) for 'heading' kind blocks",
"minimum": 1,
"maximum": 6
},
"table_index": {
"type": "integer",
"description": "Table index for 'table' kind blocks (points to tables array)",
"minimum": 0
},
"receipt": {
"$ref": "#/definitions/receipt"
}
}
},
"table": {
"type": "object",
"required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"],
"properties": {
"id": {
"type": "string",
"description": "Unique identifier for this table (e.g., 'table_0')"
},
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"rows": {
"type": "array",
"description": "Rows in this table, ordered top-to-bottom",
"items": {
"$ref": "#/definitions/row"
}
},
"header_rows": {
"type": "integer",
"description": "Number of contiguous header rows at the top of the table",
"minimum": 0
},
"detection_method": {
"type": "string",
"description": "Detection method used to identify this table",
"enum": ["line_based", "borderless"]
},
"continued": {
"type": "boolean",
"description": "Whether this table continues on the next page"
},
"continued_from_prev": {
"type": "boolean",
"description": "Whether this table is a continuation from the previous page"
},
"page_index": {
"type": "integer",
"description": "Zero-based page index where this table appears",
"minimum": 0
}
}
},
"row": {
"type": "object",
"required": ["bbox", "cells", "is_header"],
"properties": {
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"cells": {
"type": "array",
"description": "Cells in this row, ordered left-to-right",
"items": {
"$ref": "#/definitions/cell"
}
},
"is_header": {
"type": "boolean",
"description": "Whether this row is a header row"
}
}
},
"cell": {
"type": "object",
"required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"],
"properties": {
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"text": {
"type": "string",
"description": "The concatenated text content of all spans in the cell"
},
"spans": {
"type": "array",
"description": "References to spans in the page's spans array",
"items": {
"type": "integer"
}
},
"row": {
"type": "integer",
"description": "Zero-based row index within the table",
"minimum": 0
},
"col": {
"type": "integer",
"description": "Zero-based column index within the table",
"minimum": 0
},
"rowspan": {
"type": "integer",
"description": "Number of rows this cell spans (default 1)",
"minimum": 1
},
"colspan": {
"type": "integer",
"description": "Number of columns this cell spans (default 1)",
"minimum": 1
},
"is_header_row": {
"type": "boolean",
"description": "Whether this cell is in a header row"
}
}
},
"receipt": {
"type": "object",
"required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"],
"properties": {
"pdf_fingerprint": {
"type": "string",
"description": "The PDF fingerprint"
},
"page_index": {
"type": "integer",
"description": "The page index"
},
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"content_hash": {
"type": "string",
"description": "SHA-256 hash of the content"
},
"extraction_version": {
"type": "string",
"description": "Version string of the extractor"
},
"svg_clip": {
"type": "string",
"description": "SVG clip path for verification (present only in SvgClip mode)"
}
}
},
"metadata": {
"type": "object",
"required": ["page_count", "span_count", "block_count"],
"properties": {
"page_count": {
"type": "integer",
"description": "Total number of pages in the document"
},
"span_count": {
"type": "integer",
"description": "Number of spans extracted"
},
"block_count": {
"type": "integer",
"description": "Number of blocks extracted"
},
"cache_status": {
"type": "string",
"description": "Cache status: 'hit', 'miss', or 'skipped'",
"enum": ["hit", "miss", "skipped"]
},
"cache_age_seconds": {
"type": "integer",
"description": "Cache entry age in seconds (only present when cache_status == 'hit')",
"minimum": 0
},
"error_count": {
"type": "integer",
"description": "Number of pages that failed to extract",
"minimum": 0
},
"reading_order_algorithm": {
"type": "string",
"description": "Reading order algorithm used for this extraction",
"enum": ["struct_tree", "xy_cut"]
},
"diagnostics": {
"type": "array",
"description": "Diagnostics emitted during extraction",
"items": {
"type": "string"
}
}
}
}
}
}

6
examples/test_export.rs Normal file
View file

@ -0,0 +1,6 @@
// Test that detect_merged_cells is accessible from pdftract_core::table
use pdftract_core::table::detect_merged_cells;
fn main() {
println!("detect_merged_cells is exported!");
}