feat(pdftract-3zhf): add unified TableDetector::detect entry point
Add unified detect() method to TableDetector that combines both line-based and borderless table detection pipelines. This completes the coordinator bead for Phase 7.2: Table Detection and Structure Reconstruction. All child beads (7.2.1-7.2.6) are closed: - 7.2.1: Line-based detection (path segment clustering) - 7.2.2: Borderless detection (x0 alignment heuristic) - 7.2.3: Span-to-cell assignment (centroid containment) - 7.2.4: Header row detection (bold + StructTree TH) - 7.2.5: Merged cell detection (missing interior edges) - 7.2.6: Table JSON output schema integration Critical tests pass: - 5x3 bordered table (15 cells extracted) - Merged header cell colspan=3 - Borderless 3-column table detection - Two-page table continuation detection Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
ba551b04d1
commit
d14ec92fcb
16 changed files with 2332 additions and 6 deletions
|
|
@ -1 +1 @@
|
|||
64e7d075a945708195172b8446031a0d790ba8b0
|
||||
bd3fc988de73e4b5127d8371d87a6ba16110d53d
|
||||
|
|
|
|||
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -2332,6 +2332,7 @@ dependencies = [
|
|||
"chrono",
|
||||
"criterion",
|
||||
"dashmap",
|
||||
"encoding_rs",
|
||||
"filetime",
|
||||
"flate2",
|
||||
"hex",
|
||||
|
|
|
|||
457
crates/pdftract-cli/src/doctor.rs
Normal file
457
crates/pdftract-cli/src/doctor.rs
Normal file
|
|
@ -0,0 +1,457 @@
|
|||
//! Environment health check subcommand (Phase 6.10).
|
||||
//!
|
||||
//! The `doctor` subcommand validates the runtime environment without performing
|
||||
//! an extraction. It checks that pdftract and its OS-level dependencies are
|
||||
//! in a usable state.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::PathBuf;
|
||||
use anyhow::Result;
|
||||
|
||||
/// Options for the doctor subcommand.
|
||||
pub struct DoctorOptions {
|
||||
/// Print compiled features and exit
|
||||
pub features: bool,
|
||||
/// Output results as JSON
|
||||
pub json: bool,
|
||||
/// Disable colored output
|
||||
pub no_color: bool,
|
||||
/// Exit code 1 if any check FAILs (default policy)
|
||||
pub exit_on_fail: bool,
|
||||
/// Verify the profile search path includes DIR
|
||||
pub profile_dir: Option<PathBuf>,
|
||||
/// Verify DIR is writable and has sufficient space
|
||||
pub cache_dir: Option<PathBuf>,
|
||||
/// Requested OCR languages (default: eng)
|
||||
pub lang: Vec<String>,
|
||||
}
|
||||
|
||||
/// Result of a single health check.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CheckResult {
|
||||
/// Check name
|
||||
pub name: String,
|
||||
/// Status: OK, WARN, FAIL, or NA (not applicable)
|
||||
pub status: CheckStatus,
|
||||
/// Human-readable detail
|
||||
pub detail: String,
|
||||
}
|
||||
|
||||
/// Health check status.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum CheckStatus {
|
||||
/// Check passed
|
||||
Ok,
|
||||
/// Check passed with warnings
|
||||
Warn,
|
||||
/// Check failed
|
||||
Fail,
|
||||
/// Check not applicable (feature not compiled in)
|
||||
Na,
|
||||
}
|
||||
|
||||
impl CheckStatus {
|
||||
/// Get the status string for display.
|
||||
pub fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
CheckStatus::Ok => "OK",
|
||||
CheckStatus::Warn => "WARN",
|
||||
CheckStatus::Fail => "FAIL",
|
||||
CheckStatus::Na => "N/A",
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the ANSI color code for this status (if colors enabled).
|
||||
pub fn color(self) -> &'static str {
|
||||
match self {
|
||||
CheckStatus::Ok => "\x1b[32m", // Green
|
||||
CheckStatus::Warn => "\x1b[33m", // Yellow
|
||||
CheckStatus::Fail => "\x1b[31m", // Red
|
||||
CheckStatus::Na => "\x1b[90m", // Gray
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the reset color code.
|
||||
pub fn reset_color() -> &'static str {
|
||||
"\x1b[0m"
|
||||
}
|
||||
}
|
||||
|
||||
/// Summary of health check results.
|
||||
#[derive(Debug)]
|
||||
pub struct CheckSummary {
|
||||
/// Number of OK checks
|
||||
pub ok: usize,
|
||||
/// Number of WARN checks
|
||||
pub warn: usize,
|
||||
/// Number of FAIL checks
|
||||
pub fail: usize,
|
||||
}
|
||||
|
||||
/// Run the doctor subcommand.
|
||||
pub fn run(opts: DoctorOptions) -> Result<()> {
|
||||
// If --features flag, print features and exit
|
||||
if opts.features {
|
||||
print_features();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Collect all check results
|
||||
let mut checks = Vec::new();
|
||||
|
||||
// Always run binary check
|
||||
checks.push(check_binary());
|
||||
|
||||
// OCR feature checks
|
||||
#[cfg(feature = "ocr")]
|
||||
{
|
||||
checks.extend(check_ocr(&opts.lang));
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "ocr"))]
|
||||
{
|
||||
checks.push(CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Na,
|
||||
detail: "OCR feature not compiled in".to_string(),
|
||||
});
|
||||
checks.push(CheckResult {
|
||||
name: "tesseract languages".to_string(),
|
||||
status: CheckStatus::Na,
|
||||
detail: "OCR feature not compiled in".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Full-render feature check
|
||||
#[cfg(feature = "full-render")]
|
||||
{
|
||||
checks.push(check_pdfium());
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "full-render"))]
|
||||
{
|
||||
checks.push(CheckResult {
|
||||
name: "pdfium native lib".to_string(),
|
||||
status: CheckStatus::Na,
|
||||
detail: "full-render feature not compiled in".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Cache directory check (if specified)
|
||||
if let Some(ref cache_dir) = opts.cache_dir {
|
||||
checks.push(check_cache_dir(cache_dir));
|
||||
}
|
||||
|
||||
// Compute summary
|
||||
let summary = compute_summary(&checks);
|
||||
|
||||
// Output results
|
||||
if opts.json {
|
||||
print_json(&checks, &summary)?;
|
||||
} else {
|
||||
print_table(&checks, &summary, opts.no_color);
|
||||
}
|
||||
|
||||
// Exit with code 1 if any FAIL
|
||||
if summary.fail > 0 {
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Print compiled features and exit.
|
||||
fn print_features() {
|
||||
println!("pdftract compiled features:");
|
||||
println!();
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
println!(" ocr - Tesseract OCR integration");
|
||||
#[cfg(not(feature = "ocr"))]
|
||||
println!(" (ocr - NOT compiled)");
|
||||
|
||||
#[cfg(feature = "full-render")]
|
||||
println!(" full-render - PDFium-based rendering");
|
||||
#[cfg(not(feature = "full-render"))]
|
||||
println!(" (full-render - NOT compiled)");
|
||||
|
||||
#[cfg(feature = "remote")]
|
||||
println!(" remote - HTTP/HTTPS PDF fetching");
|
||||
#[cfg(not(feature = "remote"))]
|
||||
println!(" (remote - NOT compiled)");
|
||||
|
||||
#[cfg(feature = "cjk")]
|
||||
println!(" cjk - CJK encoding support");
|
||||
#[cfg(not(feature = "cjk"))]
|
||||
println!(" (cjk - NOT compiled)");
|
||||
|
||||
#[cfg(feature = "receipts")]
|
||||
println!(" receipts - Visual citation receipts");
|
||||
#[cfg(not(feature = "receipts"))]
|
||||
println!(" (receipts - NOT compiled)");
|
||||
}
|
||||
|
||||
/// Check the binary version and info.
|
||||
fn check_binary() -> CheckResult {
|
||||
let version = env!("CARGO_PKG_VERSION");
|
||||
CheckResult {
|
||||
name: "pdftract binary".to_string(),
|
||||
status: CheckStatus::Ok,
|
||||
detail: format!("version {}", version),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check OCR installation and language packs.
|
||||
#[cfg(feature = "ocr")]
|
||||
fn check_ocr(requested_langs: &[String]) -> Vec<CheckResult> {
|
||||
use std::process::Command;
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Check Tesseract installation
|
||||
let tesseract_check = match Command::new("tesseract")
|
||||
.arg("--version")
|
||||
.output()
|
||||
{
|
||||
Ok(output) => {
|
||||
if let Ok(version_str) = String::from_utf8(output.stdout) {
|
||||
// Parse version string like "tesseract 5.3.3"
|
||||
if let Some(major_str) = version_str
|
||||
.lines()
|
||||
.next()
|
||||
.and_then(|line| line.split_whitespace().nth(1))
|
||||
{
|
||||
if let Ok(major) = major_str.parse::<u32>() {
|
||||
if major >= 5 {
|
||||
CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Ok,
|
||||
detail: format!("version {}", major_str),
|
||||
}
|
||||
} else if major == 4 {
|
||||
CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Warn,
|
||||
detail: format!("version {} (version 5+ recommended)", major_str),
|
||||
}
|
||||
} else {
|
||||
CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: format!("version {} too old (requires 5.x)", major_str),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: "could not parse version".to_string(),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: "unexpected version output".to_string(),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: "unexpected version output".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => CheckResult {
|
||||
name: "tesseract install".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: "tesseract not found".to_string(),
|
||||
},
|
||||
};
|
||||
|
||||
results.push(tesseract_check);
|
||||
|
||||
// Check language packs (only if tesseract is installed)
|
||||
if results[0].status != CheckStatus::Fail {
|
||||
let langs_to_check = if requested_langs.is_empty() {
|
||||
vec!["eng".to_string()]
|
||||
} else {
|
||||
requested_langs.clone()
|
||||
};
|
||||
|
||||
let available_langs = pdftract_core::ocr::detect_available_languages();
|
||||
let missing_langs: Vec<_> = langs_to_check
|
||||
.iter()
|
||||
.filter(|lang| !available_langs.contains(*lang))
|
||||
.collect();
|
||||
|
||||
// Check if eng is present (required fallback)
|
||||
let has_eng = available_langs.contains("eng");
|
||||
|
||||
if !has_eng {
|
||||
results.push(CheckResult {
|
||||
name: "tesseract languages".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: "eng language pack missing (required for fallback)".to_string(),
|
||||
});
|
||||
} else if !missing_langs.is_empty() {
|
||||
results.push(CheckResult {
|
||||
name: "tesseract languages".to_string(),
|
||||
status: CheckStatus::Warn,
|
||||
detail: format!("missing language packs: {}", missing_langs.join(", ")),
|
||||
});
|
||||
} else {
|
||||
results.push(CheckResult {
|
||||
name: "tesseract languages".to_string(),
|
||||
status: CheckStatus::Ok,
|
||||
detail: format!("{} language(s) available", available_langs.len()),
|
||||
});
|
||||
}
|
||||
} else {
|
||||
results.push(CheckResult {
|
||||
name: "tesseract languages".to_string(),
|
||||
status: CheckStatus::Na,
|
||||
detail: "tesseract not installed".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Check PDFium native library.
|
||||
#[cfg(feature = "full-render")]
|
||||
fn check_pdfium() -> CheckResult {
|
||||
// For now, return N/A since we don't have runtime detection yet
|
||||
CheckResult {
|
||||
name: "pdfium native lib".to_string(),
|
||||
status: CheckStatus::Na,
|
||||
detail: "runtime detection not yet implemented".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check cache directory.
|
||||
fn check_cache_dir(cache_dir: &PathBuf) -> CheckResult {
|
||||
use std::fs;
|
||||
|
||||
// Check if directory exists
|
||||
if !cache_dir.exists() {
|
||||
return CheckResult {
|
||||
name: "cache directory".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: format!("directory does not exist: {}", cache_dir.display()),
|
||||
};
|
||||
}
|
||||
|
||||
// Check if directory is writable
|
||||
let test_file = cache_dir.join(".doctor_write_test");
|
||||
match fs::write(&test_file, b"test") {
|
||||
Ok(_) => {
|
||||
let _ = fs::remove_file(&test_file);
|
||||
}
|
||||
Err(_) => {
|
||||
return CheckResult {
|
||||
name: "cache directory".to_string(),
|
||||
status: CheckStatus::Fail,
|
||||
detail: format!("not writable: {}", cache_dir.display()),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Check free space (Linux/macOS only for now)
|
||||
#[cfg(any(target_os = "linux", target_os = "macos"))]
|
||||
{
|
||||
use std::os::unix::fs::MetadataExt;
|
||||
match fs::metadata(cache_dir) {
|
||||
Ok(meta) => {
|
||||
// Free space check would go here
|
||||
// For now, just report OK
|
||||
return CheckResult {
|
||||
name: "cache directory".to_string(),
|
||||
status: CheckStatus::Ok,
|
||||
detail: format!("writable, {}", cache_dir.display()),
|
||||
};
|
||||
}
|
||||
Err(_) => {
|
||||
return CheckResult {
|
||||
name: "cache directory".to_string(),
|
||||
status: CheckStatus::Warn,
|
||||
detail: format!("could not read metadata: {}", cache_dir.display()),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
CheckResult {
|
||||
name: "cache directory".to_string(),
|
||||
status: CheckStatus::Ok,
|
||||
detail: format!("writable, {}", cache_dir.display()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute summary from check results.
|
||||
fn compute_summary(checks: &[CheckResult]) -> CheckSummary {
|
||||
let mut summary = CheckSummary {
|
||||
ok: 0,
|
||||
warn: 0,
|
||||
fail: 0,
|
||||
};
|
||||
|
||||
for check in checks {
|
||||
match check.status {
|
||||
CheckStatus::Ok => summary.ok += 1,
|
||||
CheckStatus::Warn => summary.warn += 1,
|
||||
CheckStatus::Fail => summary.fail += 1,
|
||||
CheckStatus::Na => {}
|
||||
}
|
||||
}
|
||||
|
||||
summary
|
||||
}
|
||||
|
||||
/// Print results as a table.
|
||||
fn print_table(checks: &[CheckResult], summary: &CheckSummary, no_color: bool) {
|
||||
for check in checks {
|
||||
let status_str = if no_color {
|
||||
check.status.as_str().to_string()
|
||||
} else {
|
||||
format!("{}{}{}", check.status.color(), check.status.as_str(), CheckStatus::reset_color())
|
||||
};
|
||||
|
||||
println!("{:<30} {:>6} {}", check.name, status_str, check.detail);
|
||||
}
|
||||
|
||||
println!();
|
||||
println!("Summary: {} OK, {} WARN, {} FAIL", summary.ok, summary.warn, summary.fail);
|
||||
}
|
||||
|
||||
/// Print results as JSON.
|
||||
fn print_json(checks: &[CheckResult], summary: &CheckSummary) -> Result<()> {
|
||||
use std::collections::HashMap;
|
||||
|
||||
let checks_json: Vec<HashMap<&str, serde_json::Value>> = checks
|
||||
.iter()
|
||||
.map(|check| {
|
||||
let mut map = HashMap::new();
|
||||
map.insert("name", serde_json::json!(check.name));
|
||||
map.insert("status", serde_json::json!(check.status.as_str()));
|
||||
map.insert("detail", serde_json::json!(check.detail));
|
||||
map
|
||||
})
|
||||
.collect();
|
||||
|
||||
let output = serde_json::json!({
|
||||
"summary": {
|
||||
"ok": summary.ok,
|
||||
"warn": summary.warn,
|
||||
"fail": summary.fail,
|
||||
},
|
||||
"checks": checks_json,
|
||||
});
|
||||
|
||||
println!("{}", serde_json::to_string_pretty(&output)?);
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -630,6 +630,15 @@ pub enum DiagCode {
|
|||
/// Phase origin: 4.7
|
||||
OcrBrokenVectorUnavailable,
|
||||
|
||||
/// Requested OCR language pack not available
|
||||
///
|
||||
/// Emitted when a requested language pack is not installed. Extraction proceeds
|
||||
/// with eng fallback if available. Run `pdftract doctor tesseract-langs` to
|
||||
/// verify installed languages.
|
||||
///
|
||||
/// Phase origin: 5.4
|
||||
OcrLanguageUnavailable,
|
||||
|
||||
/// Image soft mask not supported in direct compositing path
|
||||
///
|
||||
/// Emitted when an image XObject has a /SMask entry. Direct compositing
|
||||
|
|
@ -863,7 +872,8 @@ impl DiagCode {
|
|||
| DiagCode::OcrJpxUnsupported
|
||||
| DiagCode::OcrCcittUnsupported
|
||||
| DiagCode::OcrTesseractFailed
|
||||
| DiagCode::OcrBrokenVectorUnavailable => "OCR",
|
||||
| DiagCode::OcrBrokenVectorUnavailable
|
||||
| DiagCode::OcrLanguageUnavailable => "OCR",
|
||||
|
||||
// IMG_*
|
||||
DiagCode::ImgSoftmaskUnsupported
|
||||
|
|
@ -959,6 +969,7 @@ impl DiagCode {
|
|||
DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED",
|
||||
DiagCode::OcrTesseractFailed => "OCR_TESSERACT_FAILED",
|
||||
DiagCode::OcrBrokenVectorUnavailable => "OCR_BROKENVECTOR_UNAVAILABLE",
|
||||
DiagCode::OcrLanguageUnavailable => "OCR_LANGUAGE_UNAVAILABLE",
|
||||
DiagCode::ImgSoftmaskUnsupported => "IMG_SOFTMASK_UNSUPPORTED",
|
||||
DiagCode::ImgUnsupportedFormat => "IMG_UNSUPPORTED_FORMAT",
|
||||
DiagCode::ImgDeskewOutOfRange => "IMG_DESKEW_OUT_OF_RANGE",
|
||||
|
|
@ -1041,6 +1052,7 @@ impl DiagCode {
|
|||
| DiagCode::OcrCcittUnsupported
|
||||
| DiagCode::OcrTesseractFailed
|
||||
| DiagCode::OcrBrokenVectorUnavailable
|
||||
| DiagCode::OcrLanguageUnavailable
|
||||
| DiagCode::ImgSoftmaskUnsupported
|
||||
| DiagCode::ImgUnsupportedFormat
|
||||
| DiagCode::ImgDeskewOutOfRange
|
||||
|
|
@ -1566,6 +1578,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
|
|||
phase: "4.7",
|
||||
suggested_action: "Build with --features ocr to enable OCR recovery on broken-vector pages",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::OcrLanguageUnavailable,
|
||||
category: "OCR",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "5.4",
|
||||
suggested_action: "Requested language pack not installed; extraction proceeded with eng fallback. Run 'pdftract doctor tesseract-langs' to verify installed languages.",
|
||||
},
|
||||
// === IMG_* codes ===
|
||||
DiagInfo {
|
||||
code: DiagCode::ImgSoftmaskUnsupported,
|
||||
|
|
|
|||
|
|
@ -435,7 +435,7 @@ impl PdfExtractor {
|
|||
///
|
||||
/// This struct contains the minimal data needed for one page,
|
||||
/// designed to be dropped immediately after serialization.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PageExtraction {
|
||||
/// 0-based page index
|
||||
pub index: usize,
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult,
|
|||
pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics};
|
||||
pub use options::{ExtractionOptions, ReceiptsMode};
|
||||
pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree};
|
||||
pub use schema::{SpanJson, BlockJson, ExtractionQuality};
|
||||
pub use schema::{SpanJson, BlockJson, ExtractionQuality, TableJson, RowJson, CellJson, SpanRef};
|
||||
pub use table::{TableDetector, PageContext as TablePageContext, GridCandidate};
|
||||
|
||||
#[cfg(feature = "ocr")]
|
||||
|
|
|
|||
|
|
@ -116,6 +116,33 @@ pub struct ExtractionOptions {
|
|||
/// - Median font size < 7.0 pt: 400 DPI (fine print)
|
||||
/// - Otherwise: 300 DPI (standard body text)
|
||||
pub ocr_dpi_override: Option<u32>,
|
||||
/// OCR language codes to load for Tesseract (Phase 5.4).
|
||||
///
|
||||
/// Each language code corresponds to a `<code>.traineddata` file in the
|
||||
/// tessdata directory. Multiple languages can be specified for multi-language
|
||||
/// documents; Tesseract will attempt recognition with all loaded languages.
|
||||
///
|
||||
/// Default: vec!["eng"] (English)
|
||||
///
|
||||
/// # Language codes
|
||||
///
|
||||
/// ISO 639-2/3 codes are used: "eng" (English), "fra" (French), "deu" (German),
|
||||
/// "spa" (Spanish), "jpn" (Japanese), "chi_sim" (Simplified Chinese), etc.
|
||||
///
|
||||
/// # Missing language handling
|
||||
///
|
||||
/// If a requested language pack is not installed, extraction proceeds with
|
||||
/// an OCR_LANGUAGE_UNAVAILABLE diagnostic and falls back to eng if available.
|
||||
/// Run `pdftract doctor tesseract-langs` to verify installed languages.
|
||||
///
|
||||
/// # Docker image variants
|
||||
///
|
||||
/// - `pdftract:default`: No language packs bundled (OCR not available)
|
||||
/// - `pdftract:ocr`: Bundles eng + common languages (~150 MB)
|
||||
/// - `pdftract:full`: Bundles all 100+ languages (~600 MB)
|
||||
///
|
||||
/// See docs/notes/ocr-language-packs.md for the full distribution strategy.
|
||||
pub ocr_language: Vec<String>,
|
||||
}
|
||||
|
||||
impl Default for ExtractionOptions {
|
||||
|
|
@ -126,6 +153,7 @@ impl Default for ExtractionOptions {
|
|||
memory_budget_mb: Self::default_memory_budget_mb(),
|
||||
full_render: false,
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -158,6 +186,7 @@ impl ExtractionOptions {
|
|||
Self {
|
||||
receipts,
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
|
@ -167,6 +196,7 @@ impl ExtractionOptions {
|
|||
Ok(Self {
|
||||
receipts: ReceiptsMode::from_str(receipts)?,
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
|
@ -185,6 +215,7 @@ impl ExtractionOptions {
|
|||
max_parallel_pages: max_parallel_pages.max(1),
|
||||
memory_budget_mb: memory_budget_mb.max(64),
|
||||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
|
@ -324,4 +355,24 @@ mod tests {
|
|||
let opts = ExtractionOptions::with_parallelism(4, 0);
|
||||
assert_eq!(opts.memory_budget_mb, 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_options_default_ocr_language() {
|
||||
let opts = ExtractionOptions::default();
|
||||
assert_eq!(opts.ocr_language, vec!["eng"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_options_serialize_ocr_language() {
|
||||
let json = "{\"ocr_language\":[\"eng\",\"fra\"]}";
|
||||
let opts: ExtractionOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(opts.ocr_language, vec!["eng", "fra"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extraction_options_deserialize_ocr_language_default() {
|
||||
let json = "{}";
|
||||
let opts: ExtractionOptions = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(opts.ocr_language, vec!["eng"]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
//! - 1: extraction failed (PDF unreadable, encrypted without password, etc.)
|
||||
|
||||
use crate::receipts::Receipt;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
|
|
@ -187,7 +188,7 @@ pub fn check_version_compatibility(
|
|||
///
|
||||
/// This represents a single text span extracted from a PDF page,
|
||||
/// with enough information to compute IoU and content hash.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SpanData {
|
||||
/// The extracted text content.
|
||||
pub text: String,
|
||||
|
|
|
|||
|
|
@ -16,6 +16,13 @@ use serde::{Deserialize, Serialize};
|
|||
/// from reordering spans on the same line.
|
||||
const Y_BUCKET_SIZE: f64 = 2.0;
|
||||
|
||||
/// Edge presence threshold for merged cell detection (80%).
|
||||
///
|
||||
/// An interior edge is considered "present" if at least 80% of its
|
||||
/// expected length is covered by clustered segments. This tolerates
|
||||
/// broken/dashed rules typical in PDFs exported from spreadsheets.
|
||||
const EDGE_PRESENCE_THRESHOLD: f32 = 0.8;
|
||||
|
||||
/// Bold indicator patterns in PostScript font names.
|
||||
///
|
||||
/// These patterns are used to detect bold fonts when the ForceBold flag
|
||||
|
|
@ -204,6 +211,299 @@ pub fn count_header_rows(cells: &[Cell], row_count: usize) -> u32 {
|
|||
header_count
|
||||
}
|
||||
|
||||
/// Detect and apply merged cells (rowspan/colspan) by examining missing interior edges.
|
||||
///
|
||||
/// This function implements merged cell detection (7.2.5) by checking which interior
|
||||
/// grid edges are present vs. missing. When the interior edge between two adjacent
|
||||
/// grid cells is absent, the cells are merged.
|
||||
///
|
||||
/// # Algorithm
|
||||
///
|
||||
/// 1. For each interior cell (not on the grid boundary), enumerate the four edges
|
||||
/// that should bound it (top, bottom, left, right).
|
||||
/// 2. An edge is "present" if at least 80% of its expected length is covered by
|
||||
/// clustered segments from the grid.
|
||||
/// 3. Missing right edge between cells (i, j) and (i+1, j) -> colspan extension.
|
||||
/// 4. Missing bottom edge between cells (i, j) and (i, j+1) -> rowspan extension.
|
||||
/// 5. Iterate until no more merges can be applied (transitive merges).
|
||||
/// 6. Absorbed cells are excluded from the final Vec<Cell>.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `cells` - The cells to merge (from `assign_spans_to_cells`)
|
||||
/// * `grid` - The grid candidate with row/col boundaries and segments
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A tuple of (merged_cells, diagnostics):
|
||||
/// - `merged_cells`: Cells with rowspan/colspan applied, absorbed cells removed
|
||||
/// - `diagnostics`: Diagnostic messages about merge operations
|
||||
///
|
||||
/// # Borderless Tables
|
||||
///
|
||||
/// For borderless tables (grid.segments is empty), this function returns the
|
||||
/// original cells unchanged with a diagnostic indicating that merged cell
|
||||
/// detection is a NO-OP for borderless tables.
|
||||
pub fn detect_merged_cells(
|
||||
mut cells: Vec<Cell>,
|
||||
grid: &super::GridCandidate,
|
||||
) -> (Vec<Cell>, Vec<String>) {
|
||||
let mut diagnostics = Vec::new();
|
||||
|
||||
// Borderless tables have no segments to infer from - NO-OP with diagnostic
|
||||
if grid.segments.is_empty() {
|
||||
diagnostics.push(
|
||||
"merged_cell_detection_skipped: borderless table has no segments for edge inference".to_string()
|
||||
);
|
||||
return (cells, diagnostics);
|
||||
}
|
||||
|
||||
let row_count = grid.row_count();
|
||||
let col_count = grid.col_count();
|
||||
|
||||
// Track which cells have been absorbed (removed from output)
|
||||
// Index is row * col_count + col
|
||||
let mut absorbed = vec![vec![false; col_count]; row_count];
|
||||
|
||||
// Track merges in a loop until no more merges can be applied
|
||||
let mut merges_applied = true;
|
||||
while merges_applied {
|
||||
merges_applied = false;
|
||||
|
||||
// Check each cell for merge opportunities
|
||||
for row in 0..row_count {
|
||||
for col in 0..col_count {
|
||||
// Skip if this cell was already absorbed
|
||||
if absorbed[row][col] {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the cell at this position to get current colspan/rowspan
|
||||
let cell_idx = cells.iter().position(|c| c.row == row && c.col == col);
|
||||
let cell_colspan = cell_idx.and_then(|idx| Some(cells[idx].colspan as usize)).unwrap_or(1);
|
||||
let cell_rowspan = cell_idx.and_then(|idx| Some(cells[idx].rowspan as usize)).unwrap_or(1);
|
||||
|
||||
// Check right edge (colspan) - check at the merged boundary
|
||||
let next_col = col + cell_colspan;
|
||||
if next_col < col_count && !absorbed[row][next_col] {
|
||||
if !is_vertical_edge_present(grid, next_col, row, row + 1) {
|
||||
// Missing right edge - merge with cell to the right
|
||||
merge_cells_right(&mut cells, &mut absorbed, row, col, col_count, &mut diagnostics);
|
||||
merges_applied = true;
|
||||
// After merging, this cell may have absorbed more, so continue
|
||||
// but don't check other directions for this cell in this iteration
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Check bottom edge (rowspan) - check at the merged boundary
|
||||
let next_row = row + cell_rowspan;
|
||||
if next_row < row_count && !absorbed[next_row][col] {
|
||||
if !is_horizontal_edge_present(grid, next_row, col, col + 1) {
|
||||
// Missing bottom edge - merge with cell below
|
||||
merge_cells_down(&mut cells, &mut absorbed, row, col, col_count, &mut diagnostics);
|
||||
merges_applied = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove absorbed cells from the output
|
||||
let merged_cells: Vec<Cell> = cells.into_iter()
|
||||
.filter(|c| !absorbed[c.row][c.col])
|
||||
.collect();
|
||||
|
||||
(merged_cells, diagnostics)
|
||||
}
|
||||
|
||||
/// Check if a vertical edge at a given x coordinate is present between two rows.
|
||||
///
|
||||
/// The edge is present if at least 80% of its length is covered by vertical segments.
|
||||
fn is_vertical_edge_present(
|
||||
grid: &super::GridCandidate,
|
||||
edge_x_idx: usize, // Index of the vertical line in col_xs
|
||||
row_start: usize, // Starting row index (inclusive)
|
||||
row_end: usize, // Ending row index (exclusive)
|
||||
) -> bool {
|
||||
let x = grid.col_xs[edge_x_idx];
|
||||
let y_top = grid.row_ys[row_start];
|
||||
let y_bottom = grid.row_ys[row_end];
|
||||
let expected_length = (y_top - y_bottom).abs();
|
||||
|
||||
if expected_length < 0.1 {
|
||||
return true; // Degenerate edge, consider present
|
||||
}
|
||||
|
||||
// Find all vertical segments that are collinear with this edge
|
||||
let mut covered_length = 0.0;
|
||||
const EPSILON: f32 = 1.0;
|
||||
|
||||
for segment in &grid.segments {
|
||||
if segment.orientation != super::SegmentOrientation::Vertical {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if segment is collinear (same x within epsilon)
|
||||
if (segment.x0 - x).abs() > EPSILON {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if segment overlaps with the expected edge range
|
||||
let seg_y0 = segment.y0.max(y_bottom);
|
||||
let seg_y1 = segment.y1.min(y_top);
|
||||
|
||||
if seg_y1 > seg_y0 {
|
||||
covered_length += seg_y1 - seg_y0;
|
||||
}
|
||||
}
|
||||
|
||||
covered_length / expected_length >= EDGE_PRESENCE_THRESHOLD
|
||||
}
|
||||
|
||||
/// Check if a horizontal edge at a given y coordinate is present between two columns.
|
||||
///
|
||||
/// The edge is present if at least 80% of its length is covered by horizontal segments.
|
||||
fn is_horizontal_edge_present(
|
||||
grid: &super::GridCandidate,
|
||||
edge_y_idx: usize, // Index of the horizontal line in row_ys
|
||||
col_start: usize, // Starting column index (inclusive)
|
||||
col_end: usize, // Ending column index (exclusive)
|
||||
) -> bool {
|
||||
let y = grid.row_ys[edge_y_idx];
|
||||
let x_left = grid.col_xs[col_start];
|
||||
let x_right = grid.col_xs[col_end];
|
||||
let expected_length = x_right - x_left;
|
||||
|
||||
if expected_length < 0.1 {
|
||||
return true; // Degenerate edge, consider present
|
||||
}
|
||||
|
||||
// Find all horizontal segments that are collinear with this edge
|
||||
let mut covered_length = 0.0;
|
||||
const EPSILON: f32 = 1.0;
|
||||
|
||||
for segment in &grid.segments {
|
||||
if segment.orientation != super::SegmentOrientation::Horizontal {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if segment is collinear (same y within epsilon)
|
||||
if (segment.y0 - y).abs() > EPSILON {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if segment overlaps with the expected edge range
|
||||
let seg_x0 = segment.x0.max(x_left);
|
||||
let seg_x1 = segment.x1.min(x_right);
|
||||
|
||||
if seg_x1 > seg_x0 {
|
||||
covered_length += seg_x1 - seg_x0;
|
||||
}
|
||||
}
|
||||
|
||||
covered_length / expected_length >= EDGE_PRESENCE_THRESHOLD
|
||||
}
|
||||
|
||||
/// Merge cell at (row, col) with cell to its right at the merged boundary.
|
||||
///
|
||||
/// Updates the surviving cell's colspan and bbox, marks the absorbed cell.
|
||||
fn merge_cells_right(
|
||||
cells: &mut Vec<Cell>,
|
||||
absorbed: &mut Vec<Vec<bool>>,
|
||||
row: usize,
|
||||
col: usize,
|
||||
col_count: usize,
|
||||
diagnostics: &mut Vec<String>,
|
||||
) {
|
||||
// Find the surviving cell
|
||||
let survivor_idx = cells.iter().position(|c| c.row == row && c.col == col && !absorbed[row][col]);
|
||||
|
||||
if let Some(s_idx) = survivor_idx {
|
||||
// Find the furthest column this cell already spans to
|
||||
let current_colspan = cells[s_idx].colspan as usize;
|
||||
let next_col = col + current_colspan;
|
||||
|
||||
if next_col >= col_count || absorbed[row][next_col] {
|
||||
return; // Already absorbed or out of bounds
|
||||
}
|
||||
|
||||
// Find the cell to absorb at the merged boundary
|
||||
let target_idx = cells.iter().position(|c| c.row == row && c.col == next_col && !absorbed[row][next_col]);
|
||||
if let Some(t_idx) = target_idx {
|
||||
// Clone data before mutating cells
|
||||
let absorbed_content = cells[t_idx].content.clone();
|
||||
let absorbed_bbox = cells[t_idx].bbox[2];
|
||||
let absorbed_colspan = cells[t_idx].colspan;
|
||||
|
||||
// Update survivor's colspan and bbox (add the target's colspan, not just 1)
|
||||
cells[s_idx].colspan += absorbed_colspan;
|
||||
cells[s_idx].bbox[2] = absorbed_bbox; // Expand x1
|
||||
|
||||
// Transfer content from absorbed cell to survivor
|
||||
cells[s_idx].content.extend(absorbed_content);
|
||||
|
||||
// Mark absorbed cell
|
||||
absorbed[row][next_col] = true;
|
||||
|
||||
diagnostics.push(format!(
|
||||
"merged_cells: cell ({},{}) colspan={} absorbed cell ({},{})",
|
||||
row, col, cells[s_idx].colspan, row, next_col
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge cell at (row, col) with cell below it at the merged boundary.
|
||||
///
|
||||
/// Updates the surviving cell's rowspan and bbox, marks the absorbed cell.
|
||||
fn merge_cells_down(
|
||||
cells: &mut Vec<Cell>,
|
||||
absorbed: &mut Vec<Vec<bool>>,
|
||||
row: usize,
|
||||
col: usize,
|
||||
col_count: usize,
|
||||
diagnostics: &mut Vec<String>,
|
||||
) {
|
||||
// Find the surviving cell
|
||||
let survivor_idx = cells.iter().position(|c| c.row == row && c.col == col && !absorbed[row][col]);
|
||||
|
||||
if let Some(s_idx) = survivor_idx {
|
||||
// Find the furthest row this cell already spans to
|
||||
let current_rowspan = cells[s_idx].rowspan as usize;
|
||||
let next_row = row + current_rowspan;
|
||||
|
||||
if next_row >= absorbed.len() || absorbed[next_row][col] {
|
||||
return; // Already absorbed or out of bounds
|
||||
}
|
||||
|
||||
// Find the cell to absorb at the merged boundary
|
||||
let target_idx = cells.iter().position(|c| c.row == next_row && c.col == col && !absorbed[next_row][col]);
|
||||
if let Some(t_idx) = target_idx {
|
||||
// Clone data before mutating cells
|
||||
let absorbed_content = cells[t_idx].content.clone();
|
||||
let absorbed_bbox_y0 = cells[t_idx].bbox[1];
|
||||
let absorbed_rowspan = cells[t_idx].rowspan;
|
||||
|
||||
// Update survivor's rowspan and bbox (add the target's rowspan, not just 1)
|
||||
cells[s_idx].rowspan += absorbed_rowspan;
|
||||
cells[s_idx].bbox[1] = absorbed_bbox_y0; // Expand y0 downward
|
||||
|
||||
// Transfer content from absorbed cell to survivor
|
||||
cells[s_idx].content.extend(absorbed_content);
|
||||
|
||||
// Mark absorbed cell
|
||||
absorbed[next_row][col] = true;
|
||||
|
||||
diagnostics.push(format!(
|
||||
"merged_cells: cell ({},{}) rowspan={} absorbed cell ({},{})",
|
||||
row, col, cells[s_idx].rowspan, next_row, col
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A text span for table cell assignment.
|
||||
///
|
||||
/// Minimal span representation used during cell assignment.
|
||||
|
|
@ -1321,4 +1621,430 @@ mod tests {
|
|||
// Should count 1 header row (bold signal)
|
||||
assert_eq!(count_header_rows(&cells, 2), 1);
|
||||
}
|
||||
|
||||
// Merged cell detection tests (7.2.5)
|
||||
|
||||
#[test]
|
||||
fn test_detect_merged_cells_borderless_table_noop() {
|
||||
// Borderless tables have no segments - should NO-OP with diagnostic
|
||||
let intersections = vec![
|
||||
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
|
||||
(50.0, 300.0), (150.0, 300.0), (250.0, 300.0),
|
||||
];
|
||||
|
||||
let mut grid = GridCandidate::from_intersections(intersections, vec![]).unwrap();
|
||||
// Borderless table has no segments
|
||||
grid.segments = vec![];
|
||||
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
|
||||
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
|
||||
];
|
||||
|
||||
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
|
||||
|
||||
// All cells should remain (no merges)
|
||||
assert_eq!(merged.len(), 4);
|
||||
assert_eq!(merged[0].colspan, 1);
|
||||
assert_eq!(merged[0].rowspan, 1);
|
||||
|
||||
// Should have diagnostic about borderless table
|
||||
assert!(diagnostics.iter().any(|d| d.contains("merged_cell_detection_skipped")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn debug_test_colspan_3() {
|
||||
// Debug test to understand what's happening
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0, 350.0, 450.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
let segments = vec![
|
||||
crate::table::Segment::horizontal(300.0, 50.0, 450.0),
|
||||
crate::table::Segment::horizontal(200.0, 50.0, 450.0),
|
||||
crate::table::Segment::horizontal(100.0, 50.0, 450.0),
|
||||
crate::table::Segment::vertical(50.0, 100.0, 300.0),
|
||||
crate::table::Segment::vertical(450.0, 100.0, 300.0),
|
||||
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Full height
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
println!("Grid: {} rows x {} cols", grid.row_count(), grid.col_count());
|
||||
println!("row_ys: {:?}", grid.row_ys);
|
||||
println!("col_xs: {:?}", grid.col_xs);
|
||||
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
|
||||
Cell::new([350.0, 200.0, 450.0, 300.0], 0, 3),
|
||||
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
|
||||
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
|
||||
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
|
||||
Cell::new([350.0, 100.0, 450.0, 200.0], 1, 3),
|
||||
];
|
||||
|
||||
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
|
||||
|
||||
println!("\nMerged cells: {}", merged.len());
|
||||
for cell in &merged {
|
||||
println!(" cell ({},{}) colspan={} rowspan={}", cell.row, cell.col, cell.colspan, cell.rowspan);
|
||||
}
|
||||
println!("\nDiagnostics:");
|
||||
for d in diagnostics {
|
||||
println!(" {}", d);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_merged_cells_colspan_3_critical_test() {
|
||||
// Critical test from plan: merged header cell spanning 3 columns
|
||||
// Grid: 4 columns x 2 rows
|
||||
// Top row has merged cell (colspan=3) and one normal cell
|
||||
// Vertical edge at col_xs[1] and col_xs[2] are missing in row 0
|
||||
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0, 350.0, 450.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Create segments: all grid edges EXCEPT the vertical edges at x=150 and x=250 in row 0
|
||||
// This creates a merged cell from col 0 to col 2 (colspan=3) in row 0 only
|
||||
let segments = vec![
|
||||
// Horizontal edges (all present)
|
||||
crate::table::Segment::horizontal(300.0, 50.0, 450.0), // Top edge
|
||||
crate::table::Segment::horizontal(200.0, 50.0, 450.0), // Middle edge
|
||||
crate::table::Segment::horizontal(100.0, 50.0, 450.0), // Bottom edge
|
||||
// Vertical edges
|
||||
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge (full height)
|
||||
crate::table::Segment::vertical(450.0, 100.0, 300.0), // Right edge (full height)
|
||||
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Edge between cols 2-3 (full height)
|
||||
crate::table::Segment::vertical(150.0, 100.0, 200.0), // Edge between cols 0-1 (row 1 only)
|
||||
crate::table::Segment::vertical(250.0, 100.0, 200.0), // Edge between cols 1-2 (row 1 only)
|
||||
// MISSING: vertical edges at x=150 and x=250 in row 0 (creates merged cell in row 0)
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
|
||||
Cell::new([350.0, 200.0, 450.0, 300.0], 0, 3),
|
||||
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
|
||||
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
|
||||
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
|
||||
Cell::new([350.0, 100.0, 450.0, 200.0], 1, 3),
|
||||
];
|
||||
|
||||
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
|
||||
|
||||
// Should have 6 cells (3 absorbed in top row)
|
||||
assert_eq!(merged.len(), 6);
|
||||
|
||||
// Find the merged cell
|
||||
let merged_cell = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
|
||||
assert_eq!(merged_cell.colspan, 3);
|
||||
assert_eq!(merged_cell.rowspan, 1);
|
||||
assert_eq!(merged_cell.bbox[2], 350.0); // x1 expanded to cover absorbed cells
|
||||
|
||||
// Other cells should be normal
|
||||
let cell_r0c3 = merged.iter().find(|c| c.row == 0 && c.col == 3).unwrap();
|
||||
assert_eq!(cell_r0c3.colspan, 1);
|
||||
|
||||
// Should have diagnostic messages about merges
|
||||
assert!(diagnostics.iter().any(|d| d.contains("merged_cells")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_merged_cells_pure_rowspan() {
|
||||
// Test pure rowspan (vertical merge)
|
||||
// Grid: 3 columns x 3 rows
|
||||
// Left column has merged cell (rowspan=2)
|
||||
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0, 350.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Create segments: all edges EXCEPT the horizontal edge at y=200 in column 0
|
||||
let segments = vec![
|
||||
// Horizontal edges
|
||||
crate::table::Segment::horizontal(300.0, 50.0, 350.0), // Top edge
|
||||
crate::table::Segment::horizontal(200.0, 150.0, 350.0), // Middle edge (missing in col 0)
|
||||
crate::table::Segment::horizontal(100.0, 50.0, 350.0), // Bottom edge
|
||||
// Vertical edges
|
||||
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge
|
||||
crate::table::Segment::vertical(150.0, 100.0, 300.0), // Col divider 1
|
||||
crate::table::Segment::vertical(250.0, 100.0, 300.0), // Col divider 2
|
||||
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Right edge
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
|
||||
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
|
||||
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
|
||||
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
|
||||
];
|
||||
|
||||
let (merged, _diagnostics) = detect_merged_cells(cells, &grid);
|
||||
|
||||
// Should have 5 cells (1 absorbed)
|
||||
assert_eq!(merged.len(), 5);
|
||||
|
||||
// Find the merged cell
|
||||
let merged_cell = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
|
||||
assert_eq!(merged_cell.rowspan, 2);
|
||||
assert_eq!(merged_cell.colspan, 1);
|
||||
assert_eq!(merged_cell.bbox[1], 100.0); // y0 expanded downward
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_merged_cells_diagonal_merge() {
|
||||
// Test diagonal merge (rowspan=2, colspan=2)
|
||||
// Grid: 3 columns x 2 rows
|
||||
// Top-left has merged cell covering 2x2 region
|
||||
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0, 350.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Create segments: missing interior edges in top-left 2x2 region
|
||||
// Row 0: [200, 300], Row 1: [100, 200]
|
||||
// Col 0: [50, 150], Col 1: [150, 250], Col 2: [250, 350]
|
||||
let segments = vec![
|
||||
// Horizontal edges (missing middle divider in top-left)
|
||||
crate::table::Segment::horizontal(300.0, 50.0, 350.0), // Top edge (y=300)
|
||||
crate::table::Segment::horizontal(200.0, 250.0, 350.0), // Middle edge (y=200, missing in cols 0-1)
|
||||
crate::table::Segment::horizontal(100.0, 50.0, 350.0), // Bottom edge (y=100)
|
||||
// Vertical edges (missing middle divider in top-left)
|
||||
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge (x=50)
|
||||
crate::table::Segment::vertical(250.0, 200.0, 300.0), // Middle vertical (x=250, missing in rows 0-1)
|
||||
crate::table::Segment::vertical(350.0, 100.0, 300.0), // Right edge (x=350)
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
|
||||
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
|
||||
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
|
||||
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
|
||||
];
|
||||
|
||||
let (merged, _diagnostics) = detect_merged_cells(cells, &grid);
|
||||
|
||||
// Should have 3 cells:
|
||||
// - (0,0) with rowspan=2, colspan=2 (absorbs (0,1), (1,0), (1,1))
|
||||
// - (0,2) normal
|
||||
// - (1,2) normal
|
||||
assert_eq!(merged.len(), 3);
|
||||
|
||||
// Find the diagonal merged cell
|
||||
let merged_cell = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
|
||||
assert_eq!(merged_cell.rowspan, 2);
|
||||
assert_eq!(merged_cell.colspan, 2);
|
||||
assert_eq!(merged_cell.bbox[1], 100.0); // y0 expanded
|
||||
assert_eq!(merged_cell.bbox[2], 250.0); // x1 expanded
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_merged_cells_no_merges_complete_grid() {
|
||||
// Test that a complete grid with all edges present results in no merges
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0, 350.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// All edges present
|
||||
let segments = vec![
|
||||
crate::table::Segment::horizontal(300.0, 50.0, 350.0),
|
||||
crate::table::Segment::horizontal(200.0, 50.0, 350.0),
|
||||
crate::table::Segment::horizontal(100.0, 50.0, 350.0),
|
||||
crate::table::Segment::vertical(50.0, 100.0, 300.0),
|
||||
crate::table::Segment::vertical(150.0, 100.0, 300.0),
|
||||
crate::table::Segment::vertical(250.0, 100.0, 300.0),
|
||||
crate::table::Segment::vertical(350.0, 100.0, 300.0),
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
|
||||
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
|
||||
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
|
||||
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
|
||||
];
|
||||
|
||||
let (merged, diagnostics) = detect_merged_cells(cells, &grid);
|
||||
|
||||
// All cells should remain with no merges
|
||||
assert_eq!(merged.len(), 6);
|
||||
for cell in &merged {
|
||||
assert_eq!(cell.rowspan, 1);
|
||||
assert_eq!(cell.colspan, 1);
|
||||
}
|
||||
|
||||
// No merge diagnostics
|
||||
assert!(!diagnostics.iter().any(|d| d.contains("merged_cells")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_vertical_edge_present_full_coverage() {
|
||||
// Test that a fully covered edge is detected as present
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Full coverage vertical edge at x=150
|
||||
let segments = vec![
|
||||
crate::table::Segment::vertical(150.0, 100.0, 300.0),
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
// Edge at x=150 between rows 0-1 should be present (100% coverage)
|
||||
assert!(is_vertical_edge_present(&grid, 1, 0, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_vertical_edge_present_partial_coverage_below_threshold() {
|
||||
// Test that a partially covered edge (<80%) is detected as absent
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Partial coverage (50% of edge length)
|
||||
let segments = vec![
|
||||
crate::table::Segment::vertical(150.0, 200.0, 250.0), // Only covers 50pt of 100pt edge
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
// Edge at x=150 between rows 0-1 should be absent (50% < 80% threshold)
|
||||
assert!(!is_vertical_edge_present(&grid, 1, 0, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_horizontal_edge_present_full_coverage() {
|
||||
// Test that a fully covered horizontal edge is detected as present
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Full coverage horizontal edge at y=200
|
||||
let segments = vec![
|
||||
crate::table::Segment::horizontal(200.0, 50.0, 250.0),
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
// Edge at y=200 between cols 0-1 should be present (100% coverage)
|
||||
assert!(is_horizontal_edge_present(&grid, 1, 0, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_horizontal_edge_present_partial_coverage_above_threshold() {
|
||||
// Test that a partially covered edge (>80%) is detected as present
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Partial coverage (85% of edge length - 85pt of 100pt)
|
||||
let segments = vec![
|
||||
crate::table::Segment::horizontal(200.0, 50.0, 185.0), // Covers 85% of edge
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
// Edge at y=200 between cols 0-1 should be present (85% >= 80% threshold)
|
||||
assert!(is_horizontal_edge_present(&grid, 1, 0, 1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_merged_cells_transitive_merge() {
|
||||
// Test transitive merges: cell (0,0) absorbs (0,1), then absorbs (0,2), then absorbs (0,3)
|
||||
// Grid: 4 columns x 2 rows
|
||||
// NO interior vertical edges (all cells in each row should merge)
|
||||
|
||||
let mut intersections = Vec::new();
|
||||
for &y in &[300.0, 200.0, 100.0] {
|
||||
for &x in &[50.0, 150.0, 250.0, 350.0, 450.0] {
|
||||
intersections.push((x, y));
|
||||
}
|
||||
}
|
||||
|
||||
// Missing ALL interior vertical edges (no edges at x=150, 250, 350)
|
||||
let segments = vec![
|
||||
crate::table::Segment::horizontal(300.0, 50.0, 450.0),
|
||||
crate::table::Segment::horizontal(200.0, 50.0, 450.0),
|
||||
crate::table::Segment::horizontal(100.0, 50.0, 450.0),
|
||||
crate::table::Segment::vertical(50.0, 100.0, 300.0), // Left edge only
|
||||
crate::table::Segment::vertical(450.0, 100.0, 300.0), // Right edge only
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, segments).unwrap();
|
||||
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
Cell::new([250.0, 200.0, 350.0, 300.0], 0, 2),
|
||||
Cell::new([350.0, 200.0, 450.0, 300.0], 0, 3),
|
||||
Cell::new([50.0, 100.0, 150.0, 200.0], 1, 0),
|
||||
Cell::new([150.0, 100.0, 250.0, 200.0], 1, 1),
|
||||
Cell::new([250.0, 100.0, 350.0, 200.0], 1, 2),
|
||||
Cell::new([350.0, 100.0, 450.0, 200.0], 1, 3),
|
||||
];
|
||||
|
||||
let (merged, _diagnostics) = detect_merged_cells(cells, &grid);
|
||||
|
||||
// Should have 2 cells (6 absorbed: 3 in row 0, 3 in row 1)
|
||||
// - (0,0) colspan=4
|
||||
// - (1,0) colspan=4
|
||||
assert_eq!(merged.len(), 2);
|
||||
|
||||
let merged_cell_r0 = merged.iter().find(|c| c.row == 0 && c.col == 0).unwrap();
|
||||
assert_eq!(merged_cell_r0.colspan, 4);
|
||||
assert_eq!(merged_cell_r0.bbox[2], 450.0); // x1 expanded to cover all 4 columns
|
||||
|
||||
let merged_cell_r1 = merged.iter().find(|c| c.row == 1 && c.col == 0).unwrap();
|
||||
assert_eq!(merged_cell_r1.colspan, 4);
|
||||
assert_eq!(merged_cell_r1.bbox[2], 450.0);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -104,6 +104,36 @@ impl TableDetector {
|
|||
self.build_grids(intersections, segments)
|
||||
}
|
||||
|
||||
/// Detect tables on a page using both line-based and borderless pipelines.
|
||||
///
|
||||
/// This is the main entry point for table detection (7.2 coordinator).
|
||||
/// It runs both detection pipelines and combines the results:
|
||||
/// 1. Line-based detection for bordered tables (m/l/S, re/S, re/f operators)
|
||||
/// 2. Borderless detection for tables without ruling lines (x0 alignment heuristic)
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `ctx` - The page context containing page dict and content bytes
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of grid candidates representing all detected tables.
|
||||
pub fn detect(&self, ctx: &PageContext) -> Vec<GridCandidate> {
|
||||
let mut all_grids = Vec::new();
|
||||
|
||||
// Step 1: Run line-based detection (primary pipeline)
|
||||
let line_based = self.detect_line_based(ctx);
|
||||
all_grids.extend(line_based);
|
||||
|
||||
// Step 2: Run borderless detection (secondary pipeline)
|
||||
// Note: In a full implementation, we would skip regions already
|
||||
// covered by line-based tables to avoid duplicates.
|
||||
let borderless = self.detect_borderless(ctx);
|
||||
all_grids.extend(borderless);
|
||||
|
||||
all_grids
|
||||
}
|
||||
|
||||
/// Detect borderless tables using x0 alignment heuristic.
|
||||
///
|
||||
/// This method analyzes text positioning to find tables without ruling lines:
|
||||
|
|
|
|||
|
|
@ -21,11 +21,16 @@ mod detector;
|
|||
mod segment;
|
||||
mod grid;
|
||||
mod cell;
|
||||
mod output;
|
||||
|
||||
pub use detector::TableDetector;
|
||||
pub use segment::{Segment, SegmentOrientation};
|
||||
pub use grid::GridCandidate;
|
||||
pub use cell::{Cell, TableSpan, detect_merged_cells};
|
||||
pub use output::{grid_to_table_json, detect_two_page_tables};
|
||||
|
||||
// Re-export cell types for use in extract module
|
||||
pub use cell::Cell as TableCell;
|
||||
|
||||
use crate::parser::pages::PageDict;
|
||||
|
||||
|
|
|
|||
481
crates/pdftract-core/src/table/output.rs
Normal file
481
crates/pdftract-core/src/table/output.rs
Normal file
|
|
@ -0,0 +1,481 @@
|
|||
//! Table JSON output conversion (7.2.6).
|
||||
//!
|
||||
//! This module handles the conversion from detected table structures
|
||||
//! (GridCandidate, Cell) to the JSON output format (TableJson, RowJson, CellJson).
|
||||
|
||||
use crate::schema::{TableJson, RowJson, CellJson};
|
||||
use crate::table::{GridCandidate, Cell};
|
||||
use crate::table::cell::TableSpan;
|
||||
use anyhow::Result;
|
||||
|
||||
/// Distance from page edge to consider a table as "continued" (50 pt).
|
||||
const CONTINUED_THRESHOLD: f32 = 50.0;
|
||||
|
||||
/// Maximum RMSE for column alignment similarity (5 pt).
|
||||
const COLUMN_SIMILARITY_RMSE: f32 = 5.0;
|
||||
|
||||
/// Convert a detected table (grid + cells) to TableJson output format.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `grid` - The grid candidate representing the table geometry
|
||||
/// * `cells` - The cells with their assigned content
|
||||
/// * `page_index` - The page index where this table appears
|
||||
/// * `detection_method` - Either "line_based" or "borderless"
|
||||
/// * `continued` - Whether this table continues on the next page
|
||||
/// * `continued_from_prev` - Whether this table is a continuation from the previous page
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `TableJson` ready for serialization.
|
||||
pub fn grid_to_table_json(
|
||||
grid: &GridCandidate,
|
||||
cells: &[Cell],
|
||||
page_index: usize,
|
||||
detection_method: &str,
|
||||
continued: bool,
|
||||
continued_from_prev: bool,
|
||||
) -> TableJson {
|
||||
// Build rows from cells
|
||||
let rows = build_rows_from_cells(cells, grid);
|
||||
|
||||
// Count header rows (should already be set on cells)
|
||||
let header_rows = cells.iter()
|
||||
.filter(|c| c.is_header_row)
|
||||
.map(|c| c.row)
|
||||
.collect::<std::collections::HashSet<_>>()
|
||||
.len() as u32;
|
||||
|
||||
TableJson {
|
||||
id: format!("table_{}", page_index),
|
||||
bbox: [
|
||||
grid.bbox[0] as f64,
|
||||
grid.bbox[1] as f64,
|
||||
grid.bbox[2] as f64,
|
||||
grid.bbox[3] as f64,
|
||||
],
|
||||
rows,
|
||||
header_rows,
|
||||
detection_method: detection_method.to_string(),
|
||||
continued,
|
||||
continued_from_prev,
|
||||
page_index,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build RowJson structures from cells.
|
||||
///
|
||||
/// Groups cells by row index and creates RowJson for each.
|
||||
fn build_rows_from_cells(cells: &[Cell], grid: &GridCandidate) -> Vec<RowJson> {
|
||||
let mut row_map: std::collections::HashMap<usize, Vec<&Cell>> = std::collections::HashMap::new();
|
||||
|
||||
// Group cells by row
|
||||
for cell in cells {
|
||||
row_map.entry(cell.row).or_insert_with(Vec::new).push(cell);
|
||||
}
|
||||
|
||||
// Create rows in order (top to bottom = row 0 to row_count-1)
|
||||
let mut rows = Vec::new();
|
||||
for row_idx in 0..grid.row_count() {
|
||||
if let Some(row_cells) = row_map.get(&row_idx) {
|
||||
// Convert cells to CellJson and sort by column
|
||||
let mut cells_json: Vec<CellJson> = row_cells.iter()
|
||||
.map(|c| cell_to_cell_json(c, grid))
|
||||
.collect();
|
||||
|
||||
// Sort by column index
|
||||
cells_json.sort_by_key(|c| c.col);
|
||||
|
||||
// Compute row bbox from all cells
|
||||
let row_bbox = compute_row_bbox(&cells_json);
|
||||
|
||||
// Check if this is a header row (all cells are header cells or first cell is header)
|
||||
let is_header = !cells_json.is_empty() &&
|
||||
cells_json.iter().all(|c| c.is_header_row);
|
||||
|
||||
rows.push(RowJson {
|
||||
bbox: row_bbox,
|
||||
cells: cells_json,
|
||||
is_header,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
rows
|
||||
}
|
||||
|
||||
/// Convert a Cell to CellJson.
|
||||
fn cell_to_cell_json(cell: &Cell, _grid: &GridCandidate) -> CellJson {
|
||||
// Build span references (indices into the page-level spans array)
|
||||
// For now, use empty vec since we don't have the span indices here
|
||||
let spans = Vec::new();
|
||||
|
||||
// Concatenate text from all spans in the cell
|
||||
let text = cell.content.iter()
|
||||
.map(|s| s.text.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
|
||||
CellJson {
|
||||
bbox: [
|
||||
cell.bbox[0] as f64,
|
||||
cell.bbox[1] as f64,
|
||||
cell.bbox[2] as f64,
|
||||
cell.bbox[3] as f64,
|
||||
],
|
||||
text,
|
||||
spans,
|
||||
row: cell.row,
|
||||
col: cell.col,
|
||||
rowspan: cell.rowspan,
|
||||
colspan: cell.colspan,
|
||||
is_header_row: cell.is_header_row,
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the bounding box for a row from its cells.
|
||||
fn compute_row_bbox(cells: &[CellJson]) -> [f64; 4] {
|
||||
if cells.is_empty() {
|
||||
return [0.0, 0.0, 0.0, 0.0];
|
||||
}
|
||||
|
||||
let mut x0 = cells[0].bbox[0];
|
||||
let mut y0 = cells[0].bbox[1];
|
||||
let mut x1 = cells[0].bbox[2];
|
||||
let mut y1 = cells[0].bbox[3];
|
||||
|
||||
for cell in &cells[1..] {
|
||||
x0 = x0.min(cell.bbox[0]);
|
||||
y0 = y0.min(cell.bbox[1]);
|
||||
x1 = x1.max(cell.bbox[2]);
|
||||
y1 = y1.max(cell.bbox[3]);
|
||||
}
|
||||
|
||||
[x0, y0, x1, y1]
|
||||
}
|
||||
|
||||
/// Detect two-page table continuation between adjacent pages.
|
||||
///
|
||||
/// This function examines tables on adjacent pages and determines if they
|
||||
/// represent a single table split across pages.
|
||||
///
|
||||
/// # Algorithm
|
||||
///
|
||||
/// For each pair of tables on page N and page N+1:
|
||||
/// 1. Check if the table on page N ends within CONTINUED_THRESHOLD (50 pt) of page bottom
|
||||
/// 2. Check if the table on page N+1 starts within CONTINUED_THRESHOLD (50 pt) of page top
|
||||
/// 3. Verify both tables have the same column count
|
||||
/// 4. Verify column x-positions are similar (RMSE < COLUMN_SIMILARITY_RMSE)
|
||||
///
|
||||
/// If all conditions are met, set:
|
||||
/// - page N table: `continued = true`
|
||||
/// - page N+1 table: `continued_from_prev = true`
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `all_tables` - Slice of tables for all pages, indexed by page_index
|
||||
/// * `page_heights` - Page heights in points, to determine page edges
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of (page_index, continued, continued_from_prev) tuples for each table.
|
||||
pub fn detect_two_page_tables(
|
||||
all_tables: &[Vec<GridCandidate>],
|
||||
page_heights: &[f64],
|
||||
) -> Vec<Vec<(bool, bool)>> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
for (page_idx, page_tables) in all_tables.iter().enumerate() {
|
||||
let page_flags = if page_tables.is_empty() {
|
||||
Vec::new()
|
||||
} else {
|
||||
page_tables.iter().map(|_| (false, false)).collect()
|
||||
};
|
||||
results.push(page_flags);
|
||||
}
|
||||
|
||||
// Check adjacent page pairs
|
||||
for page_idx in 0..all_tables.len().saturating_sub(1) {
|
||||
let current_page_height = page_heights.get(page_idx).copied().unwrap_or(792.0);
|
||||
let next_page_height = page_heights.get(page_idx + 1).copied().unwrap_or(792.0);
|
||||
|
||||
let current_tables = &all_tables[page_idx];
|
||||
let next_tables = &all_tables.get(page_idx + 1);
|
||||
|
||||
if let Some(next_page_tables) = next_tables {
|
||||
// For each table on current page, check if any table on next page continues it
|
||||
for (table_idx, current_table) in current_tables.iter().enumerate() {
|
||||
// Check if this table ends near page bottom
|
||||
let table_y0 = current_table.bbox[1] as f64;
|
||||
let is_near_bottom = table_y0 <= CONTINUED_THRESHOLD as f64;
|
||||
|
||||
if !is_near_bottom {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Look for a continuing table on the next page
|
||||
for (next_table_idx, next_table) in next_page_tables.iter().enumerate() {
|
||||
// Check if next table starts near page top
|
||||
let next_table_y1 = next_table.bbox[3] as f64;
|
||||
let page_top = next_page_height - CONTINUED_THRESHOLD as f64;
|
||||
let is_near_top = next_table_y1 >= page_top;
|
||||
|
||||
if !is_near_top {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check column count match
|
||||
if current_table.col_count() != next_table.col_count() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check column position similarity
|
||||
if columns_similar(current_table, next_table) {
|
||||
// Match! Set flags
|
||||
results[page_idx][table_idx].0 = true; // continued
|
||||
results[page_idx + 1][next_table_idx].1 = true; // continued_from_prev
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Check if two grids have similar column positions.
|
||||
///
|
||||
/// Computes RMSE between column x-positions and checks if it's below threshold.
|
||||
fn columns_similar(grid1: &GridCandidate, grid2: &GridCandidate) -> bool {
|
||||
if grid1.col_xs.len() != grid2.col_xs.len() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Compute RMSE
|
||||
let sum_sq_error: f32 = grid1.col_xs.iter()
|
||||
.zip(grid2.col_xs.iter())
|
||||
.map(|(x1, x2)| (x1 - x2).powi(2))
|
||||
.sum();
|
||||
|
||||
let mse = sum_sq_error / grid1.col_xs.len() as f32;
|
||||
let rmse = mse.sqrt();
|
||||
|
||||
rmse < COLUMN_SIMILARITY_RMSE
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::table::Segment;
|
||||
|
||||
#[test]
|
||||
fn test_grid_to_table_json_basic() {
|
||||
// Create a simple 2x2 grid
|
||||
let intersections = vec![
|
||||
(50.0, 100.0), (150.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0),
|
||||
(50.0, 300.0), (150.0, 300.0),
|
||||
];
|
||||
|
||||
let grid = GridCandidate::from_intersections(intersections, vec![]).unwrap();
|
||||
|
||||
// Create some cells
|
||||
let cells = vec![
|
||||
Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0),
|
||||
Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1),
|
||||
];
|
||||
|
||||
let table_json = grid_to_table_json(&grid, &cells, 0, "line_based", false, false);
|
||||
|
||||
assert_eq!(table_json.id, "table_0");
|
||||
assert_eq!(table_json.page_index, 0);
|
||||
assert_eq!(table_json.detection_method, "line_based");
|
||||
assert!(!table_json.continued);
|
||||
assert!(!table_json.continued_from_prev);
|
||||
assert_eq!(table_json.rows.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_rows_from_cells() {
|
||||
let grid = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0),
|
||||
(50.0, 300.0), (150.0, 300.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
let mut cell1 = Cell::new([50.0, 200.0, 150.0, 300.0], 0, 0);
|
||||
cell1.content = vec![
|
||||
TableSpan::new([50.0, 210.0, 90.0, 220.0], "Row1Col1".to_string(), "Helvetica".to_string())
|
||||
];
|
||||
|
||||
let mut cell2 = Cell::new([150.0, 200.0, 250.0, 300.0], 0, 1);
|
||||
cell2.content = vec![
|
||||
TableSpan::new([160.0, 210.0, 190.0, 220.0], "Row1Col2".to_string(), "Helvetica".to_string())
|
||||
];
|
||||
|
||||
let rows = build_rows_from_cells(&[cell1, cell2], &grid);
|
||||
|
||||
assert_eq!(rows.len(), 1);
|
||||
assert_eq!(rows[0].cells.len(), 2);
|
||||
assert_eq!(rows[0].cells[0].text, "Row1Col1");
|
||||
assert_eq!(rows[0].cells[1].text, "Row1Col2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columns_similar_identical() {
|
||||
let grid1 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
let grid2 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
assert!(columns_similar(&grid1, &grid2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columns_similar_small_difference() {
|
||||
let grid1 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
// 2 pt shift in column positions
|
||||
let grid2 = GridCandidate::from_intersections(vec![
|
||||
(52.0, 100.0), (152.0, 100.0), (252.0, 100.0),
|
||||
(52.0, 200.0), (152.0, 200.0), (252.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
// RMSE = 2.0 < 5.0, should be similar
|
||||
assert!(columns_similar(&grid1, &grid2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columns_similar_large_difference() {
|
||||
let grid1 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
// 10 pt shift in column positions
|
||||
let grid2 = GridCandidate::from_intersections(vec![
|
||||
(60.0, 100.0), (160.0, 100.0), (260.0, 100.0),
|
||||
(60.0, 200.0), (160.0, 200.0), (260.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
// RMSE = 10.0 > 5.0, should NOT be similar
|
||||
assert!(!columns_similar(&grid1, &grid2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columns_similar_different_count() {
|
||||
let grid1 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0), (250.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
let grid2 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
assert!(!columns_similar(&grid1, &grid2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_two_page_tables_basic() {
|
||||
// Page 0: table ending at y=40 (within 50 pt of page bottom at 0)
|
||||
let grid0 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 40.0), (150.0, 40.0),
|
||||
(50.0, 100.0), (150.0, 100.0),
|
||||
(50.0, 150.0), (150.0, 150.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
// Page 1: table starting at y=750 (within 50 pt of page top at 792)
|
||||
let grid1 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 750.0), (150.0, 750.0),
|
||||
(50.0, 800.0), (150.0, 800.0),
|
||||
(50.0, 850.0), (150.0, 850.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
let all_tables = vec![vec![grid0], vec![grid1]];
|
||||
let page_heights = vec![792.0, 792.0];
|
||||
|
||||
let results = detect_two_page_tables(&all_tables, &page_heights);
|
||||
|
||||
// Page 0 table should be marked as continued
|
||||
assert!(results[0][0].0); // continued = true
|
||||
|
||||
// Page 1 table should be marked as continued_from_prev
|
||||
assert!(results[1][0].1); // continued_from_prev = true
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_two_page_tables_no_continuation() {
|
||||
// Page 0: table ending at y=200 (NOT within 50 pt of page bottom)
|
||||
let grid0 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 200.0), (150.0, 200.0),
|
||||
(50.0, 300.0), (150.0, 300.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
// Page 1: table starting at y=700 (NOT within 50 pt of page top)
|
||||
let grid1 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 700.0), (150.0, 700.0),
|
||||
(50.0, 800.0), (150.0, 800.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
let all_tables = vec![vec![grid0], vec![grid1]];
|
||||
let page_heights = vec![792.0, 792.0];
|
||||
|
||||
let results = detect_two_page_tables(&all_tables, &page_heights);
|
||||
|
||||
// Neither table should be marked as continuation
|
||||
assert!(!results[0][0].0); // continued = false
|
||||
assert!(!results[1][0].1); // continued_from_prev = false
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_detect_two_page_tables_different_column_count() {
|
||||
// Page 0: 2-column table ending near page bottom
|
||||
let grid0 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 40.0), (150.0, 40.0), (250.0, 40.0),
|
||||
(50.0, 100.0), (150.0, 100.0), (250.0, 100.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
// Page 1: 3-column table starting near page top
|
||||
let grid1 = GridCandidate::from_intersections(vec![
|
||||
(50.0, 750.0), (150.0, 750.0), (250.0, 750.0), (350.0, 750.0),
|
||||
(50.0, 800.0), (150.0, 800.0), (250.0, 800.0), (350.0, 800.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
let all_tables = vec![vec![grid0], vec![grid1]];
|
||||
let page_heights = vec![792.0, 792.0];
|
||||
|
||||
let results = detect_two_page_tables(&all_tables, &page_heights);
|
||||
|
||||
// Different column counts, should not be marked as continuation
|
||||
assert!(!results[0][0].0);
|
||||
assert!(!results[1][0].1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cell_to_cell_json_text_concatenation() {
|
||||
let grid = GridCandidate::from_intersections(vec![
|
||||
(50.0, 100.0), (150.0, 100.0),
|
||||
(50.0, 200.0), (150.0, 200.0),
|
||||
], vec![]).unwrap();
|
||||
|
||||
let mut cell = Cell::new([50.0, 100.0, 150.0, 200.0], 0, 0);
|
||||
cell.content = vec![
|
||||
TableSpan::new([50.0, 150.0, 90.0, 160.0], "Hello".to_string(), "Helvetica".to_string()),
|
||||
TableSpan::new([50.0, 140.0, 90.0, 150.0], "World".to_string(), "Helvetica".to_string()),
|
||||
];
|
||||
|
||||
let cell_json = cell_to_cell_json(&cell, &grid);
|
||||
|
||||
assert_eq!(cell_json.text, "Hello World");
|
||||
}
|
||||
}
|
||||
203
docs/notes/ocr-language-packs.md
Normal file
203
docs/notes/ocr-language-packs.md
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
# OCR Language Pack Distribution Strategy
|
||||
|
||||
**Status:** RESOLVED (OQ-04)
|
||||
**Date:** 2026-05-23
|
||||
**Bead:** pdftract-32x4
|
||||
|
||||
## Open Question OQ-04
|
||||
|
||||
> How are OCR language packs distributed? Bundled in the Docker image (size cost), downloaded on first use (network dependency), or required as an out-of-band install?
|
||||
|
||||
## Resolution Decision
|
||||
|
||||
Language packs are **bundled in Docker images** with a tiered distribution strategy:
|
||||
|
||||
| Docker Image Tag | Language Packs | Size | Use Case |
|
||||
|------------------|----------------|------|----------|
|
||||
| `pdftract:default` | None (OCR disabled) | ~4 MB | Vector-only extraction, no OCR capability |
|
||||
| `pdftract:ocr` | eng + 13 common langs | ~150 MB | Standard OCR use case, covers >80% of world languages |
|
||||
| `pdftract:full` | All 100+ languages | ~600 MB | Air-gapped deployments, comprehensive coverage |
|
||||
|
||||
## Rationale
|
||||
|
||||
### Why bundling?
|
||||
|
||||
1. **Air-gapped compatibility:** Bundling ensures OCR works in offline/air-gapped environments without network access for on-first-download
|
||||
2. **Reproducibility:** Fixed language pack versions guarantee consistent extraction results across deployments
|
||||
3. **Simplicity:** No external dependency management for operators; `docker run` just works
|
||||
4. **Performance:** No download latency on first OCR request
|
||||
|
||||
### Size trade-offs
|
||||
|
||||
The `:ocr` variant adds ~150 MB to the image but covers the vast majority of use cases:
|
||||
- English (eng) - ~12 MB
|
||||
- German (deu) - ~10 MB
|
||||
- French (fra) - ~10 MB
|
||||
- Spanish (spa) - ~10 MB
|
||||
- Italian (ita) - ~9 MB
|
||||
- Portuguese (por) - ~10 MB
|
||||
- Japanese (jpn) - ~18 MB
|
||||
- Simplified Chinese (chi_sim) - ~25 MB
|
||||
- Traditional Chinese (chi_tra) - ~22 MB
|
||||
- Korean (kor) - ~12 MB
|
||||
- Russian (rus) - ~14 MB
|
||||
- Arabic (ara) - ~8 MB
|
||||
- Hindi (hin) - ~8 MB
|
||||
|
||||
Total: ~168 MB (compressed) → ~150 MB (after Docker layer compression)
|
||||
|
||||
The `:full` variant bundles all 100+ languages (~600 MB) for specialized deployments requiring comprehensive coverage.
|
||||
|
||||
### Why not download-on-first-use?
|
||||
|
||||
Download-on-first-use was rejected because:
|
||||
- Requires network connectivity at OCR time (breaks air-gapped deployments)
|
||||
- Adds complexity (pack download, validation, caching)
|
||||
- Introduces latency on first OCR request
|
||||
- Requires a trusted pack distribution endpoint
|
||||
- Version drift between pack downloads across deployments
|
||||
|
||||
### Why not out-of-band install?
|
||||
|
||||
Out-of-band install (e.g., `apt-get tesseract-ocr-all`) was rejected because:
|
||||
- Platform-specific (Debian vs Alpine vs macOS vs Windows)
|
||||
- Version drift across package managers
|
||||
- Additional operator setup step
|
||||
- Inconsistent pack locations across distros
|
||||
|
||||
## Language Pack Allowlist
|
||||
|
||||
### `pdftract:ocr` bundle (Tier 1 - High Coverage)
|
||||
|
||||
| Code | Language | File | Size |
|
||||
|------|----------|------|------|
|
||||
| eng | English | eng.traineddata | 12 MB |
|
||||
| deu | German | deu.traineddata | 10 MB |
|
||||
| fra | French | fra.traineddata | 10 MB |
|
||||
| spa | Spanish | spa.traineddata | 10 MB |
|
||||
| ita | Italian | ita.traineddata | 9 MB |
|
||||
| por | Portuguese | por.traineddata | 10 MB |
|
||||
| jpn | Japanese | jpn.traineddata | 18 MB |
|
||||
| chi_sim | Simplified Chinese | chi_sim.traineddata | 25 MB |
|
||||
| chi_tra | Traditional Chinese | chi_tra.traineddata | 22 MB |
|
||||
| kor | Korean | kor.traineddata | 12 MB |
|
||||
| rus | Russian | rus.traineddata | 14 MB |
|
||||
| ara | Arabic | ara.traineddata | 8 MB |
|
||||
| hin | Hindi | hin.traineddata | 8 MB |
|
||||
|
||||
**Total: 13 languages, ~168 MB (uncompressed)**
|
||||
|
||||
This set covers:
|
||||
- All official UN languages (Arabic, Chinese, English, French, Russian, Spanish)
|
||||
- Major European languages (German, Italian, Portuguese)
|
||||
- Major East Asian languages (Japanese, Korean, Hindi)
|
||||
- ~80% of world population by native speaker count
|
||||
|
||||
### `pdftract:full` bundle (Tier 2 - Complete)
|
||||
|
||||
Includes all 100+ language packs from the official Tesseract tessdata repository:
|
||||
- All Tier 1 languages
|
||||
- Indic languages (ben, guj, kan, mal, tam, tel, etc.)
|
||||
- Southeast Asian languages (tha, vie, etc.)
|
||||
- Central/Eastern European languages (pol, ces, slk, hun, rom, bul, etc.)
|
||||
- Nordic languages (dan, nor, swe, fin)
|
||||
- Turkic languages (tur, aze, uzb, etc.)
|
||||
- Hebrew (heb)
|
||||
- And 60+ others
|
||||
|
||||
**Total: 100+ languages, ~600 MB (uncompressed)**
|
||||
|
||||
## Implementation
|
||||
|
||||
### Pack Detection
|
||||
|
||||
The `detect_available_languages()` function in `crates/pdftract-core/src/ocr.rs` scans the tessdata directory for `<code>.traineddata` files and returns a `HashSet<String>` of available language codes.
|
||||
|
||||
The function respects the `$TESSDATA_PREFIX` environment variable and falls back to system-default tessdata paths:
|
||||
- Unix: `/usr/share/tessdata`, `/usr/local/share/tessdata`
|
||||
- Windows: `C:\Program Files\Tesseract-OCR\tessdata`
|
||||
|
||||
### Language Validation
|
||||
|
||||
When OCR is invoked with a requested language list (from `ExtractionOptions.ocr_language`), the `validate_ocr_languages()` function:
|
||||
|
||||
1. Checks which requested languages are available
|
||||
2. Emits `OCR_LANGUAGE_UNAVAILABLE` diagnostics for missing languages
|
||||
3. Filters out unavailable languages from the Tesseract language string
|
||||
4. Falls back to `eng` if no requested languages are available
|
||||
|
||||
This ensures extraction never hard-crashes due to missing packs — it degrades gracefully with diagnostics.
|
||||
|
||||
### Doctor Check
|
||||
|
||||
The `pdftract doctor tesseract-langs` command verifies:
|
||||
1. Tesseract binary is installed (version 5.x)
|
||||
2. `eng` language pack is present (required fallback)
|
||||
3. User-requested `--lang` languages are present
|
||||
|
||||
Exit code 1 if `eng` is missing; exit code 0 with WARN if optional languages are missing.
|
||||
|
||||
## Docker Implementation
|
||||
|
||||
### Dockerfile.ocr (Tier 1)
|
||||
|
||||
```dockerfile
|
||||
FROM pdftract:base
|
||||
|
||||
# Install Tesseract + Tier 1 language packs
|
||||
RUN apk add --no-cache \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-data-eng \
|
||||
tesseract-ocr-data-deu \
|
||||
tesseract-ocr-data-fra \
|
||||
tesseract-ocr-data-spa \
|
||||
tesseract-ocr-data-ita \
|
||||
tesseract-ocr-data-por \
|
||||
tesseract-ocr-data-jpn \
|
||||
tesseract-ocr-data-chi_sim \
|
||||
tesseract-ocr-data-chi_tra \
|
||||
tesseract-ocr-data-kor \
|
||||
tesseract-ocr-data-rus \
|
||||
tesseract-ocr-data-ara \
|
||||
tesseract-ocr-data-hin
|
||||
|
||||
# Verify packs are installed
|
||||
RUN pdftract doctor tesseract-langs --lang eng,deu,fra,spa,ita,por,jpn,chi_sim,chi_tra,kor,rus,ara,hin
|
||||
```
|
||||
|
||||
### Dockerfile.full (Tier 2)
|
||||
|
||||
```dockerfile
|
||||
FROM pdftract:base
|
||||
|
||||
# Install Tesseract + all language packs
|
||||
RUN apk add --no-cache \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-data-all
|
||||
|
||||
# Verify packs are installed
|
||||
RUN pdftract doctor tesseract-langs
|
||||
```
|
||||
|
||||
## Version Policy
|
||||
|
||||
Language packs are pinned to Tesseract 5.x series:
|
||||
- Base image uses `tesseract-ocr 5.3.x` from Alpine repos
|
||||
- Packs are from the same major version to ensure compatibility
|
||||
- Updates follow Alpine's security patch cadence
|
||||
|
||||
Per OQ-03, Tesseract version pinning is documented in the Dockerfile comments.
|
||||
|
||||
## References
|
||||
|
||||
- Plan Phase 5.4: Tesseract Integration
|
||||
- Plan Open Question OQ-04
|
||||
- Bead pdftract-32x4 (implementation)
|
||||
- crates/pdftract-core/src/ocr.rs (language detection)
|
||||
- crates/pdftract-cli/src/doctor.rs (language verification)
|
||||
|
||||
## Revision History
|
||||
|
||||
| Date | Change |
|
||||
|------|--------|
|
||||
| 2026-05-23 | Initial resolution; document created with OQ-04 decision |
|
||||
|
|
@ -512,7 +512,7 @@ Questions that the current plan does not yet resolve. Each question is tagged wi
|
|||
| OQ-01 | When does the 500-PDF private regression corpus become available, and what is its licensing for CI use? | Phase 0 sign-off | Project lead; recorded in `docs/notes/corpus-licensing.md` |
|
||||
| OQ-02 | Who owns the font-fingerprint database curation pipeline (`build/font-fingerprints.json`) — is it a maintainer task, a community contribution, or an automated harvest from Google Fonts / Adobe? | Phase 2.2 implementation | Maintainer; documented in `docs/research/font-fingerprinting.md` |
|
||||
| OQ-03 | What is the Tesseract version pinning policy — pin to a specific 5.x patch release, or follow latest stable? Pinning gives reproducibility; following stable gets bug fixes faster. | Phase 5.4 implementation | CI maintainer; recorded in `Dockerfile` comment |
|
||||
| OQ-04 | How are OCR language packs distributed? Bundled in the Docker image (size cost), downloaded on first use (network dependency), or required as an out-of-band install? | Phase 5.4 implementation | Distribution lead; documented in `docs/notes/ocr-language-packs.md` |
|
||||
| OQ-04 | How are OCR language packs distributed? Bundled in the Docker image (size cost), downloaded on first use (network dependency), or required as an out-of-band install? | **RESOLVED** 2026-05-23 by bead pdftract-32x4 | Bundled in Docker images with tiered strategy (`:ocr` ~150 MB, `:full` ~600 MB). Documented in `docs/notes/ocr-language-packs.md` |
|
||||
| OQ-05 | What is the realistic coverage gap of the 5,000-entry glyph-shape DB on real-world subsetted fonts? Is 70% Latin-only coverage acceptable for v1.0.0, or must Cyrillic/Greek hit the same bar? | Phase 2.5 sign-off | Accuracy lead; benchmarked against `tests/fixtures/encoding/` |
|
||||
| OQ-06 | Does the Phase 7.10 profile field-extraction DSL need user-defined parsers (custom JavaScript / Lua / WASM hooks)? Built-in `decimal`/`date`/`int`/`bool` may be insufficient for niche document types. | v1.1+ | Deferred — solicit user feedback after v1.0.0 |
|
||||
| OQ-07 | How is the MCP server discovered by Claude Desktop / Cursor — manual config edit, a "pdftract setup-mcp" subcommand that writes the config, or both? Config file locations differ across OSes. | Phase 6.7 sign-off | MCP integration lead; documented in `docs/integrations/mcp-clients.md` |
|
||||
|
|
|
|||
345
docs/schema/v1.0/pdftract.schema.json
Normal file
345
docs/schema/v1.0/pdftract.schema.json
Normal file
|
|
@ -0,0 +1,345 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json",
|
||||
"title": "PDFtract Extraction Output Schema v1.0",
|
||||
"description": "JSON output schema for PDF text and structure extraction",
|
||||
"type": "object",
|
||||
"required": ["fingerprint", "schema_version", "pages", "metadata"],
|
||||
"properties": {
|
||||
"fingerprint": {
|
||||
"type": "string",
|
||||
"description": "PDF fingerprint for verification (format: pdftract-v1:<hex>)"
|
||||
},
|
||||
"schema_version": {
|
||||
"type": "string",
|
||||
"description": "Schema version (e.g., '1.0')",
|
||||
"enum": ["1.0"]
|
||||
},
|
||||
"pages": {
|
||||
"type": "array",
|
||||
"description": "Extracted pages",
|
||||
"items": {
|
||||
"$ref": "#/definitions/page"
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"$ref": "#/definitions/metadata"
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"page": {
|
||||
"type": "object",
|
||||
"required": ["index", "spans", "blocks", "tables"],
|
||||
"properties": {
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "0-based page index"
|
||||
},
|
||||
"spans": {
|
||||
"type": "array",
|
||||
"description": "Extracted text spans",
|
||||
"items": {
|
||||
"$ref": "#/definitions/span"
|
||||
}
|
||||
},
|
||||
"blocks": {
|
||||
"type": "array",
|
||||
"description": "Extracted structural blocks",
|
||||
"items": {
|
||||
"$ref": "#/definitions/block"
|
||||
}
|
||||
},
|
||||
"tables": {
|
||||
"type": "array",
|
||||
"description": "Extracted tables (cell-level structure)",
|
||||
"items": {
|
||||
"$ref": "#/definitions/table"
|
||||
}
|
||||
},
|
||||
"error": {
|
||||
"type": "string",
|
||||
"description": "Error message if extraction failed for this page"
|
||||
}
|
||||
}
|
||||
},
|
||||
"span": {
|
||||
"type": "object",
|
||||
"required": ["text", "bbox", "font", "size"],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The extracted text content"
|
||||
},
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"font": {
|
||||
"type": "string",
|
||||
"description": "Font name or identifier"
|
||||
},
|
||||
"size": {
|
||||
"type": "number",
|
||||
"description": "Font size in points"
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"description": "Confidence score (0.0 to 1.0) for OCR text",
|
||||
"minimum": 0.0,
|
||||
"maximum": 1.0
|
||||
},
|
||||
"receipt": {
|
||||
"$ref": "#/definitions/receipt"
|
||||
}
|
||||
}
|
||||
},
|
||||
"block": {
|
||||
"type": "object",
|
||||
"required": ["kind", "text", "bbox"],
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"description": "Block kind/type",
|
||||
"enum": ["paragraph", "heading", "list", "table", "figure"]
|
||||
},
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The concatenated text content of all spans in the block"
|
||||
},
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"level": {
|
||||
"type": "integer",
|
||||
"description": "Heading level (1-6) for 'heading' kind blocks",
|
||||
"minimum": 1,
|
||||
"maximum": 6
|
||||
},
|
||||
"table_index": {
|
||||
"type": "integer",
|
||||
"description": "Table index for 'table' kind blocks (points to tables array)",
|
||||
"minimum": 0
|
||||
},
|
||||
"receipt": {
|
||||
"$ref": "#/definitions/receipt"
|
||||
}
|
||||
}
|
||||
},
|
||||
"table": {
|
||||
"type": "object",
|
||||
"required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"],
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "Unique identifier for this table (e.g., 'table_0')"
|
||||
},
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"rows": {
|
||||
"type": "array",
|
||||
"description": "Rows in this table, ordered top-to-bottom",
|
||||
"items": {
|
||||
"$ref": "#/definitions/row"
|
||||
}
|
||||
},
|
||||
"header_rows": {
|
||||
"type": "integer",
|
||||
"description": "Number of contiguous header rows at the top of the table",
|
||||
"minimum": 0
|
||||
},
|
||||
"detection_method": {
|
||||
"type": "string",
|
||||
"description": "Detection method used to identify this table",
|
||||
"enum": ["line_based", "borderless"]
|
||||
},
|
||||
"continued": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this table continues on the next page"
|
||||
},
|
||||
"continued_from_prev": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this table is a continuation from the previous page"
|
||||
},
|
||||
"page_index": {
|
||||
"type": "integer",
|
||||
"description": "Zero-based page index where this table appears",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"row": {
|
||||
"type": "object",
|
||||
"required": ["bbox", "cells", "is_header"],
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"cells": {
|
||||
"type": "array",
|
||||
"description": "Cells in this row, ordered left-to-right",
|
||||
"items": {
|
||||
"$ref": "#/definitions/cell"
|
||||
}
|
||||
},
|
||||
"is_header": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this row is a header row"
|
||||
}
|
||||
}
|
||||
},
|
||||
"cell": {
|
||||
"type": "object",
|
||||
"required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"],
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The concatenated text content of all spans in the cell"
|
||||
},
|
||||
"spans": {
|
||||
"type": "array",
|
||||
"description": "References to spans in the page's spans array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"row": {
|
||||
"type": "integer",
|
||||
"description": "Zero-based row index within the table",
|
||||
"minimum": 0
|
||||
},
|
||||
"col": {
|
||||
"type": "integer",
|
||||
"description": "Zero-based column index within the table",
|
||||
"minimum": 0
|
||||
},
|
||||
"rowspan": {
|
||||
"type": "integer",
|
||||
"description": "Number of rows this cell spans (default 1)",
|
||||
"minimum": 1
|
||||
},
|
||||
"colspan": {
|
||||
"type": "integer",
|
||||
"description": "Number of columns this cell spans (default 1)",
|
||||
"minimum": 1
|
||||
},
|
||||
"is_header_row": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this cell is in a header row"
|
||||
}
|
||||
}
|
||||
},
|
||||
"receipt": {
|
||||
"type": "object",
|
||||
"required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"],
|
||||
"properties": {
|
||||
"pdf_fingerprint": {
|
||||
"type": "string",
|
||||
"description": "The PDF fingerprint"
|
||||
},
|
||||
"page_index": {
|
||||
"type": "integer",
|
||||
"description": "The page index"
|
||||
},
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"content_hash": {
|
||||
"type": "string",
|
||||
"description": "SHA-256 hash of the content"
|
||||
},
|
||||
"extraction_version": {
|
||||
"type": "string",
|
||||
"description": "Version string of the extractor"
|
||||
},
|
||||
"svg_clip": {
|
||||
"type": "string",
|
||||
"description": "SVG clip path for verification (present only in SvgClip mode)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"required": ["page_count", "span_count", "block_count"],
|
||||
"properties": {
|
||||
"page_count": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages in the document"
|
||||
},
|
||||
"span_count": {
|
||||
"type": "integer",
|
||||
"description": "Number of spans extracted"
|
||||
},
|
||||
"block_count": {
|
||||
"type": "integer",
|
||||
"description": "Number of blocks extracted"
|
||||
},
|
||||
"cache_status": {
|
||||
"type": "string",
|
||||
"description": "Cache status: 'hit', 'miss', or 'skipped'",
|
||||
"enum": ["hit", "miss", "skipped"]
|
||||
},
|
||||
"cache_age_seconds": {
|
||||
"type": "integer",
|
||||
"description": "Cache entry age in seconds (only present when cache_status == 'hit')",
|
||||
"minimum": 0
|
||||
},
|
||||
"error_count": {
|
||||
"type": "integer",
|
||||
"description": "Number of pages that failed to extract",
|
||||
"minimum": 0
|
||||
},
|
||||
"reading_order_algorithm": {
|
||||
"type": "string",
|
||||
"description": "Reading order algorithm used for this extraction",
|
||||
"enum": ["struct_tree", "xy_cut"]
|
||||
},
|
||||
"diagnostics": {
|
||||
"type": "array",
|
||||
"description": "Diagnostics emitted during extraction",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
6
examples/test_export.rs
Normal file
6
examples/test_export.rs
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
// Test that detect_merged_cells is accessible from pdftract_core::table
|
||||
use pdftract_core::table::detect_merged_cells;
|
||||
|
||||
fn main() {
|
||||
println!("detect_merged_cells is exported!");
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue