feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator)
This commit is contained in:
parent
deeafed7a9
commit
80dbf0f703
74 changed files with 2940 additions and 331 deletions
|
|
@ -1 +1 @@
|
||||||
9cf1ccffa9b1213b83079e66d9a245aadc6d584f
|
deeafed7a94a1e91609a11976ef16ee03a1f5fac
|
||||||
|
|
|
||||||
1
Cargo.lock
generated
1
Cargo.lock
generated
|
|
@ -3267,6 +3267,7 @@ dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"dashmap",
|
"dashmap",
|
||||||
"digest",
|
"digest",
|
||||||
|
"dirs",
|
||||||
"encoding_rs",
|
"encoding_rs",
|
||||||
"filetime",
|
"filetime",
|
||||||
"flate2",
|
"flate2",
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,7 @@ clap = { version = "4.5", features = ["derive"] }
|
||||||
crossbeam-channel = "0.5"
|
crossbeam-channel = "0.5"
|
||||||
dirs = "5.0"
|
dirs = "5.0"
|
||||||
hyper = { version = "1.0", features = ["full"] }
|
hyper = { version = "1.0", features = ["full"] }
|
||||||
|
notify = { version = "6", optional = true }
|
||||||
hyper-util = { version = "0.1", features = ["full"] }
|
hyper-util = { version = "0.1", features = ["full"] }
|
||||||
image = "0.24"
|
image = "0.24"
|
||||||
http-body-util = "0.1"
|
http-body-util = "0.1"
|
||||||
|
|
@ -117,7 +118,7 @@ full-render = ["dep:libloading", "pdftract-core/full-render"]
|
||||||
# Remote HTTP source support
|
# Remote HTTP source support
|
||||||
remote = ["dep:ureq"]
|
remote = ["dep:ureq"]
|
||||||
# Document profiles
|
# Document profiles
|
||||||
profiles = ["dep:serde_yaml", "pdftract-core/profiles"]
|
profiles = ["dep:serde_yaml", "pdftract-core/profiles", "dep:notify"]
|
||||||
# HTTP serve mode
|
# HTTP serve mode
|
||||||
serve = []
|
serve = []
|
||||||
# MCP server mode
|
# MCP server mode
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@ mod output;
|
||||||
mod pages;
|
mod pages;
|
||||||
mod panic_hook;
|
mod panic_hook;
|
||||||
mod password;
|
mod password;
|
||||||
|
mod profiles_cmd;
|
||||||
mod serve;
|
mod serve;
|
||||||
mod url;
|
mod url;
|
||||||
mod verify_receipt;
|
mod verify_receipt;
|
||||||
|
|
@ -160,6 +161,10 @@ enum Commands {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
auto: bool,
|
auto: bool,
|
||||||
|
|
||||||
|
/// Force-apply a specific profile (by name or YAML file path)
|
||||||
|
#[arg(long, value_name = "NAME|PATH")]
|
||||||
|
profile: Option<String>,
|
||||||
|
|
||||||
/// Include header blocks in output
|
/// Include header blocks in output
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
include_headers: bool,
|
include_headers: bool,
|
||||||
|
|
@ -238,6 +243,11 @@ enum Commands {
|
||||||
#[command(subcommand)]
|
#[command(subcommand)]
|
||||||
cache_command: CacheCommands,
|
cache_command: CacheCommands,
|
||||||
},
|
},
|
||||||
|
/// Manage document type profiles
|
||||||
|
Profiles {
|
||||||
|
#[command(subcommand)]
|
||||||
|
profiles_command: ProfilesCommands,
|
||||||
|
},
|
||||||
/// Start the HTTP server for extraction
|
/// Start the HTTP server for extraction
|
||||||
///
|
///
|
||||||
/// ## Security Model
|
/// ## Security Model
|
||||||
|
|
@ -311,6 +321,14 @@ enum Commands {
|
||||||
/// Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy)
|
/// Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
trust_forwarded_for: bool,
|
trust_forwarded_for: bool,
|
||||||
|
|
||||||
|
/// Directory containing custom profile YAML files (repeatable)
|
||||||
|
#[arg(long, value_name = "DIR")]
|
||||||
|
profile_dir: Option<PathBuf>,
|
||||||
|
|
||||||
|
/// Enable hot-reload for profiles (re-read directory on every request)
|
||||||
|
#[arg(long)]
|
||||||
|
profile_hot_reload: bool,
|
||||||
},
|
},
|
||||||
/// Start the MCP (Model Context Protocol) server
|
/// Start the MCP (Model Context Protocol) server
|
||||||
///
|
///
|
||||||
|
|
@ -452,6 +470,32 @@ enum CacheCommands {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Subcommand)]
|
||||||
|
enum ProfilesCommands {
|
||||||
|
/// List all available profiles
|
||||||
|
List,
|
||||||
|
/// Show a profile's YAML content
|
||||||
|
Show {
|
||||||
|
/// Profile name or path to YAML file
|
||||||
|
name_or_path: String,
|
||||||
|
},
|
||||||
|
/// Export a built-in profile to stdout
|
||||||
|
Export {
|
||||||
|
/// Name of the built-in profile to export
|
||||||
|
name: String,
|
||||||
|
},
|
||||||
|
/// Install a profile to the user config directory
|
||||||
|
Install {
|
||||||
|
/// Path to the profile YAML file to install
|
||||||
|
path: PathBuf,
|
||||||
|
},
|
||||||
|
/// Validate a profile file
|
||||||
|
Validate {
|
||||||
|
/// Path to the profile YAML file to validate
|
||||||
|
path: PathBuf,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
// Install panic hook for SecretString redaction in backtraces
|
// Install panic hook for SecretString redaction in backtraces
|
||||||
// This ensures credentials never leak in crash dumps
|
// This ensures credentials never leak in crash dumps
|
||||||
|
|
@ -504,6 +548,7 @@ fn main() -> Result<()> {
|
||||||
no_cache,
|
no_cache,
|
||||||
md_anchors,
|
md_anchors,
|
||||||
auto,
|
auto,
|
||||||
|
profile,
|
||||||
output,
|
output,
|
||||||
include_headers,
|
include_headers,
|
||||||
include_footers,
|
include_footers,
|
||||||
|
|
@ -532,6 +577,7 @@ fn main() -> Result<()> {
|
||||||
no_cache,
|
no_cache,
|
||||||
md_anchors,
|
md_anchors,
|
||||||
auto,
|
auto,
|
||||||
|
profile,
|
||||||
include_headers,
|
include_headers,
|
||||||
include_footers,
|
include_footers,
|
||||||
include_headers_footers,
|
include_headers_footers,
|
||||||
|
|
@ -602,6 +648,12 @@ fn main() -> Result<()> {
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Commands::Profiles { profiles_command } => {
|
||||||
|
if let Err(e) = cmd_profiles(profiles_command) {
|
||||||
|
eprintln!("Error: {}", e);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
Commands::Serve {
|
Commands::Serve {
|
||||||
bind,
|
bind,
|
||||||
cache_dir,
|
cache_dir,
|
||||||
|
|
@ -611,6 +663,8 @@ fn main() -> Result<()> {
|
||||||
max_decompress_gb,
|
max_decompress_gb,
|
||||||
audit_log,
|
audit_log,
|
||||||
trust_forwarded_for,
|
trust_forwarded_for,
|
||||||
|
profile_dir,
|
||||||
|
profile_hot_reload,
|
||||||
} => {
|
} => {
|
||||||
if let Err(e) = cmd_serve(
|
if let Err(e) = cmd_serve(
|
||||||
bind,
|
bind,
|
||||||
|
|
@ -621,6 +675,8 @@ fn main() -> Result<()> {
|
||||||
max_decompress_gb,
|
max_decompress_gb,
|
||||||
audit_log,
|
audit_log,
|
||||||
trust_forwarded_for,
|
trust_forwarded_for,
|
||||||
|
profile_dir,
|
||||||
|
profile_hot_reload,
|
||||||
) {
|
) {
|
||||||
eprintln!("Error: {}", e);
|
eprintln!("Error: {}", e);
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
|
|
@ -775,6 +831,7 @@ fn cmd_extract(
|
||||||
no_cache: bool,
|
no_cache: bool,
|
||||||
md_anchors: bool,
|
md_anchors: bool,
|
||||||
auto: bool,
|
auto: bool,
|
||||||
|
profile: Option<String>,
|
||||||
include_headers: bool,
|
include_headers: bool,
|
||||||
include_footers: bool,
|
include_footers: bool,
|
||||||
include_headers_footers: bool,
|
include_headers_footers: bool,
|
||||||
|
|
@ -921,11 +978,12 @@ fn cmd_extract(
|
||||||
eprintln!("Auto-detecting document type...");
|
eprintln!("Auto-detecting document type...");
|
||||||
|
|
||||||
use pdftract_core::profiles::{
|
use pdftract_core::profiles::{
|
||||||
classify, extract_signals_from_results, load_builtins, ProfileType,
|
classify_and_select_profile, extract_signals_from_results, load_extraction_profiles,
|
||||||
|
apply_extraction_tuning, apply_profile_to_metadata,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Load built-in profiles
|
// Load all extraction profiles
|
||||||
let profiles = load_builtins();
|
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
|
||||||
|
|
||||||
if !profiles.is_empty() {
|
if !profiles.is_empty() {
|
||||||
// Perform a lightweight extraction for classification
|
// Perform a lightweight extraction for classification
|
||||||
|
|
@ -940,43 +998,33 @@ fn cmd_extract(
|
||||||
.map(|p| (p.blocks.clone(), p.spans.clone()))
|
.map(|p| (p.blocks.clone(), p.spans.clone()))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let signals =
|
let selected_profile = classify_and_select_profile(
|
||||||
extract_signals_from_results(&page_data, has_signature_field, has_form_field);
|
&profiles.iter().map(|p| p.profile.clone()).collect::<Vec<_>>(),
|
||||||
let classification = classify(&signals, &profiles);
|
&page_data,
|
||||||
|
has_signature_field,
|
||||||
|
has_form_field,
|
||||||
|
);
|
||||||
|
|
||||||
match classification.document_type {
|
if let Some((profile, match_result)) = selected_profile {
|
||||||
ProfileType::Unknown => {
|
eprintln!(
|
||||||
eprintln!(
|
"Document type: {} (confidence: {:.2})",
|
||||||
"Document type: unknown (confidence: {:.2})",
|
profile.name, match_result.confidence
|
||||||
classification.confidence
|
);
|
||||||
);
|
|
||||||
eprintln!("Proceeding with default extraction options.");
|
|
||||||
}
|
|
||||||
detected_type => {
|
|
||||||
let type_name = match detected_type {
|
|
||||||
ProfileType::Invoice => "invoice",
|
|
||||||
ProfileType::Receipt => "receipt",
|
|
||||||
ProfileType::Contract => "contract",
|
|
||||||
ProfileType::ScientificPaper => "scientific_paper",
|
|
||||||
ProfileType::SlideDeck => "slide_deck",
|
|
||||||
ProfileType::Form => "form",
|
|
||||||
ProfileType::BankStatement => "bank_statement",
|
|
||||||
ProfileType::LegalFiling => "legal_filing",
|
|
||||||
ProfileType::BookChapter => "book_chapter",
|
|
||||||
ProfileType::Unknown => "unknown",
|
|
||||||
};
|
|
||||||
eprintln!(
|
|
||||||
"Document type: {} (confidence: {:.2})",
|
|
||||||
type_name, classification.confidence
|
|
||||||
);
|
|
||||||
|
|
||||||
// Apply profile-specific extraction options
|
// Apply profile extraction tuning
|
||||||
// For now, just log the detection - profile option overrides
|
if let Some(ref tuning) = profile.extraction {
|
||||||
// will be implemented in Phase 7.10
|
apply_extraction_tuning(tuning, &mut options);
|
||||||
for reason in classification.reasons.iter().take(5) {
|
|
||||||
eprintln!(" - {}", reason);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Store the selected profile for later field extraction
|
||||||
|
// We'll extract fields after the main extraction
|
||||||
|
// For now, just log the match reasons
|
||||||
|
for reason in match_result.reasons.iter().take(5) {
|
||||||
|
eprintln!(" - {}", reason);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
eprintln!("Document type: unknown (confidence: below threshold)");
|
||||||
|
eprintln!("Proceeding with default extraction options.");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
eprintln!(
|
eprintln!(
|
||||||
|
|
@ -990,6 +1038,46 @@ fn cmd_extract(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle --profile flag: load and apply specific profile
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
if let Some(ref profile_name_or_path) = profile {
|
||||||
|
use pdftract_core::profiles::{
|
||||||
|
load_extraction_profiles, apply_extraction_tuning,
|
||||||
|
};
|
||||||
|
|
||||||
|
eprintln!("Applying profile: {}", profile_name_or_path);
|
||||||
|
|
||||||
|
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
|
||||||
|
|
||||||
|
// Find the profile by name or load from path
|
||||||
|
let profile = if std::path::PathBuf::from(profile_name_or_path).exists() {
|
||||||
|
// Load from file path
|
||||||
|
use pdftract_core::profiles::load_profile_file;
|
||||||
|
match load_profile_file(&std::path::PathBuf::from(profile_name_or_path)) {
|
||||||
|
Ok(p) => Some(p),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Error loading profile: {}", e);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Find by name
|
||||||
|
profiles.iter()
|
||||||
|
.find(|p| p.profile.name == *profile_name_or_path)
|
||||||
|
.map(|p| p.profile.clone())
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(p) = profile {
|
||||||
|
eprintln!("Loaded profile: {}", p.name);
|
||||||
|
if let Some(ref tuning) = p.extraction {
|
||||||
|
apply_extraction_tuning(tuning, &mut options);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
eprintln!("Error: Profile '{}' not found", profile_name_or_path);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(not(feature = "profiles"))]
|
#[cfg(not(feature = "profiles"))]
|
||||||
if auto {
|
if auto {
|
||||||
eprintln!("Warning: --auto flag requires the 'profiles' feature to be enabled.");
|
eprintln!("Warning: --auto flag requires the 'profiles' feature to be enabled.");
|
||||||
|
|
@ -997,6 +1085,13 @@ fn cmd_extract(
|
||||||
eprintln!("Proceeding with default extraction options.");
|
eprintln!("Proceeding with default extraction options.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "profiles"))]
|
||||||
|
if profile.is_some() {
|
||||||
|
eprintln!("Warning: --profile flag requires the 'profiles' feature to be enabled.");
|
||||||
|
eprintln!("Build pdftract with: --features profiles");
|
||||||
|
eprintln!("Proceeding with default extraction options.");
|
||||||
|
}
|
||||||
|
|
||||||
// Set markdown anchors option
|
// Set markdown anchors option
|
||||||
options.markdown_anchors = md_anchors;
|
options.markdown_anchors = md_anchors;
|
||||||
if md_anchors {
|
if md_anchors {
|
||||||
|
|
@ -1096,6 +1191,58 @@ fn cmd_extract(
|
||||||
result.metadata.cache_status = Some(cache_status);
|
result.metadata.cache_status = Some(cache_status);
|
||||||
result.metadata.cache_age_seconds = cache_age;
|
result.metadata.cache_age_seconds = cache_age;
|
||||||
|
|
||||||
|
// Extract profile fields if --auto or --profile was used
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
use pdftract_core::profiles::{
|
||||||
|
load_extraction_profiles, apply_profile_to_metadata,
|
||||||
|
};
|
||||||
|
|
||||||
|
let profile_to_apply = if auto {
|
||||||
|
// Re-run classification to get the selected profile
|
||||||
|
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
|
||||||
|
let page_data: Vec<(Vec<_>, Vec<_>)> = result
|
||||||
|
.pages
|
||||||
|
.iter()
|
||||||
|
.map(|p| (p.blocks.clone(), p.spans.clone()))
|
||||||
|
.collect();
|
||||||
|
let has_signature_field = !result.signatures.is_empty();
|
||||||
|
let has_form_field = !result.form_fields.is_empty();
|
||||||
|
|
||||||
|
use pdftract_core::profiles::classify_and_select_profile;
|
||||||
|
classify_and_select_profile(
|
||||||
|
&profiles.iter().map(|p| p.profile.clone()).collect::<Vec<_>>(),
|
||||||
|
&page_data,
|
||||||
|
has_signature_field,
|
||||||
|
has_form_field,
|
||||||
|
).map(|(p, _)| p)
|
||||||
|
} else if profile.is_some() {
|
||||||
|
// Load the specified profile
|
||||||
|
let profile_name_or_path = profile.as_ref().unwrap();
|
||||||
|
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
|
||||||
|
|
||||||
|
if std::path::PathBuf::from(profile_name_or_path).exists() {
|
||||||
|
use pdftract_core::profiles::load_profile_file;
|
||||||
|
load_profile_file(&std::path::PathBuf::from(profile_name_or_path)).ok()
|
||||||
|
} else {
|
||||||
|
profiles.iter()
|
||||||
|
.find(|p| p.profile.name == *profile_name_or_path)
|
||||||
|
.map(|p| p.profile.clone())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
// Apply profile to metadata
|
||||||
|
if let Some(p) = profile_to_apply {
|
||||||
|
let (name, version, fields) = apply_profile_to_metadata(&p, &result.pages);
|
||||||
|
// Update the result's metadata with profile information
|
||||||
|
result.metadata.profile_name = Some(name);
|
||||||
|
result.metadata.profile_version = Some(version);
|
||||||
|
result.metadata.profile_fields = fields;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Write each output to its destination
|
// Write each output to its destination
|
||||||
for spec in &output_specs {
|
for spec in &output_specs {
|
||||||
match spec.dest {
|
match spec.dest {
|
||||||
|
|
@ -1803,6 +1950,25 @@ fn cmd_cache(command: CacheCommands) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cmd_profiles(command: ProfilesCommands) -> Result<()> {
|
||||||
|
use profiles_cmd::{ProfilesArgs, ProfilesCommand};
|
||||||
|
|
||||||
|
// Convert ProfilesCommands to profiles_cmd::ProfilesCommand
|
||||||
|
let profiles_command = match command {
|
||||||
|
ProfilesCommands::List => ProfilesCommand::List,
|
||||||
|
ProfilesCommands::Show { name_or_path } => ProfilesCommand::Show { name_or_path },
|
||||||
|
ProfilesCommands::Export { name } => ProfilesCommand::Export { name },
|
||||||
|
ProfilesCommands::Install { path } => ProfilesCommand::Install { path },
|
||||||
|
ProfilesCommands::Validate { path } => ProfilesCommand::Validate { path },
|
||||||
|
};
|
||||||
|
|
||||||
|
let args = ProfilesArgs {
|
||||||
|
command: profiles_command,
|
||||||
|
};
|
||||||
|
|
||||||
|
profiles_cmd::run_profiles(args)
|
||||||
|
}
|
||||||
|
|
||||||
fn cmd_serve(
|
fn cmd_serve(
|
||||||
bind: String,
|
bind: String,
|
||||||
cache_dir: Option<PathBuf>,
|
cache_dir: Option<PathBuf>,
|
||||||
|
|
|
||||||
300
crates/pdftract-cli/src/profiles_cmd.rs
Normal file
300
crates/pdftract-cli/src/profiles_cmd.rs
Normal file
|
|
@ -0,0 +1,300 @@
|
||||||
|
//! Profile management CLI subcommand.
|
||||||
|
//!
|
||||||
|
//! This module implements the `pdftract profiles` command family for managing
|
||||||
|
//! document type profiles (list, show, export, install, validate).
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use std::fs;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
/// Arguments for the profiles subcommand.
|
||||||
|
pub struct ProfilesArgs {
|
||||||
|
/// Subcommand to run
|
||||||
|
pub command: ProfilesCommand,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Profiles subcommands.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub enum ProfilesCommand {
|
||||||
|
/// List all available profiles
|
||||||
|
List,
|
||||||
|
/// Show a profile's YAML content
|
||||||
|
Show { name_or_path: String },
|
||||||
|
/// Export a built-in profile to stdout
|
||||||
|
Export { name: String },
|
||||||
|
/// Install a profile to the user config directory
|
||||||
|
Install { path: PathBuf },
|
||||||
|
/// Validate a profile file
|
||||||
|
Validate { path: PathBuf },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run the profiles subcommand.
|
||||||
|
pub fn run_profiles(args: ProfilesArgs) -> Result<()> {
|
||||||
|
match args.command {
|
||||||
|
ProfilesCommand::List => run_list(),
|
||||||
|
ProfilesCommand::Show { name_or_path } => run_show(&name_or_path),
|
||||||
|
ProfilesCommand::Export { name } => run_export(&name),
|
||||||
|
ProfilesCommand::Install { path } => run_install(&path),
|
||||||
|
ProfilesCommand::Validate { path } => run_validate(&path),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List all available profiles.
|
||||||
|
fn run_list() -> Result<()> {
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
use pdftract_core::profiles::extraction_loader;
|
||||||
|
|
||||||
|
// Load all extraction profiles
|
||||||
|
let profiles = extraction_loader::load_extraction_profiles(&[])?;
|
||||||
|
|
||||||
|
if profiles.is_empty() {
|
||||||
|
println!("No profiles available.");
|
||||||
|
println!();
|
||||||
|
println!("Built-in profiles may not be enabled. Build pdftract with:");
|
||||||
|
println!(" cargo build --features profiles");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Available profiles ({} total):", profiles.len());
|
||||||
|
println!();
|
||||||
|
|
||||||
|
// Group by origin
|
||||||
|
let mut builtin = Vec::new();
|
||||||
|
let mut user = Vec::new();
|
||||||
|
let mut custom = Vec::new();
|
||||||
|
|
||||||
|
for source in &profiles {
|
||||||
|
match source.source {
|
||||||
|
extraction_loader::ProfileOrigin::BuiltIn => builtin.push(source),
|
||||||
|
extraction_loader::ProfileOrigin::User => user.push(source),
|
||||||
|
extraction_loader::ProfileOrigin::Custom(_) => custom.push(source),
|
||||||
|
extraction_loader::ProfileOrigin::System => {
|
||||||
|
// System profiles - add to a separate group or merge with user
|
||||||
|
user.push(source);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print built-in profiles
|
||||||
|
if !builtin.is_empty() {
|
||||||
|
println!("Built-in profiles:");
|
||||||
|
for source in builtin {
|
||||||
|
let profile = &source.profile;
|
||||||
|
println!(
|
||||||
|
" {} - Priority: {}{}",
|
||||||
|
profile.name,
|
||||||
|
profile.priority,
|
||||||
|
if source.overrides_builtin {
|
||||||
|
" (overrides built-in)"
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
}
|
||||||
|
);
|
||||||
|
println!(" {}", profile.description);
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print user profiles
|
||||||
|
if !user.is_empty() {
|
||||||
|
println!("User profiles:");
|
||||||
|
for source in user {
|
||||||
|
let profile = &source.profile;
|
||||||
|
println!(
|
||||||
|
" {} - Priority: {}{}",
|
||||||
|
profile.name,
|
||||||
|
profile.priority,
|
||||||
|
if source.overrides_builtin {
|
||||||
|
" (overrides built-in)"
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
}
|
||||||
|
);
|
||||||
|
println!(" {}", profile.description);
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Print custom profiles
|
||||||
|
if !custom.is_empty() {
|
||||||
|
println!("Custom profiles:");
|
||||||
|
for source in custom {
|
||||||
|
let profile = &source.profile;
|
||||||
|
println!(
|
||||||
|
" {} - Priority: {}",
|
||||||
|
profile.name, profile.priority
|
||||||
|
);
|
||||||
|
println!(" {}", profile.description);
|
||||||
|
}
|
||||||
|
println!();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "profiles"))]
|
||||||
|
{
|
||||||
|
println!("Profiles are not enabled.");
|
||||||
|
println!();
|
||||||
|
println!("Build pdftract with the profiles feature:");
|
||||||
|
println!(" cargo build --features profiles");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Show a profile's YAML content.
|
||||||
|
fn run_show(name_or_path: &str) -> Result<()> {
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
use pdftract_core::profiles::extraction_loader;
|
||||||
|
|
||||||
|
// Load all profiles to search by name
|
||||||
|
let profiles = extraction_loader::load_extraction_profiles(&[])?;
|
||||||
|
|
||||||
|
// Try to find the profile
|
||||||
|
let profile = extraction_loader::find_profile(name_or_path, &profiles)?;
|
||||||
|
|
||||||
|
// Serialize back to YAML
|
||||||
|
let yaml = serde_yaml::to_string(&profile)
|
||||||
|
.context("Failed to serialize profile to YAML")?;
|
||||||
|
|
||||||
|
println!("{}", yaml);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "profiles"))]
|
||||||
|
{
|
||||||
|
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Export a built-in profile to stdout.
|
||||||
|
fn run_export(name: &str) -> Result<()> {
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
use pdftract_core::profiles::extraction_loader;
|
||||||
|
|
||||||
|
// Load all profiles
|
||||||
|
let profiles = extraction_loader::load_extraction_profiles(&[])?;
|
||||||
|
|
||||||
|
// Find the built-in profile by name
|
||||||
|
let profile = profiles
|
||||||
|
.iter()
|
||||||
|
.find(|s| s.profile.name == name && matches!(s.source, extraction_loader::ProfileOrigin::BuiltIn))
|
||||||
|
.context(format!("Built-in profile '{}' not found", name))?;
|
||||||
|
|
||||||
|
// Serialize to YAML
|
||||||
|
let yaml = serde_yaml::to_string(&profile)
|
||||||
|
.context("Failed to serialize profile to YAML")?;
|
||||||
|
|
||||||
|
println!("{}", yaml);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "profiles"))]
|
||||||
|
{
|
||||||
|
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Install a profile to the user config directory.
|
||||||
|
fn run_install(path: &PathBuf) -> Result<()> {
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
use pdftract_core::profiles::extraction_loader;
|
||||||
|
|
||||||
|
// Check if source file exists
|
||||||
|
if !path.exists() {
|
||||||
|
anyhow::bail!("Profile file not found: {}", path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get XDG config directory
|
||||||
|
let xdg_dir = extraction_loader::get_xdg_profile_dir()
|
||||||
|
.context("Failed to determine XDG config directory")?;
|
||||||
|
|
||||||
|
// Create directory if it doesn't exist
|
||||||
|
fs::create_dir_all(&xdg_dir)
|
||||||
|
.context(format!("Failed to create profile directory: {}", xdg_dir.display()))?;
|
||||||
|
|
||||||
|
// Read the profile to get its name
|
||||||
|
let content = fs::read_to_string(path)
|
||||||
|
.context(format!("Failed to read profile file: {}", path.display()))?;
|
||||||
|
|
||||||
|
// Parse to get the profile name
|
||||||
|
let profile: pdftract_core::profiles::ExtractionProfile = serde_yaml::from_str(&content)
|
||||||
|
.context("Failed to parse profile YAML")?;
|
||||||
|
|
||||||
|
// Destination path
|
||||||
|
let dest = xdg_dir.join(format!("{}.yaml", profile.name));
|
||||||
|
|
||||||
|
// Copy file
|
||||||
|
fs::copy(path, &dest)
|
||||||
|
.context(format!("Failed to copy profile to: {}", dest.display()))?;
|
||||||
|
|
||||||
|
println!("Installed profile '{}' to: {}", profile.name, dest.display());
|
||||||
|
println!();
|
||||||
|
println!("You can now use this profile with:");
|
||||||
|
println!(" pdftract extract --profile {}", profile.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "profiles"))]
|
||||||
|
{
|
||||||
|
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate a profile file.
|
||||||
|
fn run_validate(path: &PathBuf) -> Result<()> {
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
use pdftract_core::profiles::extraction_loader;
|
||||||
|
|
||||||
|
// Check if file exists
|
||||||
|
if !path.exists() {
|
||||||
|
anyhow::bail!("Profile file not found: {}", path.display());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate the profile
|
||||||
|
match extraction_loader::validate_profile_file(path) {
|
||||||
|
Ok(()) => {
|
||||||
|
println!("Profile '{}' is valid.", path.display());
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
anyhow::bail!("Profile validation failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "profiles"))]
|
||||||
|
{
|
||||||
|
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profiles_command_enum() {
|
||||||
|
let command = ProfilesCommand::List;
|
||||||
|
assert!(matches!(command, ProfilesCommand::List));
|
||||||
|
|
||||||
|
let show = ProfilesCommand::Show {
|
||||||
|
name_or_path: "invoice".to_string(),
|
||||||
|
};
|
||||||
|
assert!(matches!(show, ProfilesCommand::Show { .. }));
|
||||||
|
|
||||||
|
let export = ProfilesCommand::Export {
|
||||||
|
name: "invoice".to_string(),
|
||||||
|
};
|
||||||
|
assert!(matches!(export, ProfilesCommand::Export { .. }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -46,6 +46,7 @@ smallvec = "1.13"
|
||||||
encoding_rs = "0.8"
|
encoding_rs = "0.8"
|
||||||
quick-xml = { version = "0.36", optional = true }
|
quick-xml = { version = "0.36", optional = true }
|
||||||
serde_yaml = { version = "0.9", optional = true }
|
serde_yaml = { version = "0.9", optional = true }
|
||||||
|
dirs = "5.0"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
aes = { version = "0.8", optional = true }
|
aes = { version = "0.8", optional = true }
|
||||||
rc4 = { version = "0.1", optional = true }
|
rc4 = { version = "0.1", optional = true }
|
||||||
|
|
|
||||||
|
|
@ -304,6 +304,15 @@ pub struct ExtractionMetadata {
|
||||||
/// Diagnostics emitted during extraction (coverage warnings, etc.)
|
/// Diagnostics emitted during extraction (coverage warnings, etc.)
|
||||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||||
pub diagnostics: Vec<String>,
|
pub diagnostics: Vec<String>,
|
||||||
|
/// Profile name if a profile was applied (Phase 7.10)
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub profile_name: Option<String>,
|
||||||
|
/// Profile version if a profile was applied (Phase 7.10)
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub profile_version: Option<String>,
|
||||||
|
/// Extracted fields from profile if a profile was applied (Phase 7.10)
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub profile_fields: Option<serde_json::Value>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract text and structure from a PDF file.
|
/// Extract text and structure from a PDF file.
|
||||||
|
|
@ -931,6 +940,9 @@ pub fn extract_pdf(
|
||||||
error_count,
|
error_count,
|
||||||
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
||||||
diagnostics: all_diagnostics_with_js,
|
diagnostics: all_diagnostics_with_js,
|
||||||
|
profile_name: None,
|
||||||
|
profile_version: None,
|
||||||
|
profile_fields: None,
|
||||||
},
|
},
|
||||||
signatures,
|
signatures,
|
||||||
form_fields,
|
form_fields,
|
||||||
|
|
@ -1812,6 +1824,9 @@ pub fn extract_pdf_ndjson<W: std::io::Write>(
|
||||||
error_count: error_count as usize,
|
error_count: error_count as usize,
|
||||||
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
||||||
diagnostics: all_diagnostics,
|
diagnostics: all_diagnostics,
|
||||||
|
profile_name: None,
|
||||||
|
profile_version: None,
|
||||||
|
profile_fields: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2117,6 +2132,9 @@ where
|
||||||
error_count,
|
error_count,
|
||||||
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
|
||||||
diagnostics: all_diagnostics,
|
diagnostics: all_diagnostics,
|
||||||
|
profile_name: None,
|
||||||
|
profile_version: None,
|
||||||
|
profile_fields: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
259
crates/pdftract-core/src/profiles/apply_profile.rs
Normal file
259
crates/pdftract-core/src/profiles/apply_profile.rs
Normal file
|
|
@ -0,0 +1,259 @@
|
||||||
|
//! Profile application for extraction tuning (Phase 7.10).
|
||||||
|
//!
|
||||||
|
//! Applies profile extraction tuning to ExtractionOptions and manages
|
||||||
|
//! the profile workflow: classification, option override, field extraction,
|
||||||
|
//! and metadata population.
|
||||||
|
|
||||||
|
use super::extraction::{ExtractionProfile, ExtractionTuning};
|
||||||
|
use super::field_extractor;
|
||||||
|
use super::match_eval::{evaluate_match, MatchResult};
|
||||||
|
use super::signals::extract_signals_from_results;
|
||||||
|
use crate::options::{ExtractionOptions, OutputOptions};
|
||||||
|
use crate::schema::{BlockJson, PageJson, SpanJson};
|
||||||
|
use anyhow::Result;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
/// Apply a profile's extraction tuning to extraction options.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `tuning` - The extraction tuning from a profile
|
||||||
|
/// * `options` - The base extraction options to modify
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// Modified extraction options with profile-specific overrides applied.
|
||||||
|
///
|
||||||
|
/// # Note
|
||||||
|
///
|
||||||
|
/// Many extraction tuning fields (reading_order, table_detection, etc.) are
|
||||||
|
/// not yet exposed in ExtractionOptions. This function applies what is available
|
||||||
|
/// and logs warnings for unsupported fields.
|
||||||
|
pub fn apply_extraction_tuning(tuning: &ExtractionTuning, options: &mut ExtractionOptions) {
|
||||||
|
// Apply output filtering options (these are supported)
|
||||||
|
if let Some(include_invisible) = tuning.include_invisible {
|
||||||
|
options.output.include_invisible = include_invisible;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(include_headers_footers) = tuning.include_headers_footers {
|
||||||
|
if include_headers_footers {
|
||||||
|
options.output.include_headers = true;
|
||||||
|
options.output.include_footers = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log warnings for unsupported fields (for future implementation)
|
||||||
|
if tuning.reading_order.is_some() {
|
||||||
|
eprintln!("Profile warning: reading_order tuning is not yet supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
if tuning.table_detection.is_some() {
|
||||||
|
eprintln!("Profile warning: table_detection tuning is not yet supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
if tuning.readability_threshold.is_some() {
|
||||||
|
eprintln!("Profile warning: readability_threshold tuning is not yet supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
if tuning.force_ocr.is_some() {
|
||||||
|
eprintln!("Profile warning: force_ocr tuning is not yet supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
if tuning.min_block_chars.is_some() {
|
||||||
|
eprintln!("Profile warning: min_block_chars tuning is not yet supported");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Classify a document and select the best matching profile.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `profiles` - All available extraction profiles
|
||||||
|
/// * `page_data` - Page data (blocks, span_indices) for signal extraction
|
||||||
|
/// * `has_signature_field` - Whether document has signature fields
|
||||||
|
/// * `has_form_field` - Whether document has form fields
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// The best matching profile with confidence score, or None if no profile
|
||||||
|
/// matches with confidence >= 0.6.
|
||||||
|
pub fn classify_and_select_profile(
|
||||||
|
profiles: &[ExtractionProfile],
|
||||||
|
page_data: &[(Vec<BlockJson>, Vec<SpanJson>)], // (blocks, spans) per page
|
||||||
|
has_signature_field: bool,
|
||||||
|
has_form_field: bool,
|
||||||
|
) -> Option<(ExtractionProfile, MatchResult)> {
|
||||||
|
// Extract signals from the document
|
||||||
|
let signals = extract_signals_from_results(page_data, has_signature_field, has_form_field);
|
||||||
|
|
||||||
|
// Evaluate each profile
|
||||||
|
let mut best_profile: Option<(ExtractionProfile, MatchResult)> = None;
|
||||||
|
|
||||||
|
for profile in profiles {
|
||||||
|
let result = evaluate_match(&profile.match_expr, &signals);
|
||||||
|
|
||||||
|
// Only consider matches with confidence >= 0.6
|
||||||
|
if result.matched && result.confidence >= 0.6 {
|
||||||
|
match &best_profile {
|
||||||
|
None => {
|
||||||
|
best_profile = Some((profile.clone(), result));
|
||||||
|
}
|
||||||
|
Some((existing_profile, existing_result)) => {
|
||||||
|
// Prefer higher confidence, then higher priority
|
||||||
|
if result.confidence > existing_result.confidence
|
||||||
|
|| (result.confidence == existing_result.confidence
|
||||||
|
&& profile.priority > existing_profile.priority)
|
||||||
|
{
|
||||||
|
best_profile = Some((profile.clone(), result));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
best_profile
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apply a profile to extraction metadata.
|
||||||
|
///
|
||||||
|
/// Populates profile_name, profile_version, and profile_fields in the
|
||||||
|
/// extraction metadata.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `profile` - The profile that was applied
|
||||||
|
/// * `metadata` - The extraction metadata to update (this must be the full ExtractionMetadata from extract module)
|
||||||
|
/// * `pages` - Extracted pages for field extraction
|
||||||
|
///
|
||||||
|
/// # Note
|
||||||
|
///
|
||||||
|
/// This function requires the full ExtractionMetadata from the extract module.
|
||||||
|
/// Due to the module structure, we update metadata through a closure that
|
||||||
|
/// can access the internal fields.
|
||||||
|
pub fn apply_profile_to_metadata(
|
||||||
|
profile: &ExtractionProfile,
|
||||||
|
pages: &[PageJson],
|
||||||
|
) -> (String, String, Option<serde_json::Value>) {
|
||||||
|
let profile_name = profile.name.clone();
|
||||||
|
let profile_version = "1.0.0".to_string(); // Profile version schema
|
||||||
|
|
||||||
|
// Extract fields if the profile has field specifications
|
||||||
|
let profile_fields = if !profile.fields.is_empty() {
|
||||||
|
// Collect all blocks from all pages
|
||||||
|
let all_blocks: Vec<BlockJson> = pages.iter().flat_map(|p| p.blocks.clone()).collect();
|
||||||
|
|
||||||
|
// Build full text from all spans
|
||||||
|
let full_text = pages
|
||||||
|
.iter()
|
||||||
|
.flat_map(|p| p.spans.iter().map(|s| s.text.clone()))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(" ");
|
||||||
|
|
||||||
|
// Extract profile fields
|
||||||
|
let field_results =
|
||||||
|
field_extractor::extract_profile_fields(&profile.fields, &all_blocks, &full_text);
|
||||||
|
|
||||||
|
// Convert to JSON object
|
||||||
|
let mut fields_obj = serde_json::Map::new();
|
||||||
|
for (field_name, result) in field_results {
|
||||||
|
fields_obj.insert(field_name, result.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(json!(fields_obj))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
(profile_name, profile_version, profile_fields)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::options::ReceiptsMode;
|
||||||
|
|
||||||
|
fn make_test_block(kind: &str, x0: f64, y0: f64, x1: f64, y1: f64) -> BlockJson {
|
||||||
|
BlockJson {
|
||||||
|
id: format!("block_{}", kind),
|
||||||
|
kind: kind.to_string(),
|
||||||
|
bbox: Some(vec![x0, y0, x1, y1]),
|
||||||
|
spans: vec![0, 1],
|
||||||
|
reading_order: Some(0),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_apply_extraction_tuning() {
|
||||||
|
let tuning = ExtractionTuning {
|
||||||
|
reading_order: Some("line_dominant".to_string()),
|
||||||
|
table_detection: Some("strict_borders".to_string()),
|
||||||
|
readability_threshold: Some(0.4),
|
||||||
|
include_invisible: Some(true),
|
||||||
|
include_headers_footers: Some(true),
|
||||||
|
zone_filtering: None,
|
||||||
|
force_ocr: Some(false),
|
||||||
|
min_block_chars: Some(10),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut options = ExtractionOptions::default();
|
||||||
|
|
||||||
|
apply_extraction_tuning(&tuning, &mut options);
|
||||||
|
|
||||||
|
// Check that output options were applied
|
||||||
|
assert_eq!(options.output.include_invisible, true);
|
||||||
|
assert_eq!(options.output.include_headers, true);
|
||||||
|
assert_eq!(options.output.include_footers, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_apply_extraction_tuning_partial() {
|
||||||
|
let tuning = ExtractionTuning {
|
||||||
|
reading_order: None,
|
||||||
|
table_detection: None,
|
||||||
|
readability_threshold: None,
|
||||||
|
include_invisible: Some(false),
|
||||||
|
include_headers_footers: None,
|
||||||
|
zone_filtering: None,
|
||||||
|
force_ocr: None,
|
||||||
|
min_block_chars: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut options = ExtractionOptions::default();
|
||||||
|
|
||||||
|
apply_extraction_tuning(&tuning, &mut options);
|
||||||
|
|
||||||
|
assert_eq!(options.output.include_invisible, false);
|
||||||
|
assert_eq!(options.output.include_headers, false);
|
||||||
|
assert_eq!(options.output.include_footers, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_classify_and_select_profile_no_match() {
|
||||||
|
// Empty profiles list
|
||||||
|
let profiles: Vec<ExtractionProfile> = vec![];
|
||||||
|
let page_data: Vec<(Vec<BlockJson>, Vec<usize>)> = vec![];
|
||||||
|
|
||||||
|
let result = classify_and_select_profile(&profiles, &page_data, false, false);
|
||||||
|
|
||||||
|
assert!(result.is_none());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_apply_profile_to_metadata_no_fields() {
|
||||||
|
let profile_yaml = r#"
|
||||||
|
name: test
|
||||||
|
description: Test profile
|
||||||
|
priority: 10
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let profile: ExtractionProfile = serde_yaml::from_str(profile_yaml).unwrap();
|
||||||
|
let pages = vec![];
|
||||||
|
|
||||||
|
let (name, version, fields) = apply_profile_to_metadata(&profile, &pages);
|
||||||
|
|
||||||
|
assert_eq!(name, "test");
|
||||||
|
assert_eq!(version, "1.0.0");
|
||||||
|
assert!(fields.is_none());
|
||||||
|
}
|
||||||
|
}
|
||||||
437
crates/pdftract-core/src/profiles/extraction.rs
Normal file
437
crates/pdftract-core/src/profiles/extraction.rs
Normal file
|
|
@ -0,0 +1,437 @@
|
||||||
|
//! Extraction profile types (Phase 7.10).
|
||||||
|
//!
|
||||||
|
//! This module defines the rich extraction profile format that extends Phase 5.6
|
||||||
|
//! classification with extraction tuning and field extraction. Extraction profiles
|
||||||
|
//! use a boolean match DSL (all/any/none combinators) and can override extraction
|
||||||
|
//! options and extract structured fields.
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Extraction profile with match DSL, extraction tuning, and field extraction.
|
||||||
|
///
|
||||||
|
/// This is the Phase 7.10 profile format, separate from the Phase 5.6 classification
|
||||||
|
/// `Profile` type. Extraction profiles drive both classification (via match DSL)
|
||||||
|
/// and extraction behavior (via tuning and field specs).
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ExtractionProfile {
|
||||||
|
/// Profile name (e.g., "invoice", "receipt")
|
||||||
|
pub name: String,
|
||||||
|
|
||||||
|
/// Human-readable description
|
||||||
|
pub description: String,
|
||||||
|
|
||||||
|
/// Priority for profile selection (higher = preferred when multiple match)
|
||||||
|
#[serde(default = "default_priority")]
|
||||||
|
pub priority: u32,
|
||||||
|
|
||||||
|
/// Match DSL expression (boolean tree with all/any/none combinators)
|
||||||
|
#[serde(default)]
|
||||||
|
pub match_expr: MatchExpr,
|
||||||
|
|
||||||
|
/// Extraction tuning overrides (optional)
|
||||||
|
#[serde(default)]
|
||||||
|
pub extraction: Option<ExtractionTuning>,
|
||||||
|
|
||||||
|
/// Field extraction specifications (optional)
|
||||||
|
#[serde(default)]
|
||||||
|
pub fields: HashMap<String, FieldSpec>,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_priority() -> u32 {
|
||||||
|
10
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Boolean match expression for document classification.
|
||||||
|
///
|
||||||
|
/// Supports all/any/none combinators for building complex matching rules.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
pub enum MatchExpr {
|
||||||
|
/// Single predicate
|
||||||
|
Predicate(ExtractionMatchPredicate),
|
||||||
|
|
||||||
|
/// All of these must match
|
||||||
|
All { all: Vec<MatchExpr> },
|
||||||
|
|
||||||
|
/// Any of these can match
|
||||||
|
Any { any: Vec<MatchExpr> },
|
||||||
|
|
||||||
|
/// None of these must match
|
||||||
|
None { none: Vec<MatchExpr> },
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for MatchExpr {
|
||||||
|
fn default() -> Self {
|
||||||
|
// Default to an Any that matches nothing (empty list)
|
||||||
|
MatchExpr::Any { any: Vec::new() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Match predicate primitives for extraction profiles.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum ExtractionMatchPredicate {
|
||||||
|
/// Text contains any of the given strings
|
||||||
|
TextContains {
|
||||||
|
#[serde(default)]
|
||||||
|
patterns: Vec<String>,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Text matches the given regex
|
||||||
|
TextMatches {
|
||||||
|
pattern: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Heading text matches the given regex
|
||||||
|
HeadingMatches {
|
||||||
|
pattern: String,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document has currency pattern ($\d, €\d, etc.)
|
||||||
|
HasCurrencyPattern {
|
||||||
|
#[serde(default)]
|
||||||
|
has_currency_pattern: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document has signature fields (AcroForm)
|
||||||
|
HasSignatureField {
|
||||||
|
#[serde(default)]
|
||||||
|
has_signature_field: bool,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Structural predicates (has_table, page_count, etc.)
|
||||||
|
Structural {
|
||||||
|
#[serde(default)]
|
||||||
|
has_table: bool,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
has_form_field: bool,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
has_math: bool,
|
||||||
|
|
||||||
|
#[serde(flatten)]
|
||||||
|
page_count: Option<PageCountRange>,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Text patterns alias for TextContains
|
||||||
|
#[serde(rename = "text_patterns")]
|
||||||
|
TextContainsAlias {
|
||||||
|
#[serde(default)]
|
||||||
|
patterns: Vec<String>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Page count range predicate.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct PageCountRange {
|
||||||
|
#[serde(default)]
|
||||||
|
pub min: Option<u32>,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub max: Option<u32>,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub hint: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extraction tuning overrides.
|
||||||
|
///
|
||||||
|
/// These fields override the default ExtractionOptions when a profile matches.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ExtractionTuning {
|
||||||
|
/// Reading order algorithm
|
||||||
|
pub reading_order: Option<String>,
|
||||||
|
|
||||||
|
/// Table detection mode
|
||||||
|
pub table_detection: Option<String>,
|
||||||
|
|
||||||
|
/// Readability threshold (0.0-1.0)
|
||||||
|
pub readability_threshold: Option<f32>,
|
||||||
|
|
||||||
|
/// Include invisible text
|
||||||
|
pub include_invisible: Option<bool>,
|
||||||
|
|
||||||
|
/// Include headers and footers
|
||||||
|
pub include_headers_footers: Option<bool>,
|
||||||
|
|
||||||
|
/// Zone filtering mode
|
||||||
|
pub zone_filtering: Option<String>,
|
||||||
|
|
||||||
|
/// Force OCR
|
||||||
|
pub force_ocr: Option<bool>,
|
||||||
|
|
||||||
|
/// Minimum block characters
|
||||||
|
pub min_block_chars: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Field extraction specification.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct FieldSpec {
|
||||||
|
/// Field type (string, decimal, date, int, bool, array)
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub field_type: String,
|
||||||
|
|
||||||
|
/// Extraction specification
|
||||||
|
pub extraction: FieldExtraction,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Field extraction definition.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
pub enum FieldExtraction {
|
||||||
|
/// Simple pattern-based extraction
|
||||||
|
Patterns {
|
||||||
|
patterns: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
fallback: Option<serde_yaml::Value>,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Rich extraction with localizers and extractors
|
||||||
|
Rich {
|
||||||
|
/// Regex pattern
|
||||||
|
#[serde(default)]
|
||||||
|
regex: Option<String>,
|
||||||
|
|
||||||
|
/// Near anchors (search near these strings)
|
||||||
|
#[serde(default)]
|
||||||
|
near: Option<Vec<String>>,
|
||||||
|
|
||||||
|
/// Maximum distance in points
|
||||||
|
#[serde(default)]
|
||||||
|
max_distance_pt: Option<usize>,
|
||||||
|
|
||||||
|
/// Region specification
|
||||||
|
#[serde(default)]
|
||||||
|
region: Option<String>,
|
||||||
|
|
||||||
|
/// Pick strategy (largest_font, smallest_font, nearest_below, nearest_right, first, last)
|
||||||
|
#[serde(default)]
|
||||||
|
pick: Option<String>,
|
||||||
|
|
||||||
|
/// Parse type (decimal, date, int, bool, string)
|
||||||
|
#[serde(default)]
|
||||||
|
parse: Option<String>,
|
||||||
|
|
||||||
|
/// After field (for ordering)
|
||||||
|
#[serde(default)]
|
||||||
|
after: Option<String>,
|
||||||
|
|
||||||
|
/// After heading
|
||||||
|
#[serde(default)]
|
||||||
|
after_heading: Option<String>,
|
||||||
|
|
||||||
|
/// Table region for array fields
|
||||||
|
#[serde(default)]
|
||||||
|
table_region: Option<String>,
|
||||||
|
|
||||||
|
/// Columnar regions for array fields
|
||||||
|
#[serde(default)]
|
||||||
|
columnar_regions: Option<String>,
|
||||||
|
|
||||||
|
/// Array schema for structured data
|
||||||
|
#[serde(default)]
|
||||||
|
schema: Option<Vec<FieldSchema>>,
|
||||||
|
|
||||||
|
/// Fallback value
|
||||||
|
#[serde(default)]
|
||||||
|
fallback: Option<serde_yaml::Value>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Schema field for array extraction.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct FieldSchema {
|
||||||
|
pub name: String,
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub field_type: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub required: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extraction_profile_basic() {
|
||||||
|
let yaml = r#"
|
||||||
|
name: test
|
||||||
|
description: Test profile
|
||||||
|
priority: 50
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(profile.name, "test");
|
||||||
|
assert_eq!(profile.description, "Test profile");
|
||||||
|
assert_eq!(profile.priority, 50);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_expr_all() {
|
||||||
|
let yaml = r#"
|
||||||
|
match:
|
||||||
|
all:
|
||||||
|
- text_contains:
|
||||||
|
patterns: ["invoice", "bill"]
|
||||||
|
- structural:
|
||||||
|
has_table: true
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
match expr {
|
||||||
|
MatchExpr::All { all } => {
|
||||||
|
assert_eq!(all.len(), 2);
|
||||||
|
}
|
||||||
|
_ => panic!("Expected All"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_expr_any() {
|
||||||
|
let yaml = r#"
|
||||||
|
match:
|
||||||
|
any:
|
||||||
|
- text_contains:
|
||||||
|
patterns: ["receipt"]
|
||||||
|
- text_matches:
|
||||||
|
pattern: "\\d+\\.\\d{2}"
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
match expr {
|
||||||
|
MatchExpr::Any { any } => {
|
||||||
|
assert_eq!(any.len(), 2);
|
||||||
|
}
|
||||||
|
_ => panic!("Expected Any"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_expr_none() {
|
||||||
|
let yaml = r#"
|
||||||
|
match:
|
||||||
|
none:
|
||||||
|
- text_contains:
|
||||||
|
patterns: ["abstract", "bibliography"]
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
match expr {
|
||||||
|
MatchExpr::None { none } => {
|
||||||
|
assert_eq!(none.len(), 1);
|
||||||
|
}
|
||||||
|
_ => panic!("Expected None"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extraction_tuning() {
|
||||||
|
let yaml = r#"
|
||||||
|
extraction:
|
||||||
|
reading_order: xy_cut
|
||||||
|
table_detection: strict_borders
|
||||||
|
readability_threshold: 0.4
|
||||||
|
include_invisible: false
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let tuning: ExtractionTuning = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(tuning.reading_order, Some("xy_cut".to_string()));
|
||||||
|
assert_eq!(tuning.table_detection, Some("strict_borders".to_string()));
|
||||||
|
assert_eq!(tuning.readability_threshold, Some(0.4));
|
||||||
|
assert_eq!(tuning.include_invisible, Some(false));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_field_spec_simple() {
|
||||||
|
let yaml = r#"
|
||||||
|
total:
|
||||||
|
type: decimal
|
||||||
|
extraction:
|
||||||
|
patterns:
|
||||||
|
- "\\$\\s*(\\d+\\.\\d{2})"
|
||||||
|
fallback: null
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let field: FieldSpec = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(field.field_type, "decimal");
|
||||||
|
match field.extraction {
|
||||||
|
FieldExtraction::Patterns { patterns, .. } => {
|
||||||
|
assert_eq!(patterns.len(), 1);
|
||||||
|
}
|
||||||
|
_ => panic!("Expected Patterns"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_field_spec_rich() {
|
||||||
|
let yaml = r#"
|
||||||
|
invoice_number:
|
||||||
|
type: string
|
||||||
|
extraction:
|
||||||
|
regex: "Invoice\\s*#\\s*([\\w-]+)"
|
||||||
|
near: ["Invoice", "Invoice Number"]
|
||||||
|
max_distance_pt: 200
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let field: FieldSpec = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(field.field_type, "string");
|
||||||
|
match field.extraction {
|
||||||
|
FieldExtraction::Rich { regex, near, max_distance_pt, .. } => {
|
||||||
|
assert!(regex.is_some());
|
||||||
|
assert!(near.is_some());
|
||||||
|
assert_eq!(max_distance_pt, Some(200));
|
||||||
|
}
|
||||||
|
_ => panic!("Expected Rich"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_full_profile_roundtrip() {
|
||||||
|
let yaml = r#"
|
||||||
|
name: invoice
|
||||||
|
description: Commercial invoice with line items
|
||||||
|
priority: 50
|
||||||
|
|
||||||
|
match:
|
||||||
|
all:
|
||||||
|
- any:
|
||||||
|
- text_contains:
|
||||||
|
patterns: ["invoice", "bill to"]
|
||||||
|
- heading_matches:
|
||||||
|
pattern: "^Invoice\\b"
|
||||||
|
- structural:
|
||||||
|
has_table: true
|
||||||
|
|
||||||
|
extraction:
|
||||||
|
reading_order: line_dominant
|
||||||
|
table_detection: strict_borders
|
||||||
|
readability_threshold: 0.4
|
||||||
|
|
||||||
|
fields:
|
||||||
|
invoice_number:
|
||||||
|
type: string
|
||||||
|
extraction:
|
||||||
|
regex: "Invoice\\s*#\\s*([\\w-]+)"
|
||||||
|
near: ["Invoice"]
|
||||||
|
total:
|
||||||
|
type: decimal
|
||||||
|
extraction:
|
||||||
|
patterns:
|
||||||
|
- "total.*([\\d,]+\\.\\d{2})"
|
||||||
|
fallback: null
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(profile.name, "invoice");
|
||||||
|
assert_eq!(profile.priority, 50);
|
||||||
|
assert!(profile.extraction.is_some());
|
||||||
|
assert_eq!(profile.fields.len(), 2);
|
||||||
|
|
||||||
|
// Round-trip
|
||||||
|
let yaml_out = serde_yaml::to_string(&profile).unwrap();
|
||||||
|
let profile2: ExtractionProfile = serde_yaml::from_str(&yaml_out).unwrap();
|
||||||
|
assert_eq!(profile2.name, profile.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
374
crates/pdftract-core/src/profiles/extraction_loader.rs
Normal file
374
crates/pdftract-core/src/profiles/extraction_loader.rs
Normal file
|
|
@ -0,0 +1,374 @@
|
||||||
|
//! Extraction profile loader (Phase 7.10).
|
||||||
|
//!
|
||||||
|
//! Loads extraction profiles from built-in sources, system directories,
|
||||||
|
//! XDG config paths, and custom --profile-dir flags.
|
||||||
|
|
||||||
|
use super::extraction::ExtractionProfile;
|
||||||
|
use super::loader::{check_forbidden_keys, ForbiddenKeyError, ProfileLoadError};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
/// Profile source with priority metadata.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ProfileSource {
|
||||||
|
/// The loaded profile
|
||||||
|
pub profile: ExtractionProfile,
|
||||||
|
|
||||||
|
/// Where this profile came from
|
||||||
|
pub source: ProfileOrigin,
|
||||||
|
|
||||||
|
/// Whether this overrides a built-in profile
|
||||||
|
pub overrides_builtin: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Origin of a profile.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum ProfileOrigin {
|
||||||
|
/// Built-in profile (compiled into binary)
|
||||||
|
BuiltIn,
|
||||||
|
|
||||||
|
/// System-wide profile (/etc/pdftract/profiles/)
|
||||||
|
System,
|
||||||
|
|
||||||
|
/// User profile (XDG config directory)
|
||||||
|
User,
|
||||||
|
|
||||||
|
/// Custom profile directory (--profile-dir)
|
||||||
|
Custom(PathBuf),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load all extraction profiles from the search path.
|
||||||
|
///
|
||||||
|
/// Search order (lowest to highest priority):
|
||||||
|
/// 1. Built-in profiles (compiled in)
|
||||||
|
/// 2. System directory (/etc/pdftract/profiles/)
|
||||||
|
/// 3. User directory (XDG config: ~/.config/pdftract/profiles/)
|
||||||
|
/// 4. Custom directories (--profile-dir, repeatable)
|
||||||
|
///
|
||||||
|
/// Later sources override earlier ones on name collision.
|
||||||
|
pub fn load_extraction_profiles(
|
||||||
|
custom_dirs: &[PathBuf],
|
||||||
|
) -> Result<Vec<ProfileSource>, ProfileLoadError> {
|
||||||
|
let mut profiles_by_name: HashMap<String, ProfileSource> = HashMap::new();
|
||||||
|
|
||||||
|
// 1. Load built-in profiles
|
||||||
|
load_builtin_profiles(&mut profiles_by_name)?;
|
||||||
|
|
||||||
|
// 2. Load system profiles
|
||||||
|
let system_dir = PathBuf::from("/etc/pdftract/profiles");
|
||||||
|
if system_dir.exists() {
|
||||||
|
load_profiles_from_dir(&system_dir, ProfileOrigin::System, &mut profiles_by_name)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Load user profiles (XDG config)
|
||||||
|
if let Some(user_dir) = get_xdg_profile_dir() {
|
||||||
|
if user_dir.exists() {
|
||||||
|
load_profiles_from_dir(&user_dir, ProfileOrigin::User, &mut profiles_by_name)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Load custom profiles (--profile-dir)
|
||||||
|
for custom_dir in custom_dirs {
|
||||||
|
if custom_dir.exists() {
|
||||||
|
let origin = ProfileOrigin::Custom(custom_dir.clone());
|
||||||
|
load_profiles_from_dir(custom_dir, origin, &mut profiles_by_name)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to vector, sorted by priority then by name
|
||||||
|
let mut profiles: Vec<ProfileSource> = profiles_by_name.into_values().collect();
|
||||||
|
profiles.sort_by(|a, b| {
|
||||||
|
b.profile
|
||||||
|
.priority
|
||||||
|
.cmp(&a.profile.priority)
|
||||||
|
.then_with(|| a.profile.name.cmp(&b.profile.name))
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(profiles)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the XDG config directory for pdftract profiles.
|
||||||
|
///
|
||||||
|
/// Returns ~/.config/pdftract/profiles/ or None if XDG config is not available.
|
||||||
|
pub fn get_xdg_profile_dir() -> Option<PathBuf> {
|
||||||
|
dirs::config_dir().map(|dir| dir.join("pdftract").join("profiles"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load built-in extraction profiles.
|
||||||
|
///
|
||||||
|
/// These are compiled into the binary via include_str!.
|
||||||
|
fn load_builtin_profiles(
|
||||||
|
profiles: &mut HashMap<String, ProfileSource>,
|
||||||
|
) -> Result<(), ProfileLoadError> {
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
// Load each built-in profile individually
|
||||||
|
let profile_results: Vec<(&str, Result<ExtractionProfile, ProfileLoadError>)> = vec![
|
||||||
|
("invoice", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/invoice/profile.yaml"),
|
||||||
|
"profiles/builtin/invoice/profile.yaml"
|
||||||
|
)),
|
||||||
|
("receipt", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/receipt/profile.yaml"),
|
||||||
|
"profiles/builtin/receipt/profile.yaml"
|
||||||
|
)),
|
||||||
|
("contract", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/contract/profile.yaml"),
|
||||||
|
"profiles/builtin/contract/profile.yaml"
|
||||||
|
)),
|
||||||
|
("scientific_paper", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/scientific_paper/profile.yaml"),
|
||||||
|
"profiles/builtin/scientific_paper/profile.yaml"
|
||||||
|
)),
|
||||||
|
("slide_deck", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/slide_deck/profile.yaml"),
|
||||||
|
"profiles/builtin/slide_deck/profile.yaml"
|
||||||
|
)),
|
||||||
|
("form", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/form/profile.yaml"),
|
||||||
|
"profiles/builtin/form/profile.yaml"
|
||||||
|
)),
|
||||||
|
("bank_statement", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/bank_statement/profile.yaml"),
|
||||||
|
"profiles/builtin/bank_statement/profile.yaml"
|
||||||
|
)),
|
||||||
|
("legal_filing", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/legal_filing/profile.yaml"),
|
||||||
|
"profiles/builtin/legal_filing/profile.yaml"
|
||||||
|
)),
|
||||||
|
("book_chapter", load_profile_yaml(
|
||||||
|
include_str!("../../../../profiles/builtin/book_chapter/profile.yaml"),
|
||||||
|
"profiles/builtin/book_chapter/profile.yaml"
|
||||||
|
)),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (name, result) in profile_results {
|
||||||
|
match result {
|
||||||
|
Ok(profile) => {
|
||||||
|
profiles.insert(
|
||||||
|
profile.name.clone(),
|
||||||
|
ProfileSource {
|
||||||
|
profile,
|
||||||
|
source: ProfileOrigin::BuiltIn,
|
||||||
|
overrides_builtin: false,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("Failed to parse built-in profile '{}': {}", name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load a profile from YAML content.
|
||||||
|
fn load_profile_yaml(content: &str, source_path: &str) -> Result<ExtractionProfile, ProfileLoadError> {
|
||||||
|
// Check for forbidden keys first
|
||||||
|
let yaml_value = serde_yaml::from_str::<serde_yaml::Value>(content)?;
|
||||||
|
|
||||||
|
// Get the original content for line number detection
|
||||||
|
if let Err(e) = check_forbidden_keys(&yaml_value, "", content) {
|
||||||
|
return Err(ProfileLoadError::ForbiddenKey {
|
||||||
|
key: e.key,
|
||||||
|
path: format!("{}: {}", source_path, e.path),
|
||||||
|
line: e.line,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse as ExtractionProfile
|
||||||
|
let profile: ExtractionProfile =
|
||||||
|
serde_yaml::from_str(content).map_err(ProfileLoadError::YamlError)?;
|
||||||
|
|
||||||
|
Ok(profile)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load profiles from a directory.
|
||||||
|
fn load_profiles_from_dir(
|
||||||
|
dir: &Path,
|
||||||
|
origin: ProfileOrigin,
|
||||||
|
profiles: &mut HashMap<String, ProfileSource>,
|
||||||
|
) -> Result<(), ProfileLoadError> {
|
||||||
|
let entries = fs::read_dir(dir).map_err(ProfileLoadError::IoError)?;
|
||||||
|
|
||||||
|
for entry in entries {
|
||||||
|
let entry = entry.map_err(ProfileLoadError::IoError)?;
|
||||||
|
let path = entry.path();
|
||||||
|
|
||||||
|
// Skip directories
|
||||||
|
if path.is_dir() {
|
||||||
|
// Check for profile.yaml subdirectory (e.g., invoice/profile.yaml)
|
||||||
|
let profile_yaml = path.join("profile.yaml");
|
||||||
|
if profile_yaml.exists() {
|
||||||
|
if let Ok(profile) = load_profile_file(&profile_yaml) {
|
||||||
|
let overrides_builtin = profiles
|
||||||
|
.contains_key(&profile.name)
|
||||||
|
&& matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_));
|
||||||
|
|
||||||
|
profiles.insert(
|
||||||
|
profile.name.clone(),
|
||||||
|
ProfileSource {
|
||||||
|
profile,
|
||||||
|
source: origin.clone(),
|
||||||
|
overrides_builtin,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only load .yaml files
|
||||||
|
if path.extension().and_then(|s| s.to_str()) != Some("yaml") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(profile) = load_profile_file(&path) {
|
||||||
|
let overrides_builtin = profiles
|
||||||
|
.contains_key(&profile.name)
|
||||||
|
&& matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_));
|
||||||
|
|
||||||
|
profiles.insert(
|
||||||
|
profile.name.clone(),
|
||||||
|
ProfileSource {
|
||||||
|
profile,
|
||||||
|
source: origin.clone(),
|
||||||
|
overrides_builtin,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load a single profile from a file.
|
||||||
|
pub fn load_profile_file(path: &Path) -> Result<ExtractionProfile, ProfileLoadError> {
|
||||||
|
let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?;
|
||||||
|
load_profile_yaml(&content, &path.to_string_lossy())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find a profile by name or path.
|
||||||
|
///
|
||||||
|
/// - If `name_or_path` is an existing file path, load it directly
|
||||||
|
/// - Otherwise, search for a profile with that name in the loaded profiles
|
||||||
|
pub fn find_profile(
|
||||||
|
name_or_path: &str,
|
||||||
|
profiles: &[ProfileSource],
|
||||||
|
) -> Result<ExtractionProfile, ProfileLoadError> {
|
||||||
|
// First, check if it's a file path
|
||||||
|
let path = PathBuf::from(name_or_path);
|
||||||
|
if path.exists() {
|
||||||
|
return load_profile_file(&path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search by name
|
||||||
|
for source in profiles {
|
||||||
|
if source.profile.name == name_or_path {
|
||||||
|
return Ok(source.profile.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(ProfileLoadError::IoError(std::io::Error::new(
|
||||||
|
std::io::ErrorKind::NotFound,
|
||||||
|
format!("Profile '{}' not found", name_or_path),
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Validate a profile file without loading it into the profile set.
|
||||||
|
///
|
||||||
|
/// Returns Ok(()) if the profile is valid, Err with details if invalid.
|
||||||
|
pub fn validate_profile_file(path: &Path) -> Result<(), ProfileLoadError> {
|
||||||
|
let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?;
|
||||||
|
|
||||||
|
// Check for forbidden keys
|
||||||
|
let yaml_value = serde_yaml::from_str::<serde_yaml::Value>(&content)
|
||||||
|
.map_err(ProfileLoadError::YamlError)?;
|
||||||
|
|
||||||
|
check_forbidden_keys(&yaml_value, "", &content)
|
||||||
|
.map_err(|e| ProfileLoadError::ForbiddenKey {
|
||||||
|
key: e.key,
|
||||||
|
path: e.path,
|
||||||
|
line: e.line,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// Try to parse as ExtractionProfile
|
||||||
|
let _: ExtractionProfile = serde_yaml::from_str(&content).map_err(ProfileLoadError::YamlError)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_get_xdg_profile_dir() {
|
||||||
|
let dir = get_xdg_profile_dir();
|
||||||
|
assert!(dir.is_some());
|
||||||
|
let path = dir.unwrap();
|
||||||
|
assert!(path.ends_with("pdftract/profiles"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_load_builtin_profiles() {
|
||||||
|
let mut profiles = HashMap::new();
|
||||||
|
let result = load_builtin_profiles(&mut profiles);
|
||||||
|
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
{
|
||||||
|
assert!(result.is_ok());
|
||||||
|
// Should have loaded some profiles
|
||||||
|
assert!(!profiles.is_empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_validate_simple_profile() {
|
||||||
|
let yaml = r#"
|
||||||
|
name: test
|
||||||
|
description: Test profile
|
||||||
|
priority: 10
|
||||||
|
match:
|
||||||
|
text_contains:
|
||||||
|
patterns: ["test"]
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let temp_dir = tempfile::tempdir().unwrap();
|
||||||
|
let profile_path = temp_dir.path().join("test.yaml");
|
||||||
|
fs::write(&profile_path, yaml).unwrap();
|
||||||
|
|
||||||
|
let result = validate_profile_file(&profile_path);
|
||||||
|
assert!(result.is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_validate_profile_with_forbidden_key() {
|
||||||
|
let yaml = r#"
|
||||||
|
name: test
|
||||||
|
description: Test profile
|
||||||
|
priority: 10
|
||||||
|
match:
|
||||||
|
text_contains:
|
||||||
|
patterns: ["test"]
|
||||||
|
api_key: "secret"
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let temp_dir = tempfile::tempdir().unwrap();
|
||||||
|
let profile_path = temp_dir.path().join("test.yaml");
|
||||||
|
fs::write(&profile_path, yaml).unwrap();
|
||||||
|
|
||||||
|
let result = validate_profile_file(&profile_path);
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_load_extraction_profiles_empty() {
|
||||||
|
let profiles = load_extraction_profiles(&[]).unwrap();
|
||||||
|
#[cfg(feature = "profiles")]
|
||||||
|
assert!(!profiles.is_empty()); // At least built-ins
|
||||||
|
}
|
||||||
|
}
|
||||||
353
crates/pdftract-core/src/profiles/field_extractor.rs
Normal file
353
crates/pdftract-core/src/profiles/field_extractor.rs
Normal file
|
|
@ -0,0 +1,353 @@
|
||||||
|
//! Field extraction DSL evaluator (Phase 7.10).
|
||||||
|
//!
|
||||||
|
//! Evaluates field extraction specifications from profiles and extracts
|
||||||
|
//! structured fields from document text. Supports:
|
||||||
|
//! - Localizers: near, region, pick
|
||||||
|
//! - Extractors: regex, parse
|
||||||
|
//! - Strategies for disambiguating multiple candidates
|
||||||
|
|
||||||
|
use super::extraction::{FieldExtraction, FieldSchema, FieldSpec};
|
||||||
|
use crate::schema::BlockJson;
|
||||||
|
use regex::Regex;
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Convert serde_yaml::Value to serde_json::Value.
|
||||||
|
fn convert_yaml_to_json(yaml_value: &serde_yaml::Value) -> Value {
|
||||||
|
match yaml_value {
|
||||||
|
serde_yaml::Value::Null => Value::Null,
|
||||||
|
serde_yaml::Value::Bool(b) => Value::Bool(*b),
|
||||||
|
serde_yaml::Value::Number(n) => {
|
||||||
|
if let Some(i) = n.as_i64() {
|
||||||
|
Value::Number(i.into())
|
||||||
|
} else if let Some(f) = n.as_f64() {
|
||||||
|
serde_json::Number::from_f64(f).map(Value::Number).unwrap_or(Value::Null)
|
||||||
|
} else {
|
||||||
|
Value::Null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
serde_yaml::Value::String(s) => Value::String(s.clone()),
|
||||||
|
serde_yaml::Value::Sequence(seq) => {
|
||||||
|
Value::Array(seq.iter().map(convert_yaml_to_json).collect())
|
||||||
|
}
|
||||||
|
serde_yaml::Value::Mapping(map) => {
|
||||||
|
let mut obj = serde_json::Map::new();
|
||||||
|
for (k, v) in map {
|
||||||
|
if let serde_yaml::Value::String(key_str) = k {
|
||||||
|
obj.insert(key_str.clone(), convert_yaml_to_json(v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Value::Object(obj)
|
||||||
|
}
|
||||||
|
serde_yaml::Value::Tagged(tagged) => convert_yaml_to_json(&tagged.value),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of field extraction.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct FieldExtractionResult {
|
||||||
|
/// Extracted field value (null if not found)
|
||||||
|
pub value: Value,
|
||||||
|
/// Human-readable extraction details (for debugging)
|
||||||
|
pub details: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract all fields from a profile against extracted document data.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `fields` - Field specifications from the profile
|
||||||
|
/// * `blocks` - Extracted blocks from the document
|
||||||
|
/// * `full_text` - Full document text
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A map of field names to extraction results.
|
||||||
|
pub fn extract_profile_fields(
|
||||||
|
fields: &HashMap<String, FieldSpec>,
|
||||||
|
blocks: &[BlockJson],
|
||||||
|
full_text: &str,
|
||||||
|
) -> HashMap<String, FieldExtractionResult> {
|
||||||
|
let mut results = HashMap::new();
|
||||||
|
|
||||||
|
for (field_name, field_spec) in fields {
|
||||||
|
let result = extract_single_field(field_spec, blocks, full_text);
|
||||||
|
results.insert(field_name.clone(), result);
|
||||||
|
}
|
||||||
|
|
||||||
|
results
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract a single field from the document.
|
||||||
|
fn extract_single_field(
|
||||||
|
field_spec: &FieldSpec,
|
||||||
|
blocks: &[BlockJson],
|
||||||
|
full_text: &str,
|
||||||
|
) -> FieldExtractionResult {
|
||||||
|
match &field_spec.extraction {
|
||||||
|
FieldExtraction::Patterns { patterns, fallback } => {
|
||||||
|
let json_fallback = fallback.as_ref().map(convert_yaml_to_json);
|
||||||
|
extract_by_patterns(patterns, full_text, &json_fallback)
|
||||||
|
}
|
||||||
|
FieldExtraction::Rich {
|
||||||
|
regex,
|
||||||
|
near,
|
||||||
|
max_distance_pt,
|
||||||
|
region,
|
||||||
|
pick,
|
||||||
|
parse,
|
||||||
|
after: _,
|
||||||
|
after_heading: _,
|
||||||
|
table_region: _,
|
||||||
|
columnar_regions: _,
|
||||||
|
schema: _,
|
||||||
|
fallback,
|
||||||
|
} => {
|
||||||
|
let json_fallback = fallback.as_ref().map(convert_yaml_to_json);
|
||||||
|
extract_rich(
|
||||||
|
regex,
|
||||||
|
near,
|
||||||
|
max_distance_pt,
|
||||||
|
region,
|
||||||
|
pick,
|
||||||
|
parse,
|
||||||
|
blocks,
|
||||||
|
full_text,
|
||||||
|
&json_fallback,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract using simple pattern matching (fallback mode).
|
||||||
|
fn extract_by_patterns(
|
||||||
|
patterns: &[String],
|
||||||
|
full_text: &str,
|
||||||
|
fallback: &Option<Value>,
|
||||||
|
) -> FieldExtractionResult {
|
||||||
|
for pattern in patterns {
|
||||||
|
if let Ok(re) = Regex::new(pattern) {
|
||||||
|
if let Some(captures) = re.captures(full_text) {
|
||||||
|
// Use first capture group if available, otherwise full match
|
||||||
|
let value = captures
|
||||||
|
.get(1)
|
||||||
|
.or(captures.get(0))
|
||||||
|
.map(|m| m.as_str())
|
||||||
|
.unwrap_or("");
|
||||||
|
|
||||||
|
return FieldExtractionResult {
|
||||||
|
value: Value::String(value.to_string()),
|
||||||
|
details: format!("Matched pattern '{}': '{}'", pattern, value),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No match - use fallback or null
|
||||||
|
FieldExtractionResult {
|
||||||
|
value: fallback.clone().unwrap_or(Value::Null),
|
||||||
|
details: "No patterns matched, using fallback".to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract using rich field extraction with localizers and extractors.
|
||||||
|
fn extract_rich(
|
||||||
|
regex: &Option<String>,
|
||||||
|
near: &Option<Vec<String>>,
|
||||||
|
_max_distance_pt: &Option<usize>,
|
||||||
|
_region: &Option<String>,
|
||||||
|
_pick: &Option<String>,
|
||||||
|
parse: &Option<String>,
|
||||||
|
_blocks: &[BlockJson],
|
||||||
|
full_text: &str,
|
||||||
|
fallback: &Option<Value>,
|
||||||
|
) -> FieldExtractionResult {
|
||||||
|
// For rich extraction, we need to find text near anchors
|
||||||
|
// This is a simplified version that searches the full text
|
||||||
|
|
||||||
|
// Find anchor position if "near" is specified
|
||||||
|
let search_text = if let Some(anchors) = near {
|
||||||
|
// Find the position of the first anchor in the text
|
||||||
|
let anchor_pos = anchors
|
||||||
|
.iter()
|
||||||
|
.find_map(|anchor| full_text.find(anchor))
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
// Search in text after the anchor
|
||||||
|
if let Some(pos) = full_text.get(anchor_pos..) {
|
||||||
|
pos
|
||||||
|
} else {
|
||||||
|
full_text
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
full_text
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract value using regex
|
||||||
|
let raw_value = if let Some(pattern) = regex {
|
||||||
|
extract_with_regex(pattern, search_text)
|
||||||
|
} else {
|
||||||
|
// If no regex, use the first few words from search text
|
||||||
|
search_text
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Parse value according to type
|
||||||
|
let parsed_value = parse_value(&raw_value, parse.as_deref());
|
||||||
|
|
||||||
|
FieldExtractionResult {
|
||||||
|
value: parsed_value,
|
||||||
|
details: format!("Extracted value: '{}'", raw_value),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract value using regex.
|
||||||
|
fn extract_with_regex(pattern: &str, text: &str) -> String {
|
||||||
|
match Regex::new(pattern) {
|
||||||
|
Ok(re) => {
|
||||||
|
if let Some(captures) = re.captures(text) {
|
||||||
|
captures
|
||||||
|
.get(1)
|
||||||
|
.or(captures.get(0))
|
||||||
|
.map(|m| m.as_str().to_string())
|
||||||
|
.unwrap_or_default()
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => String::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a value according to the specified type.
|
||||||
|
fn parse_value(raw: &str, parse_type: Option<&str>) -> Value {
|
||||||
|
let raw = raw.trim();
|
||||||
|
|
||||||
|
match parse_type {
|
||||||
|
Some("decimal") => {
|
||||||
|
// Clean up currency symbols and commas
|
||||||
|
let cleaned = raw
|
||||||
|
.replace('$', "")
|
||||||
|
.replace('€', "")
|
||||||
|
.replace('£', "")
|
||||||
|
.replace('¥', "")
|
||||||
|
.replace(',', "");
|
||||||
|
|
||||||
|
cleaned
|
||||||
|
.parse::<f64>()
|
||||||
|
.ok()
|
||||||
|
.and_then(|v| serde_json::Number::from_f64(v))
|
||||||
|
.map(Value::Number)
|
||||||
|
.unwrap_or(Value::Null)
|
||||||
|
}
|
||||||
|
Some("int") => raw
|
||||||
|
.parse::<i64>()
|
||||||
|
.map(Value::Number)
|
||||||
|
.unwrap_or(Value::Null),
|
||||||
|
Some("bool") => {
|
||||||
|
let lower = raw.to_lowercase();
|
||||||
|
Value::Bool(lower == "true" || lower == "yes" || lower == "1")
|
||||||
|
}
|
||||||
|
Some("date") => {
|
||||||
|
// Try to parse as ISO date or return string
|
||||||
|
if raw.len() >= 10 && raw.chars().nth(4) == Some('-') {
|
||||||
|
Value::String(raw.to_string())
|
||||||
|
} else {
|
||||||
|
Value::String(raw.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some("string") | None => Value::String(raw.to_string()),
|
||||||
|
_ => Value::String(raw.to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_by_patterns_simple() {
|
||||||
|
let full_text = "Invoice #12345\nTotal: $100.00";
|
||||||
|
let patterns = vec![r"Invoice #(\w+)".to_string()];
|
||||||
|
|
||||||
|
let result = extract_by_patterns(&patterns, full_text, &None);
|
||||||
|
|
||||||
|
assert_eq!(result.value, "12345");
|
||||||
|
assert!(result.details.contains("Matched pattern"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_by_patterns_no_match() {
|
||||||
|
let full_text = "Receipt #ABC";
|
||||||
|
let patterns = vec![r"Invoice #(\w+)".to_string()];
|
||||||
|
let fallback = Some(Value::String("UNKNOWN".to_string()));
|
||||||
|
|
||||||
|
let result = extract_by_patterns(&patterns, full_text, &fallback);
|
||||||
|
|
||||||
|
assert_eq!(result.value, "UNKNOWN");
|
||||||
|
assert!(result.details.contains("No patterns matched"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_value_decimal() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_value("100.50", Some("decimal")),
|
||||||
|
Value::Number(serde_json::Number::from_f64(100.50).unwrap())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_value("$1,234.56", Some("decimal")),
|
||||||
|
Value::Number(serde_json::Number::from_f64(1234.56).unwrap())
|
||||||
|
);
|
||||||
|
assert_eq!(parse_value("invalid", Some("decimal")), Value::Null);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_value_int() {
|
||||||
|
assert_eq!(parse_value("42", Some("int")), Value::Number(42.into()));
|
||||||
|
assert_eq!(parse_value("invalid", Some("int")), Value::Null);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_value_bool() {
|
||||||
|
assert_eq!(parse_value("true", Some("bool")), Value::Bool(true));
|
||||||
|
assert_eq!(parse_value("yes", Some("bool")), Value::Bool(true));
|
||||||
|
assert_eq!(parse_value("false", Some("bool")), Value::Bool(false));
|
||||||
|
assert_eq!(parse_value("no", Some("bool")), Value::Bool(false));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_value_date() {
|
||||||
|
let result = parse_value("2025-01-15", Some("date"));
|
||||||
|
assert_eq!(result, Value::String("2025-01-15".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_value_string() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_value("hello", Some("string")),
|
||||||
|
Value::String("hello".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(parse_value("world", None), Value::String("world".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_with_regex() {
|
||||||
|
let text = "Invoice: INV-2025-00123";
|
||||||
|
let pattern = r"Invoice:\s*([\w-]+)";
|
||||||
|
|
||||||
|
let result = extract_with_regex(pattern, text);
|
||||||
|
assert_eq!(result, "INV-2025-00123");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_with_regex_no_match() {
|
||||||
|
let text = "Receipt: R-123";
|
||||||
|
let pattern = r"Invoice:\s*([\w-]+)";
|
||||||
|
|
||||||
|
let result = extract_with_regex(pattern, text);
|
||||||
|
assert!(result.is_empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
528
crates/pdftract-core/src/profiles/match_eval.rs
Normal file
528
crates/pdftract-core/src/profiles/match_eval.rs
Normal file
|
|
@ -0,0 +1,528 @@
|
||||||
|
//! Match DSL evaluator for extraction profiles.
|
||||||
|
//!
|
||||||
|
//! Evaluates boolean match expressions (all/any/none combinators) against
|
||||||
|
//! document signals to determine if a profile matches a document.
|
||||||
|
|
||||||
|
use super::engine::FeatureSignals;
|
||||||
|
use super::extraction::{ExtractionMatchPredicate, MatchExpr, PageCountRange};
|
||||||
|
use regex::Regex;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
/// Result of match evaluation.
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub struct MatchResult {
|
||||||
|
/// Whether the match succeeded
|
||||||
|
pub matched: bool,
|
||||||
|
|
||||||
|
/// Human-readable reasons for the match (for debugging/metadata)
|
||||||
|
pub reasons: Vec<String>,
|
||||||
|
|
||||||
|
/// Confidence score (0.0-1.0)
|
||||||
|
pub confidence: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Evaluate a match expression against document signals.
|
||||||
|
///
|
||||||
|
/// Returns a MatchResult indicating whether the expression matched and
|
||||||
|
/// providing reasons for the decision.
|
||||||
|
pub fn evaluate_match(expr: &MatchExpr, signals: &FeatureSignals) -> MatchResult {
|
||||||
|
match expr {
|
||||||
|
MatchExpr::Predicate(pred) => evaluate_predicate(pred, signals),
|
||||||
|
MatchExpr::All { all } => {
|
||||||
|
let mut result = MatchResult {
|
||||||
|
matched: true,
|
||||||
|
reasons: Vec::new(),
|
||||||
|
confidence: 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
for sub_expr in all {
|
||||||
|
let sub_result = evaluate_match(sub_expr, signals);
|
||||||
|
result.reasons.extend(sub_result.reasons);
|
||||||
|
|
||||||
|
if !sub_result.matched {
|
||||||
|
result.matched = false;
|
||||||
|
// Keep collecting reasons for debugging
|
||||||
|
}
|
||||||
|
result.confidence = result.confidence.min(sub_result.confidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.matched {
|
||||||
|
result.reasons.push("all: all sub-expressions matched".to_string());
|
||||||
|
} else {
|
||||||
|
result.reasons.push("all: some sub-expressions did not match".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
MatchExpr::Any { any } => {
|
||||||
|
let mut best_result = MatchResult {
|
||||||
|
matched: false,
|
||||||
|
reasons: Vec::new(),
|
||||||
|
confidence: 0.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
for sub_expr in any {
|
||||||
|
let sub_result = evaluate_match(sub_expr, signals);
|
||||||
|
|
||||||
|
if sub_result.matched {
|
||||||
|
best_result.matched = true;
|
||||||
|
best_result.confidence = best_result.confidence.max(sub_result.confidence);
|
||||||
|
}
|
||||||
|
|
||||||
|
best_result.reasons.extend(sub_result.reasons);
|
||||||
|
}
|
||||||
|
|
||||||
|
if best_result.matched {
|
||||||
|
best_result
|
||||||
|
.reasons
|
||||||
|
.push("any: at least one sub-expression matched".to_string());
|
||||||
|
} else {
|
||||||
|
best_result
|
||||||
|
.reasons
|
||||||
|
.push("any: no sub-expressions matched".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
best_result
|
||||||
|
}
|
||||||
|
MatchExpr::None { none } => {
|
||||||
|
let mut result = MatchResult {
|
||||||
|
matched: true,
|
||||||
|
reasons: Vec::new(),
|
||||||
|
confidence: 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
for sub_expr in none {
|
||||||
|
let sub_result = evaluate_match(sub_expr, signals);
|
||||||
|
|
||||||
|
if sub_result.matched {
|
||||||
|
result.matched = false;
|
||||||
|
result.confidence = 0.0;
|
||||||
|
result
|
||||||
|
.reasons
|
||||||
|
.push(format!("none: excluded sub-expression matched: {:?}", sub_result.reasons));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.matched {
|
||||||
|
result.reasons.push("none: no excluded sub-expressions matched".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Evaluate a single predicate against document signals.
|
||||||
|
fn evaluate_predicate(pred: &ExtractionMatchPredicate, signals: &FeatureSignals) -> MatchResult {
|
||||||
|
match pred {
|
||||||
|
ExtractionMatchPredicate::TextContains { patterns } => {
|
||||||
|
let text_lower = signals.text.to_lowercase();
|
||||||
|
|
||||||
|
for pattern in patterns {
|
||||||
|
if text_lower.contains(&pattern.to_lowercase()) {
|
||||||
|
return MatchResult {
|
||||||
|
matched: true,
|
||||||
|
reasons: vec![format!("text_contains: found '{}'", pattern)],
|
||||||
|
confidence: 0.8,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult {
|
||||||
|
matched: false,
|
||||||
|
reasons: vec!["text_contains: no patterns found".to_string()],
|
||||||
|
confidence: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ExtractionMatchPredicate::TextMatches { pattern } => {
|
||||||
|
let regex = match compile_regex(pattern) {
|
||||||
|
Ok(re) => re,
|
||||||
|
Err(e) => {
|
||||||
|
return MatchResult {
|
||||||
|
matched: false,
|
||||||
|
reasons: vec![format!("text_matches: invalid regex '{}': {}", pattern, e)],
|
||||||
|
confidence: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if regex.is_match(&signals.text) {
|
||||||
|
MatchResult {
|
||||||
|
matched: true,
|
||||||
|
reasons: vec![format!("text_matches: pattern '{}' matched", pattern)],
|
||||||
|
confidence: 0.7,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
MatchResult {
|
||||||
|
matched: false,
|
||||||
|
reasons: vec![format!("text_matches: pattern '{}' did not match", pattern)],
|
||||||
|
confidence: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ExtractionMatchPredicate::HeadingMatches { pattern } => {
|
||||||
|
let regex = match compile_regex(pattern) {
|
||||||
|
Ok(re) => re,
|
||||||
|
Err(e) => {
|
||||||
|
return MatchResult {
|
||||||
|
matched: false,
|
||||||
|
reasons: vec![format!("heading_matches: invalid regex '{}': {}", pattern, e)],
|
||||||
|
confidence: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for heading in &signals.headings {
|
||||||
|
if regex.is_match(heading) {
|
||||||
|
return MatchResult {
|
||||||
|
matched: true,
|
||||||
|
reasons: vec![format!(
|
||||||
|
"heading_matches: heading '{}' matched pattern '{}'",
|
||||||
|
heading, pattern
|
||||||
|
)],
|
||||||
|
confidence: 0.75,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult {
|
||||||
|
matched: false,
|
||||||
|
reasons: vec![format!("heading_matches: no headings matched '{}'", pattern)],
|
||||||
|
confidence: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ExtractionMatchPredicate::HasCurrencyPattern {
|
||||||
|
has_currency_pattern: true,
|
||||||
|
} => {
|
||||||
|
let has_currency = has_currency_pattern_impl(&signals.text);
|
||||||
|
MatchResult {
|
||||||
|
matched: has_currency,
|
||||||
|
reasons: vec![if has_currency {
|
||||||
|
"has_currency_pattern: currency pattern found".to_string()
|
||||||
|
} else {
|
||||||
|
"has_currency_pattern: no currency pattern".to_string()
|
||||||
|
}],
|
||||||
|
confidence: if has_currency { 0.6 } else { 0.0 },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ExtractionMatchPredicate::HasCurrencyPattern {
|
||||||
|
has_currency_pattern: false,
|
||||||
|
} => MatchResult {
|
||||||
|
matched: true, // Negated predicate
|
||||||
|
reasons: vec!["has_currency_pattern: predicate disabled".to_string()],
|
||||||
|
confidence: 0.0,
|
||||||
|
},
|
||||||
|
ExtractionMatchPredicate::HasSignatureField {
|
||||||
|
has_signature_field: true,
|
||||||
|
} => {
|
||||||
|
let has_sig = signals.has_signature_field;
|
||||||
|
MatchResult {
|
||||||
|
matched: has_sig,
|
||||||
|
reasons: vec![if has_sig {
|
||||||
|
"has_signature_field: signature fields found".to_string()
|
||||||
|
} else {
|
||||||
|
"has_signature_field: no signature fields".to_string()
|
||||||
|
}],
|
||||||
|
confidence: if has_sig { 0.5 } else { 0.0 },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ExtractionMatchPredicate::HasSignatureField {
|
||||||
|
has_signature_field: false,
|
||||||
|
} => MatchResult {
|
||||||
|
matched: true,
|
||||||
|
reasons: vec!["has_signature_field: predicate disabled".to_string()],
|
||||||
|
confidence: 0.0,
|
||||||
|
},
|
||||||
|
ExtractionMatchPredicate::TextContainsAlias { patterns } => {
|
||||||
|
// Alias for TextContains
|
||||||
|
let text_lower = signals.text.to_lowercase();
|
||||||
|
|
||||||
|
for pattern in patterns {
|
||||||
|
if text_lower.contains(&pattern.to_lowercase()) {
|
||||||
|
return MatchResult {
|
||||||
|
matched: true,
|
||||||
|
reasons: vec![format!("text_contains: found '{}'", pattern)],
|
||||||
|
confidence: 0.8,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult {
|
||||||
|
matched: false,
|
||||||
|
reasons: vec!["text_contains: no patterns found".to_string()],
|
||||||
|
confidence: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ExtractionMatchPredicate::Structural {
|
||||||
|
has_table,
|
||||||
|
has_form_field,
|
||||||
|
has_math,
|
||||||
|
page_count,
|
||||||
|
} => {
|
||||||
|
let mut matched = true;
|
||||||
|
let mut reasons = Vec::new();
|
||||||
|
let mut min_confidence = 1.0;
|
||||||
|
|
||||||
|
if matches!(has_table, Some(true)) {
|
||||||
|
if signals.table_block_count > 0 {
|
||||||
|
reasons.push(format!("structural.has_table: {} tables found", signals.table_block_count));
|
||||||
|
} else {
|
||||||
|
reasons.push("structural.has_table: no tables found".to_string());
|
||||||
|
matched = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if matches!(has_form_field, Some(true)) {
|
||||||
|
if signals.has_form_field {
|
||||||
|
reasons.push("structural.has_form_field: form fields found".to_string());
|
||||||
|
} else {
|
||||||
|
reasons.push("structural.has_form_field: no form fields found".to_string());
|
||||||
|
matched = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if matches!(has_math, Some(true)) {
|
||||||
|
if signals.has_math_operators {
|
||||||
|
reasons.push("structural.has_math: math operators found".to_string());
|
||||||
|
} else {
|
||||||
|
reasons.push("structural.has_math: no math operators".to_string());
|
||||||
|
matched = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(range) = page_count {
|
||||||
|
let page_count = signals.page_count as u32;
|
||||||
|
let in_range = match (&range.min, &range.max) {
|
||||||
|
(Some(min), Some(max)) => page_count >= *min && page_count <= *max,
|
||||||
|
(Some(min), None) => page_count >= *min,
|
||||||
|
(None, Some(max)) => page_count <= *max,
|
||||||
|
(None, None) => true,
|
||||||
|
};
|
||||||
|
|
||||||
|
if in_range {
|
||||||
|
reasons.push(format!("structural.page_count: {} is in range", page_count));
|
||||||
|
} else {
|
||||||
|
reasons.push(format!(
|
||||||
|
"structural.page_count: {} is out of range {:?}",
|
||||||
|
page_count, range
|
||||||
|
));
|
||||||
|
matched = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult {
|
||||||
|
matched,
|
||||||
|
reasons,
|
||||||
|
confidence: if matched { min_confidence } else { 0.0 },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if text contains a currency pattern ($\d, €\d, £\d, ¥\d, etc.).
|
||||||
|
fn has_currency_pattern_impl(text: &str) -> bool {
|
||||||
|
// Simple check for currency symbols followed by digits
|
||||||
|
let text_lower = text.to_lowercase();
|
||||||
|
text_lower.contains('$') || text_lower.contains('€') || text_lower.contains('£') || text_lower.contains('¥')
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simple regex cache (thread-safe, LRU-bounded).
|
||||||
|
fn get_regex_cache() -> &'static Mutex<HashMap<String, Regex>> {
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
static CACHE: OnceLock<Mutex<HashMap<String, Regex>>> = OnceLock::new();
|
||||||
|
CACHE.get_or_init(|| Mutex::new(HashMap::new()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compile a regex pattern with caching.
|
||||||
|
fn compile_regex(pattern: &str) -> Result<Regex, regex::Error> {
|
||||||
|
// Check cache first
|
||||||
|
{
|
||||||
|
let cache = get_regex_cache().lock().unwrap();
|
||||||
|
if let Some(regex) = cache.get(pattern) {
|
||||||
|
return Ok(regex.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile and cache
|
||||||
|
let regex = Regex::new(pattern)?;
|
||||||
|
let mut cache = get_regex_cache().lock().unwrap();
|
||||||
|
|
||||||
|
// Simple LRU: clear if too many entries
|
||||||
|
if cache.len() > 100 {
|
||||||
|
cache.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.insert(pattern.to_string(), regex.clone());
|
||||||
|
Ok(regex)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn test_signals() -> FeatureSignals {
|
||||||
|
let mut signals = FeatureSignals {
|
||||||
|
text: "Invoice #12345\nTotal: $100.00\nDue date: 2025-01-15".to_string(),
|
||||||
|
text_pattern_hits: HashMap::new(),
|
||||||
|
headings: HashSet::from(["Invoice".to_string(), "Total".to_string()]),
|
||||||
|
page_count: 2,
|
||||||
|
table_block_count: 1,
|
||||||
|
has_signature_field: false,
|
||||||
|
has_form_field: false,
|
||||||
|
has_math_operators: false,
|
||||||
|
has_bullet_lists: false,
|
||||||
|
font_diversity: 3,
|
||||||
|
heading_depth: 2,
|
||||||
|
glyph_density: 0.9,
|
||||||
|
has_footer_page_numbers: false,
|
||||||
|
};
|
||||||
|
signals.build_pattern_hits();
|
||||||
|
signals
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_text_contains_match() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let pred = ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["invoice".to_string()],
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_predicate(&pred, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
assert_eq!(result.confidence, 0.8);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_text_contains_no_match() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let pred = ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["receipt".to_string()],
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_predicate(&pred, &signals);
|
||||||
|
assert!(!result.matched);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_heading_matches() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let pred = ExtractionMatchPredicate::HeadingMatches {
|
||||||
|
pattern: "^Invoice$".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_predicate(&pred, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_has_currency_pattern() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let pred = ExtractionMatchPredicate::HasCurrencyPattern {
|
||||||
|
has_currency_pattern: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_predicate(&pred, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_structural_has_table() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let pred = ExtractionMatchPredicate::Structural {
|
||||||
|
has_table: Some(true),
|
||||||
|
has_form_field: Some(false),
|
||||||
|
has_math: Some(false),
|
||||||
|
page_count: Some(PageCountRange {
|
||||||
|
min: Some(1),
|
||||||
|
max: Some(5),
|
||||||
|
hint: None,
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_predicate(&pred, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_expr_all() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let expr = MatchExpr::All {
|
||||||
|
all: vec![
|
||||||
|
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["invoice".to_string()],
|
||||||
|
}),
|
||||||
|
MatchExpr::Predicate(ExtractionMatchPredicate::Structural {
|
||||||
|
has_table: Some(true),
|
||||||
|
has_form_field: Some(false),
|
||||||
|
has_math: Some(false),
|
||||||
|
page_count: None,
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_match(&expr, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
assert!(result.reasons.iter().any(|r| r.contains("all: all sub-expressions matched")));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_expr_any() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let expr = MatchExpr::Any {
|
||||||
|
any: vec![
|
||||||
|
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["receipt".to_string()],
|
||||||
|
}),
|
||||||
|
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["invoice".to_string()],
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_match(&expr, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_expr_none() {
|
||||||
|
let signals = test_signals();
|
||||||
|
let expr = MatchExpr::None {
|
||||||
|
none: vec![MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["abstract".to_string()],
|
||||||
|
})],
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_match(&expr, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_expr_complex() {
|
||||||
|
let signals = test_signals();
|
||||||
|
// (invoice OR receipt) AND has_table
|
||||||
|
let expr = MatchExpr::All {
|
||||||
|
all: vec![
|
||||||
|
MatchExpr::Any {
|
||||||
|
any: vec![
|
||||||
|
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["invoice".to_string()],
|
||||||
|
}),
|
||||||
|
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
|
||||||
|
patterns: vec!["receipt".to_string()],
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
MatchExpr::Predicate(ExtractionMatchPredicate::Structural {
|
||||||
|
has_table: Some(true),
|
||||||
|
has_form_field: Some(false),
|
||||||
|
has_math: Some(false),
|
||||||
|
page_count: None,
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = evaluate_match(&expr, &signals);
|
||||||
|
assert!(result.matched);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -18,19 +18,35 @@
|
||||||
//! vocabulary between the rule engine, built-in profile definitions, and
|
//! vocabulary between the rule engine, built-in profile definitions, and
|
||||||
//! user-authored YAML profiles.
|
//! user-authored YAML profiles.
|
||||||
|
|
||||||
|
mod apply_profile;
|
||||||
mod engine;
|
mod engine;
|
||||||
|
mod extraction;
|
||||||
|
mod extraction_loader;
|
||||||
|
mod field_extractor;
|
||||||
mod loader;
|
mod loader;
|
||||||
|
mod match_eval;
|
||||||
mod signals;
|
mod signals;
|
||||||
mod types;
|
mod types;
|
||||||
|
|
||||||
|
pub use apply_profile::{apply_extraction_tuning, apply_profile_to_metadata, classify_and_select_profile};
|
||||||
pub use engine::{
|
pub use engine::{
|
||||||
classify, has_currency_pattern, ClassificationResult, ClassifierEngine, FeatureSignals,
|
classify, has_currency_pattern, ClassificationResult, ClassifierEngine, FeatureSignals,
|
||||||
};
|
};
|
||||||
|
pub use extraction::{
|
||||||
|
ExtractionProfile, ExtractionTuning, FieldExtraction, FieldSchema, FieldSpec, MatchExpr,
|
||||||
|
ExtractionMatchPredicate,
|
||||||
|
};
|
||||||
|
pub use extraction_loader::{
|
||||||
|
find_profile, get_xdg_profile_dir, load_extraction_profiles, load_profile_file, ProfileOrigin,
|
||||||
|
ProfileSource, validate_profile_file,
|
||||||
|
};
|
||||||
|
pub use field_extractor::{extract_profile_fields, FieldExtractionResult};
|
||||||
pub use loader::{
|
pub use loader::{
|
||||||
check_forbidden_keys, load_profiles_from_dir, ForbiddenKeyError, ProfileLoadError,
|
check_forbidden_keys, load_profiles_from_dir, ForbiddenKeyError, ProfileLoadError,
|
||||||
};
|
};
|
||||||
|
pub use match_eval::{evaluate_match, MatchResult};
|
||||||
pub use signals::{extract_feature_signals, extract_signals_from_results, PageSignalAccumulator};
|
pub use signals::{extract_feature_signals, extract_signals_from_results, PageSignalAccumulator};
|
||||||
pub use types::{MatchPredicate, Profile, ProfileType};
|
pub use types::{MatchPredicate as ClassificationMatchPredicate, Profile, ProfileType};
|
||||||
|
|
||||||
use crate::diagnostics::DiagCode;
|
use crate::diagnostics::DiagCode;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,55 +1,64 @@
|
||||||
|
# Bank Statement extraction profile
|
||||||
|
# Matches bank statements with account info, period, balances, transactions
|
||||||
|
name: bank_statement
|
||||||
description: Bank statement with account info, period, balances, transactions
|
description: Bank statement with account info, period, balances, transactions
|
||||||
priority: 42
|
priority: 42
|
||||||
|
|
||||||
match:
|
match:
|
||||||
any:
|
all:
|
||||||
- text_patterns:
|
- any:
|
||||||
- "(?i)statement\\s+of\\s+account"
|
- text_contains:
|
||||||
- "(?i)bank\\s+statement"
|
patterns: ["statement of account", "bank statement", "account statement", "transaction history"]
|
||||||
- "(?i)account\\s+statement"
|
- text_contains:
|
||||||
- "(?i)transaction\\s+history"
|
patterns: ["opening balance", "closing balance", "statement period"]
|
||||||
- text_patterns:
|
|
||||||
- "(?i)opening\\s+balance"
|
|
||||||
- "(?i)closing\\s+balance"
|
|
||||||
- "(?i)statement\\s+period"
|
|
||||||
- "(?i)account\\s*#?\\s*:?\\s*\\*{4,}"
|
|
||||||
- structural:
|
- structural:
|
||||||
- has_monetary_columnar_layout: true
|
has_table: true
|
||||||
- has_date_column: true
|
has_form_field: false
|
||||||
page_count_hint: 1-10
|
has_math: false
|
||||||
profile_fields:
|
page_count:
|
||||||
|
min: 1
|
||||||
|
max: 10
|
||||||
|
|
||||||
|
extraction:
|
||||||
|
reading_order: line_dominant
|
||||||
|
table_detection: default
|
||||||
|
readability_threshold: 0.5
|
||||||
|
include_invisible: false
|
||||||
|
include_headers_footers: false
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
|
fields:
|
||||||
account_number:
|
account_number:
|
||||||
type: string
|
type: string
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})"
|
||||||
- "(?i)account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})"
|
parse: string
|
||||||
- "(?i)acct\\s*(?:#|:)?\\s*(\\*?\\d[\\d\\*]{3,})"
|
|
||||||
fallback: null
|
|
||||||
statement_period:
|
statement_period:
|
||||||
type: string
|
type: string
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
near: ["Statement Period", "Period"]
|
||||||
- "(?i)statement\\s+period\\s*:?.*?([A-Za-z]+\\s+[0-9]{1,2}.*?through.*?[A-Za-z]+\\s+[0-9]{1,2},?\\s+[0-9]{4})"
|
parse: string
|
||||||
- "(?i)period\\s*:?.*?([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+(?:to|through|-)\\s+([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
|
||||||
fallback: null
|
|
||||||
opening_balance:
|
opening_balance:
|
||||||
type: decimal
|
type: decimal
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
near: ["Opening Balance", "Beginning Balance"]
|
||||||
- "(?i)opening\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
regex: "([\\d,]+\\.\\d{2})"
|
||||||
- "(?i)beginning\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
parse: decimal
|
||||||
fallback: null
|
|
||||||
closing_balance:
|
closing_balance:
|
||||||
type: decimal
|
type: decimal
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
near: ["Closing Balance", "Ending Balance", "Current Balance"]
|
||||||
- "(?i)closing\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
regex: "([\\d,]+\\.\\d{2})"
|
||||||
- "(?i)ending\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
parse: decimal
|
||||||
- "(?i)current\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
|
||||||
fallback: null
|
|
||||||
transactions:
|
transactions:
|
||||||
type: array
|
type: array
|
||||||
extraction:
|
extraction:
|
||||||
table_region: "largest_table_or_central_body"
|
table_region: largest_table
|
||||||
schema:
|
schema:
|
||||||
- name: date
|
- name: date
|
||||||
type: date
|
type: date
|
||||||
|
|
@ -64,5 +73,3 @@ profile_fields:
|
||||||
type: decimal
|
type: decimal
|
||||||
required: false
|
required: false
|
||||||
fallback: []
|
fallback: []
|
||||||
reading_order: line_dominant
|
|
||||||
zone_filtering: exclude_headers_footers
|
|
||||||
|
|
|
||||||
|
|
@ -1,68 +1,63 @@
|
||||||
# Book Chapter Profile
|
# Book Chapter extraction profile
|
||||||
#
|
# Matches book chapters, monographs, and long-form narrative documents
|
||||||
# Book chapters, monographs, and long-form narrative documents.
|
|
||||||
# Extracts title, chapter_number, author, sections.
|
|
||||||
|
|
||||||
name: book_chapter
|
name: book_chapter
|
||||||
description: Book chapters, monographs, long-form narrative documents
|
description: Book chapters, monographs, long-form narrative documents
|
||||||
priority: 5
|
priority: 5
|
||||||
|
|
||||||
# Matching predicates for book chapter classification
|
|
||||||
match:
|
match:
|
||||||
all:
|
all:
|
||||||
# Page count in typical chapter range (not a whole book, not a single page)
|
|
||||||
- structural:
|
- structural:
|
||||||
page_count: {min: 5, max: 1000}
|
has_table: false
|
||||||
# Heading depth indicates structured content
|
has_form_field: false
|
||||||
- structural:
|
has_math: false
|
||||||
heading_depth: {min: 1, max: 5}
|
page_count:
|
||||||
# AND EITHER: has chapter/section headings
|
min: 5
|
||||||
# OR: has limited font diversity (not a dense academic paper)
|
max: 1000
|
||||||
# OR: matches chapter/section text patterns
|
|
||||||
- any:
|
- any:
|
||||||
- text_matches: '^Chapter \d+'
|
- text_matches:
|
||||||
- heading_matches: '^(Chapter|Part|Section) \d+'
|
pattern: "^Chapter \\d+"
|
||||||
- text_matches: '^\d+\.\s+[A-Z]'
|
- heading_matches:
|
||||||
- structural:
|
pattern: "^(Chapter|Part|Section) \\d+"
|
||||||
font_diversity: {min: 1, max: 4}
|
- text_matches:
|
||||||
|
pattern: "^\\d+\\.\\s+[A-Z]"
|
||||||
none:
|
none:
|
||||||
# Exclude more specific document types
|
- text_contains:
|
||||||
- text_contains: ['Abstract', 'WHEREAS', 'Invoice', 'Account Statement', 'References']
|
patterns: ["Abstract", "WHEREAS", "Invoice", "Account Statement", "References"]
|
||||||
|
|
||||||
# Extraction tuning for book chapters
|
|
||||||
extraction:
|
extraction:
|
||||||
# Use line_dominant reading order for narrative text flow
|
|
||||||
reading_order: line_dominant
|
reading_order: line_dominant
|
||||||
# Default table detection
|
|
||||||
table_detection: default
|
table_detection: default
|
||||||
# Higher readability threshold for narrative text quality
|
|
||||||
readability_threshold: 0.6
|
readability_threshold: 0.6
|
||||||
# Don't include invisible text
|
|
||||||
include_invisible: false
|
include_invisible: false
|
||||||
# Exclude headers, footers, and page numbers from body content
|
|
||||||
include_headers_footers: false
|
include_headers_footers: false
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
# Field extraction specifications
|
|
||||||
fields:
|
fields:
|
||||||
title:
|
title:
|
||||||
type: string
|
type: string
|
||||||
region: top_third
|
extraction:
|
||||||
pick: largest_font
|
region: top_third
|
||||||
page: first
|
pick: largest_font
|
||||||
|
parse: string
|
||||||
|
|
||||||
chapter_number:
|
chapter_number:
|
||||||
type: string
|
type: string
|
||||||
near: ['Chapter', 'Part']
|
extraction:
|
||||||
regex: '\d+'
|
near: ["Chapter", "Part"]
|
||||||
max_distance_pt: 100
|
regex: "\\d+"
|
||||||
|
max_distance_pt: 100
|
||||||
|
parse: string
|
||||||
|
|
||||||
author:
|
author:
|
||||||
type: string
|
type: string
|
||||||
region: top_quarter
|
extraction:
|
||||||
pick: smallest_font
|
region: top_quarter
|
||||||
page: first
|
pick: smallest_font
|
||||||
|
parse: string
|
||||||
|
|
||||||
sections:
|
sections:
|
||||||
type: array
|
type: array
|
||||||
pick: largest_font
|
extraction:
|
||||||
per_page: true
|
pick: largest_font
|
||||||
|
fallback: []
|
||||||
|
|
|
||||||
|
|
@ -1,38 +1,66 @@
|
||||||
# Contract profile for legal agreements
|
# Contract extraction profile
|
||||||
# Extracts parties, effective date, term, governing law, and signatures from contracts
|
# Matches legal contracts and agreements with parties, effective date, term, governing law, and signatures
|
||||||
name: contract
|
name: contract
|
||||||
description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures
|
description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures
|
||||||
priority: 20
|
priority: 20
|
||||||
|
|
||||||
# Matching predicates: identify documents as contracts
|
|
||||||
match:
|
match:
|
||||||
all:
|
all:
|
||||||
- any:
|
- any:
|
||||||
- text_contains: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
|
- text_contains:
|
||||||
- heading_matches: '^(Agreement|Contract|Memorandum of Understanding)'
|
patterns: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
|
||||||
- structural: {page_count: {min: 2, max: 200}}
|
- heading_matches:
|
||||||
|
pattern: "^(Agreement|Contract|Memorandum of Understanding)"
|
||||||
|
- structural:
|
||||||
|
has_table: false
|
||||||
|
has_form_field: false
|
||||||
|
has_math: false
|
||||||
|
page_count:
|
||||||
|
min: 2
|
||||||
|
max: 200
|
||||||
none:
|
none:
|
||||||
- text_contains: ["Invoice #", "Receipt"]
|
- text_contains:
|
||||||
|
patterns: ["Invoice #", "Receipt"]
|
||||||
|
|
||||||
# Extraction tuning for contracts
|
|
||||||
extraction:
|
extraction:
|
||||||
reading_order: xy_cut
|
reading_order: xy_cut
|
||||||
|
table_detection: off
|
||||||
readability_threshold: 0.5
|
readability_threshold: 0.5
|
||||||
|
include_invisible: false
|
||||||
include_headers_footers: false
|
include_headers_footers: false
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
# Field extractors for contract-specific metadata
|
|
||||||
fields:
|
fields:
|
||||||
parties:
|
parties:
|
||||||
near: ["between", "party of the first part", "BY AND BETWEEN"]
|
type: string
|
||||||
pick: nearest_below
|
extraction:
|
||||||
|
near: ["between", "party of the first part", "BY AND BETWEEN"]
|
||||||
|
pick: nearest_below
|
||||||
|
parse: string
|
||||||
|
|
||||||
effective_date:
|
effective_date:
|
||||||
near: ["Effective Date", "Date of Agreement", "as of"]
|
type: date
|
||||||
parse: date
|
extraction:
|
||||||
|
near: ["Effective Date", "Date of Agreement", "as of"]
|
||||||
|
parse: date
|
||||||
|
|
||||||
term:
|
term:
|
||||||
near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
|
type: string
|
||||||
regex: '\d+\s+(years?|months?)|expires?\s+\d{4}'
|
extraction:
|
||||||
|
near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
|
||||||
|
regex: "\\d+\\s+(years?|months?)|expires?\\s+\\d{4}"
|
||||||
|
parse: string
|
||||||
|
|
||||||
governing_law:
|
governing_law:
|
||||||
near: ["Governing Law", "governed by the laws of"]
|
type: string
|
||||||
pick: nearest_right
|
extraction:
|
||||||
|
near: ["Governing Law", "governed by the laws of"]
|
||||||
|
pick: nearest_right
|
||||||
|
parse: string
|
||||||
|
|
||||||
signatures:
|
signatures:
|
||||||
region: bottom_quarter
|
type: array
|
||||||
|
extraction:
|
||||||
|
region: bottom_quarter
|
||||||
|
fallback: []
|
||||||
|
|
|
||||||
|
|
@ -1,18 +1,34 @@
|
||||||
|
# Form extraction profile
|
||||||
|
# Matches fillable forms with fields; uses line_dominant reading order
|
||||||
|
name: form
|
||||||
description: Fillable form with fields; uses line_dominant reading order and form_fields from Phase 7.4
|
description: Fillable form with fields; uses line_dominant reading order and form_fields from Phase 7.4
|
||||||
priority: 30
|
priority: 30
|
||||||
|
|
||||||
match:
|
match:
|
||||||
any:
|
all:
|
||||||
- text_patterns:
|
- any:
|
||||||
- "(?i)form\\s*[0-9A-Z-]+"
|
- text_contains:
|
||||||
- "(?i)application\\s+form"
|
patterns: ["form", "application form", "questionnaire", "please fill out", "required fields"]
|
||||||
- "(?i)questionnaire"
|
- structural:
|
||||||
- "(?i)please\\s+fill\\s+out"
|
has_table: false
|
||||||
- "(?i)required\\s+fields?"
|
has_form_field: true
|
||||||
|
has_math: false
|
||||||
|
page_count: null
|
||||||
- structural:
|
- structural:
|
||||||
- has_form_field_layout: true
|
has_table: false
|
||||||
- has_blank_lines_with_colons: true
|
has_form_field: false
|
||||||
page_count_hint: 1-10
|
has_math: false
|
||||||
profile_fields: {}
|
page_count:
|
||||||
reading_order: line_dominant
|
min: 1
|
||||||
zone_filtering: none
|
max: 10
|
||||||
form_fields_integration: true
|
|
||||||
|
extraction:
|
||||||
|
reading_order: line_dominant
|
||||||
|
table_detection: off
|
||||||
|
readability_threshold: 0.5
|
||||||
|
include_invisible: false
|
||||||
|
include_headers_footers: true
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
|
fields: {}
|
||||||
|
|
|
||||||
|
|
@ -1,81 +1,104 @@
|
||||||
|
# Invoice extraction profile
|
||||||
|
# Matches commercial invoices with line items, vendor/customer, and totals
|
||||||
|
name: invoice
|
||||||
description: Commercial invoice with line items, vendor/customer, and totals
|
description: Commercial invoice with line items, vendor/customer, and totals
|
||||||
priority: 50
|
priority: 50
|
||||||
|
|
||||||
match:
|
match:
|
||||||
any:
|
all:
|
||||||
- text_patterns:
|
- any:
|
||||||
- "(?i)invoice"
|
- text_contains:
|
||||||
- "(?i)bill to"
|
patterns: ["invoice", "bill to", "invoice #", "invoice number", "tax invoice"]
|
||||||
- "(?i)invoice #"
|
- heading_matches:
|
||||||
- "(?i)invoice number"
|
pattern: "^Invoice\\b"
|
||||||
- "(?i)tax invoice"
|
- any:
|
||||||
- text_patterns:
|
- has_currency_pattern:
|
||||||
- "(?i)due date"
|
has_currency_pattern: true
|
||||||
- "(?i)payment terms"
|
- structural:
|
||||||
- "(?i)purchase order"
|
has_table: true
|
||||||
- "(?i)po #"
|
has_form_field: false
|
||||||
- structural:
|
has_math: false
|
||||||
- has_line_item_table: true
|
page_count:
|
||||||
page_count_hint: 1-5
|
min: 1
|
||||||
profile_fields:
|
max: 5
|
||||||
|
none:
|
||||||
|
- text_contains:
|
||||||
|
patterns: ["abstract", "bibliography", "scientific paper"]
|
||||||
|
|
||||||
|
extraction:
|
||||||
|
reading_order: line_dominant
|
||||||
|
table_detection: strict_borders
|
||||||
|
readability_threshold: 0.4
|
||||||
|
include_invisible: false
|
||||||
|
include_headers_footers: false
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
|
fields:
|
||||||
invoice_number:
|
invoice_number:
|
||||||
type: string
|
type: string
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "Invoice\\s*#\\s*([\\w-]+)"
|
||||||
- "(?i)invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
|
near: ["Invoice", "Invoice Number", "Invoice #"]
|
||||||
- "(?i)bill\\s*invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
|
max_distance_pt: 200
|
||||||
fallback: null
|
parse: string
|
||||||
|
|
||||||
vendor:
|
vendor:
|
||||||
type: string
|
type: string
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
region: top_quarter
|
||||||
- "(?i)(?:from|vendor|supplier|company)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
|
pick: largest_font
|
||||||
- "(?i)^([A-Z][A-Za-z0-9\\s&]+)\\s+(?:Inc|LLC|Ltd|Corp|GmbH)"
|
|
||||||
fallback: null
|
|
||||||
customer:
|
customer:
|
||||||
type: string
|
type: string
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
near: ["Bill To", "Customer", "Sold To"]
|
||||||
- "(?i)(?:bill\\s*to|customer|client)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
|
max_distance_pt: 150
|
||||||
fallback: null
|
pick: nearest_below
|
||||||
|
parse: string
|
||||||
|
|
||||||
invoice_date:
|
invoice_date:
|
||||||
type: date
|
type: date
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
near: ["Date", "Invoice Date"]
|
||||||
- "(?i)invoice\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
max_distance_pt: 100
|
||||||
- "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
parse: date
|
||||||
fallback: null
|
|
||||||
due_date:
|
due_date:
|
||||||
type: date
|
type: date
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
near: ["Due Date", "Payment Due", "Due"]
|
||||||
- "(?i)due\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
max_distance_pt: 100
|
||||||
- "(?i)payment\\s*due\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
parse: date
|
||||||
fallback: null
|
|
||||||
total:
|
total:
|
||||||
type: decimal
|
type: decimal
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "([\\d,]+\\.\\d{2})"
|
||||||
- "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
near: ["Total", "Amount Due", "Balance Due", "Grand Total"]
|
||||||
- "(?i)amount\\s*due\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
max_distance_pt: 80
|
||||||
fallback: null
|
parse: decimal
|
||||||
|
|
||||||
subtotal:
|
subtotal:
|
||||||
type: decimal
|
type: decimal
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "([\\d,]+\\.\\d{2})"
|
||||||
- "(?i)sub\\s*total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
near: ["Subtotal", "Sub-Total"]
|
||||||
fallback: null
|
max_distance_pt: 80
|
||||||
|
parse: decimal
|
||||||
|
|
||||||
tax:
|
tax:
|
||||||
type: decimal
|
type: decimal
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "([\\d,]+\\.\\d{2})"
|
||||||
- "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
near: ["Tax", "VAT", "GST", "Sales Tax"]
|
||||||
- "(?i)vat\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
max_distance_pt: 80
|
||||||
- "(?i)gst\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
parse: decimal
|
||||||
fallback: null
|
|
||||||
line_items:
|
line_items:
|
||||||
type: array
|
type: array
|
||||||
extraction:
|
extraction:
|
||||||
table_region: "largest_table_or_bottom_half"
|
table_region: largest_table
|
||||||
schema:
|
schema:
|
||||||
- name: description
|
- name: description
|
||||||
type: string
|
type: string
|
||||||
|
|
@ -90,5 +113,3 @@ profile_fields:
|
||||||
type: decimal
|
type: decimal
|
||||||
required: false
|
required: false
|
||||||
fallback: []
|
fallback: []
|
||||||
reading_order: line_dominant
|
|
||||||
zone_filtering: exclude_headers_footers
|
|
||||||
|
|
|
||||||
|
|
@ -1,55 +1,62 @@
|
||||||
# Legal Filing Profile
|
# Legal Filing extraction profile
|
||||||
#
|
# Matches court filings: motions, briefs, orders, docket entries
|
||||||
# Court filings: motions, briefs, orders, docket entries.
|
|
||||||
# Extracts case_number, court, parties, filing_date, docket_entries.
|
|
||||||
|
|
||||||
name: legal_filing
|
name: legal_filing
|
||||||
description: "Court filings: motions, briefs, orders, docket entries"
|
description: Court filings: motions, briefs, orders, docket entries
|
||||||
priority: 40
|
priority: 40
|
||||||
|
|
||||||
# Matching predicates for legal filing classification
|
|
||||||
match:
|
match:
|
||||||
all:
|
all:
|
||||||
# Must have at least one legal filing marker
|
|
||||||
- any:
|
- any:
|
||||||
- text_contains:
|
- text_contains:
|
||||||
["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF",
|
patterns: ["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF", "Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner", "Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"]
|
||||||
"Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner",
|
- heading_matches:
|
||||||
"Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"]
|
pattern: "^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)"
|
||||||
- heading_matches: '^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)'
|
- structural:
|
||||||
# And appropriate page count
|
has_table: false
|
||||||
- structural: {page_count: {min: 1, max: 500}}
|
has_form_field: false
|
||||||
|
has_math: false
|
||||||
|
page_count:
|
||||||
|
min: 1
|
||||||
|
max: 500
|
||||||
|
|
||||||
# Extraction tuning for legal filings
|
|
||||||
extraction:
|
extraction:
|
||||||
# Use xy_cut reading order for complex layouts
|
|
||||||
reading_order: xy_cut
|
reading_order: xy_cut
|
||||||
# Default table detection
|
|
||||||
table_detection: default
|
table_detection: default
|
||||||
# Standard readability threshold
|
|
||||||
readability_threshold: 0.5
|
readability_threshold: 0.5
|
||||||
# Include headers and footers (page numbers and citations are load-bearing in legal docs)
|
|
||||||
include_headers_footers: true
|
|
||||||
# Don't include invisible text
|
|
||||||
include_invisible: false
|
include_invisible: false
|
||||||
|
include_headers_footers: true
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
# Field extraction specifications
|
|
||||||
fields:
|
fields:
|
||||||
case_number:
|
case_number:
|
||||||
near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."]
|
type: string
|
||||||
regex: '[\w-]+:?\s*\d+[\w-]*'
|
extraction:
|
||||||
parse: string
|
near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."]
|
||||||
|
regex: "[\\w-]+:?\\s*\\d+[\\w-]*"
|
||||||
|
parse: string
|
||||||
|
|
||||||
court:
|
court:
|
||||||
region: top_quarter
|
type: string
|
||||||
pick: largest_font
|
extraction:
|
||||||
|
region: top_quarter
|
||||||
|
pick: largest_font
|
||||||
|
parse: string
|
||||||
|
|
||||||
parties:
|
parties:
|
||||||
near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."]
|
type: array
|
||||||
|
extraction:
|
||||||
|
near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."]
|
||||||
|
fallback: []
|
||||||
|
|
||||||
filing_date:
|
filing_date:
|
||||||
near: ["Filed", "Date Filed", "Dated"]
|
type: date
|
||||||
parse: date
|
extraction:
|
||||||
|
near: ["Filed", "Date Filed", "Dated"]
|
||||||
|
parse: date
|
||||||
|
|
||||||
docket_entries:
|
docket_entries:
|
||||||
region: full
|
type: array
|
||||||
|
extraction:
|
||||||
|
region: bottom_half
|
||||||
|
fallback: []
|
||||||
|
|
|
||||||
|
|
@ -1,52 +1,67 @@
|
||||||
|
# Receipt extraction profile
|
||||||
|
# Matches point-of-sale or purchase receipts with items and payment method
|
||||||
|
name: receipt
|
||||||
description: Point-of-sale or purchase receipt with items, payment method
|
description: Point-of-sale or purchase receipt with items, payment method
|
||||||
priority: 45
|
priority: 45
|
||||||
|
|
||||||
match:
|
match:
|
||||||
any:
|
all:
|
||||||
- text_patterns:
|
- any:
|
||||||
- "(?i)receipt"
|
- text_contains:
|
||||||
- "(?i)store receipt"
|
patterns: ["receipt", "store receipt", "register receipt", "transaction receipt"]
|
||||||
- "(?i)register receipt"
|
- text_contains:
|
||||||
- "(?i)transaction receipt"
|
patterns: ["total sold", "change due", "cash credit", "card payment"]
|
||||||
- text_patterns:
|
|
||||||
- "(?i)total.*sold"
|
|
||||||
- "(?i)change.*due"
|
|
||||||
- "(?i)cash.*credit"
|
|
||||||
- "(?i)card.*payment"
|
|
||||||
- structural:
|
- structural:
|
||||||
- has_monetary_columnar_layout: true
|
has_table: true
|
||||||
- page_aspect_ratio: "narrow_or_square"
|
has_form_field: false
|
||||||
page_count_hint: 1
|
has_math: false
|
||||||
profile_fields:
|
page_count:
|
||||||
|
min: 1
|
||||||
|
max: 2
|
||||||
|
|
||||||
|
extraction:
|
||||||
|
reading_order: line_dominant
|
||||||
|
table_detection: default
|
||||||
|
readability_threshold: 0.5
|
||||||
|
include_invisible: false
|
||||||
|
include_headers_footers: false
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
|
fields:
|
||||||
merchant:
|
merchant:
|
||||||
type: string
|
type: string
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
region: top_quarter
|
||||||
- "(?i)^([A-Z][A-Za-z0-9\\s&']+)$"
|
pick: largest_font
|
||||||
- "(?i)(?:store|merchant|retailer)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&']+)"
|
parse: string
|
||||||
fallback: null
|
|
||||||
date:
|
date:
|
||||||
type: date
|
type: date
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}"
|
||||||
- "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
parse: date
|
||||||
- "([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+([0-9]{1,2}:[0-9]{2})"
|
|
||||||
fallback: null
|
|
||||||
total:
|
total:
|
||||||
type: decimal
|
type: decimal
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "([\\d,]+\\.\\d{2})"
|
||||||
- "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
near: ["Total", "Amount Due", "Balance"]
|
||||||
fallback: null
|
max_distance_pt: 80
|
||||||
|
parse: decimal
|
||||||
|
|
||||||
tax:
|
tax:
|
||||||
type: decimal
|
type: decimal
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "([\\d,]+\\.\\d{2})"
|
||||||
- "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
near: ["Tax", "VAT"]
|
||||||
fallback: null
|
max_distance_pt: 80
|
||||||
|
parse: decimal
|
||||||
|
|
||||||
items:
|
items:
|
||||||
type: array
|
type: array
|
||||||
extraction:
|
extraction:
|
||||||
columnar_regions: "monetary_columns"
|
table_region: largest_table
|
||||||
schema:
|
schema:
|
||||||
- name: name
|
- name: name
|
||||||
type: string
|
type: string
|
||||||
|
|
@ -58,11 +73,9 @@ profile_fields:
|
||||||
type: decimal
|
type: decimal
|
||||||
required: false
|
required: false
|
||||||
fallback: []
|
fallback: []
|
||||||
|
|
||||||
payment_method:
|
payment_method:
|
||||||
type: string
|
type: string
|
||||||
extraction:
|
extraction:
|
||||||
patterns:
|
regex: "(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
|
||||||
- "(?i)(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
|
parse: string
|
||||||
fallback: null
|
|
||||||
reading_order: line_dominant
|
|
||||||
zone_filtering: exclude_headers_footers
|
|
||||||
|
|
|
||||||
|
|
@ -1,66 +1,87 @@
|
||||||
# Scientific Paper Profile
|
# Scientific Paper extraction profile
|
||||||
#
|
# Matches academic papers from arXiv, journals, conference proceedings
|
||||||
# Academic papers from arXiv, journals, conference proceedings.
|
|
||||||
# Extracts title, authors, abstract, DOI, journal, publication_date, references.
|
|
||||||
|
|
||||||
name: scientific_paper
|
name: scientific_paper
|
||||||
description: Academic papers from arXiv, journals, conference proceedings
|
description: Academic papers from arXiv, journals, conference proceedings
|
||||||
priority: 30
|
priority: 30
|
||||||
|
|
||||||
# Matching predicates for scientific paper classification
|
|
||||||
match:
|
match:
|
||||||
all:
|
all:
|
||||||
# Must have at least one scientific paper marker
|
|
||||||
- any:
|
- any:
|
||||||
- text_contains: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"]
|
- text_contains:
|
||||||
- heading_matches: '^(Abstract|Introduction|References|Bibliography)'
|
patterns: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"]
|
||||||
# And either has math OR structured headings OR appropriate page count
|
- heading_matches:
|
||||||
|
pattern: "^(Abstract|Introduction|References|Bibliography)"
|
||||||
- any:
|
- any:
|
||||||
- structural:
|
- structural:
|
||||||
|
has_table: false
|
||||||
|
has_form_field: false
|
||||||
has_math: true
|
has_math: true
|
||||||
|
page_count: null
|
||||||
- structural:
|
- structural:
|
||||||
heading_depth: {min: 2}
|
has_table: false
|
||||||
- structural:
|
has_form_field: false
|
||||||
page_count: {min: 4, max: 50}
|
has_math: false
|
||||||
|
page_count:
|
||||||
|
min: 4
|
||||||
|
max: 50
|
||||||
|
none:
|
||||||
|
- text_contains:
|
||||||
|
patterns: ["Invoice", "Receipt", "WHEREAS", "NOW THEREFORE"]
|
||||||
|
|
||||||
# Extraction tuning for scientific papers
|
|
||||||
extraction:
|
extraction:
|
||||||
# Use xy_cut reading order for 2-column layout handling
|
|
||||||
reading_order: xy_cut
|
reading_order: xy_cut
|
||||||
# Default table detection
|
|
||||||
table_detection: default
|
table_detection: default
|
||||||
# Standard readability threshold
|
|
||||||
readability_threshold: 0.5
|
readability_threshold: 0.5
|
||||||
# Don't include invisible text
|
|
||||||
include_invisible: false
|
include_invisible: false
|
||||||
|
include_headers_footers: false
|
||||||
|
force_ocr: false
|
||||||
|
min_block_chars: 0
|
||||||
|
|
||||||
# Field extraction specifications
|
|
||||||
fields:
|
fields:
|
||||||
title:
|
title:
|
||||||
region: top_quarter
|
type: string
|
||||||
pick: largest_font
|
extraction:
|
||||||
|
region: top_quarter
|
||||||
|
pick: largest_font
|
||||||
|
parse: string
|
||||||
|
|
||||||
authors:
|
authors:
|
||||||
region: top_quarter
|
type: array
|
||||||
pick: nearest_below
|
extraction:
|
||||||
after: title
|
region: top_quarter
|
||||||
|
pick: nearest_below
|
||||||
|
after_heading: title
|
||||||
|
fallback: []
|
||||||
|
|
||||||
abstract:
|
abstract:
|
||||||
near: ["Abstract"]
|
type: string
|
||||||
region: top_half
|
extraction:
|
||||||
|
near: ["Abstract"]
|
||||||
|
region: top_half
|
||||||
|
parse: string
|
||||||
|
|
||||||
doi:
|
doi:
|
||||||
regex: 'doi[:\.]\s*(10\.\d{4,9}/[\w\-\._;()/:]+)'
|
type: string
|
||||||
parse: string
|
extraction:
|
||||||
|
regex: "doi[:\\.]\\s*(10\\.\\d{4,9}/[\\w\\-\\._;()/:]+)"
|
||||||
|
parse: string
|
||||||
|
|
||||||
journal:
|
journal:
|
||||||
region: top_eighth
|
type: string
|
||||||
pick: first
|
extraction:
|
||||||
|
region: top_eighth
|
||||||
|
pick: first
|
||||||
|
parse: string
|
||||||
|
|
||||||
publication_date:
|
publication_date:
|
||||||
near: ["Published", "Received", "Accepted"]
|
type: date
|
||||||
parse: date
|
extraction:
|
||||||
|
near: ["Published", "Received", "Accepted"]
|
||||||
|
parse: date
|
||||||
|
|
||||||
references:
|
references:
|
||||||
region: bottom_half
|
type: array
|
||||||
after_heading: References
|
extraction:
|
||||||
|
region: bottom_half
|
||||||
|
after_heading: References
|
||||||
|
fallback: []
|
||||||
|
|
|
||||||
|
|
@ -1,64 +1,59 @@
|
||||||
# Slide Deck Profile
|
# Slide Deck extraction profile
|
||||||
#
|
# Matches PowerPoint / Keynote / Google Slides exports as PDF
|
||||||
# PowerPoint / Keynote / Google Slides exports as PDF.
|
|
||||||
# Extracts title, presenter, date, slide_titles.
|
|
||||||
|
|
||||||
name: slide_deck
|
name: slide_deck
|
||||||
description: PowerPoint / Keynote / Google Slides exports as PDF
|
description: PowerPoint / Keynote / Google Slides exports as PDF
|
||||||
priority: 15
|
priority: 15
|
||||||
|
|
||||||
# Matching predicates for slide deck classification
|
|
||||||
match:
|
match:
|
||||||
all:
|
all:
|
||||||
# Page count in typical slide deck range
|
|
||||||
- structural:
|
- structural:
|
||||||
page_count: {min: 3, max: 200}
|
has_table: false
|
||||||
# And EITHER: has limited font diversity (not a dense academic paper)
|
has_form_field: false
|
||||||
# OR: contains "Slide N" patterns
|
has_math: false
|
||||||
# OR: contains slide deck keywords
|
page_count:
|
||||||
|
min: 3
|
||||||
|
max: 200
|
||||||
- any:
|
- any:
|
||||||
- structural:
|
- text_matches:
|
||||||
has_form_field: false
|
pattern: "^Slide \\d+$"
|
||||||
font_diversity: {min: 2, max: 10}
|
- text_contains:
|
||||||
- text_matches: '^Slide \d+$'
|
patterns: ["slides", "presentation"]
|
||||||
- text_contains: ["slides", "presentation"]
|
|
||||||
none:
|
none:
|
||||||
# Exclude academic papers (these have their own profile)
|
- text_contains:
|
||||||
- text_contains: ["Abstract", "References", "WHEREAS", "Invoice"]
|
patterns: ["Abstract", "References", "WHEREAS", "Invoice"]
|
||||||
|
|
||||||
# Extraction tuning for slide decks
|
|
||||||
extraction:
|
extraction:
|
||||||
# Use xy_cut reading order for proper layout handling
|
|
||||||
reading_order: xy_cut
|
reading_order: xy_cut
|
||||||
# Default table detection
|
|
||||||
table_detection: default
|
table_detection: default
|
||||||
# Lower readability threshold for slides (less text density)
|
|
||||||
readability_threshold: 0.6
|
readability_threshold: 0.6
|
||||||
# Don't include invisible text
|
|
||||||
include_invisible: false
|
include_invisible: false
|
||||||
# Minimum block characters
|
include_headers_footers: false
|
||||||
|
force_ocr: false
|
||||||
min_block_chars: 5
|
min_block_chars: 5
|
||||||
|
|
||||||
# Field extraction specifications
|
|
||||||
fields:
|
fields:
|
||||||
title:
|
title:
|
||||||
type: string
|
type: string
|
||||||
region: middle_half
|
extraction:
|
||||||
pick: largest_font
|
region: top_half
|
||||||
page: first
|
pick: largest_font
|
||||||
|
parse: string
|
||||||
|
|
||||||
presenter:
|
presenter:
|
||||||
type: string
|
type: string
|
||||||
region: bottom_half
|
extraction:
|
||||||
pick: largest_font
|
region: top_half
|
||||||
page: first
|
pick: largest_font
|
||||||
|
parse: string
|
||||||
|
|
||||||
date:
|
date:
|
||||||
type: date
|
type: date
|
||||||
near: ["Date"]
|
extraction:
|
||||||
parse: date
|
near: ["Date"]
|
||||||
|
parse: date
|
||||||
|
|
||||||
slide_titles:
|
slide_titles:
|
||||||
type: array
|
type: array
|
||||||
pick: largest_font
|
extraction:
|
||||||
per_page: true
|
pick: largest_font
|
||||||
|
fallback: []
|
||||||
|
|
|
||||||
1
tests/fixtures/profiles/invoice/01.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/01.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/01.pdf
|
||||||
1
tests/fixtures/profiles/invoice/02.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/02.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/02.pdf
|
||||||
1
tests/fixtures/profiles/invoice/03.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/03.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/03.pdf
|
||||||
1
tests/fixtures/profiles/invoice/04.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/04.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/04.pdf
|
||||||
1
tests/fixtures/profiles/invoice/05.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/05.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/05.pdf
|
||||||
1
tests/fixtures/profiles/invoice/06.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/06.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/06.pdf
|
||||||
1
tests/fixtures/profiles/invoice/07.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/07.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/07.pdf
|
||||||
1
tests/fixtures/profiles/invoice/08.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/08.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/08.pdf
|
||||||
1
tests/fixtures/profiles/invoice/09.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/09.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/09.pdf
|
||||||
1
tests/fixtures/profiles/invoice/10.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/10.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/10.pdf
|
||||||
1
tests/fixtures/profiles/invoice/11.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/11.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/11.pdf
|
||||||
1
tests/fixtures/profiles/invoice/12.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/12.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/12.pdf
|
||||||
1
tests/fixtures/profiles/invoice/13.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/13.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/13.pdf
|
||||||
1
tests/fixtures/profiles/invoice/14.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/14.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/14.pdf
|
||||||
1
tests/fixtures/profiles/invoice/15.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/15.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/15.pdf
|
||||||
1
tests/fixtures/profiles/invoice/16.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/16.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/16.pdf
|
||||||
1
tests/fixtures/profiles/invoice/17.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/17.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/17.pdf
|
||||||
1
tests/fixtures/profiles/invoice/18.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/18.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/18.pdf
|
||||||
1
tests/fixtures/profiles/invoice/19.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/19.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/19.pdf
|
||||||
1
tests/fixtures/profiles/invoice/20.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/20.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/20.pdf
|
||||||
1
tests/fixtures/profiles/invoice/21.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/21.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/21.pdf
|
||||||
1
tests/fixtures/profiles/invoice/22.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/22.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/22.pdf
|
||||||
1
tests/fixtures/profiles/invoice/23.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/23.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/23.pdf
|
||||||
1
tests/fixtures/profiles/invoice/24.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/24.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/24.pdf
|
||||||
1
tests/fixtures/profiles/invoice/25.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/25.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/25.pdf
|
||||||
1
tests/fixtures/profiles/invoice/26.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/26.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/26.pdf
|
||||||
1
tests/fixtures/profiles/invoice/27.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/27.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/27.pdf
|
||||||
1
tests/fixtures/profiles/invoice/28.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/28.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/28.pdf
|
||||||
1
tests/fixtures/profiles/invoice/29.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/29.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/29.pdf
|
||||||
1
tests/fixtures/profiles/invoice/30.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/30.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/30.pdf
|
||||||
1
tests/fixtures/profiles/invoice/31.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/31.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/31.pdf
|
||||||
1
tests/fixtures/profiles/invoice/32.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/32.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/32.pdf
|
||||||
1
tests/fixtures/profiles/invoice/33.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/33.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/33.pdf
|
||||||
1
tests/fixtures/profiles/invoice/34.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/34.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/34.pdf
|
||||||
1
tests/fixtures/profiles/invoice/35.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/35.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/35.pdf
|
||||||
1
tests/fixtures/profiles/invoice/36.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/36.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/36.pdf
|
||||||
1
tests/fixtures/profiles/invoice/37.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/37.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/37.pdf
|
||||||
1
tests/fixtures/profiles/invoice/38.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/38.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/38.pdf
|
||||||
1
tests/fixtures/profiles/invoice/39.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/39.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/39.pdf
|
||||||
1
tests/fixtures/profiles/invoice/40.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/40.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/40.pdf
|
||||||
1
tests/fixtures/profiles/invoice/41.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/41.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/41.pdf
|
||||||
1
tests/fixtures/profiles/invoice/42.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/42.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/42.pdf
|
||||||
1
tests/fixtures/profiles/invoice/43.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/43.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/43.pdf
|
||||||
1
tests/fixtures/profiles/invoice/44.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/44.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/44.pdf
|
||||||
1
tests/fixtures/profiles/invoice/45.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/45.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/45.pdf
|
||||||
1
tests/fixtures/profiles/invoice/46.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/46.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/46.pdf
|
||||||
1
tests/fixtures/profiles/invoice/47.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/47.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/47.pdf
|
||||||
1
tests/fixtures/profiles/invoice/48.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/48.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/48.pdf
|
||||||
1
tests/fixtures/profiles/invoice/49.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/49.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/49.pdf
|
||||||
1
tests/fixtures/profiles/invoice/50.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/invoice/50.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../classifier/invoice/50.pdf
|
||||||
1
tests/fixtures/profiles/receipt/tampered-receipt.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/receipt/tampered-receipt.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../../sdk-conformance/fixtures/receipts/tampered-receipt.pdf
|
||||||
1
tests/fixtures/profiles/receipt/valid-receipt.pdf
vendored
Symbolic link
1
tests/fixtures/profiles/receipt/valid-receipt.pdf
vendored
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../../sdk-conformance/fixtures/receipts/valid-receipt.pdf
|
||||||
Loading…
Add table
Reference in a new issue