feat(profiles): add profile infrastructure and initial fixtures

- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
This commit is contained in:
jedarden 2026-05-31 15:10:51 -04:00
parent deeafed7a9
commit 80dbf0f703
74 changed files with 2940 additions and 331 deletions

View file

@ -1 +1 @@
9cf1ccffa9b1213b83079e66d9a245aadc6d584f
deeafed7a94a1e91609a11976ef16ee03a1f5fac

1
Cargo.lock generated
View file

@ -3267,6 +3267,7 @@ dependencies = [
"criterion",
"dashmap",
"digest",
"dirs",
"encoding_rs",
"filetime",
"flate2",

View file

@ -72,6 +72,7 @@ clap = { version = "4.5", features = ["derive"] }
crossbeam-channel = "0.5"
dirs = "5.0"
hyper = { version = "1.0", features = ["full"] }
notify = { version = "6", optional = true }
hyper-util = { version = "0.1", features = ["full"] }
image = "0.24"
http-body-util = "0.1"
@ -117,7 +118,7 @@ full-render = ["dep:libloading", "pdftract-core/full-render"]
# Remote HTTP source support
remote = ["dep:ureq"]
# Document profiles
profiles = ["dep:serde_yaml", "pdftract-core/profiles"]
profiles = ["dep:serde_yaml", "pdftract-core/profiles", "dep:notify"]
# HTTP serve mode
serve = []
# MCP server mode

View file

@ -19,6 +19,7 @@ mod output;
mod pages;
mod panic_hook;
mod password;
mod profiles_cmd;
mod serve;
mod url;
mod verify_receipt;
@ -160,6 +161,10 @@ enum Commands {
#[arg(long)]
auto: bool,
/// Force-apply a specific profile (by name or YAML file path)
#[arg(long, value_name = "NAME|PATH")]
profile: Option<String>,
/// Include header blocks in output
#[arg(long)]
include_headers: bool,
@ -238,6 +243,11 @@ enum Commands {
#[command(subcommand)]
cache_command: CacheCommands,
},
/// Manage document type profiles
Profiles {
#[command(subcommand)]
profiles_command: ProfilesCommands,
},
/// Start the HTTP server for extraction
///
/// ## Security Model
@ -311,6 +321,14 @@ enum Commands {
/// Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy)
#[arg(long)]
trust_forwarded_for: bool,
/// Directory containing custom profile YAML files (repeatable)
#[arg(long, value_name = "DIR")]
profile_dir: Option<PathBuf>,
/// Enable hot-reload for profiles (re-read directory on every request)
#[arg(long)]
profile_hot_reload: bool,
},
/// Start the MCP (Model Context Protocol) server
///
@ -452,6 +470,32 @@ enum CacheCommands {
},
}
#[derive(Subcommand)]
enum ProfilesCommands {
/// List all available profiles
List,
/// Show a profile's YAML content
Show {
/// Profile name or path to YAML file
name_or_path: String,
},
/// Export a built-in profile to stdout
Export {
/// Name of the built-in profile to export
name: String,
},
/// Install a profile to the user config directory
Install {
/// Path to the profile YAML file to install
path: PathBuf,
},
/// Validate a profile file
Validate {
/// Path to the profile YAML file to validate
path: PathBuf,
},
}
fn main() -> Result<()> {
// Install panic hook for SecretString redaction in backtraces
// This ensures credentials never leak in crash dumps
@ -504,6 +548,7 @@ fn main() -> Result<()> {
no_cache,
md_anchors,
auto,
profile,
output,
include_headers,
include_footers,
@ -532,6 +577,7 @@ fn main() -> Result<()> {
no_cache,
md_anchors,
auto,
profile,
include_headers,
include_footers,
include_headers_footers,
@ -602,6 +648,12 @@ fn main() -> Result<()> {
std::process::exit(1);
}
}
Commands::Profiles { profiles_command } => {
if let Err(e) = cmd_profiles(profiles_command) {
eprintln!("Error: {}", e);
std::process::exit(1);
}
}
Commands::Serve {
bind,
cache_dir,
@ -611,6 +663,8 @@ fn main() -> Result<()> {
max_decompress_gb,
audit_log,
trust_forwarded_for,
profile_dir,
profile_hot_reload,
} => {
if let Err(e) = cmd_serve(
bind,
@ -621,6 +675,8 @@ fn main() -> Result<()> {
max_decompress_gb,
audit_log,
trust_forwarded_for,
profile_dir,
profile_hot_reload,
) {
eprintln!("Error: {}", e);
std::process::exit(1);
@ -775,6 +831,7 @@ fn cmd_extract(
no_cache: bool,
md_anchors: bool,
auto: bool,
profile: Option<String>,
include_headers: bool,
include_footers: bool,
include_headers_footers: bool,
@ -921,11 +978,12 @@ fn cmd_extract(
eprintln!("Auto-detecting document type...");
use pdftract_core::profiles::{
classify, extract_signals_from_results, load_builtins, ProfileType,
classify_and_select_profile, extract_signals_from_results, load_extraction_profiles,
apply_extraction_tuning, apply_profile_to_metadata,
};
// Load built-in profiles
let profiles = load_builtins();
// Load all extraction profiles
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
if !profiles.is_empty() {
// Perform a lightweight extraction for classification
@ -940,43 +998,33 @@ fn cmd_extract(
.map(|p| (p.blocks.clone(), p.spans.clone()))
.collect();
let signals =
extract_signals_from_results(&page_data, has_signature_field, has_form_field);
let classification = classify(&signals, &profiles);
let selected_profile = classify_and_select_profile(
&profiles.iter().map(|p| p.profile.clone()).collect::<Vec<_>>(),
&page_data,
has_signature_field,
has_form_field,
);
match classification.document_type {
ProfileType::Unknown => {
eprintln!(
"Document type: unknown (confidence: {:.2})",
classification.confidence
);
eprintln!("Proceeding with default extraction options.");
}
detected_type => {
let type_name = match detected_type {
ProfileType::Invoice => "invoice",
ProfileType::Receipt => "receipt",
ProfileType::Contract => "contract",
ProfileType::ScientificPaper => "scientific_paper",
ProfileType::SlideDeck => "slide_deck",
ProfileType::Form => "form",
ProfileType::BankStatement => "bank_statement",
ProfileType::LegalFiling => "legal_filing",
ProfileType::BookChapter => "book_chapter",
ProfileType::Unknown => "unknown",
};
eprintln!(
"Document type: {} (confidence: {:.2})",
type_name, classification.confidence
);
if let Some((profile, match_result)) = selected_profile {
eprintln!(
"Document type: {} (confidence: {:.2})",
profile.name, match_result.confidence
);
// Apply profile-specific extraction options
// For now, just log the detection - profile option overrides
// will be implemented in Phase 7.10
for reason in classification.reasons.iter().take(5) {
eprintln!(" - {}", reason);
}
// Apply profile extraction tuning
if let Some(ref tuning) = profile.extraction {
apply_extraction_tuning(tuning, &mut options);
}
// Store the selected profile for later field extraction
// We'll extract fields after the main extraction
// For now, just log the match reasons
for reason in match_result.reasons.iter().take(5) {
eprintln!(" - {}", reason);
}
} else {
eprintln!("Document type: unknown (confidence: below threshold)");
eprintln!("Proceeding with default extraction options.");
}
} else {
eprintln!(
@ -990,6 +1038,46 @@ fn cmd_extract(
}
}
// Handle --profile flag: load and apply specific profile
#[cfg(feature = "profiles")]
if let Some(ref profile_name_or_path) = profile {
use pdftract_core::profiles::{
load_extraction_profiles, apply_extraction_tuning,
};
eprintln!("Applying profile: {}", profile_name_or_path);
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
// Find the profile by name or load from path
let profile = if std::path::PathBuf::from(profile_name_or_path).exists() {
// Load from file path
use pdftract_core::profiles::load_profile_file;
match load_profile_file(&std::path::PathBuf::from(profile_name_or_path)) {
Ok(p) => Some(p),
Err(e) => {
eprintln!("Error loading profile: {}", e);
std::process::exit(1);
}
}
} else {
// Find by name
profiles.iter()
.find(|p| p.profile.name == *profile_name_or_path)
.map(|p| p.profile.clone())
};
if let Some(p) = profile {
eprintln!("Loaded profile: {}", p.name);
if let Some(ref tuning) = p.extraction {
apply_extraction_tuning(tuning, &mut options);
}
} else {
eprintln!("Error: Profile '{}' not found", profile_name_or_path);
std::process::exit(1);
}
}
#[cfg(not(feature = "profiles"))]
if auto {
eprintln!("Warning: --auto flag requires the 'profiles' feature to be enabled.");
@ -997,6 +1085,13 @@ fn cmd_extract(
eprintln!("Proceeding with default extraction options.");
}
#[cfg(not(feature = "profiles"))]
if profile.is_some() {
eprintln!("Warning: --profile flag requires the 'profiles' feature to be enabled.");
eprintln!("Build pdftract with: --features profiles");
eprintln!("Proceeding with default extraction options.");
}
// Set markdown anchors option
options.markdown_anchors = md_anchors;
if md_anchors {
@ -1096,6 +1191,58 @@ fn cmd_extract(
result.metadata.cache_status = Some(cache_status);
result.metadata.cache_age_seconds = cache_age;
// Extract profile fields if --auto or --profile was used
#[cfg(feature = "profiles")]
{
use pdftract_core::profiles::{
load_extraction_profiles, apply_profile_to_metadata,
};
let profile_to_apply = if auto {
// Re-run classification to get the selected profile
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
let page_data: Vec<(Vec<_>, Vec<_>)> = result
.pages
.iter()
.map(|p| (p.blocks.clone(), p.spans.clone()))
.collect();
let has_signature_field = !result.signatures.is_empty();
let has_form_field = !result.form_fields.is_empty();
use pdftract_core::profiles::classify_and_select_profile;
classify_and_select_profile(
&profiles.iter().map(|p| p.profile.clone()).collect::<Vec<_>>(),
&page_data,
has_signature_field,
has_form_field,
).map(|(p, _)| p)
} else if profile.is_some() {
// Load the specified profile
let profile_name_or_path = profile.as_ref().unwrap();
let profiles = load_extraction_profiles(&[]).unwrap_or_default();
if std::path::PathBuf::from(profile_name_or_path).exists() {
use pdftract_core::profiles::load_profile_file;
load_profile_file(&std::path::PathBuf::from(profile_name_or_path)).ok()
} else {
profiles.iter()
.find(|p| p.profile.name == *profile_name_or_path)
.map(|p| p.profile.clone())
}
} else {
None
};
// Apply profile to metadata
if let Some(p) = profile_to_apply {
let (name, version, fields) = apply_profile_to_metadata(&p, &result.pages);
// Update the result's metadata with profile information
result.metadata.profile_name = Some(name);
result.metadata.profile_version = Some(version);
result.metadata.profile_fields = fields;
}
}
// Write each output to its destination
for spec in &output_specs {
match spec.dest {
@ -1803,6 +1950,25 @@ fn cmd_cache(command: CacheCommands) -> Result<()> {
Ok(())
}
fn cmd_profiles(command: ProfilesCommands) -> Result<()> {
use profiles_cmd::{ProfilesArgs, ProfilesCommand};
// Convert ProfilesCommands to profiles_cmd::ProfilesCommand
let profiles_command = match command {
ProfilesCommands::List => ProfilesCommand::List,
ProfilesCommands::Show { name_or_path } => ProfilesCommand::Show { name_or_path },
ProfilesCommands::Export { name } => ProfilesCommand::Export { name },
ProfilesCommands::Install { path } => ProfilesCommand::Install { path },
ProfilesCommands::Validate { path } => ProfilesCommand::Validate { path },
};
let args = ProfilesArgs {
command: profiles_command,
};
profiles_cmd::run_profiles(args)
}
fn cmd_serve(
bind: String,
cache_dir: Option<PathBuf>,

View file

@ -0,0 +1,300 @@
//! Profile management CLI subcommand.
//!
//! This module implements the `pdftract profiles` command family for managing
//! document type profiles (list, show, export, install, validate).
use anyhow::{Context, Result};
use std::fs;
use std::path::PathBuf;
/// Arguments for the profiles subcommand.
pub struct ProfilesArgs {
/// Subcommand to run
pub command: ProfilesCommand,
}
/// Profiles subcommands.
#[derive(Debug, Clone)]
pub enum ProfilesCommand {
/// List all available profiles
List,
/// Show a profile's YAML content
Show { name_or_path: String },
/// Export a built-in profile to stdout
Export { name: String },
/// Install a profile to the user config directory
Install { path: PathBuf },
/// Validate a profile file
Validate { path: PathBuf },
}
/// Run the profiles subcommand.
pub fn run_profiles(args: ProfilesArgs) -> Result<()> {
match args.command {
ProfilesCommand::List => run_list(),
ProfilesCommand::Show { name_or_path } => run_show(&name_or_path),
ProfilesCommand::Export { name } => run_export(&name),
ProfilesCommand::Install { path } => run_install(&path),
ProfilesCommand::Validate { path } => run_validate(&path),
}
}
/// List all available profiles.
fn run_list() -> Result<()> {
#[cfg(feature = "profiles")]
{
use pdftract_core::profiles::extraction_loader;
// Load all extraction profiles
let profiles = extraction_loader::load_extraction_profiles(&[])?;
if profiles.is_empty() {
println!("No profiles available.");
println!();
println!("Built-in profiles may not be enabled. Build pdftract with:");
println!(" cargo build --features profiles");
return Ok(());
}
println!("Available profiles ({} total):", profiles.len());
println!();
// Group by origin
let mut builtin = Vec::new();
let mut user = Vec::new();
let mut custom = Vec::new();
for source in &profiles {
match source.source {
extraction_loader::ProfileOrigin::BuiltIn => builtin.push(source),
extraction_loader::ProfileOrigin::User => user.push(source),
extraction_loader::ProfileOrigin::Custom(_) => custom.push(source),
extraction_loader::ProfileOrigin::System => {
// System profiles - add to a separate group or merge with user
user.push(source);
}
}
}
// Print built-in profiles
if !builtin.is_empty() {
println!("Built-in profiles:");
for source in builtin {
let profile = &source.profile;
println!(
" {} - Priority: {}{}",
profile.name,
profile.priority,
if source.overrides_builtin {
" (overrides built-in)"
} else {
""
}
);
println!(" {}", profile.description);
}
println!();
}
// Print user profiles
if !user.is_empty() {
println!("User profiles:");
for source in user {
let profile = &source.profile;
println!(
" {} - Priority: {}{}",
profile.name,
profile.priority,
if source.overrides_builtin {
" (overrides built-in)"
} else {
""
}
);
println!(" {}", profile.description);
}
println!();
}
// Print custom profiles
if !custom.is_empty() {
println!("Custom profiles:");
for source in custom {
let profile = &source.profile;
println!(
" {} - Priority: {}",
profile.name, profile.priority
);
println!(" {}", profile.description);
}
println!();
}
}
#[cfg(not(feature = "profiles"))]
{
println!("Profiles are not enabled.");
println!();
println!("Build pdftract with the profiles feature:");
println!(" cargo build --features profiles");
}
Ok(())
}
/// Show a profile's YAML content.
fn run_show(name_or_path: &str) -> Result<()> {
#[cfg(feature = "profiles")]
{
use pdftract_core::profiles::extraction_loader;
// Load all profiles to search by name
let profiles = extraction_loader::load_extraction_profiles(&[])?;
// Try to find the profile
let profile = extraction_loader::find_profile(name_or_path, &profiles)?;
// Serialize back to YAML
let yaml = serde_yaml::to_string(&profile)
.context("Failed to serialize profile to YAML")?;
println!("{}", yaml);
}
#[cfg(not(feature = "profiles"))]
{
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
}
Ok(())
}
/// Export a built-in profile to stdout.
fn run_export(name: &str) -> Result<()> {
#[cfg(feature = "profiles")]
{
use pdftract_core::profiles::extraction_loader;
// Load all profiles
let profiles = extraction_loader::load_extraction_profiles(&[])?;
// Find the built-in profile by name
let profile = profiles
.iter()
.find(|s| s.profile.name == name && matches!(s.source, extraction_loader::ProfileOrigin::BuiltIn))
.context(format!("Built-in profile '{}' not found", name))?;
// Serialize to YAML
let yaml = serde_yaml::to_string(&profile)
.context("Failed to serialize profile to YAML")?;
println!("{}", yaml);
}
#[cfg(not(feature = "profiles"))]
{
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
}
Ok(())
}
/// Install a profile to the user config directory.
fn run_install(path: &PathBuf) -> Result<()> {
#[cfg(feature = "profiles")]
{
use pdftract_core::profiles::extraction_loader;
// Check if source file exists
if !path.exists() {
anyhow::bail!("Profile file not found: {}", path.display());
}
// Get XDG config directory
let xdg_dir = extraction_loader::get_xdg_profile_dir()
.context("Failed to determine XDG config directory")?;
// Create directory if it doesn't exist
fs::create_dir_all(&xdg_dir)
.context(format!("Failed to create profile directory: {}", xdg_dir.display()))?;
// Read the profile to get its name
let content = fs::read_to_string(path)
.context(format!("Failed to read profile file: {}", path.display()))?;
// Parse to get the profile name
let profile: pdftract_core::profiles::ExtractionProfile = serde_yaml::from_str(&content)
.context("Failed to parse profile YAML")?;
// Destination path
let dest = xdg_dir.join(format!("{}.yaml", profile.name));
// Copy file
fs::copy(path, &dest)
.context(format!("Failed to copy profile to: {}", dest.display()))?;
println!("Installed profile '{}' to: {}", profile.name, dest.display());
println!();
println!("You can now use this profile with:");
println!(" pdftract extract --profile {}", profile.name);
}
#[cfg(not(feature = "profiles"))]
{
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
}
Ok(())
}
/// Validate a profile file.
fn run_validate(path: &PathBuf) -> Result<()> {
#[cfg(feature = "profiles")]
{
use pdftract_core::profiles::extraction_loader;
// Check if file exists
if !path.exists() {
anyhow::bail!("Profile file not found: {}", path.display());
}
// Validate the profile
match extraction_loader::validate_profile_file(path) {
Ok(()) => {
println!("Profile '{}' is valid.", path.display());
return Ok(());
}
Err(e) => {
anyhow::bail!("Profile validation failed: {}", e);
}
}
}
#[cfg(not(feature = "profiles"))]
{
anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_profiles_command_enum() {
let command = ProfilesCommand::List;
assert!(matches!(command, ProfilesCommand::List));
let show = ProfilesCommand::Show {
name_or_path: "invoice".to_string(),
};
assert!(matches!(show, ProfilesCommand::Show { .. }));
let export = ProfilesCommand::Export {
name: "invoice".to_string(),
};
assert!(matches!(export, ProfilesCommand::Export { .. }));
}
}

View file

@ -46,6 +46,7 @@ smallvec = "1.13"
encoding_rs = "0.8"
quick-xml = { version = "0.36", optional = true }
serde_yaml = { version = "0.9", optional = true }
dirs = "5.0"
chrono = "0.4"
aes = { version = "0.8", optional = true }
rc4 = { version = "0.1", optional = true }

View file

@ -304,6 +304,15 @@ pub struct ExtractionMetadata {
/// Diagnostics emitted during extraction (coverage warnings, etc.)
#[serde(skip_serializing_if = "Vec::is_empty")]
pub diagnostics: Vec<String>,
/// Profile name if a profile was applied (Phase 7.10)
#[serde(skip_serializing_if = "Option::is_none")]
pub profile_name: Option<String>,
/// Profile version if a profile was applied (Phase 7.10)
#[serde(skip_serializing_if = "Option::is_none")]
pub profile_version: Option<String>,
/// Extracted fields from profile if a profile was applied (Phase 7.10)
#[serde(skip_serializing_if = "Option::is_none")]
pub profile_fields: Option<serde_json::Value>,
}
/// Extract text and structure from a PDF file.
@ -931,6 +940,9 @@ pub fn extract_pdf(
error_count,
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
diagnostics: all_diagnostics_with_js,
profile_name: None,
profile_version: None,
profile_fields: None,
},
signatures,
form_fields,
@ -1812,6 +1824,9 @@ pub fn extract_pdf_ndjson<W: std::io::Write>(
error_count: error_count as usize,
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
diagnostics: all_diagnostics,
profile_name: None,
profile_version: None,
profile_fields: None,
})
}
@ -2117,6 +2132,9 @@ where
error_count,
reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
diagnostics: all_diagnostics,
profile_name: None,
profile_version: None,
profile_fields: None,
})
}

View file

@ -0,0 +1,259 @@
//! Profile application for extraction tuning (Phase 7.10).
//!
//! Applies profile extraction tuning to ExtractionOptions and manages
//! the profile workflow: classification, option override, field extraction,
//! and metadata population.
use super::extraction::{ExtractionProfile, ExtractionTuning};
use super::field_extractor;
use super::match_eval::{evaluate_match, MatchResult};
use super::signals::extract_signals_from_results;
use crate::options::{ExtractionOptions, OutputOptions};
use crate::schema::{BlockJson, PageJson, SpanJson};
use anyhow::Result;
use serde_json::json;
/// Apply a profile's extraction tuning to extraction options.
///
/// # Arguments
///
/// * `tuning` - The extraction tuning from a profile
/// * `options` - The base extraction options to modify
///
/// # Returns
///
/// Modified extraction options with profile-specific overrides applied.
///
/// # Note
///
/// Many extraction tuning fields (reading_order, table_detection, etc.) are
/// not yet exposed in ExtractionOptions. This function applies what is available
/// and logs warnings for unsupported fields.
pub fn apply_extraction_tuning(tuning: &ExtractionTuning, options: &mut ExtractionOptions) {
// Apply output filtering options (these are supported)
if let Some(include_invisible) = tuning.include_invisible {
options.output.include_invisible = include_invisible;
}
if let Some(include_headers_footers) = tuning.include_headers_footers {
if include_headers_footers {
options.output.include_headers = true;
options.output.include_footers = true;
}
}
// Log warnings for unsupported fields (for future implementation)
if tuning.reading_order.is_some() {
eprintln!("Profile warning: reading_order tuning is not yet supported");
}
if tuning.table_detection.is_some() {
eprintln!("Profile warning: table_detection tuning is not yet supported");
}
if tuning.readability_threshold.is_some() {
eprintln!("Profile warning: readability_threshold tuning is not yet supported");
}
if tuning.force_ocr.is_some() {
eprintln!("Profile warning: force_ocr tuning is not yet supported");
}
if tuning.min_block_chars.is_some() {
eprintln!("Profile warning: min_block_chars tuning is not yet supported");
}
}
/// Classify a document and select the best matching profile.
///
/// # Arguments
///
/// * `profiles` - All available extraction profiles
/// * `page_data` - Page data (blocks, span_indices) for signal extraction
/// * `has_signature_field` - Whether document has signature fields
/// * `has_form_field` - Whether document has form fields
///
/// # Returns
///
/// The best matching profile with confidence score, or None if no profile
/// matches with confidence >= 0.6.
pub fn classify_and_select_profile(
profiles: &[ExtractionProfile],
page_data: &[(Vec<BlockJson>, Vec<SpanJson>)], // (blocks, spans) per page
has_signature_field: bool,
has_form_field: bool,
) -> Option<(ExtractionProfile, MatchResult)> {
// Extract signals from the document
let signals = extract_signals_from_results(page_data, has_signature_field, has_form_field);
// Evaluate each profile
let mut best_profile: Option<(ExtractionProfile, MatchResult)> = None;
for profile in profiles {
let result = evaluate_match(&profile.match_expr, &signals);
// Only consider matches with confidence >= 0.6
if result.matched && result.confidence >= 0.6 {
match &best_profile {
None => {
best_profile = Some((profile.clone(), result));
}
Some((existing_profile, existing_result)) => {
// Prefer higher confidence, then higher priority
if result.confidence > existing_result.confidence
|| (result.confidence == existing_result.confidence
&& profile.priority > existing_profile.priority)
{
best_profile = Some((profile.clone(), result));
}
}
}
}
}
best_profile
}
/// Apply a profile to extraction metadata.
///
/// Populates profile_name, profile_version, and profile_fields in the
/// extraction metadata.
///
/// # Arguments
///
/// * `profile` - The profile that was applied
/// * `metadata` - The extraction metadata to update (this must be the full ExtractionMetadata from extract module)
/// * `pages` - Extracted pages for field extraction
///
/// # Note
///
/// This function requires the full ExtractionMetadata from the extract module.
/// Due to the module structure, we update metadata through a closure that
/// can access the internal fields.
pub fn apply_profile_to_metadata(
profile: &ExtractionProfile,
pages: &[PageJson],
) -> (String, String, Option<serde_json::Value>) {
let profile_name = profile.name.clone();
let profile_version = "1.0.0".to_string(); // Profile version schema
// Extract fields if the profile has field specifications
let profile_fields = if !profile.fields.is_empty() {
// Collect all blocks from all pages
let all_blocks: Vec<BlockJson> = pages.iter().flat_map(|p| p.blocks.clone()).collect();
// Build full text from all spans
let full_text = pages
.iter()
.flat_map(|p| p.spans.iter().map(|s| s.text.clone()))
.collect::<Vec<_>>()
.join(" ");
// Extract profile fields
let field_results =
field_extractor::extract_profile_fields(&profile.fields, &all_blocks, &full_text);
// Convert to JSON object
let mut fields_obj = serde_json::Map::new();
for (field_name, result) in field_results {
fields_obj.insert(field_name, result.value);
}
Some(json!(fields_obj))
} else {
None
};
(profile_name, profile_version, profile_fields)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::options::ReceiptsMode;
fn make_test_block(kind: &str, x0: f64, y0: f64, x1: f64, y1: f64) -> BlockJson {
BlockJson {
id: format!("block_{}", kind),
kind: kind.to_string(),
bbox: Some(vec![x0, y0, x1, y1]),
spans: vec![0, 1],
reading_order: Some(0),
..Default::default()
}
}
#[test]
fn test_apply_extraction_tuning() {
let tuning = ExtractionTuning {
reading_order: Some("line_dominant".to_string()),
table_detection: Some("strict_borders".to_string()),
readability_threshold: Some(0.4),
include_invisible: Some(true),
include_headers_footers: Some(true),
zone_filtering: None,
force_ocr: Some(false),
min_block_chars: Some(10),
};
let mut options = ExtractionOptions::default();
apply_extraction_tuning(&tuning, &mut options);
// Check that output options were applied
assert_eq!(options.output.include_invisible, true);
assert_eq!(options.output.include_headers, true);
assert_eq!(options.output.include_footers, true);
}
#[test]
fn test_apply_extraction_tuning_partial() {
let tuning = ExtractionTuning {
reading_order: None,
table_detection: None,
readability_threshold: None,
include_invisible: Some(false),
include_headers_footers: None,
zone_filtering: None,
force_ocr: None,
min_block_chars: None,
};
let mut options = ExtractionOptions::default();
apply_extraction_tuning(&tuning, &mut options);
assert_eq!(options.output.include_invisible, false);
assert_eq!(options.output.include_headers, false);
assert_eq!(options.output.include_footers, false);
}
#[test]
fn test_classify_and_select_profile_no_match() {
// Empty profiles list
let profiles: Vec<ExtractionProfile> = vec![];
let page_data: Vec<(Vec<BlockJson>, Vec<usize>)> = vec![];
let result = classify_and_select_profile(&profiles, &page_data, false, false);
assert!(result.is_none());
}
#[test]
fn test_apply_profile_to_metadata_no_fields() {
let profile_yaml = r#"
name: test
description: Test profile
priority: 10
"#;
let profile: ExtractionProfile = serde_yaml::from_str(profile_yaml).unwrap();
let pages = vec![];
let (name, version, fields) = apply_profile_to_metadata(&profile, &pages);
assert_eq!(name, "test");
assert_eq!(version, "1.0.0");
assert!(fields.is_none());
}
}

View file

@ -0,0 +1,437 @@
//! Extraction profile types (Phase 7.10).
//!
//! This module defines the rich extraction profile format that extends Phase 5.6
//! classification with extraction tuning and field extraction. Extraction profiles
//! use a boolean match DSL (all/any/none combinators) and can override extraction
//! options and extract structured fields.
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Extraction profile with match DSL, extraction tuning, and field extraction.
///
/// This is the Phase 7.10 profile format, separate from the Phase 5.6 classification
/// `Profile` type. Extraction profiles drive both classification (via match DSL)
/// and extraction behavior (via tuning and field specs).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionProfile {
/// Profile name (e.g., "invoice", "receipt")
pub name: String,
/// Human-readable description
pub description: String,
/// Priority for profile selection (higher = preferred when multiple match)
#[serde(default = "default_priority")]
pub priority: u32,
/// Match DSL expression (boolean tree with all/any/none combinators)
#[serde(default)]
pub match_expr: MatchExpr,
/// Extraction tuning overrides (optional)
#[serde(default)]
pub extraction: Option<ExtractionTuning>,
/// Field extraction specifications (optional)
#[serde(default)]
pub fields: HashMap<String, FieldSpec>,
}
fn default_priority() -> u32 {
10
}
/// Boolean match expression for document classification.
///
/// Supports all/any/none combinators for building complex matching rules.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum MatchExpr {
/// Single predicate
Predicate(ExtractionMatchPredicate),
/// All of these must match
All { all: Vec<MatchExpr> },
/// Any of these can match
Any { any: Vec<MatchExpr> },
/// None of these must match
None { none: Vec<MatchExpr> },
}
impl Default for MatchExpr {
fn default() -> Self {
// Default to an Any that matches nothing (empty list)
MatchExpr::Any { any: Vec::new() }
}
}
/// Match predicate primitives for extraction profiles.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ExtractionMatchPredicate {
/// Text contains any of the given strings
TextContains {
#[serde(default)]
patterns: Vec<String>,
},
/// Text matches the given regex
TextMatches {
pattern: String,
},
/// Heading text matches the given regex
HeadingMatches {
pattern: String,
},
/// Document has currency pattern ($\d, €\d, etc.)
HasCurrencyPattern {
#[serde(default)]
has_currency_pattern: bool,
},
/// Document has signature fields (AcroForm)
HasSignatureField {
#[serde(default)]
has_signature_field: bool,
},
/// Structural predicates (has_table, page_count, etc.)
Structural {
#[serde(default)]
has_table: bool,
#[serde(default)]
has_form_field: bool,
#[serde(default)]
has_math: bool,
#[serde(flatten)]
page_count: Option<PageCountRange>,
},
/// Text patterns alias for TextContains
#[serde(rename = "text_patterns")]
TextContainsAlias {
#[serde(default)]
patterns: Vec<String>,
},
}
/// Page count range predicate.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageCountRange {
#[serde(default)]
pub min: Option<u32>,
#[serde(default)]
pub max: Option<u32>,
#[serde(default)]
pub hint: Option<String>,
}
/// Extraction tuning overrides.
///
/// These fields override the default ExtractionOptions when a profile matches.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractionTuning {
/// Reading order algorithm
pub reading_order: Option<String>,
/// Table detection mode
pub table_detection: Option<String>,
/// Readability threshold (0.0-1.0)
pub readability_threshold: Option<f32>,
/// Include invisible text
pub include_invisible: Option<bool>,
/// Include headers and footers
pub include_headers_footers: Option<bool>,
/// Zone filtering mode
pub zone_filtering: Option<String>,
/// Force OCR
pub force_ocr: Option<bool>,
/// Minimum block characters
pub min_block_chars: Option<usize>,
}
/// Field extraction specification.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldSpec {
/// Field type (string, decimal, date, int, bool, array)
#[serde(rename = "type")]
pub field_type: String,
/// Extraction specification
pub extraction: FieldExtraction,
}
/// Field extraction definition.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum FieldExtraction {
/// Simple pattern-based extraction
Patterns {
patterns: Vec<String>,
#[serde(default)]
fallback: Option<serde_yaml::Value>,
},
/// Rich extraction with localizers and extractors
Rich {
/// Regex pattern
#[serde(default)]
regex: Option<String>,
/// Near anchors (search near these strings)
#[serde(default)]
near: Option<Vec<String>>,
/// Maximum distance in points
#[serde(default)]
max_distance_pt: Option<usize>,
/// Region specification
#[serde(default)]
region: Option<String>,
/// Pick strategy (largest_font, smallest_font, nearest_below, nearest_right, first, last)
#[serde(default)]
pick: Option<String>,
/// Parse type (decimal, date, int, bool, string)
#[serde(default)]
parse: Option<String>,
/// After field (for ordering)
#[serde(default)]
after: Option<String>,
/// After heading
#[serde(default)]
after_heading: Option<String>,
/// Table region for array fields
#[serde(default)]
table_region: Option<String>,
/// Columnar regions for array fields
#[serde(default)]
columnar_regions: Option<String>,
/// Array schema for structured data
#[serde(default)]
schema: Option<Vec<FieldSchema>>,
/// Fallback value
#[serde(default)]
fallback: Option<serde_yaml::Value>,
},
}
/// Schema field for array extraction.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldSchema {
pub name: String,
#[serde(rename = "type")]
pub field_type: String,
#[serde(default)]
pub required: bool,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extraction_profile_basic() {
let yaml = r#"
name: test
description: Test profile
priority: 50
"#;
let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap();
assert_eq!(profile.name, "test");
assert_eq!(profile.description, "Test profile");
assert_eq!(profile.priority, 50);
}
#[test]
fn test_match_expr_all() {
let yaml = r#"
match:
all:
- text_contains:
patterns: ["invoice", "bill"]
- structural:
has_table: true
"#;
let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
match expr {
MatchExpr::All { all } => {
assert_eq!(all.len(), 2);
}
_ => panic!("Expected All"),
}
}
#[test]
fn test_match_expr_any() {
let yaml = r#"
match:
any:
- text_contains:
patterns: ["receipt"]
- text_matches:
pattern: "\\d+\\.\\d{2}"
"#;
let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
match expr {
MatchExpr::Any { any } => {
assert_eq!(any.len(), 2);
}
_ => panic!("Expected Any"),
}
}
#[test]
fn test_match_expr_none() {
let yaml = r#"
match:
none:
- text_contains:
patterns: ["abstract", "bibliography"]
"#;
let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
match expr {
MatchExpr::None { none } => {
assert_eq!(none.len(), 1);
}
_ => panic!("Expected None"),
}
}
#[test]
fn test_extraction_tuning() {
let yaml = r#"
extraction:
reading_order: xy_cut
table_detection: strict_borders
readability_threshold: 0.4
include_invisible: false
"#;
let tuning: ExtractionTuning = serde_yaml::from_str(yaml).unwrap();
assert_eq!(tuning.reading_order, Some("xy_cut".to_string()));
assert_eq!(tuning.table_detection, Some("strict_borders".to_string()));
assert_eq!(tuning.readability_threshold, Some(0.4));
assert_eq!(tuning.include_invisible, Some(false));
}
#[test]
fn test_field_spec_simple() {
let yaml = r#"
total:
type: decimal
extraction:
patterns:
- "\\$\\s*(\\d+\\.\\d{2})"
fallback: null
"#;
let field: FieldSpec = serde_yaml::from_str(yaml).unwrap();
assert_eq!(field.field_type, "decimal");
match field.extraction {
FieldExtraction::Patterns { patterns, .. } => {
assert_eq!(patterns.len(), 1);
}
_ => panic!("Expected Patterns"),
}
}
#[test]
fn test_field_spec_rich() {
let yaml = r#"
invoice_number:
type: string
extraction:
regex: "Invoice\\s*#\\s*([\\w-]+)"
near: ["Invoice", "Invoice Number"]
max_distance_pt: 200
"#;
let field: FieldSpec = serde_yaml::from_str(yaml).unwrap();
assert_eq!(field.field_type, "string");
match field.extraction {
FieldExtraction::Rich { regex, near, max_distance_pt, .. } => {
assert!(regex.is_some());
assert!(near.is_some());
assert_eq!(max_distance_pt, Some(200));
}
_ => panic!("Expected Rich"),
}
}
#[test]
fn test_full_profile_roundtrip() {
let yaml = r#"
name: invoice
description: Commercial invoice with line items
priority: 50
match:
all:
- any:
- text_contains:
patterns: ["invoice", "bill to"]
- heading_matches:
pattern: "^Invoice\\b"
- structural:
has_table: true
extraction:
reading_order: line_dominant
table_detection: strict_borders
readability_threshold: 0.4
fields:
invoice_number:
type: string
extraction:
regex: "Invoice\\s*#\\s*([\\w-]+)"
near: ["Invoice"]
total:
type: decimal
extraction:
patterns:
- "total.*([\\d,]+\\.\\d{2})"
fallback: null
"#;
let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap();
assert_eq!(profile.name, "invoice");
assert_eq!(profile.priority, 50);
assert!(profile.extraction.is_some());
assert_eq!(profile.fields.len(), 2);
// Round-trip
let yaml_out = serde_yaml::to_string(&profile).unwrap();
let profile2: ExtractionProfile = serde_yaml::from_str(&yaml_out).unwrap();
assert_eq!(profile2.name, profile.name);
}
}

View file

@ -0,0 +1,374 @@
//! Extraction profile loader (Phase 7.10).
//!
//! Loads extraction profiles from built-in sources, system directories,
//! XDG config paths, and custom --profile-dir flags.
use super::extraction::ExtractionProfile;
use super::loader::{check_forbidden_keys, ForbiddenKeyError, ProfileLoadError};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
/// Profile source with priority metadata.
#[derive(Debug, Clone)]
pub struct ProfileSource {
/// The loaded profile
pub profile: ExtractionProfile,
/// Where this profile came from
pub source: ProfileOrigin,
/// Whether this overrides a built-in profile
pub overrides_builtin: bool,
}
/// Origin of a profile.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ProfileOrigin {
/// Built-in profile (compiled into binary)
BuiltIn,
/// System-wide profile (/etc/pdftract/profiles/)
System,
/// User profile (XDG config directory)
User,
/// Custom profile directory (--profile-dir)
Custom(PathBuf),
}
/// Load all extraction profiles from the search path.
///
/// Search order (lowest to highest priority):
/// 1. Built-in profiles (compiled in)
/// 2. System directory (/etc/pdftract/profiles/)
/// 3. User directory (XDG config: ~/.config/pdftract/profiles/)
/// 4. Custom directories (--profile-dir, repeatable)
///
/// Later sources override earlier ones on name collision.
pub fn load_extraction_profiles(
custom_dirs: &[PathBuf],
) -> Result<Vec<ProfileSource>, ProfileLoadError> {
let mut profiles_by_name: HashMap<String, ProfileSource> = HashMap::new();
// 1. Load built-in profiles
load_builtin_profiles(&mut profiles_by_name)?;
// 2. Load system profiles
let system_dir = PathBuf::from("/etc/pdftract/profiles");
if system_dir.exists() {
load_profiles_from_dir(&system_dir, ProfileOrigin::System, &mut profiles_by_name)?;
}
// 3. Load user profiles (XDG config)
if let Some(user_dir) = get_xdg_profile_dir() {
if user_dir.exists() {
load_profiles_from_dir(&user_dir, ProfileOrigin::User, &mut profiles_by_name)?;
}
}
// 4. Load custom profiles (--profile-dir)
for custom_dir in custom_dirs {
if custom_dir.exists() {
let origin = ProfileOrigin::Custom(custom_dir.clone());
load_profiles_from_dir(custom_dir, origin, &mut profiles_by_name)?;
}
}
// Convert to vector, sorted by priority then by name
let mut profiles: Vec<ProfileSource> = profiles_by_name.into_values().collect();
profiles.sort_by(|a, b| {
b.profile
.priority
.cmp(&a.profile.priority)
.then_with(|| a.profile.name.cmp(&b.profile.name))
});
Ok(profiles)
}
/// Get the XDG config directory for pdftract profiles.
///
/// Returns ~/.config/pdftract/profiles/ or None if XDG config is not available.
pub fn get_xdg_profile_dir() -> Option<PathBuf> {
dirs::config_dir().map(|dir| dir.join("pdftract").join("profiles"))
}
/// Load built-in extraction profiles.
///
/// These are compiled into the binary via include_str!.
fn load_builtin_profiles(
profiles: &mut HashMap<String, ProfileSource>,
) -> Result<(), ProfileLoadError> {
#[cfg(feature = "profiles")]
{
// Load each built-in profile individually
let profile_results: Vec<(&str, Result<ExtractionProfile, ProfileLoadError>)> = vec![
("invoice", load_profile_yaml(
include_str!("../../../../profiles/builtin/invoice/profile.yaml"),
"profiles/builtin/invoice/profile.yaml"
)),
("receipt", load_profile_yaml(
include_str!("../../../../profiles/builtin/receipt/profile.yaml"),
"profiles/builtin/receipt/profile.yaml"
)),
("contract", load_profile_yaml(
include_str!("../../../../profiles/builtin/contract/profile.yaml"),
"profiles/builtin/contract/profile.yaml"
)),
("scientific_paper", load_profile_yaml(
include_str!("../../../../profiles/builtin/scientific_paper/profile.yaml"),
"profiles/builtin/scientific_paper/profile.yaml"
)),
("slide_deck", load_profile_yaml(
include_str!("../../../../profiles/builtin/slide_deck/profile.yaml"),
"profiles/builtin/slide_deck/profile.yaml"
)),
("form", load_profile_yaml(
include_str!("../../../../profiles/builtin/form/profile.yaml"),
"profiles/builtin/form/profile.yaml"
)),
("bank_statement", load_profile_yaml(
include_str!("../../../../profiles/builtin/bank_statement/profile.yaml"),
"profiles/builtin/bank_statement/profile.yaml"
)),
("legal_filing", load_profile_yaml(
include_str!("../../../../profiles/builtin/legal_filing/profile.yaml"),
"profiles/builtin/legal_filing/profile.yaml"
)),
("book_chapter", load_profile_yaml(
include_str!("../../../../profiles/builtin/book_chapter/profile.yaml"),
"profiles/builtin/book_chapter/profile.yaml"
)),
];
for (name, result) in profile_results {
match result {
Ok(profile) => {
profiles.insert(
profile.name.clone(),
ProfileSource {
profile,
source: ProfileOrigin::BuiltIn,
overrides_builtin: false,
},
);
}
Err(e) => {
eprintln!("Failed to parse built-in profile '{}': {}", name, e);
}
}
}
}
Ok(())
}
/// Load a profile from YAML content.
fn load_profile_yaml(content: &str, source_path: &str) -> Result<ExtractionProfile, ProfileLoadError> {
// Check for forbidden keys first
let yaml_value = serde_yaml::from_str::<serde_yaml::Value>(content)?;
// Get the original content for line number detection
if let Err(e) = check_forbidden_keys(&yaml_value, "", content) {
return Err(ProfileLoadError::ForbiddenKey {
key: e.key,
path: format!("{}: {}", source_path, e.path),
line: e.line,
});
}
// Parse as ExtractionProfile
let profile: ExtractionProfile =
serde_yaml::from_str(content).map_err(ProfileLoadError::YamlError)?;
Ok(profile)
}
/// Load profiles from a directory.
fn load_profiles_from_dir(
dir: &Path,
origin: ProfileOrigin,
profiles: &mut HashMap<String, ProfileSource>,
) -> Result<(), ProfileLoadError> {
let entries = fs::read_dir(dir).map_err(ProfileLoadError::IoError)?;
for entry in entries {
let entry = entry.map_err(ProfileLoadError::IoError)?;
let path = entry.path();
// Skip directories
if path.is_dir() {
// Check for profile.yaml subdirectory (e.g., invoice/profile.yaml)
let profile_yaml = path.join("profile.yaml");
if profile_yaml.exists() {
if let Ok(profile) = load_profile_file(&profile_yaml) {
let overrides_builtin = profiles
.contains_key(&profile.name)
&& matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_));
profiles.insert(
profile.name.clone(),
ProfileSource {
profile,
source: origin.clone(),
overrides_builtin,
},
);
}
}
continue;
}
// Only load .yaml files
if path.extension().and_then(|s| s.to_str()) != Some("yaml") {
continue;
}
if let Ok(profile) = load_profile_file(&path) {
let overrides_builtin = profiles
.contains_key(&profile.name)
&& matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_));
profiles.insert(
profile.name.clone(),
ProfileSource {
profile,
source: origin.clone(),
overrides_builtin,
},
);
}
}
Ok(())
}
/// Load a single profile from a file.
pub fn load_profile_file(path: &Path) -> Result<ExtractionProfile, ProfileLoadError> {
let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?;
load_profile_yaml(&content, &path.to_string_lossy())
}
/// Find a profile by name or path.
///
/// - If `name_or_path` is an existing file path, load it directly
/// - Otherwise, search for a profile with that name in the loaded profiles
pub fn find_profile(
name_or_path: &str,
profiles: &[ProfileSource],
) -> Result<ExtractionProfile, ProfileLoadError> {
// First, check if it's a file path
let path = PathBuf::from(name_or_path);
if path.exists() {
return load_profile_file(&path);
}
// Search by name
for source in profiles {
if source.profile.name == name_or_path {
return Ok(source.profile.clone());
}
}
Err(ProfileLoadError::IoError(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("Profile '{}' not found", name_or_path),
)))
}
/// Validate a profile file without loading it into the profile set.
///
/// Returns Ok(()) if the profile is valid, Err with details if invalid.
pub fn validate_profile_file(path: &Path) -> Result<(), ProfileLoadError> {
let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?;
// Check for forbidden keys
let yaml_value = serde_yaml::from_str::<serde_yaml::Value>(&content)
.map_err(ProfileLoadError::YamlError)?;
check_forbidden_keys(&yaml_value, "", &content)
.map_err(|e| ProfileLoadError::ForbiddenKey {
key: e.key,
path: e.path,
line: e.line,
})?;
// Try to parse as ExtractionProfile
let _: ExtractionProfile = serde_yaml::from_str(&content).map_err(ProfileLoadError::YamlError)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_xdg_profile_dir() {
let dir = get_xdg_profile_dir();
assert!(dir.is_some());
let path = dir.unwrap();
assert!(path.ends_with("pdftract/profiles"));
}
#[test]
fn test_load_builtin_profiles() {
let mut profiles = HashMap::new();
let result = load_builtin_profiles(&mut profiles);
#[cfg(feature = "profiles")]
{
assert!(result.is_ok());
// Should have loaded some profiles
assert!(!profiles.is_empty());
}
}
#[test]
fn test_validate_simple_profile() {
let yaml = r#"
name: test
description: Test profile
priority: 10
match:
text_contains:
patterns: ["test"]
"#;
let temp_dir = tempfile::tempdir().unwrap();
let profile_path = temp_dir.path().join("test.yaml");
fs::write(&profile_path, yaml).unwrap();
let result = validate_profile_file(&profile_path);
assert!(result.is_ok());
}
#[test]
fn test_validate_profile_with_forbidden_key() {
let yaml = r#"
name: test
description: Test profile
priority: 10
match:
text_contains:
patterns: ["test"]
api_key: "secret"
"#;
let temp_dir = tempfile::tempdir().unwrap();
let profile_path = temp_dir.path().join("test.yaml");
fs::write(&profile_path, yaml).unwrap();
let result = validate_profile_file(&profile_path);
assert!(result.is_err());
}
#[test]
fn test_load_extraction_profiles_empty() {
let profiles = load_extraction_profiles(&[]).unwrap();
#[cfg(feature = "profiles")]
assert!(!profiles.is_empty()); // At least built-ins
}
}

View file

@ -0,0 +1,353 @@
//! Field extraction DSL evaluator (Phase 7.10).
//!
//! Evaluates field extraction specifications from profiles and extracts
//! structured fields from document text. Supports:
//! - Localizers: near, region, pick
//! - Extractors: regex, parse
//! - Strategies for disambiguating multiple candidates
use super::extraction::{FieldExtraction, FieldSchema, FieldSpec};
use crate::schema::BlockJson;
use regex::Regex;
use serde_json::Value;
use std::collections::HashMap;
/// Convert serde_yaml::Value to serde_json::Value.
fn convert_yaml_to_json(yaml_value: &serde_yaml::Value) -> Value {
match yaml_value {
serde_yaml::Value::Null => Value::Null,
serde_yaml::Value::Bool(b) => Value::Bool(*b),
serde_yaml::Value::Number(n) => {
if let Some(i) = n.as_i64() {
Value::Number(i.into())
} else if let Some(f) = n.as_f64() {
serde_json::Number::from_f64(f).map(Value::Number).unwrap_or(Value::Null)
} else {
Value::Null
}
}
serde_yaml::Value::String(s) => Value::String(s.clone()),
serde_yaml::Value::Sequence(seq) => {
Value::Array(seq.iter().map(convert_yaml_to_json).collect())
}
serde_yaml::Value::Mapping(map) => {
let mut obj = serde_json::Map::new();
for (k, v) in map {
if let serde_yaml::Value::String(key_str) = k {
obj.insert(key_str.clone(), convert_yaml_to_json(v));
}
}
Value::Object(obj)
}
serde_yaml::Value::Tagged(tagged) => convert_yaml_to_json(&tagged.value),
}
}
/// Result of field extraction.
#[derive(Debug, Clone)]
pub struct FieldExtractionResult {
/// Extracted field value (null if not found)
pub value: Value,
/// Human-readable extraction details (for debugging)
pub details: String,
}
/// Extract all fields from a profile against extracted document data.
///
/// # Arguments
///
/// * `fields` - Field specifications from the profile
/// * `blocks` - Extracted blocks from the document
/// * `full_text` - Full document text
///
/// # Returns
///
/// A map of field names to extraction results.
pub fn extract_profile_fields(
fields: &HashMap<String, FieldSpec>,
blocks: &[BlockJson],
full_text: &str,
) -> HashMap<String, FieldExtractionResult> {
let mut results = HashMap::new();
for (field_name, field_spec) in fields {
let result = extract_single_field(field_spec, blocks, full_text);
results.insert(field_name.clone(), result);
}
results
}
/// Extract a single field from the document.
fn extract_single_field(
field_spec: &FieldSpec,
blocks: &[BlockJson],
full_text: &str,
) -> FieldExtractionResult {
match &field_spec.extraction {
FieldExtraction::Patterns { patterns, fallback } => {
let json_fallback = fallback.as_ref().map(convert_yaml_to_json);
extract_by_patterns(patterns, full_text, &json_fallback)
}
FieldExtraction::Rich {
regex,
near,
max_distance_pt,
region,
pick,
parse,
after: _,
after_heading: _,
table_region: _,
columnar_regions: _,
schema: _,
fallback,
} => {
let json_fallback = fallback.as_ref().map(convert_yaml_to_json);
extract_rich(
regex,
near,
max_distance_pt,
region,
pick,
parse,
blocks,
full_text,
&json_fallback,
)
}
}
}
/// Extract using simple pattern matching (fallback mode).
fn extract_by_patterns(
patterns: &[String],
full_text: &str,
fallback: &Option<Value>,
) -> FieldExtractionResult {
for pattern in patterns {
if let Ok(re) = Regex::new(pattern) {
if let Some(captures) = re.captures(full_text) {
// Use first capture group if available, otherwise full match
let value = captures
.get(1)
.or(captures.get(0))
.map(|m| m.as_str())
.unwrap_or("");
return FieldExtractionResult {
value: Value::String(value.to_string()),
details: format!("Matched pattern '{}': '{}'", pattern, value),
};
}
}
}
// No match - use fallback or null
FieldExtractionResult {
value: fallback.clone().unwrap_or(Value::Null),
details: "No patterns matched, using fallback".to_string(),
}
}
/// Extract using rich field extraction with localizers and extractors.
fn extract_rich(
regex: &Option<String>,
near: &Option<Vec<String>>,
_max_distance_pt: &Option<usize>,
_region: &Option<String>,
_pick: &Option<String>,
parse: &Option<String>,
_blocks: &[BlockJson],
full_text: &str,
fallback: &Option<Value>,
) -> FieldExtractionResult {
// For rich extraction, we need to find text near anchors
// This is a simplified version that searches the full text
// Find anchor position if "near" is specified
let search_text = if let Some(anchors) = near {
// Find the position of the first anchor in the text
let anchor_pos = anchors
.iter()
.find_map(|anchor| full_text.find(anchor))
.unwrap_or(0);
// Search in text after the anchor
if let Some(pos) = full_text.get(anchor_pos..) {
pos
} else {
full_text
}
} else {
full_text
};
// Extract value using regex
let raw_value = if let Some(pattern) = regex {
extract_with_regex(pattern, search_text)
} else {
// If no regex, use the first few words from search text
search_text
.split_whitespace()
.next()
.unwrap_or("")
.to_string()
};
// Parse value according to type
let parsed_value = parse_value(&raw_value, parse.as_deref());
FieldExtractionResult {
value: parsed_value,
details: format!("Extracted value: '{}'", raw_value),
}
}
/// Extract value using regex.
fn extract_with_regex(pattern: &str, text: &str) -> String {
match Regex::new(pattern) {
Ok(re) => {
if let Some(captures) = re.captures(text) {
captures
.get(1)
.or(captures.get(0))
.map(|m| m.as_str().to_string())
.unwrap_or_default()
} else {
String::new()
}
}
Err(_) => String::new(),
}
}
/// Parse a value according to the specified type.
fn parse_value(raw: &str, parse_type: Option<&str>) -> Value {
let raw = raw.trim();
match parse_type {
Some("decimal") => {
// Clean up currency symbols and commas
let cleaned = raw
.replace('$', "")
.replace('€', "")
.replace('£', "")
.replace('¥', "")
.replace(',', "");
cleaned
.parse::<f64>()
.ok()
.and_then(|v| serde_json::Number::from_f64(v))
.map(Value::Number)
.unwrap_or(Value::Null)
}
Some("int") => raw
.parse::<i64>()
.map(Value::Number)
.unwrap_or(Value::Null),
Some("bool") => {
let lower = raw.to_lowercase();
Value::Bool(lower == "true" || lower == "yes" || lower == "1")
}
Some("date") => {
// Try to parse as ISO date or return string
if raw.len() >= 10 && raw.chars().nth(4) == Some('-') {
Value::String(raw.to_string())
} else {
Value::String(raw.to_string())
}
}
Some("string") | None => Value::String(raw.to_string()),
_ => Value::String(raw.to_string()),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_by_patterns_simple() {
let full_text = "Invoice #12345\nTotal: $100.00";
let patterns = vec![r"Invoice #(\w+)".to_string()];
let result = extract_by_patterns(&patterns, full_text, &None);
assert_eq!(result.value, "12345");
assert!(result.details.contains("Matched pattern"));
}
#[test]
fn test_extract_by_patterns_no_match() {
let full_text = "Receipt #ABC";
let patterns = vec![r"Invoice #(\w+)".to_string()];
let fallback = Some(Value::String("UNKNOWN".to_string()));
let result = extract_by_patterns(&patterns, full_text, &fallback);
assert_eq!(result.value, "UNKNOWN");
assert!(result.details.contains("No patterns matched"));
}
#[test]
fn test_parse_value_decimal() {
assert_eq!(
parse_value("100.50", Some("decimal")),
Value::Number(serde_json::Number::from_f64(100.50).unwrap())
);
assert_eq!(
parse_value("$1,234.56", Some("decimal")),
Value::Number(serde_json::Number::from_f64(1234.56).unwrap())
);
assert_eq!(parse_value("invalid", Some("decimal")), Value::Null);
}
#[test]
fn test_parse_value_int() {
assert_eq!(parse_value("42", Some("int")), Value::Number(42.into()));
assert_eq!(parse_value("invalid", Some("int")), Value::Null);
}
#[test]
fn test_parse_value_bool() {
assert_eq!(parse_value("true", Some("bool")), Value::Bool(true));
assert_eq!(parse_value("yes", Some("bool")), Value::Bool(true));
assert_eq!(parse_value("false", Some("bool")), Value::Bool(false));
assert_eq!(parse_value("no", Some("bool")), Value::Bool(false));
}
#[test]
fn test_parse_value_date() {
let result = parse_value("2025-01-15", Some("date"));
assert_eq!(result, Value::String("2025-01-15".to_string()));
}
#[test]
fn test_parse_value_string() {
assert_eq!(
parse_value("hello", Some("string")),
Value::String("hello".to_string())
);
assert_eq!(parse_value("world", None), Value::String("world".to_string()));
}
#[test]
fn test_extract_with_regex() {
let text = "Invoice: INV-2025-00123";
let pattern = r"Invoice:\s*([\w-]+)";
let result = extract_with_regex(pattern, text);
assert_eq!(result, "INV-2025-00123");
}
#[test]
fn test_extract_with_regex_no_match() {
let text = "Receipt: R-123";
let pattern = r"Invoice:\s*([\w-]+)";
let result = extract_with_regex(pattern, text);
assert!(result.is_empty());
}
}

View file

@ -0,0 +1,528 @@
//! Match DSL evaluator for extraction profiles.
//!
//! Evaluates boolean match expressions (all/any/none combinators) against
//! document signals to determine if a profile matches a document.
use super::engine::FeatureSignals;
use super::extraction::{ExtractionMatchPredicate, MatchExpr, PageCountRange};
use regex::Regex;
use std::collections::HashMap;
use std::sync::Mutex;
/// Result of match evaluation.
#[derive(Debug, Clone, Default)]
pub struct MatchResult {
/// Whether the match succeeded
pub matched: bool,
/// Human-readable reasons for the match (for debugging/metadata)
pub reasons: Vec<String>,
/// Confidence score (0.0-1.0)
pub confidence: f32,
}
/// Evaluate a match expression against document signals.
///
/// Returns a MatchResult indicating whether the expression matched and
/// providing reasons for the decision.
pub fn evaluate_match(expr: &MatchExpr, signals: &FeatureSignals) -> MatchResult {
match expr {
MatchExpr::Predicate(pred) => evaluate_predicate(pred, signals),
MatchExpr::All { all } => {
let mut result = MatchResult {
matched: true,
reasons: Vec::new(),
confidence: 1.0,
};
for sub_expr in all {
let sub_result = evaluate_match(sub_expr, signals);
result.reasons.extend(sub_result.reasons);
if !sub_result.matched {
result.matched = false;
// Keep collecting reasons for debugging
}
result.confidence = result.confidence.min(sub_result.confidence);
}
if result.matched {
result.reasons.push("all: all sub-expressions matched".to_string());
} else {
result.reasons.push("all: some sub-expressions did not match".to_string());
}
result
}
MatchExpr::Any { any } => {
let mut best_result = MatchResult {
matched: false,
reasons: Vec::new(),
confidence: 0.0,
};
for sub_expr in any {
let sub_result = evaluate_match(sub_expr, signals);
if sub_result.matched {
best_result.matched = true;
best_result.confidence = best_result.confidence.max(sub_result.confidence);
}
best_result.reasons.extend(sub_result.reasons);
}
if best_result.matched {
best_result
.reasons
.push("any: at least one sub-expression matched".to_string());
} else {
best_result
.reasons
.push("any: no sub-expressions matched".to_string());
}
best_result
}
MatchExpr::None { none } => {
let mut result = MatchResult {
matched: true,
reasons: Vec::new(),
confidence: 1.0,
};
for sub_expr in none {
let sub_result = evaluate_match(sub_expr, signals);
if sub_result.matched {
result.matched = false;
result.confidence = 0.0;
result
.reasons
.push(format!("none: excluded sub-expression matched: {:?}", sub_result.reasons));
}
}
if result.matched {
result.reasons.push("none: no excluded sub-expressions matched".to_string());
}
result
}
}
}
/// Evaluate a single predicate against document signals.
fn evaluate_predicate(pred: &ExtractionMatchPredicate, signals: &FeatureSignals) -> MatchResult {
match pred {
ExtractionMatchPredicate::TextContains { patterns } => {
let text_lower = signals.text.to_lowercase();
for pattern in patterns {
if text_lower.contains(&pattern.to_lowercase()) {
return MatchResult {
matched: true,
reasons: vec![format!("text_contains: found '{}'", pattern)],
confidence: 0.8,
};
}
}
MatchResult {
matched: false,
reasons: vec!["text_contains: no patterns found".to_string()],
confidence: 0.0,
}
}
ExtractionMatchPredicate::TextMatches { pattern } => {
let regex = match compile_regex(pattern) {
Ok(re) => re,
Err(e) => {
return MatchResult {
matched: false,
reasons: vec![format!("text_matches: invalid regex '{}': {}", pattern, e)],
confidence: 0.0,
}
}
};
if regex.is_match(&signals.text) {
MatchResult {
matched: true,
reasons: vec![format!("text_matches: pattern '{}' matched", pattern)],
confidence: 0.7,
}
} else {
MatchResult {
matched: false,
reasons: vec![format!("text_matches: pattern '{}' did not match", pattern)],
confidence: 0.0,
}
}
}
ExtractionMatchPredicate::HeadingMatches { pattern } => {
let regex = match compile_regex(pattern) {
Ok(re) => re,
Err(e) => {
return MatchResult {
matched: false,
reasons: vec![format!("heading_matches: invalid regex '{}': {}", pattern, e)],
confidence: 0.0,
}
}
};
for heading in &signals.headings {
if regex.is_match(heading) {
return MatchResult {
matched: true,
reasons: vec![format!(
"heading_matches: heading '{}' matched pattern '{}'",
heading, pattern
)],
confidence: 0.75,
};
}
}
MatchResult {
matched: false,
reasons: vec![format!("heading_matches: no headings matched '{}'", pattern)],
confidence: 0.0,
}
}
ExtractionMatchPredicate::HasCurrencyPattern {
has_currency_pattern: true,
} => {
let has_currency = has_currency_pattern_impl(&signals.text);
MatchResult {
matched: has_currency,
reasons: vec![if has_currency {
"has_currency_pattern: currency pattern found".to_string()
} else {
"has_currency_pattern: no currency pattern".to_string()
}],
confidence: if has_currency { 0.6 } else { 0.0 },
}
}
ExtractionMatchPredicate::HasCurrencyPattern {
has_currency_pattern: false,
} => MatchResult {
matched: true, // Negated predicate
reasons: vec!["has_currency_pattern: predicate disabled".to_string()],
confidence: 0.0,
},
ExtractionMatchPredicate::HasSignatureField {
has_signature_field: true,
} => {
let has_sig = signals.has_signature_field;
MatchResult {
matched: has_sig,
reasons: vec![if has_sig {
"has_signature_field: signature fields found".to_string()
} else {
"has_signature_field: no signature fields".to_string()
}],
confidence: if has_sig { 0.5 } else { 0.0 },
}
}
ExtractionMatchPredicate::HasSignatureField {
has_signature_field: false,
} => MatchResult {
matched: true,
reasons: vec!["has_signature_field: predicate disabled".to_string()],
confidence: 0.0,
},
ExtractionMatchPredicate::TextContainsAlias { patterns } => {
// Alias for TextContains
let text_lower = signals.text.to_lowercase();
for pattern in patterns {
if text_lower.contains(&pattern.to_lowercase()) {
return MatchResult {
matched: true,
reasons: vec![format!("text_contains: found '{}'", pattern)],
confidence: 0.8,
};
}
}
MatchResult {
matched: false,
reasons: vec!["text_contains: no patterns found".to_string()],
confidence: 0.0,
}
}
ExtractionMatchPredicate::Structural {
has_table,
has_form_field,
has_math,
page_count,
} => {
let mut matched = true;
let mut reasons = Vec::new();
let mut min_confidence = 1.0;
if matches!(has_table, Some(true)) {
if signals.table_block_count > 0 {
reasons.push(format!("structural.has_table: {} tables found", signals.table_block_count));
} else {
reasons.push("structural.has_table: no tables found".to_string());
matched = false;
}
}
if matches!(has_form_field, Some(true)) {
if signals.has_form_field {
reasons.push("structural.has_form_field: form fields found".to_string());
} else {
reasons.push("structural.has_form_field: no form fields found".to_string());
matched = false;
}
}
if matches!(has_math, Some(true)) {
if signals.has_math_operators {
reasons.push("structural.has_math: math operators found".to_string());
} else {
reasons.push("structural.has_math: no math operators".to_string());
matched = false;
}
}
if let Some(range) = page_count {
let page_count = signals.page_count as u32;
let in_range = match (&range.min, &range.max) {
(Some(min), Some(max)) => page_count >= *min && page_count <= *max,
(Some(min), None) => page_count >= *min,
(None, Some(max)) => page_count <= *max,
(None, None) => true,
};
if in_range {
reasons.push(format!("structural.page_count: {} is in range", page_count));
} else {
reasons.push(format!(
"structural.page_count: {} is out of range {:?}",
page_count, range
));
matched = false;
}
}
MatchResult {
matched,
reasons,
confidence: if matched { min_confidence } else { 0.0 },
}
}
}
}
/// Check if text contains a currency pattern ($\d, €\d, £\d, ¥\d, etc.).
fn has_currency_pattern_impl(text: &str) -> bool {
// Simple check for currency symbols followed by digits
let text_lower = text.to_lowercase();
text_lower.contains('$') || text_lower.contains('€') || text_lower.contains('£') || text_lower.contains('¥')
}
/// Simple regex cache (thread-safe, LRU-bounded).
fn get_regex_cache() -> &'static Mutex<HashMap<String, Regex>> {
use std::sync::OnceLock;
static CACHE: OnceLock<Mutex<HashMap<String, Regex>>> = OnceLock::new();
CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}
/// Compile a regex pattern with caching.
fn compile_regex(pattern: &str) -> Result<Regex, regex::Error> {
// Check cache first
{
let cache = get_regex_cache().lock().unwrap();
if let Some(regex) = cache.get(pattern) {
return Ok(regex.clone());
}
}
// Compile and cache
let regex = Regex::new(pattern)?;
let mut cache = get_regex_cache().lock().unwrap();
// Simple LRU: clear if too many entries
if cache.len() > 100 {
cache.clear();
}
cache.insert(pattern.to_string(), regex.clone());
Ok(regex)
}
#[cfg(test)]
mod tests {
use super::*;
fn test_signals() -> FeatureSignals {
let mut signals = FeatureSignals {
text: "Invoice #12345\nTotal: $100.00\nDue date: 2025-01-15".to_string(),
text_pattern_hits: HashMap::new(),
headings: HashSet::from(["Invoice".to_string(), "Total".to_string()]),
page_count: 2,
table_block_count: 1,
has_signature_field: false,
has_form_field: false,
has_math_operators: false,
has_bullet_lists: false,
font_diversity: 3,
heading_depth: 2,
glyph_density: 0.9,
has_footer_page_numbers: false,
};
signals.build_pattern_hits();
signals
}
#[test]
fn test_text_contains_match() {
let signals = test_signals();
let pred = ExtractionMatchPredicate::TextContains {
patterns: vec!["invoice".to_string()],
};
let result = evaluate_predicate(&pred, &signals);
assert!(result.matched);
assert_eq!(result.confidence, 0.8);
}
#[test]
fn test_text_contains_no_match() {
let signals = test_signals();
let pred = ExtractionMatchPredicate::TextContains {
patterns: vec!["receipt".to_string()],
};
let result = evaluate_predicate(&pred, &signals);
assert!(!result.matched);
}
#[test]
fn test_heading_matches() {
let signals = test_signals();
let pred = ExtractionMatchPredicate::HeadingMatches {
pattern: "^Invoice$".to_string(),
};
let result = evaluate_predicate(&pred, &signals);
assert!(result.matched);
}
#[test]
fn test_has_currency_pattern() {
let signals = test_signals();
let pred = ExtractionMatchPredicate::HasCurrencyPattern {
has_currency_pattern: true,
};
let result = evaluate_predicate(&pred, &signals);
assert!(result.matched);
}
#[test]
fn test_structural_has_table() {
let signals = test_signals();
let pred = ExtractionMatchPredicate::Structural {
has_table: Some(true),
has_form_field: Some(false),
has_math: Some(false),
page_count: Some(PageCountRange {
min: Some(1),
max: Some(5),
hint: None,
}),
};
let result = evaluate_predicate(&pred, &signals);
assert!(result.matched);
}
#[test]
fn test_match_expr_all() {
let signals = test_signals();
let expr = MatchExpr::All {
all: vec![
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
patterns: vec!["invoice".to_string()],
}),
MatchExpr::Predicate(ExtractionMatchPredicate::Structural {
has_table: Some(true),
has_form_field: Some(false),
has_math: Some(false),
page_count: None,
}),
],
};
let result = evaluate_match(&expr, &signals);
assert!(result.matched);
assert!(result.reasons.iter().any(|r| r.contains("all: all sub-expressions matched")));
}
#[test]
fn test_match_expr_any() {
let signals = test_signals();
let expr = MatchExpr::Any {
any: vec![
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
patterns: vec!["receipt".to_string()],
}),
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
patterns: vec!["invoice".to_string()],
}),
],
};
let result = evaluate_match(&expr, &signals);
assert!(result.matched);
}
#[test]
fn test_match_expr_none() {
let signals = test_signals();
let expr = MatchExpr::None {
none: vec![MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
patterns: vec!["abstract".to_string()],
})],
};
let result = evaluate_match(&expr, &signals);
assert!(result.matched);
}
#[test]
fn test_match_expr_complex() {
let signals = test_signals();
// (invoice OR receipt) AND has_table
let expr = MatchExpr::All {
all: vec![
MatchExpr::Any {
any: vec![
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
patterns: vec!["invoice".to_string()],
}),
MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
patterns: vec!["receipt".to_string()],
}),
],
},
MatchExpr::Predicate(ExtractionMatchPredicate::Structural {
has_table: Some(true),
has_form_field: Some(false),
has_math: Some(false),
page_count: None,
}),
],
};
let result = evaluate_match(&expr, &signals);
assert!(result.matched);
}
}

View file

@ -18,19 +18,35 @@
//! vocabulary between the rule engine, built-in profile definitions, and
//! user-authored YAML profiles.
mod apply_profile;
mod engine;
mod extraction;
mod extraction_loader;
mod field_extractor;
mod loader;
mod match_eval;
mod signals;
mod types;
pub use apply_profile::{apply_extraction_tuning, apply_profile_to_metadata, classify_and_select_profile};
pub use engine::{
classify, has_currency_pattern, ClassificationResult, ClassifierEngine, FeatureSignals,
};
pub use extraction::{
ExtractionProfile, ExtractionTuning, FieldExtraction, FieldSchema, FieldSpec, MatchExpr,
ExtractionMatchPredicate,
};
pub use extraction_loader::{
find_profile, get_xdg_profile_dir, load_extraction_profiles, load_profile_file, ProfileOrigin,
ProfileSource, validate_profile_file,
};
pub use field_extractor::{extract_profile_fields, FieldExtractionResult};
pub use loader::{
check_forbidden_keys, load_profiles_from_dir, ForbiddenKeyError, ProfileLoadError,
};
pub use match_eval::{evaluate_match, MatchResult};
pub use signals::{extract_feature_signals, extract_signals_from_results, PageSignalAccumulator};
pub use types::{MatchPredicate, Profile, ProfileType};
pub use types::{MatchPredicate as ClassificationMatchPredicate, Profile, ProfileType};
use crate::diagnostics::DiagCode;

View file

@ -1,55 +1,64 @@
# Bank Statement extraction profile
# Matches bank statements with account info, period, balances, transactions
name: bank_statement
description: Bank statement with account info, period, balances, transactions
priority: 42
match:
any:
- text_patterns:
- "(?i)statement\\s+of\\s+account"
- "(?i)bank\\s+statement"
- "(?i)account\\s+statement"
- "(?i)transaction\\s+history"
- text_patterns:
- "(?i)opening\\s+balance"
- "(?i)closing\\s+balance"
- "(?i)statement\\s+period"
- "(?i)account\\s*#?\\s*:?\\s*\\*{4,}"
all:
- any:
- text_contains:
patterns: ["statement of account", "bank statement", "account statement", "transaction history"]
- text_contains:
patterns: ["opening balance", "closing balance", "statement period"]
- structural:
- has_monetary_columnar_layout: true
- has_date_column: true
page_count_hint: 1-10
profile_fields:
has_table: true
has_form_field: false
has_math: false
page_count:
min: 1
max: 10
extraction:
reading_order: line_dominant
table_detection: default
readability_threshold: 0.5
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
account_number:
type: string
extraction:
patterns:
- "(?i)account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})"
- "(?i)acct\\s*(?:#|:)?\\s*(\\*?\\d[\\d\\*]{3,})"
fallback: null
regex: "account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})"
parse: string
statement_period:
type: string
extraction:
patterns:
- "(?i)statement\\s+period\\s*:?.*?([A-Za-z]+\\s+[0-9]{1,2}.*?through.*?[A-Za-z]+\\s+[0-9]{1,2},?\\s+[0-9]{4})"
- "(?i)period\\s*:?.*?([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+(?:to|through|-)\\s+([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
fallback: null
near: ["Statement Period", "Period"]
parse: string
opening_balance:
type: decimal
extraction:
patterns:
- "(?i)opening\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
- "(?i)beginning\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
near: ["Opening Balance", "Beginning Balance"]
regex: "([\\d,]+\\.\\d{2})"
parse: decimal
closing_balance:
type: decimal
extraction:
patterns:
- "(?i)closing\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
- "(?i)ending\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
- "(?i)current\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
near: ["Closing Balance", "Ending Balance", "Current Balance"]
regex: "([\\d,]+\\.\\d{2})"
parse: decimal
transactions:
type: array
extraction:
table_region: "largest_table_or_central_body"
table_region: largest_table
schema:
- name: date
type: date
@ -64,5 +73,3 @@ profile_fields:
type: decimal
required: false
fallback: []
reading_order: line_dominant
zone_filtering: exclude_headers_footers

View file

@ -1,68 +1,63 @@
# Book Chapter Profile
#
# Book chapters, monographs, and long-form narrative documents.
# Extracts title, chapter_number, author, sections.
# Book Chapter extraction profile
# Matches book chapters, monographs, and long-form narrative documents
name: book_chapter
description: Book chapters, monographs, long-form narrative documents
priority: 5
# Matching predicates for book chapter classification
match:
all:
# Page count in typical chapter range (not a whole book, not a single page)
- structural:
page_count: {min: 5, max: 1000}
# Heading depth indicates structured content
- structural:
heading_depth: {min: 1, max: 5}
# AND EITHER: has chapter/section headings
# OR: has limited font diversity (not a dense academic paper)
# OR: matches chapter/section text patterns
has_table: false
has_form_field: false
has_math: false
page_count:
min: 5
max: 1000
- any:
- text_matches: '^Chapter \d+'
- heading_matches: '^(Chapter|Part|Section) \d+'
- text_matches: '^\d+\.\s+[A-Z]'
- structural:
font_diversity: {min: 1, max: 4}
- text_matches:
pattern: "^Chapter \\d+"
- heading_matches:
pattern: "^(Chapter|Part|Section) \\d+"
- text_matches:
pattern: "^\\d+\\.\\s+[A-Z]"
none:
# Exclude more specific document types
- text_contains: ['Abstract', 'WHEREAS', 'Invoice', 'Account Statement', 'References']
- text_contains:
patterns: ["Abstract", "WHEREAS", "Invoice", "Account Statement", "References"]
# Extraction tuning for book chapters
extraction:
# Use line_dominant reading order for narrative text flow
reading_order: line_dominant
# Default table detection
table_detection: default
# Higher readability threshold for narrative text quality
readability_threshold: 0.6
# Don't include invisible text
include_invisible: false
# Exclude headers, footers, and page numbers from body content
include_headers_footers: false
force_ocr: false
min_block_chars: 0
# Field extraction specifications
fields:
title:
type: string
region: top_third
pick: largest_font
page: first
extraction:
region: top_third
pick: largest_font
parse: string
chapter_number:
type: string
near: ['Chapter', 'Part']
regex: '\d+'
max_distance_pt: 100
extraction:
near: ["Chapter", "Part"]
regex: "\\d+"
max_distance_pt: 100
parse: string
author:
type: string
region: top_quarter
pick: smallest_font
page: first
extraction:
region: top_quarter
pick: smallest_font
parse: string
sections:
type: array
pick: largest_font
per_page: true
extraction:
pick: largest_font
fallback: []

View file

@ -1,38 +1,66 @@
# Contract profile for legal agreements
# Extracts parties, effective date, term, governing law, and signatures from contracts
# Contract extraction profile
# Matches legal contracts and agreements with parties, effective date, term, governing law, and signatures
name: contract
description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures
priority: 20
# Matching predicates: identify documents as contracts
match:
all:
- any:
- text_contains: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
- heading_matches: '^(Agreement|Contract|Memorandum of Understanding)'
- structural: {page_count: {min: 2, max: 200}}
- text_contains:
patterns: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
- heading_matches:
pattern: "^(Agreement|Contract|Memorandum of Understanding)"
- structural:
has_table: false
has_form_field: false
has_math: false
page_count:
min: 2
max: 200
none:
- text_contains: ["Invoice #", "Receipt"]
- text_contains:
patterns: ["Invoice #", "Receipt"]
# Extraction tuning for contracts
extraction:
reading_order: xy_cut
table_detection: off
readability_threshold: 0.5
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
# Field extractors for contract-specific metadata
fields:
parties:
near: ["between", "party of the first part", "BY AND BETWEEN"]
pick: nearest_below
type: string
extraction:
near: ["between", "party of the first part", "BY AND BETWEEN"]
pick: nearest_below
parse: string
effective_date:
near: ["Effective Date", "Date of Agreement", "as of"]
parse: date
type: date
extraction:
near: ["Effective Date", "Date of Agreement", "as of"]
parse: date
term:
near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
regex: '\d+\s+(years?|months?)|expires?\s+\d{4}'
type: string
extraction:
near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
regex: "\\d+\\s+(years?|months?)|expires?\\s+\\d{4}"
parse: string
governing_law:
near: ["Governing Law", "governed by the laws of"]
pick: nearest_right
type: string
extraction:
near: ["Governing Law", "governed by the laws of"]
pick: nearest_right
parse: string
signatures:
region: bottom_quarter
type: array
extraction:
region: bottom_quarter
fallback: []

View file

@ -1,18 +1,34 @@
# Form extraction profile
# Matches fillable forms with fields; uses line_dominant reading order
name: form
description: Fillable form with fields; uses line_dominant reading order and form_fields from Phase 7.4
priority: 30
match:
any:
- text_patterns:
- "(?i)form\\s*[0-9A-Z-]+"
- "(?i)application\\s+form"
- "(?i)questionnaire"
- "(?i)please\\s+fill\\s+out"
- "(?i)required\\s+fields?"
all:
- any:
- text_contains:
patterns: ["form", "application form", "questionnaire", "please fill out", "required fields"]
- structural:
has_table: false
has_form_field: true
has_math: false
page_count: null
- structural:
- has_form_field_layout: true
- has_blank_lines_with_colons: true
page_count_hint: 1-10
profile_fields: {}
reading_order: line_dominant
zone_filtering: none
form_fields_integration: true
has_table: false
has_form_field: false
has_math: false
page_count:
min: 1
max: 10
extraction:
reading_order: line_dominant
table_detection: off
readability_threshold: 0.5
include_invisible: false
include_headers_footers: true
force_ocr: false
min_block_chars: 0
fields: {}

View file

@ -1,81 +1,104 @@
# Invoice extraction profile
# Matches commercial invoices with line items, vendor/customer, and totals
name: invoice
description: Commercial invoice with line items, vendor/customer, and totals
priority: 50
match:
any:
- text_patterns:
- "(?i)invoice"
- "(?i)bill to"
- "(?i)invoice #"
- "(?i)invoice number"
- "(?i)tax invoice"
- text_patterns:
- "(?i)due date"
- "(?i)payment terms"
- "(?i)purchase order"
- "(?i)po #"
- structural:
- has_line_item_table: true
page_count_hint: 1-5
profile_fields:
all:
- any:
- text_contains:
patterns: ["invoice", "bill to", "invoice #", "invoice number", "tax invoice"]
- heading_matches:
pattern: "^Invoice\\b"
- any:
- has_currency_pattern:
has_currency_pattern: true
- structural:
has_table: true
has_form_field: false
has_math: false
page_count:
min: 1
max: 5
none:
- text_contains:
patterns: ["abstract", "bibliography", "scientific paper"]
extraction:
reading_order: line_dominant
table_detection: strict_borders
readability_threshold: 0.4
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
invoice_number:
type: string
extraction:
patterns:
- "(?i)invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
- "(?i)bill\\s*invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
fallback: null
regex: "Invoice\\s*#\\s*([\\w-]+)"
near: ["Invoice", "Invoice Number", "Invoice #"]
max_distance_pt: 200
parse: string
vendor:
type: string
extraction:
patterns:
- "(?i)(?:from|vendor|supplier|company)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
- "(?i)^([A-Z][A-Za-z0-9\\s&]+)\\s+(?:Inc|LLC|Ltd|Corp|GmbH)"
fallback: null
region: top_quarter
pick: largest_font
customer:
type: string
extraction:
patterns:
- "(?i)(?:bill\\s*to|customer|client)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
fallback: null
near: ["Bill To", "Customer", "Sold To"]
max_distance_pt: 150
pick: nearest_below
parse: string
invoice_date:
type: date
extraction:
patterns:
- "(?i)invoice\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
- "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
fallback: null
near: ["Date", "Invoice Date"]
max_distance_pt: 100
parse: date
due_date:
type: date
extraction:
patterns:
- "(?i)due\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
- "(?i)payment\\s*due\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
fallback: null
near: ["Due Date", "Payment Due", "Due"]
max_distance_pt: 100
parse: date
total:
type: decimal
extraction:
patterns:
- "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
- "(?i)amount\\s*due\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
regex: "([\\d,]+\\.\\d{2})"
near: ["Total", "Amount Due", "Balance Due", "Grand Total"]
max_distance_pt: 80
parse: decimal
subtotal:
type: decimal
extraction:
patterns:
- "(?i)sub\\s*total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
regex: "([\\d,]+\\.\\d{2})"
near: ["Subtotal", "Sub-Total"]
max_distance_pt: 80
parse: decimal
tax:
type: decimal
extraction:
patterns:
- "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
- "(?i)vat\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
- "(?i)gst\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
regex: "([\\d,]+\\.\\d{2})"
near: ["Tax", "VAT", "GST", "Sales Tax"]
max_distance_pt: 80
parse: decimal
line_items:
type: array
extraction:
table_region: "largest_table_or_bottom_half"
table_region: largest_table
schema:
- name: description
type: string
@ -90,5 +113,3 @@ profile_fields:
type: decimal
required: false
fallback: []
reading_order: line_dominant
zone_filtering: exclude_headers_footers

View file

@ -1,55 +1,62 @@
# Legal Filing Profile
#
# Court filings: motions, briefs, orders, docket entries.
# Extracts case_number, court, parties, filing_date, docket_entries.
# Legal Filing extraction profile
# Matches court filings: motions, briefs, orders, docket entries
name: legal_filing
description: "Court filings: motions, briefs, orders, docket entries"
description: Court filings: motions, briefs, orders, docket entries
priority: 40
# Matching predicates for legal filing classification
match:
all:
# Must have at least one legal filing marker
- any:
- text_contains:
["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF",
"Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner",
"Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"]
- heading_matches: '^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)'
# And appropriate page count
- structural: {page_count: {min: 1, max: 500}}
patterns: ["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF", "Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner", "Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"]
- heading_matches:
pattern: "^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)"
- structural:
has_table: false
has_form_field: false
has_math: false
page_count:
min: 1
max: 500
# Extraction tuning for legal filings
extraction:
# Use xy_cut reading order for complex layouts
reading_order: xy_cut
# Default table detection
table_detection: default
# Standard readability threshold
readability_threshold: 0.5
# Include headers and footers (page numbers and citations are load-bearing in legal docs)
include_headers_footers: true
# Don't include invisible text
include_invisible: false
include_headers_footers: true
force_ocr: false
min_block_chars: 0
# Field extraction specifications
fields:
case_number:
near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."]
regex: '[\w-]+:?\s*\d+[\w-]*'
parse: string
type: string
extraction:
near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."]
regex: "[\\w-]+:?\\s*\\d+[\\w-]*"
parse: string
court:
region: top_quarter
pick: largest_font
type: string
extraction:
region: top_quarter
pick: largest_font
parse: string
parties:
near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."]
type: array
extraction:
near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."]
fallback: []
filing_date:
near: ["Filed", "Date Filed", "Dated"]
parse: date
type: date
extraction:
near: ["Filed", "Date Filed", "Dated"]
parse: date
docket_entries:
region: full
type: array
extraction:
region: bottom_half
fallback: []

View file

@ -1,52 +1,67 @@
# Receipt extraction profile
# Matches point-of-sale or purchase receipts with items and payment method
name: receipt
description: Point-of-sale or purchase receipt with items, payment method
priority: 45
match:
any:
- text_patterns:
- "(?i)receipt"
- "(?i)store receipt"
- "(?i)register receipt"
- "(?i)transaction receipt"
- text_patterns:
- "(?i)total.*sold"
- "(?i)change.*due"
- "(?i)cash.*credit"
- "(?i)card.*payment"
all:
- any:
- text_contains:
patterns: ["receipt", "store receipt", "register receipt", "transaction receipt"]
- text_contains:
patterns: ["total sold", "change due", "cash credit", "card payment"]
- structural:
- has_monetary_columnar_layout: true
- page_aspect_ratio: "narrow_or_square"
page_count_hint: 1
profile_fields:
has_table: true
has_form_field: false
has_math: false
page_count:
min: 1
max: 2
extraction:
reading_order: line_dominant
table_detection: default
readability_threshold: 0.5
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
merchant:
type: string
extraction:
patterns:
- "(?i)^([A-Z][A-Za-z0-9\\s&']+)$"
- "(?i)(?:store|merchant|retailer)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&']+)"
fallback: null
region: top_quarter
pick: largest_font
parse: string
date:
type: date
extraction:
patterns:
- "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
- "([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+([0-9]{1,2}:[0-9]{2})"
fallback: null
regex: "\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}"
parse: date
total:
type: decimal
extraction:
patterns:
- "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
regex: "([\\d,]+\\.\\d{2})"
near: ["Total", "Amount Due", "Balance"]
max_distance_pt: 80
parse: decimal
tax:
type: decimal
extraction:
patterns:
- "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
regex: "([\\d,]+\\.\\d{2})"
near: ["Tax", "VAT"]
max_distance_pt: 80
parse: decimal
items:
type: array
extraction:
columnar_regions: "monetary_columns"
table_region: largest_table
schema:
- name: name
type: string
@ -58,11 +73,9 @@ profile_fields:
type: decimal
required: false
fallback: []
payment_method:
type: string
extraction:
patterns:
- "(?i)(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
fallback: null
reading_order: line_dominant
zone_filtering: exclude_headers_footers
regex: "(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
parse: string

View file

@ -1,66 +1,87 @@
# Scientific Paper Profile
#
# Academic papers from arXiv, journals, conference proceedings.
# Extracts title, authors, abstract, DOI, journal, publication_date, references.
# Scientific Paper extraction profile
# Matches academic papers from arXiv, journals, conference proceedings
name: scientific_paper
description: Academic papers from arXiv, journals, conference proceedings
priority: 30
# Matching predicates for scientific paper classification
match:
all:
# Must have at least one scientific paper marker
- any:
- text_contains: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"]
- heading_matches: '^(Abstract|Introduction|References|Bibliography)'
# And either has math OR structured headings OR appropriate page count
- text_contains:
patterns: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"]
- heading_matches:
pattern: "^(Abstract|Introduction|References|Bibliography)"
- any:
- structural:
has_table: false
has_form_field: false
has_math: true
page_count: null
- structural:
heading_depth: {min: 2}
- structural:
page_count: {min: 4, max: 50}
has_table: false
has_form_field: false
has_math: false
page_count:
min: 4
max: 50
none:
- text_contains:
patterns: ["Invoice", "Receipt", "WHEREAS", "NOW THEREFORE"]
# Extraction tuning for scientific papers
extraction:
# Use xy_cut reading order for 2-column layout handling
reading_order: xy_cut
# Default table detection
table_detection: default
# Standard readability threshold
readability_threshold: 0.5
# Don't include invisible text
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
# Field extraction specifications
fields:
title:
region: top_quarter
pick: largest_font
type: string
extraction:
region: top_quarter
pick: largest_font
parse: string
authors:
region: top_quarter
pick: nearest_below
after: title
type: array
extraction:
region: top_quarter
pick: nearest_below
after_heading: title
fallback: []
abstract:
near: ["Abstract"]
region: top_half
type: string
extraction:
near: ["Abstract"]
region: top_half
parse: string
doi:
regex: 'doi[:\.]\s*(10\.\d{4,9}/[\w\-\._;()/:]+)'
parse: string
type: string
extraction:
regex: "doi[:\\.]\\s*(10\\.\\d{4,9}/[\\w\\-\\._;()/:]+)"
parse: string
journal:
region: top_eighth
pick: first
type: string
extraction:
region: top_eighth
pick: first
parse: string
publication_date:
near: ["Published", "Received", "Accepted"]
parse: date
type: date
extraction:
near: ["Published", "Received", "Accepted"]
parse: date
references:
region: bottom_half
after_heading: References
type: array
extraction:
region: bottom_half
after_heading: References
fallback: []

View file

@ -1,64 +1,59 @@
# Slide Deck Profile
#
# PowerPoint / Keynote / Google Slides exports as PDF.
# Extracts title, presenter, date, slide_titles.
# Slide Deck extraction profile
# Matches PowerPoint / Keynote / Google Slides exports as PDF
name: slide_deck
description: PowerPoint / Keynote / Google Slides exports as PDF
priority: 15
# Matching predicates for slide deck classification
match:
all:
# Page count in typical slide deck range
- structural:
page_count: {min: 3, max: 200}
# And EITHER: has limited font diversity (not a dense academic paper)
# OR: contains "Slide N" patterns
# OR: contains slide deck keywords
has_table: false
has_form_field: false
has_math: false
page_count:
min: 3
max: 200
- any:
- structural:
has_form_field: false
font_diversity: {min: 2, max: 10}
- text_matches: '^Slide \d+$'
- text_contains: ["slides", "presentation"]
- text_matches:
pattern: "^Slide \\d+$"
- text_contains:
patterns: ["slides", "presentation"]
none:
# Exclude academic papers (these have their own profile)
- text_contains: ["Abstract", "References", "WHEREAS", "Invoice"]
- text_contains:
patterns: ["Abstract", "References", "WHEREAS", "Invoice"]
# Extraction tuning for slide decks
extraction:
# Use xy_cut reading order for proper layout handling
reading_order: xy_cut
# Default table detection
table_detection: default
# Lower readability threshold for slides (less text density)
readability_threshold: 0.6
# Don't include invisible text
include_invisible: false
# Minimum block characters
include_headers_footers: false
force_ocr: false
min_block_chars: 5
# Field extraction specifications
fields:
title:
type: string
region: middle_half
pick: largest_font
page: first
extraction:
region: top_half
pick: largest_font
parse: string
presenter:
type: string
region: bottom_half
pick: largest_font
page: first
extraction:
region: top_half
pick: largest_font
parse: string
date:
type: date
near: ["Date"]
parse: date
extraction:
near: ["Date"]
parse: date
slide_titles:
type: array
pick: largest_font
per_page: true
extraction:
pick: largest_font
fallback: []

1
tests/fixtures/profiles/invoice/01.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/01.pdf

1
tests/fixtures/profiles/invoice/02.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/02.pdf

1
tests/fixtures/profiles/invoice/03.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/03.pdf

1
tests/fixtures/profiles/invoice/04.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/04.pdf

1
tests/fixtures/profiles/invoice/05.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/05.pdf

1
tests/fixtures/profiles/invoice/06.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/06.pdf

1
tests/fixtures/profiles/invoice/07.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/07.pdf

1
tests/fixtures/profiles/invoice/08.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/08.pdf

1
tests/fixtures/profiles/invoice/09.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/09.pdf

1
tests/fixtures/profiles/invoice/10.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/10.pdf

1
tests/fixtures/profiles/invoice/11.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/11.pdf

1
tests/fixtures/profiles/invoice/12.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/12.pdf

1
tests/fixtures/profiles/invoice/13.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/13.pdf

1
tests/fixtures/profiles/invoice/14.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/14.pdf

1
tests/fixtures/profiles/invoice/15.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/15.pdf

1
tests/fixtures/profiles/invoice/16.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/16.pdf

1
tests/fixtures/profiles/invoice/17.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/17.pdf

1
tests/fixtures/profiles/invoice/18.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/18.pdf

1
tests/fixtures/profiles/invoice/19.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/19.pdf

1
tests/fixtures/profiles/invoice/20.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/20.pdf

1
tests/fixtures/profiles/invoice/21.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/21.pdf

1
tests/fixtures/profiles/invoice/22.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/22.pdf

1
tests/fixtures/profiles/invoice/23.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/23.pdf

1
tests/fixtures/profiles/invoice/24.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/24.pdf

1
tests/fixtures/profiles/invoice/25.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/25.pdf

1
tests/fixtures/profiles/invoice/26.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/26.pdf

1
tests/fixtures/profiles/invoice/27.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/27.pdf

1
tests/fixtures/profiles/invoice/28.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/28.pdf

1
tests/fixtures/profiles/invoice/29.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/29.pdf

1
tests/fixtures/profiles/invoice/30.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/30.pdf

1
tests/fixtures/profiles/invoice/31.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/31.pdf

1
tests/fixtures/profiles/invoice/32.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/32.pdf

1
tests/fixtures/profiles/invoice/33.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/33.pdf

1
tests/fixtures/profiles/invoice/34.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/34.pdf

1
tests/fixtures/profiles/invoice/35.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/35.pdf

1
tests/fixtures/profiles/invoice/36.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/36.pdf

1
tests/fixtures/profiles/invoice/37.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/37.pdf

1
tests/fixtures/profiles/invoice/38.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/38.pdf

1
tests/fixtures/profiles/invoice/39.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/39.pdf

1
tests/fixtures/profiles/invoice/40.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/40.pdf

1
tests/fixtures/profiles/invoice/41.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/41.pdf

1
tests/fixtures/profiles/invoice/42.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/42.pdf

1
tests/fixtures/profiles/invoice/43.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/43.pdf

1
tests/fixtures/profiles/invoice/44.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/44.pdf

1
tests/fixtures/profiles/invoice/45.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/45.pdf

1
tests/fixtures/profiles/invoice/46.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/46.pdf

1
tests/fixtures/profiles/invoice/47.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/47.pdf

1
tests/fixtures/profiles/invoice/48.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/48.pdf

1
tests/fixtures/profiles/invoice/49.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/49.pdf

1
tests/fixtures/profiles/invoice/50.pdf vendored Symbolic link
View file

@ -0,0 +1 @@
../../classifier/invoice/50.pdf

View file

@ -0,0 +1 @@
../../../sdk-conformance/fixtures/receipts/tampered-receipt.pdf

View file

@ -0,0 +1 @@
../../../sdk-conformance/fixtures/receipts/valid-receipt.pdf