From 80dbf0f703188cef7a15847f30e6dc401d6492c8 Mon Sep 17 00:00:00 2001 From: jedarden Date: Sun, 31 May 2026 15:10:51 -0400 Subject: [PATCH] feat(profiles): add profile infrastructure and initial fixtures - Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator) --- .needle-predispatch-sha | 2 +- Cargo.lock | 1 + crates/pdftract-cli/Cargo.toml | 3 +- crates/pdftract-cli/src/main.rs | 240 ++++++-- crates/pdftract-cli/src/profiles_cmd.rs | 300 ++++++++++ crates/pdftract-core/Cargo.toml | 1 + crates/pdftract-core/src/extract.rs | 18 + .../src/profiles/apply_profile.rs | 259 +++++++++ .../pdftract-core/src/profiles/extraction.rs | 437 +++++++++++++++ .../src/profiles/extraction_loader.rs | 374 +++++++++++++ .../src/profiles/field_extractor.rs | 353 ++++++++++++ .../pdftract-core/src/profiles/match_eval.rs | 528 ++++++++++++++++++ crates/pdftract-core/src/profiles/mod.rs | 18 +- profiles/builtin/bank_statement/profile.yaml | 77 +-- profiles/builtin/book_chapter/profile.yaml | 73 ++- profiles/builtin/contract/profile.yaml | 64 ++- profiles/builtin/form/profile.yaml | 44 +- profiles/builtin/invoice/profile.yaml | 121 ++-- profiles/builtin/legal_filing/profile.yaml | 69 ++- profiles/builtin/receipt/profile.yaml | 83 +-- .../builtin/scientific_paper/profile.yaml | 89 +-- profiles/builtin/slide_deck/profile.yaml | 65 +-- tests/fixtures/profiles/invoice/01.pdf | 1 + tests/fixtures/profiles/invoice/02.pdf | 1 + tests/fixtures/profiles/invoice/03.pdf | 1 + tests/fixtures/profiles/invoice/04.pdf | 1 + tests/fixtures/profiles/invoice/05.pdf | 1 + tests/fixtures/profiles/invoice/06.pdf | 1 + tests/fixtures/profiles/invoice/07.pdf | 1 + tests/fixtures/profiles/invoice/08.pdf | 1 + tests/fixtures/profiles/invoice/09.pdf | 1 + tests/fixtures/profiles/invoice/10.pdf | 1 + tests/fixtures/profiles/invoice/11.pdf | 1 + tests/fixtures/profiles/invoice/12.pdf | 1 + tests/fixtures/profiles/invoice/13.pdf | 1 + tests/fixtures/profiles/invoice/14.pdf | 1 + tests/fixtures/profiles/invoice/15.pdf | 1 + tests/fixtures/profiles/invoice/16.pdf | 1 + tests/fixtures/profiles/invoice/17.pdf | 1 + tests/fixtures/profiles/invoice/18.pdf | 1 + tests/fixtures/profiles/invoice/19.pdf | 1 + tests/fixtures/profiles/invoice/20.pdf | 1 + tests/fixtures/profiles/invoice/21.pdf | 1 + tests/fixtures/profiles/invoice/22.pdf | 1 + tests/fixtures/profiles/invoice/23.pdf | 1 + tests/fixtures/profiles/invoice/24.pdf | 1 + tests/fixtures/profiles/invoice/25.pdf | 1 + tests/fixtures/profiles/invoice/26.pdf | 1 + tests/fixtures/profiles/invoice/27.pdf | 1 + tests/fixtures/profiles/invoice/28.pdf | 1 + tests/fixtures/profiles/invoice/29.pdf | 1 + tests/fixtures/profiles/invoice/30.pdf | 1 + tests/fixtures/profiles/invoice/31.pdf | 1 + tests/fixtures/profiles/invoice/32.pdf | 1 + tests/fixtures/profiles/invoice/33.pdf | 1 + tests/fixtures/profiles/invoice/34.pdf | 1 + tests/fixtures/profiles/invoice/35.pdf | 1 + tests/fixtures/profiles/invoice/36.pdf | 1 + tests/fixtures/profiles/invoice/37.pdf | 1 + tests/fixtures/profiles/invoice/38.pdf | 1 + tests/fixtures/profiles/invoice/39.pdf | 1 + tests/fixtures/profiles/invoice/40.pdf | 1 + tests/fixtures/profiles/invoice/41.pdf | 1 + tests/fixtures/profiles/invoice/42.pdf | 1 + tests/fixtures/profiles/invoice/43.pdf | 1 + tests/fixtures/profiles/invoice/44.pdf | 1 + tests/fixtures/profiles/invoice/45.pdf | 1 + tests/fixtures/profiles/invoice/46.pdf | 1 + tests/fixtures/profiles/invoice/47.pdf | 1 + tests/fixtures/profiles/invoice/48.pdf | 1 + tests/fixtures/profiles/invoice/49.pdf | 1 + tests/fixtures/profiles/invoice/50.pdf | 1 + .../profiles/receipt/tampered-receipt.pdf | 1 + .../profiles/receipt/valid-receipt.pdf | 1 + 74 files changed, 2940 insertions(+), 331 deletions(-) create mode 100644 crates/pdftract-cli/src/profiles_cmd.rs create mode 100644 crates/pdftract-core/src/profiles/apply_profile.rs create mode 100644 crates/pdftract-core/src/profiles/extraction.rs create mode 100644 crates/pdftract-core/src/profiles/extraction_loader.rs create mode 100644 crates/pdftract-core/src/profiles/field_extractor.rs create mode 100644 crates/pdftract-core/src/profiles/match_eval.rs create mode 120000 tests/fixtures/profiles/invoice/01.pdf create mode 120000 tests/fixtures/profiles/invoice/02.pdf create mode 120000 tests/fixtures/profiles/invoice/03.pdf create mode 120000 tests/fixtures/profiles/invoice/04.pdf create mode 120000 tests/fixtures/profiles/invoice/05.pdf create mode 120000 tests/fixtures/profiles/invoice/06.pdf create mode 120000 tests/fixtures/profiles/invoice/07.pdf create mode 120000 tests/fixtures/profiles/invoice/08.pdf create mode 120000 tests/fixtures/profiles/invoice/09.pdf create mode 120000 tests/fixtures/profiles/invoice/10.pdf create mode 120000 tests/fixtures/profiles/invoice/11.pdf create mode 120000 tests/fixtures/profiles/invoice/12.pdf create mode 120000 tests/fixtures/profiles/invoice/13.pdf create mode 120000 tests/fixtures/profiles/invoice/14.pdf create mode 120000 tests/fixtures/profiles/invoice/15.pdf create mode 120000 tests/fixtures/profiles/invoice/16.pdf create mode 120000 tests/fixtures/profiles/invoice/17.pdf create mode 120000 tests/fixtures/profiles/invoice/18.pdf create mode 120000 tests/fixtures/profiles/invoice/19.pdf create mode 120000 tests/fixtures/profiles/invoice/20.pdf create mode 120000 tests/fixtures/profiles/invoice/21.pdf create mode 120000 tests/fixtures/profiles/invoice/22.pdf create mode 120000 tests/fixtures/profiles/invoice/23.pdf create mode 120000 tests/fixtures/profiles/invoice/24.pdf create mode 120000 tests/fixtures/profiles/invoice/25.pdf create mode 120000 tests/fixtures/profiles/invoice/26.pdf create mode 120000 tests/fixtures/profiles/invoice/27.pdf create mode 120000 tests/fixtures/profiles/invoice/28.pdf create mode 120000 tests/fixtures/profiles/invoice/29.pdf create mode 120000 tests/fixtures/profiles/invoice/30.pdf create mode 120000 tests/fixtures/profiles/invoice/31.pdf create mode 120000 tests/fixtures/profiles/invoice/32.pdf create mode 120000 tests/fixtures/profiles/invoice/33.pdf create mode 120000 tests/fixtures/profiles/invoice/34.pdf create mode 120000 tests/fixtures/profiles/invoice/35.pdf create mode 120000 tests/fixtures/profiles/invoice/36.pdf create mode 120000 tests/fixtures/profiles/invoice/37.pdf create mode 120000 tests/fixtures/profiles/invoice/38.pdf create mode 120000 tests/fixtures/profiles/invoice/39.pdf create mode 120000 tests/fixtures/profiles/invoice/40.pdf create mode 120000 tests/fixtures/profiles/invoice/41.pdf create mode 120000 tests/fixtures/profiles/invoice/42.pdf create mode 120000 tests/fixtures/profiles/invoice/43.pdf create mode 120000 tests/fixtures/profiles/invoice/44.pdf create mode 120000 tests/fixtures/profiles/invoice/45.pdf create mode 120000 tests/fixtures/profiles/invoice/46.pdf create mode 120000 tests/fixtures/profiles/invoice/47.pdf create mode 120000 tests/fixtures/profiles/invoice/48.pdf create mode 120000 tests/fixtures/profiles/invoice/49.pdf create mode 120000 tests/fixtures/profiles/invoice/50.pdf create mode 120000 tests/fixtures/profiles/receipt/tampered-receipt.pdf create mode 120000 tests/fixtures/profiles/receipt/valid-receipt.pdf diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index 50d9fea..a189b03 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -9cf1ccffa9b1213b83079e66d9a245aadc6d584f +deeafed7a94a1e91609a11976ef16ee03a1f5fac diff --git a/Cargo.lock b/Cargo.lock index 6adc7d9..f59c554 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3267,6 +3267,7 @@ dependencies = [ "criterion", "dashmap", "digest", + "dirs", "encoding_rs", "filetime", "flate2", diff --git a/crates/pdftract-cli/Cargo.toml b/crates/pdftract-cli/Cargo.toml index b92cb9d..3212e5a 100644 --- a/crates/pdftract-cli/Cargo.toml +++ b/crates/pdftract-cli/Cargo.toml @@ -72,6 +72,7 @@ clap = { version = "4.5", features = ["derive"] } crossbeam-channel = "0.5" dirs = "5.0" hyper = { version = "1.0", features = ["full"] } +notify = { version = "6", optional = true } hyper-util = { version = "0.1", features = ["full"] } image = "0.24" http-body-util = "0.1" @@ -117,7 +118,7 @@ full-render = ["dep:libloading", "pdftract-core/full-render"] # Remote HTTP source support remote = ["dep:ureq"] # Document profiles -profiles = ["dep:serde_yaml", "pdftract-core/profiles"] +profiles = ["dep:serde_yaml", "pdftract-core/profiles", "dep:notify"] # HTTP serve mode serve = [] # MCP server mode diff --git a/crates/pdftract-cli/src/main.rs b/crates/pdftract-cli/src/main.rs index 0a8fec0..f6a3169 100644 --- a/crates/pdftract-cli/src/main.rs +++ b/crates/pdftract-cli/src/main.rs @@ -19,6 +19,7 @@ mod output; mod pages; mod panic_hook; mod password; +mod profiles_cmd; mod serve; mod url; mod verify_receipt; @@ -160,6 +161,10 @@ enum Commands { #[arg(long)] auto: bool, + /// Force-apply a specific profile (by name or YAML file path) + #[arg(long, value_name = "NAME|PATH")] + profile: Option, + /// Include header blocks in output #[arg(long)] include_headers: bool, @@ -238,6 +243,11 @@ enum Commands { #[command(subcommand)] cache_command: CacheCommands, }, + /// Manage document type profiles + Profiles { + #[command(subcommand)] + profiles_command: ProfilesCommands, + }, /// Start the HTTP server for extraction /// /// ## Security Model @@ -311,6 +321,14 @@ enum Commands { /// Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy) #[arg(long)] trust_forwarded_for: bool, + + /// Directory containing custom profile YAML files (repeatable) + #[arg(long, value_name = "DIR")] + profile_dir: Option, + + /// Enable hot-reload for profiles (re-read directory on every request) + #[arg(long)] + profile_hot_reload: bool, }, /// Start the MCP (Model Context Protocol) server /// @@ -452,6 +470,32 @@ enum CacheCommands { }, } +#[derive(Subcommand)] +enum ProfilesCommands { + /// List all available profiles + List, + /// Show a profile's YAML content + Show { + /// Profile name or path to YAML file + name_or_path: String, + }, + /// Export a built-in profile to stdout + Export { + /// Name of the built-in profile to export + name: String, + }, + /// Install a profile to the user config directory + Install { + /// Path to the profile YAML file to install + path: PathBuf, + }, + /// Validate a profile file + Validate { + /// Path to the profile YAML file to validate + path: PathBuf, + }, +} + fn main() -> Result<()> { // Install panic hook for SecretString redaction in backtraces // This ensures credentials never leak in crash dumps @@ -504,6 +548,7 @@ fn main() -> Result<()> { no_cache, md_anchors, auto, + profile, output, include_headers, include_footers, @@ -532,6 +577,7 @@ fn main() -> Result<()> { no_cache, md_anchors, auto, + profile, include_headers, include_footers, include_headers_footers, @@ -602,6 +648,12 @@ fn main() -> Result<()> { std::process::exit(1); } } + Commands::Profiles { profiles_command } => { + if let Err(e) = cmd_profiles(profiles_command) { + eprintln!("Error: {}", e); + std::process::exit(1); + } + } Commands::Serve { bind, cache_dir, @@ -611,6 +663,8 @@ fn main() -> Result<()> { max_decompress_gb, audit_log, trust_forwarded_for, + profile_dir, + profile_hot_reload, } => { if let Err(e) = cmd_serve( bind, @@ -621,6 +675,8 @@ fn main() -> Result<()> { max_decompress_gb, audit_log, trust_forwarded_for, + profile_dir, + profile_hot_reload, ) { eprintln!("Error: {}", e); std::process::exit(1); @@ -775,6 +831,7 @@ fn cmd_extract( no_cache: bool, md_anchors: bool, auto: bool, + profile: Option, include_headers: bool, include_footers: bool, include_headers_footers: bool, @@ -921,11 +978,12 @@ fn cmd_extract( eprintln!("Auto-detecting document type..."); use pdftract_core::profiles::{ - classify, extract_signals_from_results, load_builtins, ProfileType, + classify_and_select_profile, extract_signals_from_results, load_extraction_profiles, + apply_extraction_tuning, apply_profile_to_metadata, }; - // Load built-in profiles - let profiles = load_builtins(); + // Load all extraction profiles + let profiles = load_extraction_profiles(&[]).unwrap_or_default(); if !profiles.is_empty() { // Perform a lightweight extraction for classification @@ -940,43 +998,33 @@ fn cmd_extract( .map(|p| (p.blocks.clone(), p.spans.clone())) .collect(); - let signals = - extract_signals_from_results(&page_data, has_signature_field, has_form_field); - let classification = classify(&signals, &profiles); + let selected_profile = classify_and_select_profile( + &profiles.iter().map(|p| p.profile.clone()).collect::>(), + &page_data, + has_signature_field, + has_form_field, + ); - match classification.document_type { - ProfileType::Unknown => { - eprintln!( - "Document type: unknown (confidence: {:.2})", - classification.confidence - ); - eprintln!("Proceeding with default extraction options."); - } - detected_type => { - let type_name = match detected_type { - ProfileType::Invoice => "invoice", - ProfileType::Receipt => "receipt", - ProfileType::Contract => "contract", - ProfileType::ScientificPaper => "scientific_paper", - ProfileType::SlideDeck => "slide_deck", - ProfileType::Form => "form", - ProfileType::BankStatement => "bank_statement", - ProfileType::LegalFiling => "legal_filing", - ProfileType::BookChapter => "book_chapter", - ProfileType::Unknown => "unknown", - }; - eprintln!( - "Document type: {} (confidence: {:.2})", - type_name, classification.confidence - ); + if let Some((profile, match_result)) = selected_profile { + eprintln!( + "Document type: {} (confidence: {:.2})", + profile.name, match_result.confidence + ); - // Apply profile-specific extraction options - // For now, just log the detection - profile option overrides - // will be implemented in Phase 7.10 - for reason in classification.reasons.iter().take(5) { - eprintln!(" - {}", reason); - } + // Apply profile extraction tuning + if let Some(ref tuning) = profile.extraction { + apply_extraction_tuning(tuning, &mut options); } + + // Store the selected profile for later field extraction + // We'll extract fields after the main extraction + // For now, just log the match reasons + for reason in match_result.reasons.iter().take(5) { + eprintln!(" - {}", reason); + } + } else { + eprintln!("Document type: unknown (confidence: below threshold)"); + eprintln!("Proceeding with default extraction options."); } } else { eprintln!( @@ -990,6 +1038,46 @@ fn cmd_extract( } } + // Handle --profile flag: load and apply specific profile + #[cfg(feature = "profiles")] + if let Some(ref profile_name_or_path) = profile { + use pdftract_core::profiles::{ + load_extraction_profiles, apply_extraction_tuning, + }; + + eprintln!("Applying profile: {}", profile_name_or_path); + + let profiles = load_extraction_profiles(&[]).unwrap_or_default(); + + // Find the profile by name or load from path + let profile = if std::path::PathBuf::from(profile_name_or_path).exists() { + // Load from file path + use pdftract_core::profiles::load_profile_file; + match load_profile_file(&std::path::PathBuf::from(profile_name_or_path)) { + Ok(p) => Some(p), + Err(e) => { + eprintln!("Error loading profile: {}", e); + std::process::exit(1); + } + } + } else { + // Find by name + profiles.iter() + .find(|p| p.profile.name == *profile_name_or_path) + .map(|p| p.profile.clone()) + }; + + if let Some(p) = profile { + eprintln!("Loaded profile: {}", p.name); + if let Some(ref tuning) = p.extraction { + apply_extraction_tuning(tuning, &mut options); + } + } else { + eprintln!("Error: Profile '{}' not found", profile_name_or_path); + std::process::exit(1); + } + } + #[cfg(not(feature = "profiles"))] if auto { eprintln!("Warning: --auto flag requires the 'profiles' feature to be enabled."); @@ -997,6 +1085,13 @@ fn cmd_extract( eprintln!("Proceeding with default extraction options."); } + #[cfg(not(feature = "profiles"))] + if profile.is_some() { + eprintln!("Warning: --profile flag requires the 'profiles' feature to be enabled."); + eprintln!("Build pdftract with: --features profiles"); + eprintln!("Proceeding with default extraction options."); + } + // Set markdown anchors option options.markdown_anchors = md_anchors; if md_anchors { @@ -1096,6 +1191,58 @@ fn cmd_extract( result.metadata.cache_status = Some(cache_status); result.metadata.cache_age_seconds = cache_age; + // Extract profile fields if --auto or --profile was used + #[cfg(feature = "profiles")] + { + use pdftract_core::profiles::{ + load_extraction_profiles, apply_profile_to_metadata, + }; + + let profile_to_apply = if auto { + // Re-run classification to get the selected profile + let profiles = load_extraction_profiles(&[]).unwrap_or_default(); + let page_data: Vec<(Vec<_>, Vec<_>)> = result + .pages + .iter() + .map(|p| (p.blocks.clone(), p.spans.clone())) + .collect(); + let has_signature_field = !result.signatures.is_empty(); + let has_form_field = !result.form_fields.is_empty(); + + use pdftract_core::profiles::classify_and_select_profile; + classify_and_select_profile( + &profiles.iter().map(|p| p.profile.clone()).collect::>(), + &page_data, + has_signature_field, + has_form_field, + ).map(|(p, _)| p) + } else if profile.is_some() { + // Load the specified profile + let profile_name_or_path = profile.as_ref().unwrap(); + let profiles = load_extraction_profiles(&[]).unwrap_or_default(); + + if std::path::PathBuf::from(profile_name_or_path).exists() { + use pdftract_core::profiles::load_profile_file; + load_profile_file(&std::path::PathBuf::from(profile_name_or_path)).ok() + } else { + profiles.iter() + .find(|p| p.profile.name == *profile_name_or_path) + .map(|p| p.profile.clone()) + } + } else { + None + }; + + // Apply profile to metadata + if let Some(p) = profile_to_apply { + let (name, version, fields) = apply_profile_to_metadata(&p, &result.pages); + // Update the result's metadata with profile information + result.metadata.profile_name = Some(name); + result.metadata.profile_version = Some(version); + result.metadata.profile_fields = fields; + } + } + // Write each output to its destination for spec in &output_specs { match spec.dest { @@ -1803,6 +1950,25 @@ fn cmd_cache(command: CacheCommands) -> Result<()> { Ok(()) } +fn cmd_profiles(command: ProfilesCommands) -> Result<()> { + use profiles_cmd::{ProfilesArgs, ProfilesCommand}; + + // Convert ProfilesCommands to profiles_cmd::ProfilesCommand + let profiles_command = match command { + ProfilesCommands::List => ProfilesCommand::List, + ProfilesCommands::Show { name_or_path } => ProfilesCommand::Show { name_or_path }, + ProfilesCommands::Export { name } => ProfilesCommand::Export { name }, + ProfilesCommands::Install { path } => ProfilesCommand::Install { path }, + ProfilesCommands::Validate { path } => ProfilesCommand::Validate { path }, + }; + + let args = ProfilesArgs { + command: profiles_command, + }; + + profiles_cmd::run_profiles(args) +} + fn cmd_serve( bind: String, cache_dir: Option, diff --git a/crates/pdftract-cli/src/profiles_cmd.rs b/crates/pdftract-cli/src/profiles_cmd.rs new file mode 100644 index 0000000..2d95002 --- /dev/null +++ b/crates/pdftract-cli/src/profiles_cmd.rs @@ -0,0 +1,300 @@ +//! Profile management CLI subcommand. +//! +//! This module implements the `pdftract profiles` command family for managing +//! document type profiles (list, show, export, install, validate). + +use anyhow::{Context, Result}; +use std::fs; +use std::path::PathBuf; + +/// Arguments for the profiles subcommand. +pub struct ProfilesArgs { + /// Subcommand to run + pub command: ProfilesCommand, +} + +/// Profiles subcommands. +#[derive(Debug, Clone)] +pub enum ProfilesCommand { + /// List all available profiles + List, + /// Show a profile's YAML content + Show { name_or_path: String }, + /// Export a built-in profile to stdout + Export { name: String }, + /// Install a profile to the user config directory + Install { path: PathBuf }, + /// Validate a profile file + Validate { path: PathBuf }, +} + +/// Run the profiles subcommand. +pub fn run_profiles(args: ProfilesArgs) -> Result<()> { + match args.command { + ProfilesCommand::List => run_list(), + ProfilesCommand::Show { name_or_path } => run_show(&name_or_path), + ProfilesCommand::Export { name } => run_export(&name), + ProfilesCommand::Install { path } => run_install(&path), + ProfilesCommand::Validate { path } => run_validate(&path), + } +} + +/// List all available profiles. +fn run_list() -> Result<()> { + #[cfg(feature = "profiles")] + { + use pdftract_core::profiles::extraction_loader; + + // Load all extraction profiles + let profiles = extraction_loader::load_extraction_profiles(&[])?; + + if profiles.is_empty() { + println!("No profiles available."); + println!(); + println!("Built-in profiles may not be enabled. Build pdftract with:"); + println!(" cargo build --features profiles"); + return Ok(()); + } + + println!("Available profiles ({} total):", profiles.len()); + println!(); + + // Group by origin + let mut builtin = Vec::new(); + let mut user = Vec::new(); + let mut custom = Vec::new(); + + for source in &profiles { + match source.source { + extraction_loader::ProfileOrigin::BuiltIn => builtin.push(source), + extraction_loader::ProfileOrigin::User => user.push(source), + extraction_loader::ProfileOrigin::Custom(_) => custom.push(source), + extraction_loader::ProfileOrigin::System => { + // System profiles - add to a separate group or merge with user + user.push(source); + } + } + } + + // Print built-in profiles + if !builtin.is_empty() { + println!("Built-in profiles:"); + for source in builtin { + let profile = &source.profile; + println!( + " {} - Priority: {}{}", + profile.name, + profile.priority, + if source.overrides_builtin { + " (overrides built-in)" + } else { + "" + } + ); + println!(" {}", profile.description); + } + println!(); + } + + // Print user profiles + if !user.is_empty() { + println!("User profiles:"); + for source in user { + let profile = &source.profile; + println!( + " {} - Priority: {}{}", + profile.name, + profile.priority, + if source.overrides_builtin { + " (overrides built-in)" + } else { + "" + } + ); + println!(" {}", profile.description); + } + println!(); + } + + // Print custom profiles + if !custom.is_empty() { + println!("Custom profiles:"); + for source in custom { + let profile = &source.profile; + println!( + " {} - Priority: {}", + profile.name, profile.priority + ); + println!(" {}", profile.description); + } + println!(); + } + } + + #[cfg(not(feature = "profiles"))] + { + println!("Profiles are not enabled."); + println!(); + println!("Build pdftract with the profiles feature:"); + println!(" cargo build --features profiles"); + } + + Ok(()) +} + +/// Show a profile's YAML content. +fn run_show(name_or_path: &str) -> Result<()> { + #[cfg(feature = "profiles")] + { + use pdftract_core::profiles::extraction_loader; + + // Load all profiles to search by name + let profiles = extraction_loader::load_extraction_profiles(&[])?; + + // Try to find the profile + let profile = extraction_loader::find_profile(name_or_path, &profiles)?; + + // Serialize back to YAML + let yaml = serde_yaml::to_string(&profile) + .context("Failed to serialize profile to YAML")?; + + println!("{}", yaml); + } + + #[cfg(not(feature = "profiles"))] + { + anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles"); + } + + Ok(()) +} + +/// Export a built-in profile to stdout. +fn run_export(name: &str) -> Result<()> { + #[cfg(feature = "profiles")] + { + use pdftract_core::profiles::extraction_loader; + + // Load all profiles + let profiles = extraction_loader::load_extraction_profiles(&[])?; + + // Find the built-in profile by name + let profile = profiles + .iter() + .find(|s| s.profile.name == name && matches!(s.source, extraction_loader::ProfileOrigin::BuiltIn)) + .context(format!("Built-in profile '{}' not found", name))?; + + // Serialize to YAML + let yaml = serde_yaml::to_string(&profile) + .context("Failed to serialize profile to YAML")?; + + println!("{}", yaml); + } + + #[cfg(not(feature = "profiles"))] + { + anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles"); + } + + Ok(()) +} + +/// Install a profile to the user config directory. +fn run_install(path: &PathBuf) -> Result<()> { + #[cfg(feature = "profiles")] + { + use pdftract_core::profiles::extraction_loader; + + // Check if source file exists + if !path.exists() { + anyhow::bail!("Profile file not found: {}", path.display()); + } + + // Get XDG config directory + let xdg_dir = extraction_loader::get_xdg_profile_dir() + .context("Failed to determine XDG config directory")?; + + // Create directory if it doesn't exist + fs::create_dir_all(&xdg_dir) + .context(format!("Failed to create profile directory: {}", xdg_dir.display()))?; + + // Read the profile to get its name + let content = fs::read_to_string(path) + .context(format!("Failed to read profile file: {}", path.display()))?; + + // Parse to get the profile name + let profile: pdftract_core::profiles::ExtractionProfile = serde_yaml::from_str(&content) + .context("Failed to parse profile YAML")?; + + // Destination path + let dest = xdg_dir.join(format!("{}.yaml", profile.name)); + + // Copy file + fs::copy(path, &dest) + .context(format!("Failed to copy profile to: {}", dest.display()))?; + + println!("Installed profile '{}' to: {}", profile.name, dest.display()); + println!(); + println!("You can now use this profile with:"); + println!(" pdftract extract --profile {}", profile.name); + } + + #[cfg(not(feature = "profiles"))] + { + anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles"); + } + + Ok(()) +} + +/// Validate a profile file. +fn run_validate(path: &PathBuf) -> Result<()> { + #[cfg(feature = "profiles")] + { + use pdftract_core::profiles::extraction_loader; + + // Check if file exists + if !path.exists() { + anyhow::bail!("Profile file not found: {}", path.display()); + } + + // Validate the profile + match extraction_loader::validate_profile_file(path) { + Ok(()) => { + println!("Profile '{}' is valid.", path.display()); + return Ok(()); + } + Err(e) => { + anyhow::bail!("Profile validation failed: {}", e); + } + } + } + + #[cfg(not(feature = "profiles"))] + { + anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles"); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_profiles_command_enum() { + let command = ProfilesCommand::List; + assert!(matches!(command, ProfilesCommand::List)); + + let show = ProfilesCommand::Show { + name_or_path: "invoice".to_string(), + }; + assert!(matches!(show, ProfilesCommand::Show { .. })); + + let export = ProfilesCommand::Export { + name: "invoice".to_string(), + }; + assert!(matches!(export, ProfilesCommand::Export { .. })); + } +} diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index 0947460..b551b09 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -46,6 +46,7 @@ smallvec = "1.13" encoding_rs = "0.8" quick-xml = { version = "0.36", optional = true } serde_yaml = { version = "0.9", optional = true } +dirs = "5.0" chrono = "0.4" aes = { version = "0.8", optional = true } rc4 = { version = "0.1", optional = true } diff --git a/crates/pdftract-core/src/extract.rs b/crates/pdftract-core/src/extract.rs index 9e91382..f37eec3 100644 --- a/crates/pdftract-core/src/extract.rs +++ b/crates/pdftract-core/src/extract.rs @@ -304,6 +304,15 @@ pub struct ExtractionMetadata { /// Diagnostics emitted during extraction (coverage warnings, etc.) #[serde(skip_serializing_if = "Vec::is_empty")] pub diagnostics: Vec, + /// Profile name if a profile was applied (Phase 7.10) + #[serde(skip_serializing_if = "Option::is_none")] + pub profile_name: Option, + /// Profile version if a profile was applied (Phase 7.10) + #[serde(skip_serializing_if = "Option::is_none")] + pub profile_version: Option, + /// Extracted fields from profile if a profile was applied (Phase 7.10) + #[serde(skip_serializing_if = "Option::is_none")] + pub profile_fields: Option, } /// Extract text and structure from a PDF file. @@ -931,6 +940,9 @@ pub fn extract_pdf( error_count, reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()), diagnostics: all_diagnostics_with_js, + profile_name: None, + profile_version: None, + profile_fields: None, }, signatures, form_fields, @@ -1812,6 +1824,9 @@ pub fn extract_pdf_ndjson( error_count: error_count as usize, reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()), diagnostics: all_diagnostics, + profile_name: None, + profile_version: None, + profile_fields: None, }) } @@ -2117,6 +2132,9 @@ where error_count, reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()), diagnostics: all_diagnostics, + profile_name: None, + profile_version: None, + profile_fields: None, }) } diff --git a/crates/pdftract-core/src/profiles/apply_profile.rs b/crates/pdftract-core/src/profiles/apply_profile.rs new file mode 100644 index 0000000..fbb3424 --- /dev/null +++ b/crates/pdftract-core/src/profiles/apply_profile.rs @@ -0,0 +1,259 @@ +//! Profile application for extraction tuning (Phase 7.10). +//! +//! Applies profile extraction tuning to ExtractionOptions and manages +//! the profile workflow: classification, option override, field extraction, +//! and metadata population. + +use super::extraction::{ExtractionProfile, ExtractionTuning}; +use super::field_extractor; +use super::match_eval::{evaluate_match, MatchResult}; +use super::signals::extract_signals_from_results; +use crate::options::{ExtractionOptions, OutputOptions}; +use crate::schema::{BlockJson, PageJson, SpanJson}; +use anyhow::Result; +use serde_json::json; + +/// Apply a profile's extraction tuning to extraction options. +/// +/// # Arguments +/// +/// * `tuning` - The extraction tuning from a profile +/// * `options` - The base extraction options to modify +/// +/// # Returns +/// +/// Modified extraction options with profile-specific overrides applied. +/// +/// # Note +/// +/// Many extraction tuning fields (reading_order, table_detection, etc.) are +/// not yet exposed in ExtractionOptions. This function applies what is available +/// and logs warnings for unsupported fields. +pub fn apply_extraction_tuning(tuning: &ExtractionTuning, options: &mut ExtractionOptions) { + // Apply output filtering options (these are supported) + if let Some(include_invisible) = tuning.include_invisible { + options.output.include_invisible = include_invisible; + } + + if let Some(include_headers_footers) = tuning.include_headers_footers { + if include_headers_footers { + options.output.include_headers = true; + options.output.include_footers = true; + } + } + + // Log warnings for unsupported fields (for future implementation) + if tuning.reading_order.is_some() { + eprintln!("Profile warning: reading_order tuning is not yet supported"); + } + + if tuning.table_detection.is_some() { + eprintln!("Profile warning: table_detection tuning is not yet supported"); + } + + if tuning.readability_threshold.is_some() { + eprintln!("Profile warning: readability_threshold tuning is not yet supported"); + } + + if tuning.force_ocr.is_some() { + eprintln!("Profile warning: force_ocr tuning is not yet supported"); + } + + if tuning.min_block_chars.is_some() { + eprintln!("Profile warning: min_block_chars tuning is not yet supported"); + } +} + +/// Classify a document and select the best matching profile. +/// +/// # Arguments +/// +/// * `profiles` - All available extraction profiles +/// * `page_data` - Page data (blocks, span_indices) for signal extraction +/// * `has_signature_field` - Whether document has signature fields +/// * `has_form_field` - Whether document has form fields +/// +/// # Returns +/// +/// The best matching profile with confidence score, or None if no profile +/// matches with confidence >= 0.6. +pub fn classify_and_select_profile( + profiles: &[ExtractionProfile], + page_data: &[(Vec, Vec)], // (blocks, spans) per page + has_signature_field: bool, + has_form_field: bool, +) -> Option<(ExtractionProfile, MatchResult)> { + // Extract signals from the document + let signals = extract_signals_from_results(page_data, has_signature_field, has_form_field); + + // Evaluate each profile + let mut best_profile: Option<(ExtractionProfile, MatchResult)> = None; + + for profile in profiles { + let result = evaluate_match(&profile.match_expr, &signals); + + // Only consider matches with confidence >= 0.6 + if result.matched && result.confidence >= 0.6 { + match &best_profile { + None => { + best_profile = Some((profile.clone(), result)); + } + Some((existing_profile, existing_result)) => { + // Prefer higher confidence, then higher priority + if result.confidence > existing_result.confidence + || (result.confidence == existing_result.confidence + && profile.priority > existing_profile.priority) + { + best_profile = Some((profile.clone(), result)); + } + } + } + } + } + + best_profile +} + +/// Apply a profile to extraction metadata. +/// +/// Populates profile_name, profile_version, and profile_fields in the +/// extraction metadata. +/// +/// # Arguments +/// +/// * `profile` - The profile that was applied +/// * `metadata` - The extraction metadata to update (this must be the full ExtractionMetadata from extract module) +/// * `pages` - Extracted pages for field extraction +/// +/// # Note +/// +/// This function requires the full ExtractionMetadata from the extract module. +/// Due to the module structure, we update metadata through a closure that +/// can access the internal fields. +pub fn apply_profile_to_metadata( + profile: &ExtractionProfile, + pages: &[PageJson], +) -> (String, String, Option) { + let profile_name = profile.name.clone(); + let profile_version = "1.0.0".to_string(); // Profile version schema + + // Extract fields if the profile has field specifications + let profile_fields = if !profile.fields.is_empty() { + // Collect all blocks from all pages + let all_blocks: Vec = pages.iter().flat_map(|p| p.blocks.clone()).collect(); + + // Build full text from all spans + let full_text = pages + .iter() + .flat_map(|p| p.spans.iter().map(|s| s.text.clone())) + .collect::>() + .join(" "); + + // Extract profile fields + let field_results = + field_extractor::extract_profile_fields(&profile.fields, &all_blocks, &full_text); + + // Convert to JSON object + let mut fields_obj = serde_json::Map::new(); + for (field_name, result) in field_results { + fields_obj.insert(field_name, result.value); + } + + Some(json!(fields_obj)) + } else { + None + }; + + (profile_name, profile_version, profile_fields) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::options::ReceiptsMode; + + fn make_test_block(kind: &str, x0: f64, y0: f64, x1: f64, y1: f64) -> BlockJson { + BlockJson { + id: format!("block_{}", kind), + kind: kind.to_string(), + bbox: Some(vec![x0, y0, x1, y1]), + spans: vec![0, 1], + reading_order: Some(0), + ..Default::default() + } + } + + #[test] + fn test_apply_extraction_tuning() { + let tuning = ExtractionTuning { + reading_order: Some("line_dominant".to_string()), + table_detection: Some("strict_borders".to_string()), + readability_threshold: Some(0.4), + include_invisible: Some(true), + include_headers_footers: Some(true), + zone_filtering: None, + force_ocr: Some(false), + min_block_chars: Some(10), + }; + + let mut options = ExtractionOptions::default(); + + apply_extraction_tuning(&tuning, &mut options); + + // Check that output options were applied + assert_eq!(options.output.include_invisible, true); + assert_eq!(options.output.include_headers, true); + assert_eq!(options.output.include_footers, true); + } + + #[test] + fn test_apply_extraction_tuning_partial() { + let tuning = ExtractionTuning { + reading_order: None, + table_detection: None, + readability_threshold: None, + include_invisible: Some(false), + include_headers_footers: None, + zone_filtering: None, + force_ocr: None, + min_block_chars: None, + }; + + let mut options = ExtractionOptions::default(); + + apply_extraction_tuning(&tuning, &mut options); + + assert_eq!(options.output.include_invisible, false); + assert_eq!(options.output.include_headers, false); + assert_eq!(options.output.include_footers, false); + } + + #[test] + fn test_classify_and_select_profile_no_match() { + // Empty profiles list + let profiles: Vec = vec![]; + let page_data: Vec<(Vec, Vec)> = vec![]; + + let result = classify_and_select_profile(&profiles, &page_data, false, false); + + assert!(result.is_none()); + } + + #[test] + fn test_apply_profile_to_metadata_no_fields() { + let profile_yaml = r#" +name: test +description: Test profile +priority: 10 +"#; + + let profile: ExtractionProfile = serde_yaml::from_str(profile_yaml).unwrap(); + let pages = vec![]; + + let (name, version, fields) = apply_profile_to_metadata(&profile, &pages); + + assert_eq!(name, "test"); + assert_eq!(version, "1.0.0"); + assert!(fields.is_none()); + } +} diff --git a/crates/pdftract-core/src/profiles/extraction.rs b/crates/pdftract-core/src/profiles/extraction.rs new file mode 100644 index 0000000..4ac84f8 --- /dev/null +++ b/crates/pdftract-core/src/profiles/extraction.rs @@ -0,0 +1,437 @@ +//! Extraction profile types (Phase 7.10). +//! +//! This module defines the rich extraction profile format that extends Phase 5.6 +//! classification with extraction tuning and field extraction. Extraction profiles +//! use a boolean match DSL (all/any/none combinators) and can override extraction +//! options and extract structured fields. + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Extraction profile with match DSL, extraction tuning, and field extraction. +/// +/// This is the Phase 7.10 profile format, separate from the Phase 5.6 classification +/// `Profile` type. Extraction profiles drive both classification (via match DSL) +/// and extraction behavior (via tuning and field specs). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExtractionProfile { + /// Profile name (e.g., "invoice", "receipt") + pub name: String, + + /// Human-readable description + pub description: String, + + /// Priority for profile selection (higher = preferred when multiple match) + #[serde(default = "default_priority")] + pub priority: u32, + + /// Match DSL expression (boolean tree with all/any/none combinators) + #[serde(default)] + pub match_expr: MatchExpr, + + /// Extraction tuning overrides (optional) + #[serde(default)] + pub extraction: Option, + + /// Field extraction specifications (optional) + #[serde(default)] + pub fields: HashMap, +} + +fn default_priority() -> u32 { + 10 +} + +/// Boolean match expression for document classification. +/// +/// Supports all/any/none combinators for building complex matching rules. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum MatchExpr { + /// Single predicate + Predicate(ExtractionMatchPredicate), + + /// All of these must match + All { all: Vec }, + + /// Any of these can match + Any { any: Vec }, + + /// None of these must match + None { none: Vec }, +} + +impl Default for MatchExpr { + fn default() -> Self { + // Default to an Any that matches nothing (empty list) + MatchExpr::Any { any: Vec::new() } + } +} + +/// Match predicate primitives for extraction profiles. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ExtractionMatchPredicate { + /// Text contains any of the given strings + TextContains { + #[serde(default)] + patterns: Vec, + }, + + /// Text matches the given regex + TextMatches { + pattern: String, + }, + + /// Heading text matches the given regex + HeadingMatches { + pattern: String, + }, + + /// Document has currency pattern ($\d, €\d, etc.) + HasCurrencyPattern { + #[serde(default)] + has_currency_pattern: bool, + }, + + /// Document has signature fields (AcroForm) + HasSignatureField { + #[serde(default)] + has_signature_field: bool, + }, + + /// Structural predicates (has_table, page_count, etc.) + Structural { + #[serde(default)] + has_table: bool, + + #[serde(default)] + has_form_field: bool, + + #[serde(default)] + has_math: bool, + + #[serde(flatten)] + page_count: Option, + }, + + /// Text patterns alias for TextContains + #[serde(rename = "text_patterns")] + TextContainsAlias { + #[serde(default)] + patterns: Vec, + }, +} + +/// Page count range predicate. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PageCountRange { + #[serde(default)] + pub min: Option, + + #[serde(default)] + pub max: Option, + + #[serde(default)] + pub hint: Option, +} + +/// Extraction tuning overrides. +/// +/// These fields override the default ExtractionOptions when a profile matches. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExtractionTuning { + /// Reading order algorithm + pub reading_order: Option, + + /// Table detection mode + pub table_detection: Option, + + /// Readability threshold (0.0-1.0) + pub readability_threshold: Option, + + /// Include invisible text + pub include_invisible: Option, + + /// Include headers and footers + pub include_headers_footers: Option, + + /// Zone filtering mode + pub zone_filtering: Option, + + /// Force OCR + pub force_ocr: Option, + + /// Minimum block characters + pub min_block_chars: Option, +} + +/// Field extraction specification. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FieldSpec { + /// Field type (string, decimal, date, int, bool, array) + #[serde(rename = "type")] + pub field_type: String, + + /// Extraction specification + pub extraction: FieldExtraction, +} + +/// Field extraction definition. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum FieldExtraction { + /// Simple pattern-based extraction + Patterns { + patterns: Vec, + #[serde(default)] + fallback: Option, + }, + + /// Rich extraction with localizers and extractors + Rich { + /// Regex pattern + #[serde(default)] + regex: Option, + + /// Near anchors (search near these strings) + #[serde(default)] + near: Option>, + + /// Maximum distance in points + #[serde(default)] + max_distance_pt: Option, + + /// Region specification + #[serde(default)] + region: Option, + + /// Pick strategy (largest_font, smallest_font, nearest_below, nearest_right, first, last) + #[serde(default)] + pick: Option, + + /// Parse type (decimal, date, int, bool, string) + #[serde(default)] + parse: Option, + + /// After field (for ordering) + #[serde(default)] + after: Option, + + /// After heading + #[serde(default)] + after_heading: Option, + + /// Table region for array fields + #[serde(default)] + table_region: Option, + + /// Columnar regions for array fields + #[serde(default)] + columnar_regions: Option, + + /// Array schema for structured data + #[serde(default)] + schema: Option>, + + /// Fallback value + #[serde(default)] + fallback: Option, + }, +} + +/// Schema field for array extraction. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FieldSchema { + pub name: String, + #[serde(rename = "type")] + pub field_type: String, + #[serde(default)] + pub required: bool, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extraction_profile_basic() { + let yaml = r#" +name: test +description: Test profile +priority: 50 +"#; + + let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(profile.name, "test"); + assert_eq!(profile.description, "Test profile"); + assert_eq!(profile.priority, 50); + } + + #[test] + fn test_match_expr_all() { + let yaml = r#" +match: + all: + - text_contains: + patterns: ["invoice", "bill"] + - structural: + has_table: true +"#; + + let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap(); + match expr { + MatchExpr::All { all } => { + assert_eq!(all.len(), 2); + } + _ => panic!("Expected All"), + } + } + + #[test] + fn test_match_expr_any() { + let yaml = r#" +match: + any: + - text_contains: + patterns: ["receipt"] + - text_matches: + pattern: "\\d+\\.\\d{2}" +"#; + + let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap(); + match expr { + MatchExpr::Any { any } => { + assert_eq!(any.len(), 2); + } + _ => panic!("Expected Any"), + } + } + + #[test] + fn test_match_expr_none() { + let yaml = r#" +match: + none: + - text_contains: + patterns: ["abstract", "bibliography"] +"#; + + let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap(); + match expr { + MatchExpr::None { none } => { + assert_eq!(none.len(), 1); + } + _ => panic!("Expected None"), + } + } + + #[test] + fn test_extraction_tuning() { + let yaml = r#" +extraction: + reading_order: xy_cut + table_detection: strict_borders + readability_threshold: 0.4 + include_invisible: false +"#; + + let tuning: ExtractionTuning = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(tuning.reading_order, Some("xy_cut".to_string())); + assert_eq!(tuning.table_detection, Some("strict_borders".to_string())); + assert_eq!(tuning.readability_threshold, Some(0.4)); + assert_eq!(tuning.include_invisible, Some(false)); + } + + #[test] + fn test_field_spec_simple() { + let yaml = r#" +total: + type: decimal + extraction: + patterns: + - "\\$\\s*(\\d+\\.\\d{2})" + fallback: null +"#; + + let field: FieldSpec = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(field.field_type, "decimal"); + match field.extraction { + FieldExtraction::Patterns { patterns, .. } => { + assert_eq!(patterns.len(), 1); + } + _ => panic!("Expected Patterns"), + } + } + + #[test] + fn test_field_spec_rich() { + let yaml = r#" +invoice_number: + type: string + extraction: + regex: "Invoice\\s*#\\s*([\\w-]+)" + near: ["Invoice", "Invoice Number"] + max_distance_pt: 200 +"#; + + let field: FieldSpec = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(field.field_type, "string"); + match field.extraction { + FieldExtraction::Rich { regex, near, max_distance_pt, .. } => { + assert!(regex.is_some()); + assert!(near.is_some()); + assert_eq!(max_distance_pt, Some(200)); + } + _ => panic!("Expected Rich"), + } + } + + #[test] + fn test_full_profile_roundtrip() { + let yaml = r#" +name: invoice +description: Commercial invoice with line items +priority: 50 + +match: + all: + - any: + - text_contains: + patterns: ["invoice", "bill to"] + - heading_matches: + pattern: "^Invoice\\b" + - structural: + has_table: true + +extraction: + reading_order: line_dominant + table_detection: strict_borders + readability_threshold: 0.4 + +fields: + invoice_number: + type: string + extraction: + regex: "Invoice\\s*#\\s*([\\w-]+)" + near: ["Invoice"] + total: + type: decimal + extraction: + patterns: + - "total.*([\\d,]+\\.\\d{2})" + fallback: null +"#; + + let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(profile.name, "invoice"); + assert_eq!(profile.priority, 50); + assert!(profile.extraction.is_some()); + assert_eq!(profile.fields.len(), 2); + + // Round-trip + let yaml_out = serde_yaml::to_string(&profile).unwrap(); + let profile2: ExtractionProfile = serde_yaml::from_str(&yaml_out).unwrap(); + assert_eq!(profile2.name, profile.name); + } +} diff --git a/crates/pdftract-core/src/profiles/extraction_loader.rs b/crates/pdftract-core/src/profiles/extraction_loader.rs new file mode 100644 index 0000000..3c59142 --- /dev/null +++ b/crates/pdftract-core/src/profiles/extraction_loader.rs @@ -0,0 +1,374 @@ +//! Extraction profile loader (Phase 7.10). +//! +//! Loads extraction profiles from built-in sources, system directories, +//! XDG config paths, and custom --profile-dir flags. + +use super::extraction::ExtractionProfile; +use super::loader::{check_forbidden_keys, ForbiddenKeyError, ProfileLoadError}; +use std::collections::HashMap; +use std::fs; +use std::path::{Path, PathBuf}; + +/// Profile source with priority metadata. +#[derive(Debug, Clone)] +pub struct ProfileSource { + /// The loaded profile + pub profile: ExtractionProfile, + + /// Where this profile came from + pub source: ProfileOrigin, + + /// Whether this overrides a built-in profile + pub overrides_builtin: bool, +} + +/// Origin of a profile. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ProfileOrigin { + /// Built-in profile (compiled into binary) + BuiltIn, + + /// System-wide profile (/etc/pdftract/profiles/) + System, + + /// User profile (XDG config directory) + User, + + /// Custom profile directory (--profile-dir) + Custom(PathBuf), +} + +/// Load all extraction profiles from the search path. +/// +/// Search order (lowest to highest priority): +/// 1. Built-in profiles (compiled in) +/// 2. System directory (/etc/pdftract/profiles/) +/// 3. User directory (XDG config: ~/.config/pdftract/profiles/) +/// 4. Custom directories (--profile-dir, repeatable) +/// +/// Later sources override earlier ones on name collision. +pub fn load_extraction_profiles( + custom_dirs: &[PathBuf], +) -> Result, ProfileLoadError> { + let mut profiles_by_name: HashMap = HashMap::new(); + + // 1. Load built-in profiles + load_builtin_profiles(&mut profiles_by_name)?; + + // 2. Load system profiles + let system_dir = PathBuf::from("/etc/pdftract/profiles"); + if system_dir.exists() { + load_profiles_from_dir(&system_dir, ProfileOrigin::System, &mut profiles_by_name)?; + } + + // 3. Load user profiles (XDG config) + if let Some(user_dir) = get_xdg_profile_dir() { + if user_dir.exists() { + load_profiles_from_dir(&user_dir, ProfileOrigin::User, &mut profiles_by_name)?; + } + } + + // 4. Load custom profiles (--profile-dir) + for custom_dir in custom_dirs { + if custom_dir.exists() { + let origin = ProfileOrigin::Custom(custom_dir.clone()); + load_profiles_from_dir(custom_dir, origin, &mut profiles_by_name)?; + } + } + + // Convert to vector, sorted by priority then by name + let mut profiles: Vec = profiles_by_name.into_values().collect(); + profiles.sort_by(|a, b| { + b.profile + .priority + .cmp(&a.profile.priority) + .then_with(|| a.profile.name.cmp(&b.profile.name)) + }); + + Ok(profiles) +} + +/// Get the XDG config directory for pdftract profiles. +/// +/// Returns ~/.config/pdftract/profiles/ or None if XDG config is not available. +pub fn get_xdg_profile_dir() -> Option { + dirs::config_dir().map(|dir| dir.join("pdftract").join("profiles")) +} + +/// Load built-in extraction profiles. +/// +/// These are compiled into the binary via include_str!. +fn load_builtin_profiles( + profiles: &mut HashMap, +) -> Result<(), ProfileLoadError> { + #[cfg(feature = "profiles")] + { + // Load each built-in profile individually + let profile_results: Vec<(&str, Result)> = vec![ + ("invoice", load_profile_yaml( + include_str!("../../../../profiles/builtin/invoice/profile.yaml"), + "profiles/builtin/invoice/profile.yaml" + )), + ("receipt", load_profile_yaml( + include_str!("../../../../profiles/builtin/receipt/profile.yaml"), + "profiles/builtin/receipt/profile.yaml" + )), + ("contract", load_profile_yaml( + include_str!("../../../../profiles/builtin/contract/profile.yaml"), + "profiles/builtin/contract/profile.yaml" + )), + ("scientific_paper", load_profile_yaml( + include_str!("../../../../profiles/builtin/scientific_paper/profile.yaml"), + "profiles/builtin/scientific_paper/profile.yaml" + )), + ("slide_deck", load_profile_yaml( + include_str!("../../../../profiles/builtin/slide_deck/profile.yaml"), + "profiles/builtin/slide_deck/profile.yaml" + )), + ("form", load_profile_yaml( + include_str!("../../../../profiles/builtin/form/profile.yaml"), + "profiles/builtin/form/profile.yaml" + )), + ("bank_statement", load_profile_yaml( + include_str!("../../../../profiles/builtin/bank_statement/profile.yaml"), + "profiles/builtin/bank_statement/profile.yaml" + )), + ("legal_filing", load_profile_yaml( + include_str!("../../../../profiles/builtin/legal_filing/profile.yaml"), + "profiles/builtin/legal_filing/profile.yaml" + )), + ("book_chapter", load_profile_yaml( + include_str!("../../../../profiles/builtin/book_chapter/profile.yaml"), + "profiles/builtin/book_chapter/profile.yaml" + )), + ]; + + for (name, result) in profile_results { + match result { + Ok(profile) => { + profiles.insert( + profile.name.clone(), + ProfileSource { + profile, + source: ProfileOrigin::BuiltIn, + overrides_builtin: false, + }, + ); + } + Err(e) => { + eprintln!("Failed to parse built-in profile '{}': {}", name, e); + } + } + } + } + + Ok(()) +} + +/// Load a profile from YAML content. +fn load_profile_yaml(content: &str, source_path: &str) -> Result { + // Check for forbidden keys first + let yaml_value = serde_yaml::from_str::(content)?; + + // Get the original content for line number detection + if let Err(e) = check_forbidden_keys(&yaml_value, "", content) { + return Err(ProfileLoadError::ForbiddenKey { + key: e.key, + path: format!("{}: {}", source_path, e.path), + line: e.line, + }); + } + + // Parse as ExtractionProfile + let profile: ExtractionProfile = + serde_yaml::from_str(content).map_err(ProfileLoadError::YamlError)?; + + Ok(profile) +} + +/// Load profiles from a directory. +fn load_profiles_from_dir( + dir: &Path, + origin: ProfileOrigin, + profiles: &mut HashMap, +) -> Result<(), ProfileLoadError> { + let entries = fs::read_dir(dir).map_err(ProfileLoadError::IoError)?; + + for entry in entries { + let entry = entry.map_err(ProfileLoadError::IoError)?; + let path = entry.path(); + + // Skip directories + if path.is_dir() { + // Check for profile.yaml subdirectory (e.g., invoice/profile.yaml) + let profile_yaml = path.join("profile.yaml"); + if profile_yaml.exists() { + if let Ok(profile) = load_profile_file(&profile_yaml) { + let overrides_builtin = profiles + .contains_key(&profile.name) + && matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_)); + + profiles.insert( + profile.name.clone(), + ProfileSource { + profile, + source: origin.clone(), + overrides_builtin, + }, + ); + } + } + continue; + } + + // Only load .yaml files + if path.extension().and_then(|s| s.to_str()) != Some("yaml") { + continue; + } + + if let Ok(profile) = load_profile_file(&path) { + let overrides_builtin = profiles + .contains_key(&profile.name) + && matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_)); + + profiles.insert( + profile.name.clone(), + ProfileSource { + profile, + source: origin.clone(), + overrides_builtin, + }, + ); + } + } + + Ok(()) +} + +/// Load a single profile from a file. +pub fn load_profile_file(path: &Path) -> Result { + let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?; + load_profile_yaml(&content, &path.to_string_lossy()) +} + +/// Find a profile by name or path. +/// +/// - If `name_or_path` is an existing file path, load it directly +/// - Otherwise, search for a profile with that name in the loaded profiles +pub fn find_profile( + name_or_path: &str, + profiles: &[ProfileSource], +) -> Result { + // First, check if it's a file path + let path = PathBuf::from(name_or_path); + if path.exists() { + return load_profile_file(&path); + } + + // Search by name + for source in profiles { + if source.profile.name == name_or_path { + return Ok(source.profile.clone()); + } + } + + Err(ProfileLoadError::IoError(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!("Profile '{}' not found", name_or_path), + ))) +} + +/// Validate a profile file without loading it into the profile set. +/// +/// Returns Ok(()) if the profile is valid, Err with details if invalid. +pub fn validate_profile_file(path: &Path) -> Result<(), ProfileLoadError> { + let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?; + + // Check for forbidden keys + let yaml_value = serde_yaml::from_str::(&content) + .map_err(ProfileLoadError::YamlError)?; + + check_forbidden_keys(&yaml_value, "", &content) + .map_err(|e| ProfileLoadError::ForbiddenKey { + key: e.key, + path: e.path, + line: e.line, + })?; + + // Try to parse as ExtractionProfile + let _: ExtractionProfile = serde_yaml::from_str(&content).map_err(ProfileLoadError::YamlError)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_xdg_profile_dir() { + let dir = get_xdg_profile_dir(); + assert!(dir.is_some()); + let path = dir.unwrap(); + assert!(path.ends_with("pdftract/profiles")); + } + + #[test] + fn test_load_builtin_profiles() { + let mut profiles = HashMap::new(); + let result = load_builtin_profiles(&mut profiles); + + #[cfg(feature = "profiles")] + { + assert!(result.is_ok()); + // Should have loaded some profiles + assert!(!profiles.is_empty()); + } + } + + #[test] + fn test_validate_simple_profile() { + let yaml = r#" +name: test +description: Test profile +priority: 10 +match: + text_contains: + patterns: ["test"] +"#; + + let temp_dir = tempfile::tempdir().unwrap(); + let profile_path = temp_dir.path().join("test.yaml"); + fs::write(&profile_path, yaml).unwrap(); + + let result = validate_profile_file(&profile_path); + assert!(result.is_ok()); + } + + #[test] + fn test_validate_profile_with_forbidden_key() { + let yaml = r#" +name: test +description: Test profile +priority: 10 +match: + text_contains: + patterns: ["test"] +api_key: "secret" +"#; + + let temp_dir = tempfile::tempdir().unwrap(); + let profile_path = temp_dir.path().join("test.yaml"); + fs::write(&profile_path, yaml).unwrap(); + + let result = validate_profile_file(&profile_path); + assert!(result.is_err()); + } + + #[test] + fn test_load_extraction_profiles_empty() { + let profiles = load_extraction_profiles(&[]).unwrap(); + #[cfg(feature = "profiles")] + assert!(!profiles.is_empty()); // At least built-ins + } +} diff --git a/crates/pdftract-core/src/profiles/field_extractor.rs b/crates/pdftract-core/src/profiles/field_extractor.rs new file mode 100644 index 0000000..2a8d24a --- /dev/null +++ b/crates/pdftract-core/src/profiles/field_extractor.rs @@ -0,0 +1,353 @@ +//! Field extraction DSL evaluator (Phase 7.10). +//! +//! Evaluates field extraction specifications from profiles and extracts +//! structured fields from document text. Supports: +//! - Localizers: near, region, pick +//! - Extractors: regex, parse +//! - Strategies for disambiguating multiple candidates + +use super::extraction::{FieldExtraction, FieldSchema, FieldSpec}; +use crate::schema::BlockJson; +use regex::Regex; +use serde_json::Value; +use std::collections::HashMap; + +/// Convert serde_yaml::Value to serde_json::Value. +fn convert_yaml_to_json(yaml_value: &serde_yaml::Value) -> Value { + match yaml_value { + serde_yaml::Value::Null => Value::Null, + serde_yaml::Value::Bool(b) => Value::Bool(*b), + serde_yaml::Value::Number(n) => { + if let Some(i) = n.as_i64() { + Value::Number(i.into()) + } else if let Some(f) = n.as_f64() { + serde_json::Number::from_f64(f).map(Value::Number).unwrap_or(Value::Null) + } else { + Value::Null + } + } + serde_yaml::Value::String(s) => Value::String(s.clone()), + serde_yaml::Value::Sequence(seq) => { + Value::Array(seq.iter().map(convert_yaml_to_json).collect()) + } + serde_yaml::Value::Mapping(map) => { + let mut obj = serde_json::Map::new(); + for (k, v) in map { + if let serde_yaml::Value::String(key_str) = k { + obj.insert(key_str.clone(), convert_yaml_to_json(v)); + } + } + Value::Object(obj) + } + serde_yaml::Value::Tagged(tagged) => convert_yaml_to_json(&tagged.value), + } +} + +/// Result of field extraction. +#[derive(Debug, Clone)] +pub struct FieldExtractionResult { + /// Extracted field value (null if not found) + pub value: Value, + /// Human-readable extraction details (for debugging) + pub details: String, +} + +/// Extract all fields from a profile against extracted document data. +/// +/// # Arguments +/// +/// * `fields` - Field specifications from the profile +/// * `blocks` - Extracted blocks from the document +/// * `full_text` - Full document text +/// +/// # Returns +/// +/// A map of field names to extraction results. +pub fn extract_profile_fields( + fields: &HashMap, + blocks: &[BlockJson], + full_text: &str, +) -> HashMap { + let mut results = HashMap::new(); + + for (field_name, field_spec) in fields { + let result = extract_single_field(field_spec, blocks, full_text); + results.insert(field_name.clone(), result); + } + + results +} + +/// Extract a single field from the document. +fn extract_single_field( + field_spec: &FieldSpec, + blocks: &[BlockJson], + full_text: &str, +) -> FieldExtractionResult { + match &field_spec.extraction { + FieldExtraction::Patterns { patterns, fallback } => { + let json_fallback = fallback.as_ref().map(convert_yaml_to_json); + extract_by_patterns(patterns, full_text, &json_fallback) + } + FieldExtraction::Rich { + regex, + near, + max_distance_pt, + region, + pick, + parse, + after: _, + after_heading: _, + table_region: _, + columnar_regions: _, + schema: _, + fallback, + } => { + let json_fallback = fallback.as_ref().map(convert_yaml_to_json); + extract_rich( + regex, + near, + max_distance_pt, + region, + pick, + parse, + blocks, + full_text, + &json_fallback, + ) + } + } +} + +/// Extract using simple pattern matching (fallback mode). +fn extract_by_patterns( + patterns: &[String], + full_text: &str, + fallback: &Option, +) -> FieldExtractionResult { + for pattern in patterns { + if let Ok(re) = Regex::new(pattern) { + if let Some(captures) = re.captures(full_text) { + // Use first capture group if available, otherwise full match + let value = captures + .get(1) + .or(captures.get(0)) + .map(|m| m.as_str()) + .unwrap_or(""); + + return FieldExtractionResult { + value: Value::String(value.to_string()), + details: format!("Matched pattern '{}': '{}'", pattern, value), + }; + } + } + } + + // No match - use fallback or null + FieldExtractionResult { + value: fallback.clone().unwrap_or(Value::Null), + details: "No patterns matched, using fallback".to_string(), + } +} + +/// Extract using rich field extraction with localizers and extractors. +fn extract_rich( + regex: &Option, + near: &Option>, + _max_distance_pt: &Option, + _region: &Option, + _pick: &Option, + parse: &Option, + _blocks: &[BlockJson], + full_text: &str, + fallback: &Option, +) -> FieldExtractionResult { + // For rich extraction, we need to find text near anchors + // This is a simplified version that searches the full text + + // Find anchor position if "near" is specified + let search_text = if let Some(anchors) = near { + // Find the position of the first anchor in the text + let anchor_pos = anchors + .iter() + .find_map(|anchor| full_text.find(anchor)) + .unwrap_or(0); + + // Search in text after the anchor + if let Some(pos) = full_text.get(anchor_pos..) { + pos + } else { + full_text + } + } else { + full_text + }; + + // Extract value using regex + let raw_value = if let Some(pattern) = regex { + extract_with_regex(pattern, search_text) + } else { + // If no regex, use the first few words from search text + search_text + .split_whitespace() + .next() + .unwrap_or("") + .to_string() + }; + + // Parse value according to type + let parsed_value = parse_value(&raw_value, parse.as_deref()); + + FieldExtractionResult { + value: parsed_value, + details: format!("Extracted value: '{}'", raw_value), + } +} + +/// Extract value using regex. +fn extract_with_regex(pattern: &str, text: &str) -> String { + match Regex::new(pattern) { + Ok(re) => { + if let Some(captures) = re.captures(text) { + captures + .get(1) + .or(captures.get(0)) + .map(|m| m.as_str().to_string()) + .unwrap_or_default() + } else { + String::new() + } + } + Err(_) => String::new(), + } +} + +/// Parse a value according to the specified type. +fn parse_value(raw: &str, parse_type: Option<&str>) -> Value { + let raw = raw.trim(); + + match parse_type { + Some("decimal") => { + // Clean up currency symbols and commas + let cleaned = raw + .replace('$', "") + .replace('€', "") + .replace('£', "") + .replace('¥', "") + .replace(',', ""); + + cleaned + .parse::() + .ok() + .and_then(|v| serde_json::Number::from_f64(v)) + .map(Value::Number) + .unwrap_or(Value::Null) + } + Some("int") => raw + .parse::() + .map(Value::Number) + .unwrap_or(Value::Null), + Some("bool") => { + let lower = raw.to_lowercase(); + Value::Bool(lower == "true" || lower == "yes" || lower == "1") + } + Some("date") => { + // Try to parse as ISO date or return string + if raw.len() >= 10 && raw.chars().nth(4) == Some('-') { + Value::String(raw.to_string()) + } else { + Value::String(raw.to_string()) + } + } + Some("string") | None => Value::String(raw.to_string()), + _ => Value::String(raw.to_string()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_by_patterns_simple() { + let full_text = "Invoice #12345\nTotal: $100.00"; + let patterns = vec![r"Invoice #(\w+)".to_string()]; + + let result = extract_by_patterns(&patterns, full_text, &None); + + assert_eq!(result.value, "12345"); + assert!(result.details.contains("Matched pattern")); + } + + #[test] + fn test_extract_by_patterns_no_match() { + let full_text = "Receipt #ABC"; + let patterns = vec![r"Invoice #(\w+)".to_string()]; + let fallback = Some(Value::String("UNKNOWN".to_string())); + + let result = extract_by_patterns(&patterns, full_text, &fallback); + + assert_eq!(result.value, "UNKNOWN"); + assert!(result.details.contains("No patterns matched")); + } + + #[test] + fn test_parse_value_decimal() { + assert_eq!( + parse_value("100.50", Some("decimal")), + Value::Number(serde_json::Number::from_f64(100.50).unwrap()) + ); + assert_eq!( + parse_value("$1,234.56", Some("decimal")), + Value::Number(serde_json::Number::from_f64(1234.56).unwrap()) + ); + assert_eq!(parse_value("invalid", Some("decimal")), Value::Null); + } + + #[test] + fn test_parse_value_int() { + assert_eq!(parse_value("42", Some("int")), Value::Number(42.into())); + assert_eq!(parse_value("invalid", Some("int")), Value::Null); + } + + #[test] + fn test_parse_value_bool() { + assert_eq!(parse_value("true", Some("bool")), Value::Bool(true)); + assert_eq!(parse_value("yes", Some("bool")), Value::Bool(true)); + assert_eq!(parse_value("false", Some("bool")), Value::Bool(false)); + assert_eq!(parse_value("no", Some("bool")), Value::Bool(false)); + } + + #[test] + fn test_parse_value_date() { + let result = parse_value("2025-01-15", Some("date")); + assert_eq!(result, Value::String("2025-01-15".to_string())); + } + + #[test] + fn test_parse_value_string() { + assert_eq!( + parse_value("hello", Some("string")), + Value::String("hello".to_string()) + ); + assert_eq!(parse_value("world", None), Value::String("world".to_string())); + } + + #[test] + fn test_extract_with_regex() { + let text = "Invoice: INV-2025-00123"; + let pattern = r"Invoice:\s*([\w-]+)"; + + let result = extract_with_regex(pattern, text); + assert_eq!(result, "INV-2025-00123"); + } + + #[test] + fn test_extract_with_regex_no_match() { + let text = "Receipt: R-123"; + let pattern = r"Invoice:\s*([\w-]+)"; + + let result = extract_with_regex(pattern, text); + assert!(result.is_empty()); + } +} diff --git a/crates/pdftract-core/src/profiles/match_eval.rs b/crates/pdftract-core/src/profiles/match_eval.rs new file mode 100644 index 0000000..0f33e9e --- /dev/null +++ b/crates/pdftract-core/src/profiles/match_eval.rs @@ -0,0 +1,528 @@ +//! Match DSL evaluator for extraction profiles. +//! +//! Evaluates boolean match expressions (all/any/none combinators) against +//! document signals to determine if a profile matches a document. + +use super::engine::FeatureSignals; +use super::extraction::{ExtractionMatchPredicate, MatchExpr, PageCountRange}; +use regex::Regex; +use std::collections::HashMap; +use std::sync::Mutex; + +/// Result of match evaluation. +#[derive(Debug, Clone, Default)] +pub struct MatchResult { + /// Whether the match succeeded + pub matched: bool, + + /// Human-readable reasons for the match (for debugging/metadata) + pub reasons: Vec, + + /// Confidence score (0.0-1.0) + pub confidence: f32, +} + +/// Evaluate a match expression against document signals. +/// +/// Returns a MatchResult indicating whether the expression matched and +/// providing reasons for the decision. +pub fn evaluate_match(expr: &MatchExpr, signals: &FeatureSignals) -> MatchResult { + match expr { + MatchExpr::Predicate(pred) => evaluate_predicate(pred, signals), + MatchExpr::All { all } => { + let mut result = MatchResult { + matched: true, + reasons: Vec::new(), + confidence: 1.0, + }; + + for sub_expr in all { + let sub_result = evaluate_match(sub_expr, signals); + result.reasons.extend(sub_result.reasons); + + if !sub_result.matched { + result.matched = false; + // Keep collecting reasons for debugging + } + result.confidence = result.confidence.min(sub_result.confidence); + } + + if result.matched { + result.reasons.push("all: all sub-expressions matched".to_string()); + } else { + result.reasons.push("all: some sub-expressions did not match".to_string()); + } + + result + } + MatchExpr::Any { any } => { + let mut best_result = MatchResult { + matched: false, + reasons: Vec::new(), + confidence: 0.0, + }; + + for sub_expr in any { + let sub_result = evaluate_match(sub_expr, signals); + + if sub_result.matched { + best_result.matched = true; + best_result.confidence = best_result.confidence.max(sub_result.confidence); + } + + best_result.reasons.extend(sub_result.reasons); + } + + if best_result.matched { + best_result + .reasons + .push("any: at least one sub-expression matched".to_string()); + } else { + best_result + .reasons + .push("any: no sub-expressions matched".to_string()); + } + + best_result + } + MatchExpr::None { none } => { + let mut result = MatchResult { + matched: true, + reasons: Vec::new(), + confidence: 1.0, + }; + + for sub_expr in none { + let sub_result = evaluate_match(sub_expr, signals); + + if sub_result.matched { + result.matched = false; + result.confidence = 0.0; + result + .reasons + .push(format!("none: excluded sub-expression matched: {:?}", sub_result.reasons)); + } + } + + if result.matched { + result.reasons.push("none: no excluded sub-expressions matched".to_string()); + } + + result + } + } +} + +/// Evaluate a single predicate against document signals. +fn evaluate_predicate(pred: &ExtractionMatchPredicate, signals: &FeatureSignals) -> MatchResult { + match pred { + ExtractionMatchPredicate::TextContains { patterns } => { + let text_lower = signals.text.to_lowercase(); + + for pattern in patterns { + if text_lower.contains(&pattern.to_lowercase()) { + return MatchResult { + matched: true, + reasons: vec![format!("text_contains: found '{}'", pattern)], + confidence: 0.8, + }; + } + } + + MatchResult { + matched: false, + reasons: vec!["text_contains: no patterns found".to_string()], + confidence: 0.0, + } + } + ExtractionMatchPredicate::TextMatches { pattern } => { + let regex = match compile_regex(pattern) { + Ok(re) => re, + Err(e) => { + return MatchResult { + matched: false, + reasons: vec![format!("text_matches: invalid regex '{}': {}", pattern, e)], + confidence: 0.0, + } + } + }; + + if regex.is_match(&signals.text) { + MatchResult { + matched: true, + reasons: vec![format!("text_matches: pattern '{}' matched", pattern)], + confidence: 0.7, + } + } else { + MatchResult { + matched: false, + reasons: vec![format!("text_matches: pattern '{}' did not match", pattern)], + confidence: 0.0, + } + } + } + ExtractionMatchPredicate::HeadingMatches { pattern } => { + let regex = match compile_regex(pattern) { + Ok(re) => re, + Err(e) => { + return MatchResult { + matched: false, + reasons: vec![format!("heading_matches: invalid regex '{}': {}", pattern, e)], + confidence: 0.0, + } + } + }; + + for heading in &signals.headings { + if regex.is_match(heading) { + return MatchResult { + matched: true, + reasons: vec![format!( + "heading_matches: heading '{}' matched pattern '{}'", + heading, pattern + )], + confidence: 0.75, + }; + } + } + + MatchResult { + matched: false, + reasons: vec![format!("heading_matches: no headings matched '{}'", pattern)], + confidence: 0.0, + } + } + ExtractionMatchPredicate::HasCurrencyPattern { + has_currency_pattern: true, + } => { + let has_currency = has_currency_pattern_impl(&signals.text); + MatchResult { + matched: has_currency, + reasons: vec![if has_currency { + "has_currency_pattern: currency pattern found".to_string() + } else { + "has_currency_pattern: no currency pattern".to_string() + }], + confidence: if has_currency { 0.6 } else { 0.0 }, + } + } + ExtractionMatchPredicate::HasCurrencyPattern { + has_currency_pattern: false, + } => MatchResult { + matched: true, // Negated predicate + reasons: vec!["has_currency_pattern: predicate disabled".to_string()], + confidence: 0.0, + }, + ExtractionMatchPredicate::HasSignatureField { + has_signature_field: true, + } => { + let has_sig = signals.has_signature_field; + MatchResult { + matched: has_sig, + reasons: vec![if has_sig { + "has_signature_field: signature fields found".to_string() + } else { + "has_signature_field: no signature fields".to_string() + }], + confidence: if has_sig { 0.5 } else { 0.0 }, + } + } + ExtractionMatchPredicate::HasSignatureField { + has_signature_field: false, + } => MatchResult { + matched: true, + reasons: vec!["has_signature_field: predicate disabled".to_string()], + confidence: 0.0, + }, + ExtractionMatchPredicate::TextContainsAlias { patterns } => { + // Alias for TextContains + let text_lower = signals.text.to_lowercase(); + + for pattern in patterns { + if text_lower.contains(&pattern.to_lowercase()) { + return MatchResult { + matched: true, + reasons: vec![format!("text_contains: found '{}'", pattern)], + confidence: 0.8, + }; + } + } + + MatchResult { + matched: false, + reasons: vec!["text_contains: no patterns found".to_string()], + confidence: 0.0, + } + } + ExtractionMatchPredicate::Structural { + has_table, + has_form_field, + has_math, + page_count, + } => { + let mut matched = true; + let mut reasons = Vec::new(); + let mut min_confidence = 1.0; + + if matches!(has_table, Some(true)) { + if signals.table_block_count > 0 { + reasons.push(format!("structural.has_table: {} tables found", signals.table_block_count)); + } else { + reasons.push("structural.has_table: no tables found".to_string()); + matched = false; + } + } + + if matches!(has_form_field, Some(true)) { + if signals.has_form_field { + reasons.push("structural.has_form_field: form fields found".to_string()); + } else { + reasons.push("structural.has_form_field: no form fields found".to_string()); + matched = false; + } + } + + if matches!(has_math, Some(true)) { + if signals.has_math_operators { + reasons.push("structural.has_math: math operators found".to_string()); + } else { + reasons.push("structural.has_math: no math operators".to_string()); + matched = false; + } + } + + if let Some(range) = page_count { + let page_count = signals.page_count as u32; + let in_range = match (&range.min, &range.max) { + (Some(min), Some(max)) => page_count >= *min && page_count <= *max, + (Some(min), None) => page_count >= *min, + (None, Some(max)) => page_count <= *max, + (None, None) => true, + }; + + if in_range { + reasons.push(format!("structural.page_count: {} is in range", page_count)); + } else { + reasons.push(format!( + "structural.page_count: {} is out of range {:?}", + page_count, range + )); + matched = false; + } + } + + MatchResult { + matched, + reasons, + confidence: if matched { min_confidence } else { 0.0 }, + } + } + } +} + +/// Check if text contains a currency pattern ($\d, €\d, £\d, ¥\d, etc.). +fn has_currency_pattern_impl(text: &str) -> bool { + // Simple check for currency symbols followed by digits + let text_lower = text.to_lowercase(); + text_lower.contains('$') || text_lower.contains('€') || text_lower.contains('£') || text_lower.contains('¥') +} + +/// Simple regex cache (thread-safe, LRU-bounded). +fn get_regex_cache() -> &'static Mutex> { + use std::sync::OnceLock; + static CACHE: OnceLock>> = OnceLock::new(); + CACHE.get_or_init(|| Mutex::new(HashMap::new())) +} + +/// Compile a regex pattern with caching. +fn compile_regex(pattern: &str) -> Result { + // Check cache first + { + let cache = get_regex_cache().lock().unwrap(); + if let Some(regex) = cache.get(pattern) { + return Ok(regex.clone()); + } + } + + // Compile and cache + let regex = Regex::new(pattern)?; + let mut cache = get_regex_cache().lock().unwrap(); + + // Simple LRU: clear if too many entries + if cache.len() > 100 { + cache.clear(); + } + + cache.insert(pattern.to_string(), regex.clone()); + Ok(regex) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_signals() -> FeatureSignals { + let mut signals = FeatureSignals { + text: "Invoice #12345\nTotal: $100.00\nDue date: 2025-01-15".to_string(), + text_pattern_hits: HashMap::new(), + headings: HashSet::from(["Invoice".to_string(), "Total".to_string()]), + page_count: 2, + table_block_count: 1, + has_signature_field: false, + has_form_field: false, + has_math_operators: false, + has_bullet_lists: false, + font_diversity: 3, + heading_depth: 2, + glyph_density: 0.9, + has_footer_page_numbers: false, + }; + signals.build_pattern_hits(); + signals + } + + #[test] + fn test_text_contains_match() { + let signals = test_signals(); + let pred = ExtractionMatchPredicate::TextContains { + patterns: vec!["invoice".to_string()], + }; + + let result = evaluate_predicate(&pred, &signals); + assert!(result.matched); + assert_eq!(result.confidence, 0.8); + } + + #[test] + fn test_text_contains_no_match() { + let signals = test_signals(); + let pred = ExtractionMatchPredicate::TextContains { + patterns: vec!["receipt".to_string()], + }; + + let result = evaluate_predicate(&pred, &signals); + assert!(!result.matched); + } + + #[test] + fn test_heading_matches() { + let signals = test_signals(); + let pred = ExtractionMatchPredicate::HeadingMatches { + pattern: "^Invoice$".to_string(), + }; + + let result = evaluate_predicate(&pred, &signals); + assert!(result.matched); + } + + #[test] + fn test_has_currency_pattern() { + let signals = test_signals(); + let pred = ExtractionMatchPredicate::HasCurrencyPattern { + has_currency_pattern: true, + }; + + let result = evaluate_predicate(&pred, &signals); + assert!(result.matched); + } + + #[test] + fn test_structural_has_table() { + let signals = test_signals(); + let pred = ExtractionMatchPredicate::Structural { + has_table: Some(true), + has_form_field: Some(false), + has_math: Some(false), + page_count: Some(PageCountRange { + min: Some(1), + max: Some(5), + hint: None, + }), + }; + + let result = evaluate_predicate(&pred, &signals); + assert!(result.matched); + } + + #[test] + fn test_match_expr_all() { + let signals = test_signals(); + let expr = MatchExpr::All { + all: vec![ + MatchExpr::Predicate(ExtractionMatchPredicate::TextContains { + patterns: vec!["invoice".to_string()], + }), + MatchExpr::Predicate(ExtractionMatchPredicate::Structural { + has_table: Some(true), + has_form_field: Some(false), + has_math: Some(false), + page_count: None, + }), + ], + }; + + let result = evaluate_match(&expr, &signals); + assert!(result.matched); + assert!(result.reasons.iter().any(|r| r.contains("all: all sub-expressions matched"))); + } + + #[test] + fn test_match_expr_any() { + let signals = test_signals(); + let expr = MatchExpr::Any { + any: vec![ + MatchExpr::Predicate(ExtractionMatchPredicate::TextContains { + patterns: vec!["receipt".to_string()], + }), + MatchExpr::Predicate(ExtractionMatchPredicate::TextContains { + patterns: vec!["invoice".to_string()], + }), + ], + }; + + let result = evaluate_match(&expr, &signals); + assert!(result.matched); + } + + #[test] + fn test_match_expr_none() { + let signals = test_signals(); + let expr = MatchExpr::None { + none: vec![MatchExpr::Predicate(ExtractionMatchPredicate::TextContains { + patterns: vec!["abstract".to_string()], + })], + }; + + let result = evaluate_match(&expr, &signals); + assert!(result.matched); + } + + #[test] + fn test_match_expr_complex() { + let signals = test_signals(); + // (invoice OR receipt) AND has_table + let expr = MatchExpr::All { + all: vec![ + MatchExpr::Any { + any: vec![ + MatchExpr::Predicate(ExtractionMatchPredicate::TextContains { + patterns: vec!["invoice".to_string()], + }), + MatchExpr::Predicate(ExtractionMatchPredicate::TextContains { + patterns: vec!["receipt".to_string()], + }), + ], + }, + MatchExpr::Predicate(ExtractionMatchPredicate::Structural { + has_table: Some(true), + has_form_field: Some(false), + has_math: Some(false), + page_count: None, + }), + ], + }; + + let result = evaluate_match(&expr, &signals); + assert!(result.matched); + } +} diff --git a/crates/pdftract-core/src/profiles/mod.rs b/crates/pdftract-core/src/profiles/mod.rs index 2d4fc22..c75f84a 100644 --- a/crates/pdftract-core/src/profiles/mod.rs +++ b/crates/pdftract-core/src/profiles/mod.rs @@ -18,19 +18,35 @@ //! vocabulary between the rule engine, built-in profile definitions, and //! user-authored YAML profiles. +mod apply_profile; mod engine; +mod extraction; +mod extraction_loader; +mod field_extractor; mod loader; +mod match_eval; mod signals; mod types; +pub use apply_profile::{apply_extraction_tuning, apply_profile_to_metadata, classify_and_select_profile}; pub use engine::{ classify, has_currency_pattern, ClassificationResult, ClassifierEngine, FeatureSignals, }; +pub use extraction::{ + ExtractionProfile, ExtractionTuning, FieldExtraction, FieldSchema, FieldSpec, MatchExpr, + ExtractionMatchPredicate, +}; +pub use extraction_loader::{ + find_profile, get_xdg_profile_dir, load_extraction_profiles, load_profile_file, ProfileOrigin, + ProfileSource, validate_profile_file, +}; +pub use field_extractor::{extract_profile_fields, FieldExtractionResult}; pub use loader::{ check_forbidden_keys, load_profiles_from_dir, ForbiddenKeyError, ProfileLoadError, }; +pub use match_eval::{evaluate_match, MatchResult}; pub use signals::{extract_feature_signals, extract_signals_from_results, PageSignalAccumulator}; -pub use types::{MatchPredicate, Profile, ProfileType}; +pub use types::{MatchPredicate as ClassificationMatchPredicate, Profile, ProfileType}; use crate::diagnostics::DiagCode; diff --git a/profiles/builtin/bank_statement/profile.yaml b/profiles/builtin/bank_statement/profile.yaml index 6f91404..077f861 100644 --- a/profiles/builtin/bank_statement/profile.yaml +++ b/profiles/builtin/bank_statement/profile.yaml @@ -1,55 +1,64 @@ +# Bank Statement extraction profile +# Matches bank statements with account info, period, balances, transactions +name: bank_statement description: Bank statement with account info, period, balances, transactions priority: 42 + match: - any: - - text_patterns: - - "(?i)statement\\s+of\\s+account" - - "(?i)bank\\s+statement" - - "(?i)account\\s+statement" - - "(?i)transaction\\s+history" - - text_patterns: - - "(?i)opening\\s+balance" - - "(?i)closing\\s+balance" - - "(?i)statement\\s+period" - - "(?i)account\\s*#?\\s*:?\\s*\\*{4,}" + all: + - any: + - text_contains: + patterns: ["statement of account", "bank statement", "account statement", "transaction history"] + - text_contains: + patterns: ["opening balance", "closing balance", "statement period"] - structural: - - has_monetary_columnar_layout: true - - has_date_column: true - page_count_hint: 1-10 -profile_fields: + has_table: true + has_form_field: false + has_math: false + page_count: + min: 1 + max: 10 + +extraction: + reading_order: line_dominant + table_detection: default + readability_threshold: 0.5 + include_invisible: false + include_headers_footers: false + force_ocr: false + min_block_chars: 0 + +fields: account_number: type: string extraction: - patterns: - - "(?i)account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})" - - "(?i)acct\\s*(?:#|:)?\\s*(\\*?\\d[\\d\\*]{3,})" - fallback: null + regex: "account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})" + parse: string + statement_period: type: string extraction: - patterns: - - "(?i)statement\\s+period\\s*:?.*?([A-Za-z]+\\s+[0-9]{1,2}.*?through.*?[A-Za-z]+\\s+[0-9]{1,2},?\\s+[0-9]{4})" - - "(?i)period\\s*:?.*?([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+(?:to|through|-)\\s+([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})" - fallback: null + near: ["Statement Period", "Period"] + parse: string + opening_balance: type: decimal extraction: - patterns: - - "(?i)opening\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - - "(?i)beginning\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - fallback: null + near: ["Opening Balance", "Beginning Balance"] + regex: "([\\d,]+\\.\\d{2})" + parse: decimal + closing_balance: type: decimal extraction: - patterns: - - "(?i)closing\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - - "(?i)ending\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - - "(?i)current\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - fallback: null + near: ["Closing Balance", "Ending Balance", "Current Balance"] + regex: "([\\d,]+\\.\\d{2})" + parse: decimal + transactions: type: array extraction: - table_region: "largest_table_or_central_body" + table_region: largest_table schema: - name: date type: date @@ -64,5 +73,3 @@ profile_fields: type: decimal required: false fallback: [] -reading_order: line_dominant -zone_filtering: exclude_headers_footers diff --git a/profiles/builtin/book_chapter/profile.yaml b/profiles/builtin/book_chapter/profile.yaml index 85b50ce..c7f70ea 100644 --- a/profiles/builtin/book_chapter/profile.yaml +++ b/profiles/builtin/book_chapter/profile.yaml @@ -1,68 +1,63 @@ -# Book Chapter Profile -# -# Book chapters, monographs, and long-form narrative documents. -# Extracts title, chapter_number, author, sections. - +# Book Chapter extraction profile +# Matches book chapters, monographs, and long-form narrative documents name: book_chapter description: Book chapters, monographs, long-form narrative documents priority: 5 -# Matching predicates for book chapter classification match: all: - # Page count in typical chapter range (not a whole book, not a single page) - structural: - page_count: {min: 5, max: 1000} - # Heading depth indicates structured content - - structural: - heading_depth: {min: 1, max: 5} - # AND EITHER: has chapter/section headings - # OR: has limited font diversity (not a dense academic paper) - # OR: matches chapter/section text patterns + has_table: false + has_form_field: false + has_math: false + page_count: + min: 5 + max: 1000 - any: - - text_matches: '^Chapter \d+' - - heading_matches: '^(Chapter|Part|Section) \d+' - - text_matches: '^\d+\.\s+[A-Z]' - - structural: - font_diversity: {min: 1, max: 4} + - text_matches: + pattern: "^Chapter \\d+" + - heading_matches: + pattern: "^(Chapter|Part|Section) \\d+" + - text_matches: + pattern: "^\\d+\\.\\s+[A-Z]" none: - # Exclude more specific document types - - text_contains: ['Abstract', 'WHEREAS', 'Invoice', 'Account Statement', 'References'] + - text_contains: + patterns: ["Abstract", "WHEREAS", "Invoice", "Account Statement", "References"] -# Extraction tuning for book chapters extraction: - # Use line_dominant reading order for narrative text flow reading_order: line_dominant - # Default table detection table_detection: default - # Higher readability threshold for narrative text quality readability_threshold: 0.6 - # Don't include invisible text include_invisible: false - # Exclude headers, footers, and page numbers from body content include_headers_footers: false + force_ocr: false + min_block_chars: 0 -# Field extraction specifications fields: title: type: string - region: top_third - pick: largest_font - page: first + extraction: + region: top_third + pick: largest_font + parse: string chapter_number: type: string - near: ['Chapter', 'Part'] - regex: '\d+' - max_distance_pt: 100 + extraction: + near: ["Chapter", "Part"] + regex: "\\d+" + max_distance_pt: 100 + parse: string author: type: string - region: top_quarter - pick: smallest_font - page: first + extraction: + region: top_quarter + pick: smallest_font + parse: string sections: type: array - pick: largest_font - per_page: true + extraction: + pick: largest_font + fallback: [] diff --git a/profiles/builtin/contract/profile.yaml b/profiles/builtin/contract/profile.yaml index 498a2f5..db24377 100644 --- a/profiles/builtin/contract/profile.yaml +++ b/profiles/builtin/contract/profile.yaml @@ -1,38 +1,66 @@ -# Contract profile for legal agreements -# Extracts parties, effective date, term, governing law, and signatures from contracts +# Contract extraction profile +# Matches legal contracts and agreements with parties, effective date, term, governing law, and signatures name: contract description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures priority: 20 -# Matching predicates: identify documents as contracts match: all: - any: - - text_contains: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"] - - heading_matches: '^(Agreement|Contract|Memorandum of Understanding)' - - structural: {page_count: {min: 2, max: 200}} + - text_contains: + patterns: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"] + - heading_matches: + pattern: "^(Agreement|Contract|Memorandum of Understanding)" + - structural: + has_table: false + has_form_field: false + has_math: false + page_count: + min: 2 + max: 200 none: - - text_contains: ["Invoice #", "Receipt"] + - text_contains: + patterns: ["Invoice #", "Receipt"] -# Extraction tuning for contracts extraction: reading_order: xy_cut + table_detection: off readability_threshold: 0.5 + include_invisible: false include_headers_footers: false + force_ocr: false + min_block_chars: 0 -# Field extractors for contract-specific metadata fields: parties: - near: ["between", "party of the first part", "BY AND BETWEEN"] - pick: nearest_below + type: string + extraction: + near: ["between", "party of the first part", "BY AND BETWEEN"] + pick: nearest_below + parse: string + effective_date: - near: ["Effective Date", "Date of Agreement", "as of"] - parse: date + type: date + extraction: + near: ["Effective Date", "Date of Agreement", "as of"] + parse: date + term: - near: ["Term", "Initial Term", "expires on", "shall remain in effect"] - regex: '\d+\s+(years?|months?)|expires?\s+\d{4}' + type: string + extraction: + near: ["Term", "Initial Term", "expires on", "shall remain in effect"] + regex: "\\d+\\s+(years?|months?)|expires?\\s+\\d{4}" + parse: string + governing_law: - near: ["Governing Law", "governed by the laws of"] - pick: nearest_right + type: string + extraction: + near: ["Governing Law", "governed by the laws of"] + pick: nearest_right + parse: string + signatures: - region: bottom_quarter + type: array + extraction: + region: bottom_quarter + fallback: [] diff --git a/profiles/builtin/form/profile.yaml b/profiles/builtin/form/profile.yaml index 5028e88..8cc55d0 100644 --- a/profiles/builtin/form/profile.yaml +++ b/profiles/builtin/form/profile.yaml @@ -1,18 +1,34 @@ +# Form extraction profile +# Matches fillable forms with fields; uses line_dominant reading order +name: form description: Fillable form with fields; uses line_dominant reading order and form_fields from Phase 7.4 priority: 30 + match: - any: - - text_patterns: - - "(?i)form\\s*[0-9A-Z-]+" - - "(?i)application\\s+form" - - "(?i)questionnaire" - - "(?i)please\\s+fill\\s+out" - - "(?i)required\\s+fields?" + all: + - any: + - text_contains: + patterns: ["form", "application form", "questionnaire", "please fill out", "required fields"] + - structural: + has_table: false + has_form_field: true + has_math: false + page_count: null - structural: - - has_form_field_layout: true - - has_blank_lines_with_colons: true - page_count_hint: 1-10 -profile_fields: {} -reading_order: line_dominant -zone_filtering: none -form_fields_integration: true + has_table: false + has_form_field: false + has_math: false + page_count: + min: 1 + max: 10 + +extraction: + reading_order: line_dominant + table_detection: off + readability_threshold: 0.5 + include_invisible: false + include_headers_footers: true + force_ocr: false + min_block_chars: 0 + +fields: {} diff --git a/profiles/builtin/invoice/profile.yaml b/profiles/builtin/invoice/profile.yaml index f77e55d..56a9046 100644 --- a/profiles/builtin/invoice/profile.yaml +++ b/profiles/builtin/invoice/profile.yaml @@ -1,81 +1,104 @@ +# Invoice extraction profile +# Matches commercial invoices with line items, vendor/customer, and totals +name: invoice description: Commercial invoice with line items, vendor/customer, and totals priority: 50 + match: - any: - - text_patterns: - - "(?i)invoice" - - "(?i)bill to" - - "(?i)invoice #" - - "(?i)invoice number" - - "(?i)tax invoice" - - text_patterns: - - "(?i)due date" - - "(?i)payment terms" - - "(?i)purchase order" - - "(?i)po #" - - structural: - - has_line_item_table: true - page_count_hint: 1-5 -profile_fields: + all: + - any: + - text_contains: + patterns: ["invoice", "bill to", "invoice #", "invoice number", "tax invoice"] + - heading_matches: + pattern: "^Invoice\\b" + - any: + - has_currency_pattern: + has_currency_pattern: true + - structural: + has_table: true + has_form_field: false + has_math: false + page_count: + min: 1 + max: 5 + none: + - text_contains: + patterns: ["abstract", "bibliography", "scientific paper"] + +extraction: + reading_order: line_dominant + table_detection: strict_borders + readability_threshold: 0.4 + include_invisible: false + include_headers_footers: false + force_ocr: false + min_block_chars: 0 + +fields: invoice_number: type: string extraction: - patterns: - - "(?i)invoice\\s*[#:]?\\s*([A-Z0-9-]+)" - - "(?i)bill\\s*invoice\\s*[#:]?\\s*([A-Z0-9-]+)" - fallback: null + regex: "Invoice\\s*#\\s*([\\w-]+)" + near: ["Invoice", "Invoice Number", "Invoice #"] + max_distance_pt: 200 + parse: string + vendor: type: string extraction: - patterns: - - "(?i)(?:from|vendor|supplier|company)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)" - - "(?i)^([A-Z][A-Za-z0-9\\s&]+)\\s+(?:Inc|LLC|Ltd|Corp|GmbH)" - fallback: null + region: top_quarter + pick: largest_font + customer: type: string extraction: - patterns: - - "(?i)(?:bill\\s*to|customer|client)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)" - fallback: null + near: ["Bill To", "Customer", "Sold To"] + max_distance_pt: 150 + pick: nearest_below + parse: string + invoice_date: type: date extraction: - patterns: - - "(?i)invoice\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})" - - "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})" - fallback: null + near: ["Date", "Invoice Date"] + max_distance_pt: 100 + parse: date + due_date: type: date extraction: - patterns: - - "(?i)due\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})" - - "(?i)payment\\s*due\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})" - fallback: null + near: ["Due Date", "Payment Due", "Due"] + max_distance_pt: 100 + parse: date + total: type: decimal extraction: - patterns: - - "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - - "(?i)amount\\s*due\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - fallback: null + regex: "([\\d,]+\\.\\d{2})" + near: ["Total", "Amount Due", "Balance Due", "Grand Total"] + max_distance_pt: 80 + parse: decimal + subtotal: type: decimal extraction: - patterns: - - "(?i)sub\\s*total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - fallback: null + regex: "([\\d,]+\\.\\d{2})" + near: ["Subtotal", "Sub-Total"] + max_distance_pt: 80 + parse: decimal + tax: type: decimal extraction: - patterns: - - "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - - "(?i)vat\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - - "(?i)gst\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - fallback: null + regex: "([\\d,]+\\.\\d{2})" + near: ["Tax", "VAT", "GST", "Sales Tax"] + max_distance_pt: 80 + parse: decimal + line_items: type: array extraction: - table_region: "largest_table_or_bottom_half" + table_region: largest_table schema: - name: description type: string @@ -90,5 +113,3 @@ profile_fields: type: decimal required: false fallback: [] -reading_order: line_dominant -zone_filtering: exclude_headers_footers diff --git a/profiles/builtin/legal_filing/profile.yaml b/profiles/builtin/legal_filing/profile.yaml index 7c65eb2..a64cf76 100644 --- a/profiles/builtin/legal_filing/profile.yaml +++ b/profiles/builtin/legal_filing/profile.yaml @@ -1,55 +1,62 @@ -# Legal Filing Profile -# -# Court filings: motions, briefs, orders, docket entries. -# Extracts case_number, court, parties, filing_date, docket_entries. - +# Legal Filing extraction profile +# Matches court filings: motions, briefs, orders, docket entries name: legal_filing -description: "Court filings: motions, briefs, orders, docket entries" +description: Court filings: motions, briefs, orders, docket entries priority: 40 -# Matching predicates for legal filing classification match: all: - # Must have at least one legal filing marker - any: - text_contains: - ["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF", - "Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner", - "Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"] - - heading_matches: '^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)' - # And appropriate page count - - structural: {page_count: {min: 1, max: 500}} + patterns: ["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF", "Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner", "Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"] + - heading_matches: + pattern: "^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)" + - structural: + has_table: false + has_form_field: false + has_math: false + page_count: + min: 1 + max: 500 -# Extraction tuning for legal filings extraction: - # Use xy_cut reading order for complex layouts reading_order: xy_cut - # Default table detection table_detection: default - # Standard readability threshold readability_threshold: 0.5 - # Include headers and footers (page numbers and citations are load-bearing in legal docs) - include_headers_footers: true - # Don't include invisible text include_invisible: false + include_headers_footers: true + force_ocr: false + min_block_chars: 0 -# Field extraction specifications fields: case_number: - near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."] - regex: '[\w-]+:?\s*\d+[\w-]*' - parse: string + type: string + extraction: + near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."] + regex: "[\\w-]+:?\\s*\\d+[\\w-]*" + parse: string court: - region: top_quarter - pick: largest_font + type: string + extraction: + region: top_quarter + pick: largest_font + parse: string parties: - near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."] + type: array + extraction: + near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."] + fallback: [] filing_date: - near: ["Filed", "Date Filed", "Dated"] - parse: date + type: date + extraction: + near: ["Filed", "Date Filed", "Dated"] + parse: date docket_entries: - region: full + type: array + extraction: + region: bottom_half + fallback: [] diff --git a/profiles/builtin/receipt/profile.yaml b/profiles/builtin/receipt/profile.yaml index 831cf60..a6108e5 100644 --- a/profiles/builtin/receipt/profile.yaml +++ b/profiles/builtin/receipt/profile.yaml @@ -1,52 +1,67 @@ +# Receipt extraction profile +# Matches point-of-sale or purchase receipts with items and payment method +name: receipt description: Point-of-sale or purchase receipt with items, payment method priority: 45 + match: - any: - - text_patterns: - - "(?i)receipt" - - "(?i)store receipt" - - "(?i)register receipt" - - "(?i)transaction receipt" - - text_patterns: - - "(?i)total.*sold" - - "(?i)change.*due" - - "(?i)cash.*credit" - - "(?i)card.*payment" + all: + - any: + - text_contains: + patterns: ["receipt", "store receipt", "register receipt", "transaction receipt"] + - text_contains: + patterns: ["total sold", "change due", "cash credit", "card payment"] - structural: - - has_monetary_columnar_layout: true - - page_aspect_ratio: "narrow_or_square" - page_count_hint: 1 -profile_fields: + has_table: true + has_form_field: false + has_math: false + page_count: + min: 1 + max: 2 + +extraction: + reading_order: line_dominant + table_detection: default + readability_threshold: 0.5 + include_invisible: false + include_headers_footers: false + force_ocr: false + min_block_chars: 0 + +fields: merchant: type: string extraction: - patterns: - - "(?i)^([A-Z][A-Za-z0-9\\s&']+)$" - - "(?i)(?:store|merchant|retailer)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&']+)" - fallback: null + region: top_quarter + pick: largest_font + parse: string + date: type: date extraction: - patterns: - - "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})" - - "([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+([0-9]{1,2}:[0-9]{2})" - fallback: null + regex: "\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}" + parse: date + total: type: decimal extraction: - patterns: - - "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - fallback: null + regex: "([\\d,]+\\.\\d{2})" + near: ["Total", "Amount Due", "Balance"] + max_distance_pt: 80 + parse: decimal + tax: type: decimal extraction: - patterns: - - "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)" - fallback: null + regex: "([\\d,]+\\.\\d{2})" + near: ["Tax", "VAT"] + max_distance_pt: 80 + parse: decimal + items: type: array extraction: - columnar_regions: "monetary_columns" + table_region: largest_table schema: - name: name type: string @@ -58,11 +73,9 @@ profile_fields: type: decimal required: false fallback: [] + payment_method: type: string extraction: - patterns: - - "(?i)(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)" - fallback: null -reading_order: line_dominant -zone_filtering: exclude_headers_footers + regex: "(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)" + parse: string diff --git a/profiles/builtin/scientific_paper/profile.yaml b/profiles/builtin/scientific_paper/profile.yaml index eec7947..58edbe0 100644 --- a/profiles/builtin/scientific_paper/profile.yaml +++ b/profiles/builtin/scientific_paper/profile.yaml @@ -1,66 +1,87 @@ -# Scientific Paper Profile -# -# Academic papers from arXiv, journals, conference proceedings. -# Extracts title, authors, abstract, DOI, journal, publication_date, references. - +# Scientific Paper extraction profile +# Matches academic papers from arXiv, journals, conference proceedings name: scientific_paper description: Academic papers from arXiv, journals, conference proceedings priority: 30 -# Matching predicates for scientific paper classification match: all: - # Must have at least one scientific paper marker - any: - - text_contains: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"] - - heading_matches: '^(Abstract|Introduction|References|Bibliography)' - # And either has math OR structured headings OR appropriate page count + - text_contains: + patterns: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"] + - heading_matches: + pattern: "^(Abstract|Introduction|References|Bibliography)" - any: - structural: + has_table: false + has_form_field: false has_math: true + page_count: null - structural: - heading_depth: {min: 2} - - structural: - page_count: {min: 4, max: 50} + has_table: false + has_form_field: false + has_math: false + page_count: + min: 4 + max: 50 + none: + - text_contains: + patterns: ["Invoice", "Receipt", "WHEREAS", "NOW THEREFORE"] -# Extraction tuning for scientific papers extraction: - # Use xy_cut reading order for 2-column layout handling reading_order: xy_cut - # Default table detection table_detection: default - # Standard readability threshold readability_threshold: 0.5 - # Don't include invisible text include_invisible: false + include_headers_footers: false + force_ocr: false + min_block_chars: 0 -# Field extraction specifications fields: title: - region: top_quarter - pick: largest_font + type: string + extraction: + region: top_quarter + pick: largest_font + parse: string authors: - region: top_quarter - pick: nearest_below - after: title + type: array + extraction: + region: top_quarter + pick: nearest_below + after_heading: title + fallback: [] abstract: - near: ["Abstract"] - region: top_half + type: string + extraction: + near: ["Abstract"] + region: top_half + parse: string doi: - regex: 'doi[:\.]\s*(10\.\d{4,9}/[\w\-\._;()/:]+)' - parse: string + type: string + extraction: + regex: "doi[:\\.]\\s*(10\\.\\d{4,9}/[\\w\\-\\._;()/:]+)" + parse: string journal: - region: top_eighth - pick: first + type: string + extraction: + region: top_eighth + pick: first + parse: string publication_date: - near: ["Published", "Received", "Accepted"] - parse: date + type: date + extraction: + near: ["Published", "Received", "Accepted"] + parse: date references: - region: bottom_half - after_heading: References + type: array + extraction: + region: bottom_half + after_heading: References + fallback: [] diff --git a/profiles/builtin/slide_deck/profile.yaml b/profiles/builtin/slide_deck/profile.yaml index 2d80bf4..853695b 100644 --- a/profiles/builtin/slide_deck/profile.yaml +++ b/profiles/builtin/slide_deck/profile.yaml @@ -1,64 +1,59 @@ -# Slide Deck Profile -# -# PowerPoint / Keynote / Google Slides exports as PDF. -# Extracts title, presenter, date, slide_titles. - +# Slide Deck extraction profile +# Matches PowerPoint / Keynote / Google Slides exports as PDF name: slide_deck description: PowerPoint / Keynote / Google Slides exports as PDF priority: 15 -# Matching predicates for slide deck classification match: all: - # Page count in typical slide deck range - structural: - page_count: {min: 3, max: 200} - # And EITHER: has limited font diversity (not a dense academic paper) - # OR: contains "Slide N" patterns - # OR: contains slide deck keywords + has_table: false + has_form_field: false + has_math: false + page_count: + min: 3 + max: 200 - any: - - structural: - has_form_field: false - font_diversity: {min: 2, max: 10} - - text_matches: '^Slide \d+$' - - text_contains: ["slides", "presentation"] + - text_matches: + pattern: "^Slide \\d+$" + - text_contains: + patterns: ["slides", "presentation"] none: - # Exclude academic papers (these have their own profile) - - text_contains: ["Abstract", "References", "WHEREAS", "Invoice"] + - text_contains: + patterns: ["Abstract", "References", "WHEREAS", "Invoice"] -# Extraction tuning for slide decks extraction: - # Use xy_cut reading order for proper layout handling reading_order: xy_cut - # Default table detection table_detection: default - # Lower readability threshold for slides (less text density) readability_threshold: 0.6 - # Don't include invisible text include_invisible: false - # Minimum block characters + include_headers_footers: false + force_ocr: false min_block_chars: 5 -# Field extraction specifications fields: title: type: string - region: middle_half - pick: largest_font - page: first + extraction: + region: top_half + pick: largest_font + parse: string presenter: type: string - region: bottom_half - pick: largest_font - page: first + extraction: + region: top_half + pick: largest_font + parse: string date: type: date - near: ["Date"] - parse: date + extraction: + near: ["Date"] + parse: date slide_titles: type: array - pick: largest_font - per_page: true + extraction: + pick: largest_font + fallback: [] diff --git a/tests/fixtures/profiles/invoice/01.pdf b/tests/fixtures/profiles/invoice/01.pdf new file mode 120000 index 0000000..d8b13d6 --- /dev/null +++ b/tests/fixtures/profiles/invoice/01.pdf @@ -0,0 +1 @@ +../../classifier/invoice/01.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/02.pdf b/tests/fixtures/profiles/invoice/02.pdf new file mode 120000 index 0000000..306c339 --- /dev/null +++ b/tests/fixtures/profiles/invoice/02.pdf @@ -0,0 +1 @@ +../../classifier/invoice/02.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/03.pdf b/tests/fixtures/profiles/invoice/03.pdf new file mode 120000 index 0000000..702c270 --- /dev/null +++ b/tests/fixtures/profiles/invoice/03.pdf @@ -0,0 +1 @@ +../../classifier/invoice/03.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/04.pdf b/tests/fixtures/profiles/invoice/04.pdf new file mode 120000 index 0000000..7c17a48 --- /dev/null +++ b/tests/fixtures/profiles/invoice/04.pdf @@ -0,0 +1 @@ +../../classifier/invoice/04.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/05.pdf b/tests/fixtures/profiles/invoice/05.pdf new file mode 120000 index 0000000..e64cc74 --- /dev/null +++ b/tests/fixtures/profiles/invoice/05.pdf @@ -0,0 +1 @@ +../../classifier/invoice/05.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/06.pdf b/tests/fixtures/profiles/invoice/06.pdf new file mode 120000 index 0000000..66244f9 --- /dev/null +++ b/tests/fixtures/profiles/invoice/06.pdf @@ -0,0 +1 @@ +../../classifier/invoice/06.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/07.pdf b/tests/fixtures/profiles/invoice/07.pdf new file mode 120000 index 0000000..223285c --- /dev/null +++ b/tests/fixtures/profiles/invoice/07.pdf @@ -0,0 +1 @@ +../../classifier/invoice/07.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/08.pdf b/tests/fixtures/profiles/invoice/08.pdf new file mode 120000 index 0000000..dc931bb --- /dev/null +++ b/tests/fixtures/profiles/invoice/08.pdf @@ -0,0 +1 @@ +../../classifier/invoice/08.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/09.pdf b/tests/fixtures/profiles/invoice/09.pdf new file mode 120000 index 0000000..2dd9db9 --- /dev/null +++ b/tests/fixtures/profiles/invoice/09.pdf @@ -0,0 +1 @@ +../../classifier/invoice/09.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/10.pdf b/tests/fixtures/profiles/invoice/10.pdf new file mode 120000 index 0000000..07d5cdc --- /dev/null +++ b/tests/fixtures/profiles/invoice/10.pdf @@ -0,0 +1 @@ +../../classifier/invoice/10.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/11.pdf b/tests/fixtures/profiles/invoice/11.pdf new file mode 120000 index 0000000..25d9fe6 --- /dev/null +++ b/tests/fixtures/profiles/invoice/11.pdf @@ -0,0 +1 @@ +../../classifier/invoice/11.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/12.pdf b/tests/fixtures/profiles/invoice/12.pdf new file mode 120000 index 0000000..ae765c1 --- /dev/null +++ b/tests/fixtures/profiles/invoice/12.pdf @@ -0,0 +1 @@ +../../classifier/invoice/12.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/13.pdf b/tests/fixtures/profiles/invoice/13.pdf new file mode 120000 index 0000000..fb700a6 --- /dev/null +++ b/tests/fixtures/profiles/invoice/13.pdf @@ -0,0 +1 @@ +../../classifier/invoice/13.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/14.pdf b/tests/fixtures/profiles/invoice/14.pdf new file mode 120000 index 0000000..a8afbe5 --- /dev/null +++ b/tests/fixtures/profiles/invoice/14.pdf @@ -0,0 +1 @@ +../../classifier/invoice/14.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/15.pdf b/tests/fixtures/profiles/invoice/15.pdf new file mode 120000 index 0000000..1bb1bd8 --- /dev/null +++ b/tests/fixtures/profiles/invoice/15.pdf @@ -0,0 +1 @@ +../../classifier/invoice/15.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/16.pdf b/tests/fixtures/profiles/invoice/16.pdf new file mode 120000 index 0000000..4215735 --- /dev/null +++ b/tests/fixtures/profiles/invoice/16.pdf @@ -0,0 +1 @@ +../../classifier/invoice/16.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/17.pdf b/tests/fixtures/profiles/invoice/17.pdf new file mode 120000 index 0000000..f5e1e81 --- /dev/null +++ b/tests/fixtures/profiles/invoice/17.pdf @@ -0,0 +1 @@ +../../classifier/invoice/17.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/18.pdf b/tests/fixtures/profiles/invoice/18.pdf new file mode 120000 index 0000000..5eabb98 --- /dev/null +++ b/tests/fixtures/profiles/invoice/18.pdf @@ -0,0 +1 @@ +../../classifier/invoice/18.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/19.pdf b/tests/fixtures/profiles/invoice/19.pdf new file mode 120000 index 0000000..5577049 --- /dev/null +++ b/tests/fixtures/profiles/invoice/19.pdf @@ -0,0 +1 @@ +../../classifier/invoice/19.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/20.pdf b/tests/fixtures/profiles/invoice/20.pdf new file mode 120000 index 0000000..3586698 --- /dev/null +++ b/tests/fixtures/profiles/invoice/20.pdf @@ -0,0 +1 @@ +../../classifier/invoice/20.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/21.pdf b/tests/fixtures/profiles/invoice/21.pdf new file mode 120000 index 0000000..3014706 --- /dev/null +++ b/tests/fixtures/profiles/invoice/21.pdf @@ -0,0 +1 @@ +../../classifier/invoice/21.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/22.pdf b/tests/fixtures/profiles/invoice/22.pdf new file mode 120000 index 0000000..0abf9ec --- /dev/null +++ b/tests/fixtures/profiles/invoice/22.pdf @@ -0,0 +1 @@ +../../classifier/invoice/22.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/23.pdf b/tests/fixtures/profiles/invoice/23.pdf new file mode 120000 index 0000000..aef2edf --- /dev/null +++ b/tests/fixtures/profiles/invoice/23.pdf @@ -0,0 +1 @@ +../../classifier/invoice/23.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/24.pdf b/tests/fixtures/profiles/invoice/24.pdf new file mode 120000 index 0000000..a1e048f --- /dev/null +++ b/tests/fixtures/profiles/invoice/24.pdf @@ -0,0 +1 @@ +../../classifier/invoice/24.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/25.pdf b/tests/fixtures/profiles/invoice/25.pdf new file mode 120000 index 0000000..9352fc9 --- /dev/null +++ b/tests/fixtures/profiles/invoice/25.pdf @@ -0,0 +1 @@ +../../classifier/invoice/25.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/26.pdf b/tests/fixtures/profiles/invoice/26.pdf new file mode 120000 index 0000000..b18cec5 --- /dev/null +++ b/tests/fixtures/profiles/invoice/26.pdf @@ -0,0 +1 @@ +../../classifier/invoice/26.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/27.pdf b/tests/fixtures/profiles/invoice/27.pdf new file mode 120000 index 0000000..f528760 --- /dev/null +++ b/tests/fixtures/profiles/invoice/27.pdf @@ -0,0 +1 @@ +../../classifier/invoice/27.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/28.pdf b/tests/fixtures/profiles/invoice/28.pdf new file mode 120000 index 0000000..67326f8 --- /dev/null +++ b/tests/fixtures/profiles/invoice/28.pdf @@ -0,0 +1 @@ +../../classifier/invoice/28.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/29.pdf b/tests/fixtures/profiles/invoice/29.pdf new file mode 120000 index 0000000..61116c9 --- /dev/null +++ b/tests/fixtures/profiles/invoice/29.pdf @@ -0,0 +1 @@ +../../classifier/invoice/29.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/30.pdf b/tests/fixtures/profiles/invoice/30.pdf new file mode 120000 index 0000000..67b5460 --- /dev/null +++ b/tests/fixtures/profiles/invoice/30.pdf @@ -0,0 +1 @@ +../../classifier/invoice/30.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/31.pdf b/tests/fixtures/profiles/invoice/31.pdf new file mode 120000 index 0000000..ee68233 --- /dev/null +++ b/tests/fixtures/profiles/invoice/31.pdf @@ -0,0 +1 @@ +../../classifier/invoice/31.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/32.pdf b/tests/fixtures/profiles/invoice/32.pdf new file mode 120000 index 0000000..50bfcdf --- /dev/null +++ b/tests/fixtures/profiles/invoice/32.pdf @@ -0,0 +1 @@ +../../classifier/invoice/32.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/33.pdf b/tests/fixtures/profiles/invoice/33.pdf new file mode 120000 index 0000000..ec37664 --- /dev/null +++ b/tests/fixtures/profiles/invoice/33.pdf @@ -0,0 +1 @@ +../../classifier/invoice/33.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/34.pdf b/tests/fixtures/profiles/invoice/34.pdf new file mode 120000 index 0000000..c1ca2e4 --- /dev/null +++ b/tests/fixtures/profiles/invoice/34.pdf @@ -0,0 +1 @@ +../../classifier/invoice/34.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/35.pdf b/tests/fixtures/profiles/invoice/35.pdf new file mode 120000 index 0000000..ab688dd --- /dev/null +++ b/tests/fixtures/profiles/invoice/35.pdf @@ -0,0 +1 @@ +../../classifier/invoice/35.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/36.pdf b/tests/fixtures/profiles/invoice/36.pdf new file mode 120000 index 0000000..0498f5d --- /dev/null +++ b/tests/fixtures/profiles/invoice/36.pdf @@ -0,0 +1 @@ +../../classifier/invoice/36.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/37.pdf b/tests/fixtures/profiles/invoice/37.pdf new file mode 120000 index 0000000..81b0dcf --- /dev/null +++ b/tests/fixtures/profiles/invoice/37.pdf @@ -0,0 +1 @@ +../../classifier/invoice/37.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/38.pdf b/tests/fixtures/profiles/invoice/38.pdf new file mode 120000 index 0000000..b5a6256 --- /dev/null +++ b/tests/fixtures/profiles/invoice/38.pdf @@ -0,0 +1 @@ +../../classifier/invoice/38.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/39.pdf b/tests/fixtures/profiles/invoice/39.pdf new file mode 120000 index 0000000..25d350e --- /dev/null +++ b/tests/fixtures/profiles/invoice/39.pdf @@ -0,0 +1 @@ +../../classifier/invoice/39.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/40.pdf b/tests/fixtures/profiles/invoice/40.pdf new file mode 120000 index 0000000..caecc69 --- /dev/null +++ b/tests/fixtures/profiles/invoice/40.pdf @@ -0,0 +1 @@ +../../classifier/invoice/40.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/41.pdf b/tests/fixtures/profiles/invoice/41.pdf new file mode 120000 index 0000000..db049c1 --- /dev/null +++ b/tests/fixtures/profiles/invoice/41.pdf @@ -0,0 +1 @@ +../../classifier/invoice/41.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/42.pdf b/tests/fixtures/profiles/invoice/42.pdf new file mode 120000 index 0000000..0eeebdc --- /dev/null +++ b/tests/fixtures/profiles/invoice/42.pdf @@ -0,0 +1 @@ +../../classifier/invoice/42.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/43.pdf b/tests/fixtures/profiles/invoice/43.pdf new file mode 120000 index 0000000..a2ca931 --- /dev/null +++ b/tests/fixtures/profiles/invoice/43.pdf @@ -0,0 +1 @@ +../../classifier/invoice/43.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/44.pdf b/tests/fixtures/profiles/invoice/44.pdf new file mode 120000 index 0000000..87bd0cb --- /dev/null +++ b/tests/fixtures/profiles/invoice/44.pdf @@ -0,0 +1 @@ +../../classifier/invoice/44.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/45.pdf b/tests/fixtures/profiles/invoice/45.pdf new file mode 120000 index 0000000..6da3e39 --- /dev/null +++ b/tests/fixtures/profiles/invoice/45.pdf @@ -0,0 +1 @@ +../../classifier/invoice/45.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/46.pdf b/tests/fixtures/profiles/invoice/46.pdf new file mode 120000 index 0000000..cb2a2de --- /dev/null +++ b/tests/fixtures/profiles/invoice/46.pdf @@ -0,0 +1 @@ +../../classifier/invoice/46.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/47.pdf b/tests/fixtures/profiles/invoice/47.pdf new file mode 120000 index 0000000..8a2e335 --- /dev/null +++ b/tests/fixtures/profiles/invoice/47.pdf @@ -0,0 +1 @@ +../../classifier/invoice/47.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/48.pdf b/tests/fixtures/profiles/invoice/48.pdf new file mode 120000 index 0000000..f1b7859 --- /dev/null +++ b/tests/fixtures/profiles/invoice/48.pdf @@ -0,0 +1 @@ +../../classifier/invoice/48.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/49.pdf b/tests/fixtures/profiles/invoice/49.pdf new file mode 120000 index 0000000..692ad49 --- /dev/null +++ b/tests/fixtures/profiles/invoice/49.pdf @@ -0,0 +1 @@ +../../classifier/invoice/49.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/invoice/50.pdf b/tests/fixtures/profiles/invoice/50.pdf new file mode 120000 index 0000000..0a51c9b --- /dev/null +++ b/tests/fixtures/profiles/invoice/50.pdf @@ -0,0 +1 @@ +../../classifier/invoice/50.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/receipt/tampered-receipt.pdf b/tests/fixtures/profiles/receipt/tampered-receipt.pdf new file mode 120000 index 0000000..07bdd69 --- /dev/null +++ b/tests/fixtures/profiles/receipt/tampered-receipt.pdf @@ -0,0 +1 @@ +../../../sdk-conformance/fixtures/receipts/tampered-receipt.pdf \ No newline at end of file diff --git a/tests/fixtures/profiles/receipt/valid-receipt.pdf b/tests/fixtures/profiles/receipt/valid-receipt.pdf new file mode 120000 index 0000000..237ef2f --- /dev/null +++ b/tests/fixtures/profiles/receipt/valid-receipt.pdf @@ -0,0 +1 @@ +../../../sdk-conformance/fixtures/receipts/valid-receipt.pdf \ No newline at end of file