feat(profiles): add profile infrastructure and initial fixtures

- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00 · 2026-05-31 15:10:51 -04:00 · 80dbf0f703
commit 80dbf0f703
parent deeafed7a9
74 changed files with 2940 additions and 331 deletions
--- a/.needle-predispatch-sha
+++ b/.needle-predispatch-sha
@ -1 +1 @@
-9cf1ccffa9b1213b83079e66d9a245aadc6d584f
+deeafed7a94a1e91609a11976ef16ee03a1f5fac
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3267,6 +3267,7 @@ dependencies = [
 "criterion",
 "dashmap",
 "digest",
+ "dirs",
 "encoding_rs",
 "filetime",
 "flate2",
--- a/crates/pdftract-cli/Cargo.toml
+++ b/crates/pdftract-cli/Cargo.toml
@ -72,6 +72,7 @@ clap = { version = "4.5", features = ["derive"] }
 crossbeam-channel = "0.5"
 dirs = "5.0"
 hyper = { version = "1.0", features = ["full"] }
+notify = { version = "6", optional = true }
 hyper-util = { version = "0.1", features = ["full"] }
 image = "0.24"
 http-body-util = "0.1"
@ -117,7 +118,7 @@ full-render = ["dep:libloading", "pdftract-core/full-render"]
 # Remote HTTP source support
 remote = ["dep:ureq"]
 # Document profiles
-profiles = ["dep:serde_yaml", "pdftract-core/profiles"]
+profiles = ["dep:serde_yaml", "pdftract-core/profiles", "dep:notify"]
 # HTTP serve mode
 serve = []
 # MCP server mode
--- a/crates/pdftract-cli/src/main.rs
+++ b/crates/pdftract-cli/src/main.rs
@ -19,6 +19,7 @@ mod output;
 mod pages;
 mod panic_hook;
 mod password;
+mod profiles_cmd;
 mod serve;
 mod url;
 mod verify_receipt;
@ -160,6 +161,10 @@ enum Commands {
        #[arg(long)]
        auto: bool,

+        /// Force-apply a specific profile (by name or YAML file path)
+        #[arg(long, value_name = "NAME|PATH")]
+        profile: Option<String>,
+
        /// Include header blocks in output
        #[arg(long)]
        include_headers: bool,
@ -238,6 +243,11 @@ enum Commands {
        #[command(subcommand)]
        cache_command: CacheCommands,
    },
+    /// Manage document type profiles
+    Profiles {
+        #[command(subcommand)]
+        profiles_command: ProfilesCommands,
+    },
    /// Start the HTTP server for extraction
    ///
    /// ## Security Model
@ -311,6 +321,14 @@ enum Commands {
        /// Trust X-Forwarded-For header for client IP detection (DANGER: enables IP spoofing if not behind a trusted proxy)
        #[arg(long)]
        trust_forwarded_for: bool,
+
+        /// Directory containing custom profile YAML files (repeatable)
+        #[arg(long, value_name = "DIR")]
+        profile_dir: Option<PathBuf>,
+
+        /// Enable hot-reload for profiles (re-read directory on every request)
+        #[arg(long)]
+        profile_hot_reload: bool,
    },
    /// Start the MCP (Model Context Protocol) server
    ///
@ -452,6 +470,32 @@ enum CacheCommands {
    },
 }

+#[derive(Subcommand)]
+enum ProfilesCommands {
+    /// List all available profiles
+    List,
+    /// Show a profile's YAML content
+    Show {
+        /// Profile name or path to YAML file
+        name_or_path: String,
+    },
+    /// Export a built-in profile to stdout
+    Export {
+        /// Name of the built-in profile to export
+        name: String,
+    },
+    /// Install a profile to the user config directory
+    Install {
+        /// Path to the profile YAML file to install
+        path: PathBuf,
+    },
+    /// Validate a profile file
+    Validate {
+        /// Path to the profile YAML file to validate
+        path: PathBuf,
+    },
+}
+
 fn main() -> Result<()> {
    // Install panic hook for SecretString redaction in backtraces
    // This ensures credentials never leak in crash dumps
@ -504,6 +548,7 @@ fn main() -> Result<()> {
            no_cache,
            md_anchors,
            auto,
+            profile,
            output,
            include_headers,
            include_footers,
@ -532,6 +577,7 @@ fn main() -> Result<()> {
                no_cache,
                md_anchors,
                auto,
+                profile,
                include_headers,
                include_footers,
                include_headers_footers,
@ -602,6 +648,12 @@ fn main() -> Result<()> {
                std::process::exit(1);
            }
        }
+        Commands::Profiles { profiles_command } => {
+            if let Err(e) = cmd_profiles(profiles_command) {
+                eprintln!("Error: {}", e);
+                std::process::exit(1);
+            }
+        }
        Commands::Serve {
            bind,
            cache_dir,
@ -611,6 +663,8 @@ fn main() -> Result<()> {
            max_decompress_gb,
            audit_log,
            trust_forwarded_for,
+            profile_dir,
+            profile_hot_reload,
        } => {
            if let Err(e) = cmd_serve(
                bind,
@ -621,6 +675,8 @@ fn main() -> Result<()> {
                max_decompress_gb,
                audit_log,
                trust_forwarded_for,
+                profile_dir,
+                profile_hot_reload,
            ) {
                eprintln!("Error: {}", e);
                std::process::exit(1);
@ -775,6 +831,7 @@ fn cmd_extract(
    no_cache: bool,
    md_anchors: bool,
    auto: bool,
+    profile: Option<String>,
    include_headers: bool,
    include_footers: bool,
    include_headers_footers: bool,
@ -921,11 +978,12 @@ fn cmd_extract(
        eprintln!("Auto-detecting document type...");

        use pdftract_core::profiles::{
-            classify, extract_signals_from_results, load_builtins, ProfileType,
+            classify_and_select_profile, extract_signals_from_results, load_extraction_profiles,
+            apply_extraction_tuning, apply_profile_to_metadata,
        };

-        // Load built-in profiles
-        let profiles = load_builtins();
+        // Load all extraction profiles
+        let profiles = load_extraction_profiles(&[]).unwrap_or_default();

        if !profiles.is_empty() {
            // Perform a lightweight extraction for classification
@ -940,43 +998,33 @@ fn cmd_extract(
                    .map(|p| (p.blocks.clone(), p.spans.clone()))
                    .collect();

-                let signals =
-                    extract_signals_from_results(&page_data, has_signature_field, has_form_field);
-                let classification = classify(&signals, &profiles);
+                let selected_profile = classify_and_select_profile(
+                    &profiles.iter().map(|p| p.profile.clone()).collect::<Vec<_>>(),
+                    &page_data,
+                    has_signature_field,
+                    has_form_field,
+                );

-                match classification.document_type {
-                    ProfileType::Unknown => {
-                        eprintln!(
-                            "Document type: unknown (confidence: {:.2})",
-                            classification.confidence
-                        );
-                        eprintln!("Proceeding with default extraction options.");
-                    }
-                    detected_type => {
-                        let type_name = match detected_type {
-                            ProfileType::Invoice => "invoice",
-                            ProfileType::Receipt => "receipt",
-                            ProfileType::Contract => "contract",
-                            ProfileType::ScientificPaper => "scientific_paper",
-                            ProfileType::SlideDeck => "slide_deck",
-                            ProfileType::Form => "form",
-                            ProfileType::BankStatement => "bank_statement",
-                            ProfileType::LegalFiling => "legal_filing",
-                            ProfileType::BookChapter => "book_chapter",
-                            ProfileType::Unknown => "unknown",
-                        };
-                        eprintln!(
-                            "Document type: {} (confidence: {:.2})",
-                            type_name, classification.confidence
-                        );
+                if let Some((profile, match_result)) = selected_profile {
+                    eprintln!(
+                        "Document type: {} (confidence: {:.2})",
+                        profile.name, match_result.confidence
+                    );

-                        // Apply profile-specific extraction options
-                        // For now, just log the detection - profile option overrides
-                        // will be implemented in Phase 7.10
-                        for reason in classification.reasons.iter().take(5) {
-                            eprintln!("  - {}", reason);
-                        }
+                    // Apply profile extraction tuning
+                    if let Some(ref tuning) = profile.extraction {
+                        apply_extraction_tuning(tuning, &mut options);
                    }
+
+                    // Store the selected profile for later field extraction
+                    // We'll extract fields after the main extraction
+                    // For now, just log the match reasons
+                    for reason in match_result.reasons.iter().take(5) {
+                        eprintln!("  - {}", reason);
+                    }
+                } else {
+                    eprintln!("Document type: unknown (confidence: below threshold)");
+                    eprintln!("Proceeding with default extraction options.");
                }
            } else {
                eprintln!(
@ -990,6 +1038,46 @@ fn cmd_extract(
        }
    }

+    // Handle --profile flag: load and apply specific profile
+    #[cfg(feature = "profiles")]
+    if let Some(ref profile_name_or_path) = profile {
+        use pdftract_core::profiles::{
+            load_extraction_profiles, apply_extraction_tuning,
+        };
+
+        eprintln!("Applying profile: {}", profile_name_or_path);
+
+        let profiles = load_extraction_profiles(&[]).unwrap_or_default();
+
+        // Find the profile by name or load from path
+        let profile = if std::path::PathBuf::from(profile_name_or_path).exists() {
+            // Load from file path
+            use pdftract_core::profiles::load_profile_file;
+            match load_profile_file(&std::path::PathBuf::from(profile_name_or_path)) {
+                Ok(p) => Some(p),
+                Err(e) => {
+                    eprintln!("Error loading profile: {}", e);
+                    std::process::exit(1);
+                }
+            }
+        } else {
+            // Find by name
+            profiles.iter()
+                .find(|p| p.profile.name == *profile_name_or_path)
+                .map(|p| p.profile.clone())
+        };
+
+        if let Some(p) = profile {
+            eprintln!("Loaded profile: {}", p.name);
+            if let Some(ref tuning) = p.extraction {
+                apply_extraction_tuning(tuning, &mut options);
+            }
+        } else {
+            eprintln!("Error: Profile '{}' not found", profile_name_or_path);
+            std::process::exit(1);
+        }
+    }
+
    #[cfg(not(feature = "profiles"))]
    if auto {
        eprintln!("Warning: --auto flag requires the 'profiles' feature to be enabled.");
@ -997,6 +1085,13 @@ fn cmd_extract(
        eprintln!("Proceeding with default extraction options.");
    }

+    #[cfg(not(feature = "profiles"))]
+    if profile.is_some() {
+        eprintln!("Warning: --profile flag requires the 'profiles' feature to be enabled.");
+        eprintln!("Build pdftract with: --features profiles");
+        eprintln!("Proceeding with default extraction options.");
+    }
+
    // Set markdown anchors option
    options.markdown_anchors = md_anchors;
    if md_anchors {
@ -1096,6 +1191,58 @@ fn cmd_extract(
    result.metadata.cache_status = Some(cache_status);
    result.metadata.cache_age_seconds = cache_age;

+    // Extract profile fields if --auto or --profile was used
+    #[cfg(feature = "profiles")]
+    {
+        use pdftract_core::profiles::{
+            load_extraction_profiles, apply_profile_to_metadata,
+        };
+
+        let profile_to_apply = if auto {
+            // Re-run classification to get the selected profile
+            let profiles = load_extraction_profiles(&[]).unwrap_or_default();
+            let page_data: Vec<(Vec<_>, Vec<_>)> = result
+                .pages
+                .iter()
+                .map(|p| (p.blocks.clone(), p.spans.clone()))
+                .collect();
+            let has_signature_field = !result.signatures.is_empty();
+            let has_form_field = !result.form_fields.is_empty();
+
+            use pdftract_core::profiles::classify_and_select_profile;
+            classify_and_select_profile(
+                &profiles.iter().map(|p| p.profile.clone()).collect::<Vec<_>>(),
+                &page_data,
+                has_signature_field,
+                has_form_field,
+            ).map(|(p, _)| p)
+        } else if profile.is_some() {
+            // Load the specified profile
+            let profile_name_or_path = profile.as_ref().unwrap();
+            let profiles = load_extraction_profiles(&[]).unwrap_or_default();
+
+            if std::path::PathBuf::from(profile_name_or_path).exists() {
+                use pdftract_core::profiles::load_profile_file;
+                load_profile_file(&std::path::PathBuf::from(profile_name_or_path)).ok()
+            } else {
+                profiles.iter()
+                    .find(|p| p.profile.name == *profile_name_or_path)
+                    .map(|p| p.profile.clone())
+            }
+        } else {
+            None
+        };
+
+        // Apply profile to metadata
+        if let Some(p) = profile_to_apply {
+            let (name, version, fields) = apply_profile_to_metadata(&p, &result.pages);
+            // Update the result's metadata with profile information
+            result.metadata.profile_name = Some(name);
+            result.metadata.profile_version = Some(version);
+            result.metadata.profile_fields = fields;
+        }
+    }
+
    // Write each output to its destination
    for spec in &output_specs {
        match spec.dest {
@ -1803,6 +1950,25 @@ fn cmd_cache(command: CacheCommands) -> Result<()> {
    Ok(())
 }

+fn cmd_profiles(command: ProfilesCommands) -> Result<()> {
+    use profiles_cmd::{ProfilesArgs, ProfilesCommand};
+
+    // Convert ProfilesCommands to profiles_cmd::ProfilesCommand
+    let profiles_command = match command {
+        ProfilesCommands::List => ProfilesCommand::List,
+        ProfilesCommands::Show { name_or_path } => ProfilesCommand::Show { name_or_path },
+        ProfilesCommands::Export { name } => ProfilesCommand::Export { name },
+        ProfilesCommands::Install { path } => ProfilesCommand::Install { path },
+        ProfilesCommands::Validate { path } => ProfilesCommand::Validate { path },
+    };
+
+    let args = ProfilesArgs {
+        command: profiles_command,
+    };
+
+    profiles_cmd::run_profiles(args)
+}
+
 fn cmd_serve(
    bind: String,
    cache_dir: Option<PathBuf>,
--- a/crates/pdftract-cli/src/profiles_cmd.rs
+++ b/crates/pdftract-cli/src/profiles_cmd.rs
@ -0,0 +1,300 @@
+//! Profile management CLI subcommand.
+//!
+//! This module implements the `pdftract profiles` command family for managing
+//! document type profiles (list, show, export, install, validate).
+
+use anyhow::{Context, Result};
+use std::fs;
+use std::path::PathBuf;
+
+/// Arguments for the profiles subcommand.
+pub struct ProfilesArgs {
+    /// Subcommand to run
+    pub command: ProfilesCommand,
+}
+
+/// Profiles subcommands.
+#[derive(Debug, Clone)]
+pub enum ProfilesCommand {
+    /// List all available profiles
+    List,
+    /// Show a profile's YAML content
+    Show { name_or_path: String },
+    /// Export a built-in profile to stdout
+    Export { name: String },
+    /// Install a profile to the user config directory
+    Install { path: PathBuf },
+    /// Validate a profile file
+    Validate { path: PathBuf },
+}
+
+/// Run the profiles subcommand.
+pub fn run_profiles(args: ProfilesArgs) -> Result<()> {
+    match args.command {
+        ProfilesCommand::List => run_list(),
+        ProfilesCommand::Show { name_or_path } => run_show(&name_or_path),
+        ProfilesCommand::Export { name } => run_export(&name),
+        ProfilesCommand::Install { path } => run_install(&path),
+        ProfilesCommand::Validate { path } => run_validate(&path),
+    }
+}
+
+/// List all available profiles.
+fn run_list() -> Result<()> {
+    #[cfg(feature = "profiles")]
+    {
+        use pdftract_core::profiles::extraction_loader;
+
+        // Load all extraction profiles
+        let profiles = extraction_loader::load_extraction_profiles(&[])?;
+
+        if profiles.is_empty() {
+            println!("No profiles available.");
+            println!();
+            println!("Built-in profiles may not be enabled. Build pdftract with:");
+            println!("  cargo build --features profiles");
+            return Ok(());
+        }
+
+        println!("Available profiles ({} total):", profiles.len());
+        println!();
+
+        // Group by origin
+        let mut builtin = Vec::new();
+        let mut user = Vec::new();
+        let mut custom = Vec::new();
+
+        for source in &profiles {
+            match source.source {
+                extraction_loader::ProfileOrigin::BuiltIn => builtin.push(source),
+                extraction_loader::ProfileOrigin::User => user.push(source),
+                extraction_loader::ProfileOrigin::Custom(_) => custom.push(source),
+                extraction_loader::ProfileOrigin::System => {
+                    // System profiles - add to a separate group or merge with user
+                    user.push(source);
+                }
+            }
+        }
+
+        // Print built-in profiles
+        if !builtin.is_empty() {
+            println!("Built-in profiles:");
+            for source in builtin {
+                let profile = &source.profile;
+                println!(
+                    "  {} - Priority: {}{}",
+                    profile.name,
+                    profile.priority,
+                    if source.overrides_builtin {
+                        " (overrides built-in)"
+                    } else {
+                        ""
+                    }
+                );
+                println!("    {}", profile.description);
+            }
+            println!();
+        }
+
+        // Print user profiles
+        if !user.is_empty() {
+            println!("User profiles:");
+            for source in user {
+                let profile = &source.profile;
+                println!(
+                    "  {} - Priority: {}{}",
+                    profile.name,
+                    profile.priority,
+                    if source.overrides_builtin {
+                        " (overrides built-in)"
+                    } else {
+                        ""
+                    }
+                );
+                println!("    {}", profile.description);
+            }
+            println!();
+        }
+
+        // Print custom profiles
+        if !custom.is_empty() {
+            println!("Custom profiles:");
+            for source in custom {
+                let profile = &source.profile;
+                println!(
+                    "  {} - Priority: {}",
+                    profile.name, profile.priority
+                );
+                println!("    {}", profile.description);
+            }
+            println!();
+        }
+    }
+
+    #[cfg(not(feature = "profiles"))]
+    {
+        println!("Profiles are not enabled.");
+        println!();
+        println!("Build pdftract with the profiles feature:");
+        println!("  cargo build --features profiles");
+    }
+
+    Ok(())
+}
+
+/// Show a profile's YAML content.
+fn run_show(name_or_path: &str) -> Result<()> {
+    #[cfg(feature = "profiles")]
+    {
+        use pdftract_core::profiles::extraction_loader;
+
+        // Load all profiles to search by name
+        let profiles = extraction_loader::load_extraction_profiles(&[])?;
+
+        // Try to find the profile
+        let profile = extraction_loader::find_profile(name_or_path, &profiles)?;
+
+        // Serialize back to YAML
+        let yaml = serde_yaml::to_string(&profile)
+            .context("Failed to serialize profile to YAML")?;
+
+        println!("{}", yaml);
+    }
+
+    #[cfg(not(feature = "profiles"))]
+    {
+        anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
+    }
+
+    Ok(())
+}
+
+/// Export a built-in profile to stdout.
+fn run_export(name: &str) -> Result<()> {
+    #[cfg(feature = "profiles")]
+    {
+        use pdftract_core::profiles::extraction_loader;
+
+        // Load all profiles
+        let profiles = extraction_loader::load_extraction_profiles(&[])?;
+
+        // Find the built-in profile by name
+        let profile = profiles
+            .iter()
+            .find(|s| s.profile.name == name && matches!(s.source, extraction_loader::ProfileOrigin::BuiltIn))
+            .context(format!("Built-in profile '{}' not found", name))?;
+
+        // Serialize to YAML
+        let yaml = serde_yaml::to_string(&profile)
+            .context("Failed to serialize profile to YAML")?;
+
+        println!("{}", yaml);
+    }
+
+    #[cfg(not(feature = "profiles"))]
+    {
+        anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
+    }
+
+    Ok(())
+}
+
+/// Install a profile to the user config directory.
+fn run_install(path: &PathBuf) -> Result<()> {
+    #[cfg(feature = "profiles")]
+    {
+        use pdftract_core::profiles::extraction_loader;
+
+        // Check if source file exists
+        if !path.exists() {
+            anyhow::bail!("Profile file not found: {}", path.display());
+        }
+
+        // Get XDG config directory
+        let xdg_dir = extraction_loader::get_xdg_profile_dir()
+            .context("Failed to determine XDG config directory")?;
+
+        // Create directory if it doesn't exist
+        fs::create_dir_all(&xdg_dir)
+            .context(format!("Failed to create profile directory: {}", xdg_dir.display()))?;
+
+        // Read the profile to get its name
+        let content = fs::read_to_string(path)
+            .context(format!("Failed to read profile file: {}", path.display()))?;
+
+        // Parse to get the profile name
+        let profile: pdftract_core::profiles::ExtractionProfile = serde_yaml::from_str(&content)
+            .context("Failed to parse profile YAML")?;
+
+        // Destination path
+        let dest = xdg_dir.join(format!("{}.yaml", profile.name));
+
+        // Copy file
+        fs::copy(path, &dest)
+            .context(format!("Failed to copy profile to: {}", dest.display()))?;
+
+        println!("Installed profile '{}' to: {}", profile.name, dest.display());
+        println!();
+        println!("You can now use this profile with:");
+        println!("  pdftract extract --profile {}", profile.name);
+    }
+
+    #[cfg(not(feature = "profiles"))]
+    {
+        anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
+    }
+
+    Ok(())
+}
+
+/// Validate a profile file.
+fn run_validate(path: &PathBuf) -> Result<()> {
+    #[cfg(feature = "profiles")]
+    {
+        use pdftract_core::profiles::extraction_loader;
+
+        // Check if file exists
+        if !path.exists() {
+            anyhow::bail!("Profile file not found: {}", path.display());
+        }
+
+        // Validate the profile
+        match extraction_loader::validate_profile_file(path) {
+            Ok(()) => {
+                println!("Profile '{}' is valid.", path.display());
+                return Ok(());
+            }
+            Err(e) => {
+                anyhow::bail!("Profile validation failed: {}", e);
+            }
+        }
+    }
+
+    #[cfg(not(feature = "profiles"))]
+    {
+        anyhow::bail!("Profiles feature is not enabled. Build with: --features profiles");
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_profiles_command_enum() {
+        let command = ProfilesCommand::List;
+        assert!(matches!(command, ProfilesCommand::List));
+
+        let show = ProfilesCommand::Show {
+            name_or_path: "invoice".to_string(),
+        };
+        assert!(matches!(show, ProfilesCommand::Show { .. }));
+
+        let export = ProfilesCommand::Export {
+            name: "invoice".to_string(),
+        };
+        assert!(matches!(export, ProfilesCommand::Export { .. }));
+    }
+}
--- a/crates/pdftract-core/Cargo.toml
+++ b/crates/pdftract-core/Cargo.toml
@ -46,6 +46,7 @@ smallvec = "1.13"
 encoding_rs = "0.8"
 quick-xml = { version = "0.36", optional = true }
 serde_yaml = { version = "0.9", optional = true }
+dirs = "5.0"
 chrono = "0.4"
 aes = { version = "0.8", optional = true }
 rc4 = { version = "0.1", optional = true }
--- a/crates/pdftract-core/src/extract.rs
+++ b/crates/pdftract-core/src/extract.rs
@ -304,6 +304,15 @@ pub struct ExtractionMetadata {
    /// Diagnostics emitted during extraction (coverage warnings, etc.)
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub diagnostics: Vec<String>,
+    /// Profile name if a profile was applied (Phase 7.10)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub profile_name: Option<String>,
+    /// Profile version if a profile was applied (Phase 7.10)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub profile_version: Option<String>,
+    /// Extracted fields from profile if a profile was applied (Phase 7.10)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub profile_fields: Option<serde_json::Value>,
 }

 /// Extract text and structure from a PDF file.
@ -931,6 +940,9 @@ pub fn extract_pdf(
            error_count,
            reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
            diagnostics: all_diagnostics_with_js,
+            profile_name: None,
+            profile_version: None,
+            profile_fields: None,
        },
        signatures,
        form_fields,
@ -1812,6 +1824,9 @@ pub fn extract_pdf_ndjson<W: std::io::Write>(
        error_count: error_count as usize,
        reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
        diagnostics: all_diagnostics,
+        profile_name: None,
+        profile_version: None,
+        profile_fields: None,
    })
 }

@ -2117,6 +2132,9 @@ where
        error_count,
        reading_order_algorithm: Some(final_reading_order_algorithm.as_str().to_string()),
        diagnostics: all_diagnostics,
+        profile_name: None,
+        profile_version: None,
+        profile_fields: None,
    })
 }

--- a/crates/pdftract-core/src/profiles/apply_profile.rs
+++ b/crates/pdftract-core/src/profiles/apply_profile.rs
@ -0,0 +1,259 @@
+//! Profile application for extraction tuning (Phase 7.10).
+//!
+//! Applies profile extraction tuning to ExtractionOptions and manages
+//! the profile workflow: classification, option override, field extraction,
+//! and metadata population.
+
+use super::extraction::{ExtractionProfile, ExtractionTuning};
+use super::field_extractor;
+use super::match_eval::{evaluate_match, MatchResult};
+use super::signals::extract_signals_from_results;
+use crate::options::{ExtractionOptions, OutputOptions};
+use crate::schema::{BlockJson, PageJson, SpanJson};
+use anyhow::Result;
+use serde_json::json;
+
+/// Apply a profile's extraction tuning to extraction options.
+///
+/// # Arguments
+///
+/// * `tuning` - The extraction tuning from a profile
+/// * `options` - The base extraction options to modify
+///
+/// # Returns
+///
+/// Modified extraction options with profile-specific overrides applied.
+///
+/// # Note
+///
+/// Many extraction tuning fields (reading_order, table_detection, etc.) are
+/// not yet exposed in ExtractionOptions. This function applies what is available
+/// and logs warnings for unsupported fields.
+pub fn apply_extraction_tuning(tuning: &ExtractionTuning, options: &mut ExtractionOptions) {
+    // Apply output filtering options (these are supported)
+    if let Some(include_invisible) = tuning.include_invisible {
+        options.output.include_invisible = include_invisible;
+    }
+
+    if let Some(include_headers_footers) = tuning.include_headers_footers {
+        if include_headers_footers {
+            options.output.include_headers = true;
+            options.output.include_footers = true;
+        }
+    }
+
+    // Log warnings for unsupported fields (for future implementation)
+    if tuning.reading_order.is_some() {
+        eprintln!("Profile warning: reading_order tuning is not yet supported");
+    }
+
+    if tuning.table_detection.is_some() {
+        eprintln!("Profile warning: table_detection tuning is not yet supported");
+    }
+
+    if tuning.readability_threshold.is_some() {
+        eprintln!("Profile warning: readability_threshold tuning is not yet supported");
+    }
+
+    if tuning.force_ocr.is_some() {
+        eprintln!("Profile warning: force_ocr tuning is not yet supported");
+    }
+
+    if tuning.min_block_chars.is_some() {
+        eprintln!("Profile warning: min_block_chars tuning is not yet supported");
+    }
+}
+
+/// Classify a document and select the best matching profile.
+///
+/// # Arguments
+///
+/// * `profiles` - All available extraction profiles
+/// * `page_data` - Page data (blocks, span_indices) for signal extraction
+/// * `has_signature_field` - Whether document has signature fields
+/// * `has_form_field` - Whether document has form fields
+///
+/// # Returns
+///
+/// The best matching profile with confidence score, or None if no profile
+/// matches with confidence >= 0.6.
+pub fn classify_and_select_profile(
+    profiles: &[ExtractionProfile],
+    page_data: &[(Vec<BlockJson>, Vec<SpanJson>)], // (blocks, spans) per page
+    has_signature_field: bool,
+    has_form_field: bool,
+) -> Option<(ExtractionProfile, MatchResult)> {
+    // Extract signals from the document
+    let signals = extract_signals_from_results(page_data, has_signature_field, has_form_field);
+
+    // Evaluate each profile
+    let mut best_profile: Option<(ExtractionProfile, MatchResult)> = None;
+
+    for profile in profiles {
+        let result = evaluate_match(&profile.match_expr, &signals);
+
+        // Only consider matches with confidence >= 0.6
+        if result.matched && result.confidence >= 0.6 {
+            match &best_profile {
+                None => {
+                    best_profile = Some((profile.clone(), result));
+                }
+                Some((existing_profile, existing_result)) => {
+                    // Prefer higher confidence, then higher priority
+                    if result.confidence > existing_result.confidence
+                        || (result.confidence == existing_result.confidence
+                            && profile.priority > existing_profile.priority)
+                    {
+                        best_profile = Some((profile.clone(), result));
+                    }
+                }
+            }
+        }
+    }
+
+    best_profile
+}
+
+/// Apply a profile to extraction metadata.
+///
+/// Populates profile_name, profile_version, and profile_fields in the
+/// extraction metadata.
+///
+/// # Arguments
+///
+/// * `profile` - The profile that was applied
+/// * `metadata` - The extraction metadata to update (this must be the full ExtractionMetadata from extract module)
+/// * `pages` - Extracted pages for field extraction
+///
+/// # Note
+///
+/// This function requires the full ExtractionMetadata from the extract module.
+/// Due to the module structure, we update metadata through a closure that
+/// can access the internal fields.
+pub fn apply_profile_to_metadata(
+    profile: &ExtractionProfile,
+    pages: &[PageJson],
+) -> (String, String, Option<serde_json::Value>) {
+    let profile_name = profile.name.clone();
+    let profile_version = "1.0.0".to_string(); // Profile version schema
+
+    // Extract fields if the profile has field specifications
+    let profile_fields = if !profile.fields.is_empty() {
+        // Collect all blocks from all pages
+        let all_blocks: Vec<BlockJson> = pages.iter().flat_map(|p| p.blocks.clone()).collect();
+
+        // Build full text from all spans
+        let full_text = pages
+            .iter()
+            .flat_map(|p| p.spans.iter().map(|s| s.text.clone()))
+            .collect::<Vec<_>>()
+            .join(" ");
+
+        // Extract profile fields
+        let field_results =
+            field_extractor::extract_profile_fields(&profile.fields, &all_blocks, &full_text);
+
+        // Convert to JSON object
+        let mut fields_obj = serde_json::Map::new();
+        for (field_name, result) in field_results {
+            fields_obj.insert(field_name, result.value);
+        }
+
+        Some(json!(fields_obj))
+    } else {
+        None
+    };
+
+    (profile_name, profile_version, profile_fields)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::options::ReceiptsMode;
+
+    fn make_test_block(kind: &str, x0: f64, y0: f64, x1: f64, y1: f64) -> BlockJson {
+        BlockJson {
+            id: format!("block_{}", kind),
+            kind: kind.to_string(),
+            bbox: Some(vec![x0, y0, x1, y1]),
+            spans: vec![0, 1],
+            reading_order: Some(0),
+            ..Default::default()
+        }
+    }
+
+    #[test]
+    fn test_apply_extraction_tuning() {
+        let tuning = ExtractionTuning {
+            reading_order: Some("line_dominant".to_string()),
+            table_detection: Some("strict_borders".to_string()),
+            readability_threshold: Some(0.4),
+            include_invisible: Some(true),
+            include_headers_footers: Some(true),
+            zone_filtering: None,
+            force_ocr: Some(false),
+            min_block_chars: Some(10),
+        };
+
+        let mut options = ExtractionOptions::default();
+
+        apply_extraction_tuning(&tuning, &mut options);
+
+        // Check that output options were applied
+        assert_eq!(options.output.include_invisible, true);
+        assert_eq!(options.output.include_headers, true);
+        assert_eq!(options.output.include_footers, true);
+    }
+
+    #[test]
+    fn test_apply_extraction_tuning_partial() {
+        let tuning = ExtractionTuning {
+            reading_order: None,
+            table_detection: None,
+            readability_threshold: None,
+            include_invisible: Some(false),
+            include_headers_footers: None,
+            zone_filtering: None,
+            force_ocr: None,
+            min_block_chars: None,
+        };
+
+        let mut options = ExtractionOptions::default();
+
+        apply_extraction_tuning(&tuning, &mut options);
+
+        assert_eq!(options.output.include_invisible, false);
+        assert_eq!(options.output.include_headers, false);
+        assert_eq!(options.output.include_footers, false);
+    }
+
+    #[test]
+    fn test_classify_and_select_profile_no_match() {
+        // Empty profiles list
+        let profiles: Vec<ExtractionProfile> = vec![];
+        let page_data: Vec<(Vec<BlockJson>, Vec<usize>)> = vec![];
+
+        let result = classify_and_select_profile(&profiles, &page_data, false, false);
+
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_apply_profile_to_metadata_no_fields() {
+        let profile_yaml = r#"
+name: test
+description: Test profile
+priority: 10
+"#;
+
+        let profile: ExtractionProfile = serde_yaml::from_str(profile_yaml).unwrap();
+        let pages = vec![];
+
+        let (name, version, fields) = apply_profile_to_metadata(&profile, &pages);
+
+        assert_eq!(name, "test");
+        assert_eq!(version, "1.0.0");
+        assert!(fields.is_none());
+    }
+}
--- a/crates/pdftract-core/src/profiles/extraction.rs
+++ b/crates/pdftract-core/src/profiles/extraction.rs
@ -0,0 +1,437 @@
+//! Extraction profile types (Phase 7.10).
+//!
+//! This module defines the rich extraction profile format that extends Phase 5.6
+//! classification with extraction tuning and field extraction. Extraction profiles
+//! use a boolean match DSL (all/any/none combinators) and can override extraction
+//! options and extract structured fields.
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Extraction profile with match DSL, extraction tuning, and field extraction.
+///
+/// This is the Phase 7.10 profile format, separate from the Phase 5.6 classification
+/// `Profile` type. Extraction profiles drive both classification (via match DSL)
+/// and extraction behavior (via tuning and field specs).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExtractionProfile {
+    /// Profile name (e.g., "invoice", "receipt")
+    pub name: String,
+
+    /// Human-readable description
+    pub description: String,
+
+    /// Priority for profile selection (higher = preferred when multiple match)
+    #[serde(default = "default_priority")]
+    pub priority: u32,
+
+    /// Match DSL expression (boolean tree with all/any/none combinators)
+    #[serde(default)]
+    pub match_expr: MatchExpr,
+
+    /// Extraction tuning overrides (optional)
+    #[serde(default)]
+    pub extraction: Option<ExtractionTuning>,
+
+    /// Field extraction specifications (optional)
+    #[serde(default)]
+    pub fields: HashMap<String, FieldSpec>,
+}
+
+fn default_priority() -> u32 {
+    10
+}
+
+/// Boolean match expression for document classification.
+///
+/// Supports all/any/none combinators for building complex matching rules.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum MatchExpr {
+    /// Single predicate
+    Predicate(ExtractionMatchPredicate),
+
+    /// All of these must match
+    All { all: Vec<MatchExpr> },
+
+    /// Any of these can match
+    Any { any: Vec<MatchExpr> },
+
+    /// None of these must match
+    None { none: Vec<MatchExpr> },
+}
+
+impl Default for MatchExpr {
+    fn default() -> Self {
+        // Default to an Any that matches nothing (empty list)
+        MatchExpr::Any { any: Vec::new() }
+    }
+}
+
+/// Match predicate primitives for extraction profiles.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ExtractionMatchPredicate {
+    /// Text contains any of the given strings
+    TextContains {
+        #[serde(default)]
+        patterns: Vec<String>,
+    },
+
+    /// Text matches the given regex
+    TextMatches {
+        pattern: String,
+    },
+
+    /// Heading text matches the given regex
+    HeadingMatches {
+        pattern: String,
+    },
+
+    /// Document has currency pattern ($\d, €\d, etc.)
+    HasCurrencyPattern {
+        #[serde(default)]
+        has_currency_pattern: bool,
+    },
+
+    /// Document has signature fields (AcroForm)
+    HasSignatureField {
+        #[serde(default)]
+        has_signature_field: bool,
+    },
+
+    /// Structural predicates (has_table, page_count, etc.)
+    Structural {
+        #[serde(default)]
+        has_table: bool,
+
+        #[serde(default)]
+        has_form_field: bool,
+
+        #[serde(default)]
+        has_math: bool,
+
+        #[serde(flatten)]
+        page_count: Option<PageCountRange>,
+    },
+
+    /// Text patterns alias for TextContains
+    #[serde(rename = "text_patterns")]
+    TextContainsAlias {
+        #[serde(default)]
+        patterns: Vec<String>,
+    },
+}
+
+/// Page count range predicate.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageCountRange {
+    #[serde(default)]
+    pub min: Option<u32>,
+
+    #[serde(default)]
+    pub max: Option<u32>,
+
+    #[serde(default)]
+    pub hint: Option<String>,
+}
+
+/// Extraction tuning overrides.
+///
+/// These fields override the default ExtractionOptions when a profile matches.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ExtractionTuning {
+    /// Reading order algorithm
+    pub reading_order: Option<String>,
+
+    /// Table detection mode
+    pub table_detection: Option<String>,
+
+    /// Readability threshold (0.0-1.0)
+    pub readability_threshold: Option<f32>,
+
+    /// Include invisible text
+    pub include_invisible: Option<bool>,
+
+    /// Include headers and footers
+    pub include_headers_footers: Option<bool>,
+
+    /// Zone filtering mode
+    pub zone_filtering: Option<String>,
+
+    /// Force OCR
+    pub force_ocr: Option<bool>,
+
+    /// Minimum block characters
+    pub min_block_chars: Option<usize>,
+}
+
+/// Field extraction specification.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FieldSpec {
+    /// Field type (string, decimal, date, int, bool, array)
+    #[serde(rename = "type")]
+    pub field_type: String,
+
+    /// Extraction specification
+    pub extraction: FieldExtraction,
+}
+
+/// Field extraction definition.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum FieldExtraction {
+    /// Simple pattern-based extraction
+    Patterns {
+        patterns: Vec<String>,
+        #[serde(default)]
+        fallback: Option<serde_yaml::Value>,
+    },
+
+    /// Rich extraction with localizers and extractors
+    Rich {
+        /// Regex pattern
+        #[serde(default)]
+        regex: Option<String>,
+
+        /// Near anchors (search near these strings)
+        #[serde(default)]
+        near: Option<Vec<String>>,
+
+        /// Maximum distance in points
+        #[serde(default)]
+        max_distance_pt: Option<usize>,
+
+        /// Region specification
+        #[serde(default)]
+        region: Option<String>,
+
+        /// Pick strategy (largest_font, smallest_font, nearest_below, nearest_right, first, last)
+        #[serde(default)]
+        pick: Option<String>,
+
+        /// Parse type (decimal, date, int, bool, string)
+        #[serde(default)]
+        parse: Option<String>,
+
+        /// After field (for ordering)
+        #[serde(default)]
+        after: Option<String>,
+
+        /// After heading
+        #[serde(default)]
+        after_heading: Option<String>,
+
+        /// Table region for array fields
+        #[serde(default)]
+        table_region: Option<String>,
+
+        /// Columnar regions for array fields
+        #[serde(default)]
+        columnar_regions: Option<String>,
+
+        /// Array schema for structured data
+        #[serde(default)]
+        schema: Option<Vec<FieldSchema>>,
+
+        /// Fallback value
+        #[serde(default)]
+        fallback: Option<serde_yaml::Value>,
+    },
+}
+
+/// Schema field for array extraction.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FieldSchema {
+    pub name: String,
+    #[serde(rename = "type")]
+    pub field_type: String,
+    #[serde(default)]
+    pub required: bool,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_extraction_profile_basic() {
+        let yaml = r#"
+name: test
+description: Test profile
+priority: 50
+"#;
+
+        let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap();
+        assert_eq!(profile.name, "test");
+        assert_eq!(profile.description, "Test profile");
+        assert_eq!(profile.priority, 50);
+    }
+
+    #[test]
+    fn test_match_expr_all() {
+        let yaml = r#"
+match:
+  all:
+    - text_contains:
+        patterns: ["invoice", "bill"]
+    - structural:
+        has_table: true
+"#;
+
+        let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
+        match expr {
+            MatchExpr::All { all } => {
+                assert_eq!(all.len(), 2);
+            }
+            _ => panic!("Expected All"),
+        }
+    }
+
+    #[test]
+    fn test_match_expr_any() {
+        let yaml = r#"
+match:
+  any:
+    - text_contains:
+        patterns: ["receipt"]
+    - text_matches:
+        pattern: "\\d+\\.\\d{2}"
+"#;
+
+        let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
+        match expr {
+            MatchExpr::Any { any } => {
+                assert_eq!(any.len(), 2);
+            }
+            _ => panic!("Expected Any"),
+        }
+    }
+
+    #[test]
+    fn test_match_expr_none() {
+        let yaml = r#"
+match:
+  none:
+    - text_contains:
+        patterns: ["abstract", "bibliography"]
+"#;
+
+        let expr: MatchExpr = serde_yaml::from_str(yaml).unwrap();
+        match expr {
+            MatchExpr::None { none } => {
+                assert_eq!(none.len(), 1);
+            }
+            _ => panic!("Expected None"),
+        }
+    }
+
+    #[test]
+    fn test_extraction_tuning() {
+        let yaml = r#"
+extraction:
+  reading_order: xy_cut
+  table_detection: strict_borders
+  readability_threshold: 0.4
+  include_invisible: false
+"#;
+
+        let tuning: ExtractionTuning = serde_yaml::from_str(yaml).unwrap();
+        assert_eq!(tuning.reading_order, Some("xy_cut".to_string()));
+        assert_eq!(tuning.table_detection, Some("strict_borders".to_string()));
+        assert_eq!(tuning.readability_threshold, Some(0.4));
+        assert_eq!(tuning.include_invisible, Some(false));
+    }
+
+    #[test]
+    fn test_field_spec_simple() {
+        let yaml = r#"
+total:
+  type: decimal
+  extraction:
+    patterns:
+      - "\\$\\s*(\\d+\\.\\d{2})"
+    fallback: null
+"#;
+
+        let field: FieldSpec = serde_yaml::from_str(yaml).unwrap();
+        assert_eq!(field.field_type, "decimal");
+        match field.extraction {
+            FieldExtraction::Patterns { patterns, .. } => {
+                assert_eq!(patterns.len(), 1);
+            }
+            _ => panic!("Expected Patterns"),
+        }
+    }
+
+    #[test]
+    fn test_field_spec_rich() {
+        let yaml = r#"
+invoice_number:
+  type: string
+  extraction:
+    regex: "Invoice\\s*#\\s*([\\w-]+)"
+    near: ["Invoice", "Invoice Number"]
+    max_distance_pt: 200
+"#;
+
+        let field: FieldSpec = serde_yaml::from_str(yaml).unwrap();
+        assert_eq!(field.field_type, "string");
+        match field.extraction {
+            FieldExtraction::Rich { regex, near, max_distance_pt, .. } => {
+                assert!(regex.is_some());
+                assert!(near.is_some());
+                assert_eq!(max_distance_pt, Some(200));
+            }
+            _ => panic!("Expected Rich"),
+        }
+    }
+
+    #[test]
+    fn test_full_profile_roundtrip() {
+        let yaml = r#"
+name: invoice
+description: Commercial invoice with line items
+priority: 50
+
+match:
+  all:
+    - any:
+        - text_contains:
+            patterns: ["invoice", "bill to"]
+        - heading_matches:
+            pattern: "^Invoice\\b"
+    - structural:
+        has_table: true
+
+extraction:
+  reading_order: line_dominant
+  table_detection: strict_borders
+  readability_threshold: 0.4
+
+fields:
+  invoice_number:
+    type: string
+    extraction:
+      regex: "Invoice\\s*#\\s*([\\w-]+)"
+      near: ["Invoice"]
+  total:
+    type: decimal
+    extraction:
+      patterns:
+        - "total.*([\\d,]+\\.\\d{2})"
+      fallback: null
+"#;
+
+        let profile: ExtractionProfile = serde_yaml::from_str(yaml).unwrap();
+        assert_eq!(profile.name, "invoice");
+        assert_eq!(profile.priority, 50);
+        assert!(profile.extraction.is_some());
+        assert_eq!(profile.fields.len(), 2);
+
+        // Round-trip
+        let yaml_out = serde_yaml::to_string(&profile).unwrap();
+        let profile2: ExtractionProfile = serde_yaml::from_str(&yaml_out).unwrap();
+        assert_eq!(profile2.name, profile.name);
+    }
+}
--- a/crates/pdftract-core/src/profiles/extraction_loader.rs
+++ b/crates/pdftract-core/src/profiles/extraction_loader.rs
@ -0,0 +1,374 @@
+//! Extraction profile loader (Phase 7.10).
+//!
+//! Loads extraction profiles from built-in sources, system directories,
+//! XDG config paths, and custom --profile-dir flags.
+
+use super::extraction::ExtractionProfile;
+use super::loader::{check_forbidden_keys, ForbiddenKeyError, ProfileLoadError};
+use std::collections::HashMap;
+use std::fs;
+use std::path::{Path, PathBuf};
+
+/// Profile source with priority metadata.
+#[derive(Debug, Clone)]
+pub struct ProfileSource {
+    /// The loaded profile
+    pub profile: ExtractionProfile,
+
+    /// Where this profile came from
+    pub source: ProfileOrigin,
+
+    /// Whether this overrides a built-in profile
+    pub overrides_builtin: bool,
+}
+
+/// Origin of a profile.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ProfileOrigin {
+    /// Built-in profile (compiled into binary)
+    BuiltIn,
+
+    /// System-wide profile (/etc/pdftract/profiles/)
+    System,
+
+    /// User profile (XDG config directory)
+    User,
+
+    /// Custom profile directory (--profile-dir)
+    Custom(PathBuf),
+}
+
+/// Load all extraction profiles from the search path.
+///
+/// Search order (lowest to highest priority):
+/// 1. Built-in profiles (compiled in)
+/// 2. System directory (/etc/pdftract/profiles/)
+/// 3. User directory (XDG config: ~/.config/pdftract/profiles/)
+/// 4. Custom directories (--profile-dir, repeatable)
+///
+/// Later sources override earlier ones on name collision.
+pub fn load_extraction_profiles(
+    custom_dirs: &[PathBuf],
+) -> Result<Vec<ProfileSource>, ProfileLoadError> {
+    let mut profiles_by_name: HashMap<String, ProfileSource> = HashMap::new();
+
+    // 1. Load built-in profiles
+    load_builtin_profiles(&mut profiles_by_name)?;
+
+    // 2. Load system profiles
+    let system_dir = PathBuf::from("/etc/pdftract/profiles");
+    if system_dir.exists() {
+        load_profiles_from_dir(&system_dir, ProfileOrigin::System, &mut profiles_by_name)?;
+    }
+
+    // 3. Load user profiles (XDG config)
+    if let Some(user_dir) = get_xdg_profile_dir() {
+        if user_dir.exists() {
+            load_profiles_from_dir(&user_dir, ProfileOrigin::User, &mut profiles_by_name)?;
+        }
+    }
+
+    // 4. Load custom profiles (--profile-dir)
+    for custom_dir in custom_dirs {
+        if custom_dir.exists() {
+            let origin = ProfileOrigin::Custom(custom_dir.clone());
+            load_profiles_from_dir(custom_dir, origin, &mut profiles_by_name)?;
+        }
+    }
+
+    // Convert to vector, sorted by priority then by name
+    let mut profiles: Vec<ProfileSource> = profiles_by_name.into_values().collect();
+    profiles.sort_by(|a, b| {
+        b.profile
+            .priority
+            .cmp(&a.profile.priority)
+            .then_with(|| a.profile.name.cmp(&b.profile.name))
+    });
+
+    Ok(profiles)
+}
+
+/// Get the XDG config directory for pdftract profiles.
+///
+/// Returns ~/.config/pdftract/profiles/ or None if XDG config is not available.
+pub fn get_xdg_profile_dir() -> Option<PathBuf> {
+    dirs::config_dir().map(|dir| dir.join("pdftract").join("profiles"))
+}
+
+/// Load built-in extraction profiles.
+///
+/// These are compiled into the binary via include_str!.
+fn load_builtin_profiles(
+    profiles: &mut HashMap<String, ProfileSource>,
+) -> Result<(), ProfileLoadError> {
+    #[cfg(feature = "profiles")]
+    {
+        // Load each built-in profile individually
+        let profile_results: Vec<(&str, Result<ExtractionProfile, ProfileLoadError>)> = vec![
+            ("invoice", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/invoice/profile.yaml"),
+                "profiles/builtin/invoice/profile.yaml"
+            )),
+            ("receipt", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/receipt/profile.yaml"),
+                "profiles/builtin/receipt/profile.yaml"
+            )),
+            ("contract", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/contract/profile.yaml"),
+                "profiles/builtin/contract/profile.yaml"
+            )),
+            ("scientific_paper", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/scientific_paper/profile.yaml"),
+                "profiles/builtin/scientific_paper/profile.yaml"
+            )),
+            ("slide_deck", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/slide_deck/profile.yaml"),
+                "profiles/builtin/slide_deck/profile.yaml"
+            )),
+            ("form", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/form/profile.yaml"),
+                "profiles/builtin/form/profile.yaml"
+            )),
+            ("bank_statement", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/bank_statement/profile.yaml"),
+                "profiles/builtin/bank_statement/profile.yaml"
+            )),
+            ("legal_filing", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/legal_filing/profile.yaml"),
+                "profiles/builtin/legal_filing/profile.yaml"
+            )),
+            ("book_chapter", load_profile_yaml(
+                include_str!("../../../../profiles/builtin/book_chapter/profile.yaml"),
+                "profiles/builtin/book_chapter/profile.yaml"
+            )),
+        ];
+
+        for (name, result) in profile_results {
+            match result {
+                Ok(profile) => {
+                    profiles.insert(
+                        profile.name.clone(),
+                        ProfileSource {
+                            profile,
+                            source: ProfileOrigin::BuiltIn,
+                            overrides_builtin: false,
+                        },
+                    );
+                }
+                Err(e) => {
+                    eprintln!("Failed to parse built-in profile '{}': {}", name, e);
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Load a profile from YAML content.
+fn load_profile_yaml(content: &str, source_path: &str) -> Result<ExtractionProfile, ProfileLoadError> {
+    // Check for forbidden keys first
+    let yaml_value = serde_yaml::from_str::<serde_yaml::Value>(content)?;
+
+    // Get the original content for line number detection
+    if let Err(e) = check_forbidden_keys(&yaml_value, "", content) {
+        return Err(ProfileLoadError::ForbiddenKey {
+            key: e.key,
+            path: format!("{}: {}", source_path, e.path),
+            line: e.line,
+        });
+    }
+
+    // Parse as ExtractionProfile
+    let profile: ExtractionProfile =
+        serde_yaml::from_str(content).map_err(ProfileLoadError::YamlError)?;
+
+    Ok(profile)
+}
+
+/// Load profiles from a directory.
+fn load_profiles_from_dir(
+    dir: &Path,
+    origin: ProfileOrigin,
+    profiles: &mut HashMap<String, ProfileSource>,
+) -> Result<(), ProfileLoadError> {
+    let entries = fs::read_dir(dir).map_err(ProfileLoadError::IoError)?;
+
+    for entry in entries {
+        let entry = entry.map_err(ProfileLoadError::IoError)?;
+        let path = entry.path();
+
+        // Skip directories
+        if path.is_dir() {
+            // Check for profile.yaml subdirectory (e.g., invoice/profile.yaml)
+            let profile_yaml = path.join("profile.yaml");
+            if profile_yaml.exists() {
+                if let Ok(profile) = load_profile_file(&profile_yaml) {
+                    let overrides_builtin = profiles
+                        .contains_key(&profile.name)
+                        && matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_));
+
+                    profiles.insert(
+                        profile.name.clone(),
+                        ProfileSource {
+                            profile,
+                            source: origin.clone(),
+                            overrides_builtin,
+                        },
+                    );
+                }
+            }
+            continue;
+        }
+
+        // Only load .yaml files
+        if path.extension().and_then(|s| s.to_str()) != Some("yaml") {
+            continue;
+        }
+
+        if let Ok(profile) = load_profile_file(&path) {
+            let overrides_builtin = profiles
+                .contains_key(&profile.name)
+                && matches!(origin, ProfileOrigin::User | ProfileOrigin::Custom(_));
+
+            profiles.insert(
+                profile.name.clone(),
+                ProfileSource {
+                    profile,
+                    source: origin.clone(),
+                    overrides_builtin,
+                },
+            );
+        }
+    }
+
+    Ok(())
+}
+
+/// Load a single profile from a file.
+pub fn load_profile_file(path: &Path) -> Result<ExtractionProfile, ProfileLoadError> {
+    let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?;
+    load_profile_yaml(&content, &path.to_string_lossy())
+}
+
+/// Find a profile by name or path.
+///
+/// - If `name_or_path` is an existing file path, load it directly
+/// - Otherwise, search for a profile with that name in the loaded profiles
+pub fn find_profile(
+    name_or_path: &str,
+    profiles: &[ProfileSource],
+) -> Result<ExtractionProfile, ProfileLoadError> {
+    // First, check if it's a file path
+    let path = PathBuf::from(name_or_path);
+    if path.exists() {
+        return load_profile_file(&path);
+    }
+
+    // Search by name
+    for source in profiles {
+        if source.profile.name == name_or_path {
+            return Ok(source.profile.clone());
+        }
+    }
+
+    Err(ProfileLoadError::IoError(std::io::Error::new(
+        std::io::ErrorKind::NotFound,
+        format!("Profile '{}' not found", name_or_path),
+    )))
+}
+
+/// Validate a profile file without loading it into the profile set.
+///
+/// Returns Ok(()) if the profile is valid, Err with details if invalid.
+pub fn validate_profile_file(path: &Path) -> Result<(), ProfileLoadError> {
+    let content = fs::read_to_string(path).map_err(ProfileLoadError::IoError)?;
+
+    // Check for forbidden keys
+    let yaml_value = serde_yaml::from_str::<serde_yaml::Value>(&content)
+        .map_err(ProfileLoadError::YamlError)?;
+
+    check_forbidden_keys(&yaml_value, "", &content)
+        .map_err(|e| ProfileLoadError::ForbiddenKey {
+            key: e.key,
+            path: e.path,
+            line: e.line,
+        })?;
+
+    // Try to parse as ExtractionProfile
+    let _: ExtractionProfile = serde_yaml::from_str(&content).map_err(ProfileLoadError::YamlError)?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_get_xdg_profile_dir() {
+        let dir = get_xdg_profile_dir();
+        assert!(dir.is_some());
+        let path = dir.unwrap();
+        assert!(path.ends_with("pdftract/profiles"));
+    }
+
+    #[test]
+    fn test_load_builtin_profiles() {
+        let mut profiles = HashMap::new();
+        let result = load_builtin_profiles(&mut profiles);
+
+        #[cfg(feature = "profiles")]
+        {
+            assert!(result.is_ok());
+            // Should have loaded some profiles
+            assert!(!profiles.is_empty());
+        }
+    }
+
+    #[test]
+    fn test_validate_simple_profile() {
+        let yaml = r#"
+name: test
+description: Test profile
+priority: 10
+match:
+  text_contains:
+    patterns: ["test"]
+"#;
+
+        let temp_dir = tempfile::tempdir().unwrap();
+        let profile_path = temp_dir.path().join("test.yaml");
+        fs::write(&profile_path, yaml).unwrap();
+
+        let result = validate_profile_file(&profile_path);
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_validate_profile_with_forbidden_key() {
+        let yaml = r#"
+name: test
+description: Test profile
+priority: 10
+match:
+  text_contains:
+    patterns: ["test"]
+api_key: "secret"
+"#;
+
+        let temp_dir = tempfile::tempdir().unwrap();
+        let profile_path = temp_dir.path().join("test.yaml");
+        fs::write(&profile_path, yaml).unwrap();
+
+        let result = validate_profile_file(&profile_path);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_load_extraction_profiles_empty() {
+        let profiles = load_extraction_profiles(&[]).unwrap();
+        #[cfg(feature = "profiles")]
+        assert!(!profiles.is_empty()); // At least built-ins
+    }
+}
--- a/crates/pdftract-core/src/profiles/field_extractor.rs
+++ b/crates/pdftract-core/src/profiles/field_extractor.rs
@ -0,0 +1,353 @@
+//! Field extraction DSL evaluator (Phase 7.10).
+//!
+//! Evaluates field extraction specifications from profiles and extracts
+//! structured fields from document text. Supports:
+//! - Localizers: near, region, pick
+//! - Extractors: regex, parse
+//! - Strategies for disambiguating multiple candidates
+
+use super::extraction::{FieldExtraction, FieldSchema, FieldSpec};
+use crate::schema::BlockJson;
+use regex::Regex;
+use serde_json::Value;
+use std::collections::HashMap;
+
+/// Convert serde_yaml::Value to serde_json::Value.
+fn convert_yaml_to_json(yaml_value: &serde_yaml::Value) -> Value {
+    match yaml_value {
+        serde_yaml::Value::Null => Value::Null,
+        serde_yaml::Value::Bool(b) => Value::Bool(*b),
+        serde_yaml::Value::Number(n) => {
+            if let Some(i) = n.as_i64() {
+                Value::Number(i.into())
+            } else if let Some(f) = n.as_f64() {
+                serde_json::Number::from_f64(f).map(Value::Number).unwrap_or(Value::Null)
+            } else {
+                Value::Null
+            }
+        }
+        serde_yaml::Value::String(s) => Value::String(s.clone()),
+        serde_yaml::Value::Sequence(seq) => {
+            Value::Array(seq.iter().map(convert_yaml_to_json).collect())
+        }
+        serde_yaml::Value::Mapping(map) => {
+            let mut obj = serde_json::Map::new();
+            for (k, v) in map {
+                if let serde_yaml::Value::String(key_str) = k {
+                    obj.insert(key_str.clone(), convert_yaml_to_json(v));
+                }
+            }
+            Value::Object(obj)
+        }
+        serde_yaml::Value::Tagged(tagged) => convert_yaml_to_json(&tagged.value),
+    }
+}
+
+/// Result of field extraction.
+#[derive(Debug, Clone)]
+pub struct FieldExtractionResult {
+    /// Extracted field value (null if not found)
+    pub value: Value,
+    /// Human-readable extraction details (for debugging)
+    pub details: String,
+}
+
+/// Extract all fields from a profile against extracted document data.
+///
+/// # Arguments
+///
+/// * `fields` - Field specifications from the profile
+/// * `blocks` - Extracted blocks from the document
+/// * `full_text` - Full document text
+///
+/// # Returns
+///
+/// A map of field names to extraction results.
+pub fn extract_profile_fields(
+    fields: &HashMap<String, FieldSpec>,
+    blocks: &[BlockJson],
+    full_text: &str,
+) -> HashMap<String, FieldExtractionResult> {
+    let mut results = HashMap::new();
+
+    for (field_name, field_spec) in fields {
+        let result = extract_single_field(field_spec, blocks, full_text);
+        results.insert(field_name.clone(), result);
+    }
+
+    results
+}
+
+/// Extract a single field from the document.
+fn extract_single_field(
+    field_spec: &FieldSpec,
+    blocks: &[BlockJson],
+    full_text: &str,
+) -> FieldExtractionResult {
+    match &field_spec.extraction {
+        FieldExtraction::Patterns { patterns, fallback } => {
+            let json_fallback = fallback.as_ref().map(convert_yaml_to_json);
+            extract_by_patterns(patterns, full_text, &json_fallback)
+        }
+        FieldExtraction::Rich {
+            regex,
+            near,
+            max_distance_pt,
+            region,
+            pick,
+            parse,
+            after: _,
+            after_heading: _,
+            table_region: _,
+            columnar_regions: _,
+            schema: _,
+            fallback,
+        } => {
+            let json_fallback = fallback.as_ref().map(convert_yaml_to_json);
+            extract_rich(
+                regex,
+                near,
+                max_distance_pt,
+                region,
+                pick,
+                parse,
+                blocks,
+                full_text,
+                &json_fallback,
+            )
+        }
+    }
+}
+
+/// Extract using simple pattern matching (fallback mode).
+fn extract_by_patterns(
+    patterns: &[String],
+    full_text: &str,
+    fallback: &Option<Value>,
+) -> FieldExtractionResult {
+    for pattern in patterns {
+        if let Ok(re) = Regex::new(pattern) {
+            if let Some(captures) = re.captures(full_text) {
+                // Use first capture group if available, otherwise full match
+                let value = captures
+                    .get(1)
+                    .or(captures.get(0))
+                    .map(|m| m.as_str())
+                    .unwrap_or("");
+
+                return FieldExtractionResult {
+                    value: Value::String(value.to_string()),
+                    details: format!("Matched pattern '{}': '{}'", pattern, value),
+                };
+            }
+        }
+    }
+
+    // No match - use fallback or null
+    FieldExtractionResult {
+        value: fallback.clone().unwrap_or(Value::Null),
+        details: "No patterns matched, using fallback".to_string(),
+    }
+}
+
+/// Extract using rich field extraction with localizers and extractors.
+fn extract_rich(
+    regex: &Option<String>,
+    near: &Option<Vec<String>>,
+    _max_distance_pt: &Option<usize>,
+    _region: &Option<String>,
+    _pick: &Option<String>,
+    parse: &Option<String>,
+    _blocks: &[BlockJson],
+    full_text: &str,
+    fallback: &Option<Value>,
+) -> FieldExtractionResult {
+    // For rich extraction, we need to find text near anchors
+    // This is a simplified version that searches the full text
+
+    // Find anchor position if "near" is specified
+    let search_text = if let Some(anchors) = near {
+        // Find the position of the first anchor in the text
+        let anchor_pos = anchors
+            .iter()
+            .find_map(|anchor| full_text.find(anchor))
+            .unwrap_or(0);
+
+        // Search in text after the anchor
+        if let Some(pos) = full_text.get(anchor_pos..) {
+            pos
+        } else {
+            full_text
+        }
+    } else {
+        full_text
+    };
+
+    // Extract value using regex
+    let raw_value = if let Some(pattern) = regex {
+        extract_with_regex(pattern, search_text)
+    } else {
+        // If no regex, use the first few words from search text
+        search_text
+            .split_whitespace()
+            .next()
+            .unwrap_or("")
+            .to_string()
+    };
+
+    // Parse value according to type
+    let parsed_value = parse_value(&raw_value, parse.as_deref());
+
+    FieldExtractionResult {
+        value: parsed_value,
+        details: format!("Extracted value: '{}'", raw_value),
+    }
+}
+
+/// Extract value using regex.
+fn extract_with_regex(pattern: &str, text: &str) -> String {
+    match Regex::new(pattern) {
+        Ok(re) => {
+            if let Some(captures) = re.captures(text) {
+                captures
+                    .get(1)
+                    .or(captures.get(0))
+                    .map(|m| m.as_str().to_string())
+                    .unwrap_or_default()
+            } else {
+                String::new()
+            }
+        }
+        Err(_) => String::new(),
+    }
+}
+
+/// Parse a value according to the specified type.
+fn parse_value(raw: &str, parse_type: Option<&str>) -> Value {
+    let raw = raw.trim();
+
+    match parse_type {
+        Some("decimal") => {
+            // Clean up currency symbols and commas
+            let cleaned = raw
+                .replace('$', "")
+                .replace('€', "")
+                .replace('£', "")
+                .replace('¥', "")
+                .replace(',', "");
+
+            cleaned
+                .parse::<f64>()
+                .ok()
+                .and_then(|v| serde_json::Number::from_f64(v))
+                .map(Value::Number)
+                .unwrap_or(Value::Null)
+        }
+        Some("int") => raw
+            .parse::<i64>()
+            .map(Value::Number)
+            .unwrap_or(Value::Null),
+        Some("bool") => {
+            let lower = raw.to_lowercase();
+            Value::Bool(lower == "true" || lower == "yes" || lower == "1")
+        }
+        Some("date") => {
+            // Try to parse as ISO date or return string
+            if raw.len() >= 10 && raw.chars().nth(4) == Some('-') {
+                Value::String(raw.to_string())
+            } else {
+                Value::String(raw.to_string())
+            }
+        }
+        Some("string") | None => Value::String(raw.to_string()),
+        _ => Value::String(raw.to_string()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_extract_by_patterns_simple() {
+        let full_text = "Invoice #12345\nTotal: $100.00";
+        let patterns = vec![r"Invoice #(\w+)".to_string()];
+
+        let result = extract_by_patterns(&patterns, full_text, &None);
+
+        assert_eq!(result.value, "12345");
+        assert!(result.details.contains("Matched pattern"));
+    }
+
+    #[test]
+    fn test_extract_by_patterns_no_match() {
+        let full_text = "Receipt #ABC";
+        let patterns = vec![r"Invoice #(\w+)".to_string()];
+        let fallback = Some(Value::String("UNKNOWN".to_string()));
+
+        let result = extract_by_patterns(&patterns, full_text, &fallback);
+
+        assert_eq!(result.value, "UNKNOWN");
+        assert!(result.details.contains("No patterns matched"));
+    }
+
+    #[test]
+    fn test_parse_value_decimal() {
+        assert_eq!(
+            parse_value("100.50", Some("decimal")),
+            Value::Number(serde_json::Number::from_f64(100.50).unwrap())
+        );
+        assert_eq!(
+            parse_value("$1,234.56", Some("decimal")),
+            Value::Number(serde_json::Number::from_f64(1234.56).unwrap())
+        );
+        assert_eq!(parse_value("invalid", Some("decimal")), Value::Null);
+    }
+
+    #[test]
+    fn test_parse_value_int() {
+        assert_eq!(parse_value("42", Some("int")), Value::Number(42.into()));
+        assert_eq!(parse_value("invalid", Some("int")), Value::Null);
+    }
+
+    #[test]
+    fn test_parse_value_bool() {
+        assert_eq!(parse_value("true", Some("bool")), Value::Bool(true));
+        assert_eq!(parse_value("yes", Some("bool")), Value::Bool(true));
+        assert_eq!(parse_value("false", Some("bool")), Value::Bool(false));
+        assert_eq!(parse_value("no", Some("bool")), Value::Bool(false));
+    }
+
+    #[test]
+    fn test_parse_value_date() {
+        let result = parse_value("2025-01-15", Some("date"));
+        assert_eq!(result, Value::String("2025-01-15".to_string()));
+    }
+
+    #[test]
+    fn test_parse_value_string() {
+        assert_eq!(
+            parse_value("hello", Some("string")),
+            Value::String("hello".to_string())
+        );
+        assert_eq!(parse_value("world", None), Value::String("world".to_string()));
+    }
+
+    #[test]
+    fn test_extract_with_regex() {
+        let text = "Invoice: INV-2025-00123";
+        let pattern = r"Invoice:\s*([\w-]+)";
+
+        let result = extract_with_regex(pattern, text);
+        assert_eq!(result, "INV-2025-00123");
+    }
+
+    #[test]
+    fn test_extract_with_regex_no_match() {
+        let text = "Receipt: R-123";
+        let pattern = r"Invoice:\s*([\w-]+)";
+
+        let result = extract_with_regex(pattern, text);
+        assert!(result.is_empty());
+    }
+}
--- a/crates/pdftract-core/src/profiles/match_eval.rs
+++ b/crates/pdftract-core/src/profiles/match_eval.rs
@ -0,0 +1,528 @@
+//! Match DSL evaluator for extraction profiles.
+//!
+//! Evaluates boolean match expressions (all/any/none combinators) against
+//! document signals to determine if a profile matches a document.
+
+use super::engine::FeatureSignals;
+use super::extraction::{ExtractionMatchPredicate, MatchExpr, PageCountRange};
+use regex::Regex;
+use std::collections::HashMap;
+use std::sync::Mutex;
+
+/// Result of match evaluation.
+#[derive(Debug, Clone, Default)]
+pub struct MatchResult {
+    /// Whether the match succeeded
+    pub matched: bool,
+
+    /// Human-readable reasons for the match (for debugging/metadata)
+    pub reasons: Vec<String>,
+
+    /// Confidence score (0.0-1.0)
+    pub confidence: f32,
+}
+
+/// Evaluate a match expression against document signals.
+///
+/// Returns a MatchResult indicating whether the expression matched and
+/// providing reasons for the decision.
+pub fn evaluate_match(expr: &MatchExpr, signals: &FeatureSignals) -> MatchResult {
+    match expr {
+        MatchExpr::Predicate(pred) => evaluate_predicate(pred, signals),
+        MatchExpr::All { all } => {
+            let mut result = MatchResult {
+                matched: true,
+                reasons: Vec::new(),
+                confidence: 1.0,
+            };
+
+            for sub_expr in all {
+                let sub_result = evaluate_match(sub_expr, signals);
+                result.reasons.extend(sub_result.reasons);
+
+                if !sub_result.matched {
+                    result.matched = false;
+                    // Keep collecting reasons for debugging
+                }
+                result.confidence = result.confidence.min(sub_result.confidence);
+            }
+
+            if result.matched {
+                result.reasons.push("all: all sub-expressions matched".to_string());
+            } else {
+                result.reasons.push("all: some sub-expressions did not match".to_string());
+            }
+
+            result
+        }
+        MatchExpr::Any { any } => {
+            let mut best_result = MatchResult {
+                matched: false,
+                reasons: Vec::new(),
+                confidence: 0.0,
+            };
+
+            for sub_expr in any {
+                let sub_result = evaluate_match(sub_expr, signals);
+
+                if sub_result.matched {
+                    best_result.matched = true;
+                    best_result.confidence = best_result.confidence.max(sub_result.confidence);
+                }
+
+                best_result.reasons.extend(sub_result.reasons);
+            }
+
+            if best_result.matched {
+                best_result
+                    .reasons
+                    .push("any: at least one sub-expression matched".to_string());
+            } else {
+                best_result
+                    .reasons
+                    .push("any: no sub-expressions matched".to_string());
+            }
+
+            best_result
+        }
+        MatchExpr::None { none } => {
+            let mut result = MatchResult {
+                matched: true,
+                reasons: Vec::new(),
+                confidence: 1.0,
+            };
+
+            for sub_expr in none {
+                let sub_result = evaluate_match(sub_expr, signals);
+
+                if sub_result.matched {
+                    result.matched = false;
+                    result.confidence = 0.0;
+                    result
+                        .reasons
+                        .push(format!("none: excluded sub-expression matched: {:?}", sub_result.reasons));
+                }
+            }
+
+            if result.matched {
+                result.reasons.push("none: no excluded sub-expressions matched".to_string());
+            }
+
+            result
+        }
+    }
+}
+
+/// Evaluate a single predicate against document signals.
+fn evaluate_predicate(pred: &ExtractionMatchPredicate, signals: &FeatureSignals) -> MatchResult {
+    match pred {
+        ExtractionMatchPredicate::TextContains { patterns } => {
+            let text_lower = signals.text.to_lowercase();
+
+            for pattern in patterns {
+                if text_lower.contains(&pattern.to_lowercase()) {
+                    return MatchResult {
+                        matched: true,
+                        reasons: vec![format!("text_contains: found '{}'", pattern)],
+                        confidence: 0.8,
+                    };
+                }
+            }
+
+            MatchResult {
+                matched: false,
+                reasons: vec!["text_contains: no patterns found".to_string()],
+                confidence: 0.0,
+            }
+        }
+        ExtractionMatchPredicate::TextMatches { pattern } => {
+            let regex = match compile_regex(pattern) {
+                Ok(re) => re,
+                Err(e) => {
+                    return MatchResult {
+                        matched: false,
+                        reasons: vec![format!("text_matches: invalid regex '{}': {}", pattern, e)],
+                        confidence: 0.0,
+                    }
+                }
+            };
+
+            if regex.is_match(&signals.text) {
+                MatchResult {
+                    matched: true,
+                    reasons: vec![format!("text_matches: pattern '{}' matched", pattern)],
+                    confidence: 0.7,
+                }
+            } else {
+                MatchResult {
+                    matched: false,
+                    reasons: vec![format!("text_matches: pattern '{}' did not match", pattern)],
+                    confidence: 0.0,
+                }
+            }
+        }
+        ExtractionMatchPredicate::HeadingMatches { pattern } => {
+            let regex = match compile_regex(pattern) {
+                Ok(re) => re,
+                Err(e) => {
+                    return MatchResult {
+                        matched: false,
+                        reasons: vec![format!("heading_matches: invalid regex '{}': {}", pattern, e)],
+                        confidence: 0.0,
+                    }
+                }
+            };
+
+            for heading in &signals.headings {
+                if regex.is_match(heading) {
+                    return MatchResult {
+                        matched: true,
+                        reasons: vec![format!(
+                            "heading_matches: heading '{}' matched pattern '{}'",
+                            heading, pattern
+                        )],
+                        confidence: 0.75,
+                    };
+                }
+            }
+
+            MatchResult {
+                matched: false,
+                reasons: vec![format!("heading_matches: no headings matched '{}'", pattern)],
+                confidence: 0.0,
+            }
+        }
+        ExtractionMatchPredicate::HasCurrencyPattern {
+            has_currency_pattern: true,
+        } => {
+            let has_currency = has_currency_pattern_impl(&signals.text);
+            MatchResult {
+                matched: has_currency,
+                reasons: vec![if has_currency {
+                    "has_currency_pattern: currency pattern found".to_string()
+                } else {
+                    "has_currency_pattern: no currency pattern".to_string()
+                }],
+                confidence: if has_currency { 0.6 } else { 0.0 },
+            }
+        }
+        ExtractionMatchPredicate::HasCurrencyPattern {
+            has_currency_pattern: false,
+        } => MatchResult {
+            matched: true, // Negated predicate
+            reasons: vec!["has_currency_pattern: predicate disabled".to_string()],
+            confidence: 0.0,
+        },
+        ExtractionMatchPredicate::HasSignatureField {
+            has_signature_field: true,
+        } => {
+            let has_sig = signals.has_signature_field;
+            MatchResult {
+                matched: has_sig,
+                reasons: vec![if has_sig {
+                    "has_signature_field: signature fields found".to_string()
+                } else {
+                    "has_signature_field: no signature fields".to_string()
+                }],
+                confidence: if has_sig { 0.5 } else { 0.0 },
+            }
+        }
+        ExtractionMatchPredicate::HasSignatureField {
+            has_signature_field: false,
+        } => MatchResult {
+            matched: true,
+            reasons: vec!["has_signature_field: predicate disabled".to_string()],
+            confidence: 0.0,
+        },
+        ExtractionMatchPredicate::TextContainsAlias { patterns } => {
+            // Alias for TextContains
+            let text_lower = signals.text.to_lowercase();
+
+            for pattern in patterns {
+                if text_lower.contains(&pattern.to_lowercase()) {
+                    return MatchResult {
+                        matched: true,
+                        reasons: vec![format!("text_contains: found '{}'", pattern)],
+                        confidence: 0.8,
+                    };
+                }
+            }
+
+            MatchResult {
+                matched: false,
+                reasons: vec!["text_contains: no patterns found".to_string()],
+                confidence: 0.0,
+            }
+        }
+        ExtractionMatchPredicate::Structural {
+            has_table,
+            has_form_field,
+            has_math,
+            page_count,
+        } => {
+            let mut matched = true;
+            let mut reasons = Vec::new();
+            let mut min_confidence = 1.0;
+
+            if matches!(has_table, Some(true)) {
+                if signals.table_block_count > 0 {
+                    reasons.push(format!("structural.has_table: {} tables found", signals.table_block_count));
+                } else {
+                    reasons.push("structural.has_table: no tables found".to_string());
+                    matched = false;
+                }
+            }
+
+            if matches!(has_form_field, Some(true)) {
+                if signals.has_form_field {
+                    reasons.push("structural.has_form_field: form fields found".to_string());
+                } else {
+                    reasons.push("structural.has_form_field: no form fields found".to_string());
+                    matched = false;
+                }
+            }
+
+            if matches!(has_math, Some(true)) {
+                if signals.has_math_operators {
+                    reasons.push("structural.has_math: math operators found".to_string());
+                } else {
+                    reasons.push("structural.has_math: no math operators".to_string());
+                    matched = false;
+                }
+            }
+
+            if let Some(range) = page_count {
+                let page_count = signals.page_count as u32;
+                let in_range = match (&range.min, &range.max) {
+                    (Some(min), Some(max)) => page_count >= *min && page_count <= *max,
+                    (Some(min), None) => page_count >= *min,
+                    (None, Some(max)) => page_count <= *max,
+                    (None, None) => true,
+                };
+
+                if in_range {
+                    reasons.push(format!("structural.page_count: {} is in range", page_count));
+                } else {
+                    reasons.push(format!(
+                        "structural.page_count: {} is out of range {:?}",
+                        page_count, range
+                    ));
+                    matched = false;
+                }
+            }
+
+            MatchResult {
+                matched,
+                reasons,
+                confidence: if matched { min_confidence } else { 0.0 },
+            }
+        }
+    }
+}
+
+/// Check if text contains a currency pattern ($\d, €\d, £\d, ¥\d, etc.).
+fn has_currency_pattern_impl(text: &str) -> bool {
+    // Simple check for currency symbols followed by digits
+    let text_lower = text.to_lowercase();
+    text_lower.contains('$') || text_lower.contains('€') || text_lower.contains('£') || text_lower.contains('¥')
+}
+
+/// Simple regex cache (thread-safe, LRU-bounded).
+fn get_regex_cache() -> &'static Mutex<HashMap<String, Regex>> {
+    use std::sync::OnceLock;
+    static CACHE: OnceLock<Mutex<HashMap<String, Regex>>> = OnceLock::new();
+    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
+}
+
+/// Compile a regex pattern with caching.
+fn compile_regex(pattern: &str) -> Result<Regex, regex::Error> {
+    // Check cache first
+    {
+        let cache = get_regex_cache().lock().unwrap();
+        if let Some(regex) = cache.get(pattern) {
+            return Ok(regex.clone());
+        }
+    }
+
+    // Compile and cache
+    let regex = Regex::new(pattern)?;
+    let mut cache = get_regex_cache().lock().unwrap();
+
+    // Simple LRU: clear if too many entries
+    if cache.len() > 100 {
+        cache.clear();
+    }
+
+    cache.insert(pattern.to_string(), regex.clone());
+    Ok(regex)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_signals() -> FeatureSignals {
+        let mut signals = FeatureSignals {
+            text: "Invoice #12345\nTotal: $100.00\nDue date: 2025-01-15".to_string(),
+            text_pattern_hits: HashMap::new(),
+            headings: HashSet::from(["Invoice".to_string(), "Total".to_string()]),
+            page_count: 2,
+            table_block_count: 1,
+            has_signature_field: false,
+            has_form_field: false,
+            has_math_operators: false,
+            has_bullet_lists: false,
+            font_diversity: 3,
+            heading_depth: 2,
+            glyph_density: 0.9,
+            has_footer_page_numbers: false,
+        };
+        signals.build_pattern_hits();
+        signals
+    }
+
+    #[test]
+    fn test_text_contains_match() {
+        let signals = test_signals();
+        let pred = ExtractionMatchPredicate::TextContains {
+            patterns: vec!["invoice".to_string()],
+        };
+
+        let result = evaluate_predicate(&pred, &signals);
+        assert!(result.matched);
+        assert_eq!(result.confidence, 0.8);
+    }
+
+    #[test]
+    fn test_text_contains_no_match() {
+        let signals = test_signals();
+        let pred = ExtractionMatchPredicate::TextContains {
+            patterns: vec!["receipt".to_string()],
+        };
+
+        let result = evaluate_predicate(&pred, &signals);
+        assert!(!result.matched);
+    }
+
+    #[test]
+    fn test_heading_matches() {
+        let signals = test_signals();
+        let pred = ExtractionMatchPredicate::HeadingMatches {
+            pattern: "^Invoice$".to_string(),
+        };
+
+        let result = evaluate_predicate(&pred, &signals);
+        assert!(result.matched);
+    }
+
+    #[test]
+    fn test_has_currency_pattern() {
+        let signals = test_signals();
+        let pred = ExtractionMatchPredicate::HasCurrencyPattern {
+            has_currency_pattern: true,
+        };
+
+        let result = evaluate_predicate(&pred, &signals);
+        assert!(result.matched);
+    }
+
+    #[test]
+    fn test_structural_has_table() {
+        let signals = test_signals();
+        let pred = ExtractionMatchPredicate::Structural {
+            has_table: Some(true),
+            has_form_field: Some(false),
+            has_math: Some(false),
+            page_count: Some(PageCountRange {
+                min: Some(1),
+                max: Some(5),
+                hint: None,
+            }),
+        };
+
+        let result = evaluate_predicate(&pred, &signals);
+        assert!(result.matched);
+    }
+
+    #[test]
+    fn test_match_expr_all() {
+        let signals = test_signals();
+        let expr = MatchExpr::All {
+            all: vec![
+                MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
+                    patterns: vec!["invoice".to_string()],
+                }),
+                MatchExpr::Predicate(ExtractionMatchPredicate::Structural {
+                    has_table: Some(true),
+                    has_form_field: Some(false),
+                    has_math: Some(false),
+                    page_count: None,
+                }),
+            ],
+        };
+
+        let result = evaluate_match(&expr, &signals);
+        assert!(result.matched);
+        assert!(result.reasons.iter().any(|r| r.contains("all: all sub-expressions matched")));
+    }
+
+    #[test]
+    fn test_match_expr_any() {
+        let signals = test_signals();
+        let expr = MatchExpr::Any {
+            any: vec![
+                MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
+                    patterns: vec!["receipt".to_string()],
+                }),
+                MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
+                    patterns: vec!["invoice".to_string()],
+                }),
+            ],
+        };
+
+        let result = evaluate_match(&expr, &signals);
+        assert!(result.matched);
+    }
+
+    #[test]
+    fn test_match_expr_none() {
+        let signals = test_signals();
+        let expr = MatchExpr::None {
+            none: vec![MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
+                patterns: vec!["abstract".to_string()],
+            })],
+        };
+
+        let result = evaluate_match(&expr, &signals);
+        assert!(result.matched);
+    }
+
+    #[test]
+    fn test_match_expr_complex() {
+        let signals = test_signals();
+        // (invoice OR receipt) AND has_table
+        let expr = MatchExpr::All {
+            all: vec![
+                MatchExpr::Any {
+                    any: vec![
+                        MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
+                            patterns: vec!["invoice".to_string()],
+                        }),
+                        MatchExpr::Predicate(ExtractionMatchPredicate::TextContains {
+                            patterns: vec!["receipt".to_string()],
+                        }),
+                    ],
+                },
+                MatchExpr::Predicate(ExtractionMatchPredicate::Structural {
+                    has_table: Some(true),
+                    has_form_field: Some(false),
+                    has_math: Some(false),
+                    page_count: None,
+                }),
+            ],
+        };
+
+        let result = evaluate_match(&expr, &signals);
+        assert!(result.matched);
+    }
+}
--- a/crates/pdftract-core/src/profiles/mod.rs
+++ b/crates/pdftract-core/src/profiles/mod.rs
@ -18,19 +18,35 @@
 //! vocabulary between the rule engine, built-in profile definitions, and
 //! user-authored YAML profiles.

+mod apply_profile;
 mod engine;
+mod extraction;
+mod extraction_loader;
+mod field_extractor;
 mod loader;
+mod match_eval;
 mod signals;
 mod types;

+pub use apply_profile::{apply_extraction_tuning, apply_profile_to_metadata, classify_and_select_profile};
 pub use engine::{
    classify, has_currency_pattern, ClassificationResult, ClassifierEngine, FeatureSignals,
 };
+pub use extraction::{
+    ExtractionProfile, ExtractionTuning, FieldExtraction, FieldSchema, FieldSpec, MatchExpr,
+    ExtractionMatchPredicate,
+};
+pub use extraction_loader::{
+    find_profile, get_xdg_profile_dir, load_extraction_profiles, load_profile_file, ProfileOrigin,
+    ProfileSource, validate_profile_file,
+};
+pub use field_extractor::{extract_profile_fields, FieldExtractionResult};
 pub use loader::{
    check_forbidden_keys, load_profiles_from_dir, ForbiddenKeyError, ProfileLoadError,
 };
+pub use match_eval::{evaluate_match, MatchResult};
 pub use signals::{extract_feature_signals, extract_signals_from_results, PageSignalAccumulator};
-pub use types::{MatchPredicate, Profile, ProfileType};
+pub use types::{MatchPredicate as ClassificationMatchPredicate, Profile, ProfileType};

 use crate::diagnostics::DiagCode;

--- a/profiles/builtin/bank_statement/profile.yaml
+++ b/profiles/builtin/bank_statement/profile.yaml
@ -1,55 +1,64 @@
+# Bank Statement extraction profile
+# Matches bank statements with account info, period, balances, transactions
+name: bank_statement
 description: Bank statement with account info, period, balances, transactions
 priority: 42
+
 match:
-  any:
-    - text_patterns:
-        - "(?i)statement\\s+of\\s+account"
-        - "(?i)bank\\s+statement"
-        - "(?i)account\\s+statement"
-        - "(?i)transaction\\s+history"
-    - text_patterns:
-        - "(?i)opening\\s+balance"
-        - "(?i)closing\\s+balance"
-        - "(?i)statement\\s+period"
-        - "(?i)account\\s*#?\\s*:?\\s*\\*{4,}"
+  all:
+    - any:
+        - text_contains:
+            patterns: ["statement of account", "bank statement", "account statement", "transaction history"]
+        - text_contains:
+            patterns: ["opening balance", "closing balance", "statement period"]
    - structural:
-        - has_monetary_columnar_layout: true
-        - has_date_column: true
-  page_count_hint: 1-10
-profile_fields:
+        has_table: true
+        has_form_field: false
+        has_math: false
+        page_count:
+          min: 1
+          max: 10
+
+extraction:
+  reading_order: line_dominant
+  table_detection: default
+  readability_threshold: 0.5
+  include_invisible: false
+  include_headers_footers: false
+  force_ocr: false
+  min_block_chars: 0
+
+fields:
  account_number:
    type: string
    extraction:
-      patterns:
-        - "(?i)account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})"
-        - "(?i)acct\\s*(?:#|:)?\\s*(\\*?\\d[\\d\\*]{3,})"
-      fallback: null
+      regex: "account\\s*(?:number|#|no)?\\s*:?,?\\s*(\\*?\\d[\\d\\*]{3,})"
+      parse: string
+
  statement_period:
    type: string
    extraction:
-      patterns:
-        - "(?i)statement\\s+period\\s*:?.*?([A-Za-z]+\\s+[0-9]{1,2}.*?through.*?[A-Za-z]+\\s+[0-9]{1,2},?\\s+[0-9]{4})"
-        - "(?i)period\\s*:?.*?([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+(?:to|through|-)\\s+([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
-      fallback: null
+      near: ["Statement Period", "Period"]
+      parse: string
+
  opening_balance:
    type: decimal
    extraction:
-      patterns:
-        - "(?i)opening\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-        - "(?i)beginning\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-      fallback: null
+      near: ["Opening Balance", "Beginning Balance"]
+      regex: "([\\d,]+\\.\\d{2})"
+      parse: decimal
+
  closing_balance:
    type: decimal
    extraction:
-      patterns:
-        - "(?i)closing\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-        - "(?i)ending\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-        - "(?i)current\\s+balance\\s*:?.*?[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-      fallback: null
+      near: ["Closing Balance", "Ending Balance", "Current Balance"]
+      regex: "([\\d,]+\\.\\d{2})"
+      parse: decimal
+
  transactions:
    type: array
    extraction:
-      table_region: "largest_table_or_central_body"
+      table_region: largest_table
      schema:
        - name: date
          type: date
@ -64,5 +73,3 @@ profile_fields:
          type: decimal
          required: false
      fallback: []
-reading_order: line_dominant
-zone_filtering: exclude_headers_footers
--- a/profiles/builtin/book_chapter/profile.yaml
+++ b/profiles/builtin/book_chapter/profile.yaml
@ -1,68 +1,63 @@
-# Book Chapter Profile
-#
-# Book chapters, monographs, and long-form narrative documents.
-# Extracts title, chapter_number, author, sections.
-
+# Book Chapter extraction profile
+# Matches book chapters, monographs, and long-form narrative documents
 name: book_chapter
 description: Book chapters, monographs, long-form narrative documents
 priority: 5

-# Matching predicates for book chapter classification
 match:
  all:
-    # Page count in typical chapter range (not a whole book, not a single page)
    - structural:
-        page_count: {min: 5, max: 1000}
-    # Heading depth indicates structured content
-    - structural:
-        heading_depth: {min: 1, max: 5}
-    # AND EITHER: has chapter/section headings
-    # OR: has limited font diversity (not a dense academic paper)
-    # OR: matches chapter/section text patterns
+        has_table: false
+        has_form_field: false
+        has_math: false
+        page_count:
+          min: 5
+          max: 1000
    - any:
-        - text_matches: '^Chapter \d+'
-        - heading_matches: '^(Chapter|Part|Section) \d+'
-        - text_matches: '^\d+\.\s+[A-Z]'
-        - structural:
-            font_diversity: {min: 1, max: 4}
+        - text_matches:
+            pattern: "^Chapter \\d+"
+        - heading_matches:
+            pattern: "^(Chapter|Part|Section) \\d+"
+        - text_matches:
+            pattern: "^\\d+\\.\\s+[A-Z]"
  none:
-    # Exclude more specific document types
-    - text_contains: ['Abstract', 'WHEREAS', 'Invoice', 'Account Statement', 'References']
+    - text_contains:
+        patterns: ["Abstract", "WHEREAS", "Invoice", "Account Statement", "References"]

-# Extraction tuning for book chapters
 extraction:
-  # Use line_dominant reading order for narrative text flow
  reading_order: line_dominant
-  # Default table detection
  table_detection: default
-  # Higher readability threshold for narrative text quality
  readability_threshold: 0.6
-  # Don't include invisible text
  include_invisible: false
-  # Exclude headers, footers, and page numbers from body content
  include_headers_footers: false
+  force_ocr: false
+  min_block_chars: 0

-# Field extraction specifications
 fields:
  title:
    type: string
-    region: top_third
-    pick: largest_font
-    page: first
+    extraction:
+      region: top_third
+      pick: largest_font
+      parse: string

  chapter_number:
    type: string
-    near: ['Chapter', 'Part']
-    regex: '\d+'
-    max_distance_pt: 100
+    extraction:
+      near: ["Chapter", "Part"]
+      regex: "\\d+"
+      max_distance_pt: 100
+      parse: string

  author:
    type: string
-    region: top_quarter
-    pick: smallest_font
-    page: first
+    extraction:
+      region: top_quarter
+      pick: smallest_font
+      parse: string

  sections:
    type: array
-    pick: largest_font
-    per_page: true
+    extraction:
+      pick: largest_font
+      fallback: []
--- a/profiles/builtin/contract/profile.yaml
+++ b/profiles/builtin/contract/profile.yaml
@ -1,38 +1,66 @@
-# Contract profile for legal agreements
-# Extracts parties, effective date, term, governing law, and signatures from contracts
+# Contract extraction profile
+# Matches legal contracts and agreements with parties, effective date, term, governing law, and signatures
 name: contract
 description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures
 priority: 20

-# Matching predicates: identify documents as contracts
 match:
  all:
    - any:
-        - text_contains: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
-        - heading_matches: '^(Agreement|Contract|Memorandum of Understanding)'
-    - structural: {page_count: {min: 2, max: 200}}
+        - text_contains:
+            patterns: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
+        - heading_matches:
+            pattern: "^(Agreement|Contract|Memorandum of Understanding)"
+    - structural:
+        has_table: false
+        has_form_field: false
+        has_math: false
+        page_count:
+          min: 2
+          max: 200
  none:
-    - text_contains: ["Invoice #", "Receipt"]
+    - text_contains:
+        patterns: ["Invoice #", "Receipt"]

-# Extraction tuning for contracts
 extraction:
  reading_order: xy_cut
+  table_detection: off
  readability_threshold: 0.5
+  include_invisible: false
  include_headers_footers: false
+  force_ocr: false
+  min_block_chars: 0

-# Field extractors for contract-specific metadata
 fields:
  parties:
-    near: ["between", "party of the first part", "BY AND BETWEEN"]
-    pick: nearest_below
+    type: string
+    extraction:
+      near: ["between", "party of the first part", "BY AND BETWEEN"]
+      pick: nearest_below
+      parse: string
+
  effective_date:
-    near: ["Effective Date", "Date of Agreement", "as of"]
-    parse: date
+    type: date
+    extraction:
+      near: ["Effective Date", "Date of Agreement", "as of"]
+      parse: date
+
  term:
-    near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
-    regex: '\d+\s+(years?|months?)|expires?\s+\d{4}'
+    type: string
+    extraction:
+      near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
+      regex: "\\d+\\s+(years?|months?)|expires?\\s+\\d{4}"
+      parse: string
+
  governing_law:
-    near: ["Governing Law", "governed by the laws of"]
-    pick: nearest_right
+    type: string
+    extraction:
+      near: ["Governing Law", "governed by the laws of"]
+      pick: nearest_right
+      parse: string
+
  signatures:
-    region: bottom_quarter
+    type: array
+    extraction:
+      region: bottom_quarter
+      fallback: []
--- a/profiles/builtin/form/profile.yaml
+++ b/profiles/builtin/form/profile.yaml
@ -1,18 +1,34 @@
+# Form extraction profile
+# Matches fillable forms with fields; uses line_dominant reading order
+name: form
 description: Fillable form with fields; uses line_dominant reading order and form_fields from Phase 7.4
 priority: 30
+
 match:
-  any:
-    - text_patterns:
-        - "(?i)form\\s*[0-9A-Z-]+"
-        - "(?i)application\\s+form"
-        - "(?i)questionnaire"
-        - "(?i)please\\s+fill\\s+out"
-        - "(?i)required\\s+fields?"
+  all:
+    - any:
+        - text_contains:
+            patterns: ["form", "application form", "questionnaire", "please fill out", "required fields"]
+        - structural:
+            has_table: false
+            has_form_field: true
+            has_math: false
+            page_count: null
    - structural:
-        - has_form_field_layout: true
-        - has_blank_lines_with_colons: true
-  page_count_hint: 1-10
-profile_fields: {}
-reading_order: line_dominant
-zone_filtering: none
-form_fields_integration: true
+        has_table: false
+        has_form_field: false
+        has_math: false
+        page_count:
+          min: 1
+          max: 10
+
+extraction:
+  reading_order: line_dominant
+  table_detection: off
+  readability_threshold: 0.5
+  include_invisible: false
+  include_headers_footers: true
+  force_ocr: false
+  min_block_chars: 0
+
+fields: {}
--- a/profiles/builtin/invoice/profile.yaml
+++ b/profiles/builtin/invoice/profile.yaml
@ -1,81 +1,104 @@
+# Invoice extraction profile
+# Matches commercial invoices with line items, vendor/customer, and totals
+name: invoice
 description: Commercial invoice with line items, vendor/customer, and totals
 priority: 50
+
 match:
-  any:
-    - text_patterns:
-        - "(?i)invoice"
-        - "(?i)bill to"
-        - "(?i)invoice #"
-        - "(?i)invoice number"
-        - "(?i)tax invoice"
-    - text_patterns:
-        - "(?i)due date"
-        - "(?i)payment terms"
-        - "(?i)purchase order"
-        - "(?i)po #"
-    - structural:
-        - has_line_item_table: true
-  page_count_hint: 1-5
-profile_fields:
+  all:
+    - any:
+        - text_contains:
+            patterns: ["invoice", "bill to", "invoice #", "invoice number", "tax invoice"]
+        - heading_matches:
+            pattern: "^Invoice\\b"
+    - any:
+        - has_currency_pattern:
+            has_currency_pattern: true
+        - structural:
+            has_table: true
+            has_form_field: false
+            has_math: false
+            page_count:
+              min: 1
+              max: 5
+  none:
+    - text_contains:
+        patterns: ["abstract", "bibliography", "scientific paper"]
+
+extraction:
+  reading_order: line_dominant
+  table_detection: strict_borders
+  readability_threshold: 0.4
+  include_invisible: false
+  include_headers_footers: false
+  force_ocr: false
+  min_block_chars: 0
+
+fields:
  invoice_number:
    type: string
    extraction:
-      patterns:
-        - "(?i)invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
-        - "(?i)bill\\s*invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
-      fallback: null
+      regex: "Invoice\\s*#\\s*([\\w-]+)"
+      near: ["Invoice", "Invoice Number", "Invoice #"]
+      max_distance_pt: 200
+      parse: string
+
  vendor:
    type: string
    extraction:
-      patterns:
-        - "(?i)(?:from|vendor|supplier|company)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
-        - "(?i)^([A-Z][A-Za-z0-9\\s&]+)\\s+(?:Inc|LLC|Ltd|Corp|GmbH)"
-      fallback: null
+      region: top_quarter
+      pick: largest_font
+
  customer:
    type: string
    extraction:
-      patterns:
-        - "(?i)(?:bill\\s*to|customer|client)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
-      fallback: null
+      near: ["Bill To", "Customer", "Sold To"]
+      max_distance_pt: 150
+      pick: nearest_below
+      parse: string
+
  invoice_date:
    type: date
    extraction:
-      patterns:
-        - "(?i)invoice\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
-        - "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
-      fallback: null
+      near: ["Date", "Invoice Date"]
+      max_distance_pt: 100
+      parse: date
+
  due_date:
    type: date
    extraction:
-      patterns:
-        - "(?i)due\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
-        - "(?i)payment\\s*due\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
-      fallback: null
+      near: ["Due Date", "Payment Due", "Due"]
+      max_distance_pt: 100
+      parse: date
+
  total:
    type: decimal
    extraction:
-      patterns:
-        - "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-        - "(?i)amount\\s*due\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-      fallback: null
+      regex: "([\\d,]+\\.\\d{2})"
+      near: ["Total", "Amount Due", "Balance Due", "Grand Total"]
+      max_distance_pt: 80
+      parse: decimal
+
  subtotal:
    type: decimal
    extraction:
-      patterns:
-        - "(?i)sub\\s*total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-      fallback: null
+      regex: "([\\d,]+\\.\\d{2})"
+      near: ["Subtotal", "Sub-Total"]
+      max_distance_pt: 80
+      parse: decimal
+
  tax:
    type: decimal
    extraction:
-      patterns:
-        - "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-        - "(?i)vat\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-        - "(?i)gst\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-      fallback: null
+      regex: "([\\d,]+\\.\\d{2})"
+      near: ["Tax", "VAT", "GST", "Sales Tax"]
+      max_distance_pt: 80
+      parse: decimal
+
  line_items:
    type: array
    extraction:
-      table_region: "largest_table_or_bottom_half"
+      table_region: largest_table
      schema:
        - name: description
          type: string
@ -90,5 +113,3 @@ profile_fields:
          type: decimal
          required: false
      fallback: []
-reading_order: line_dominant
-zone_filtering: exclude_headers_footers
--- a/profiles/builtin/legal_filing/profile.yaml
+++ b/profiles/builtin/legal_filing/profile.yaml
@ -1,55 +1,62 @@
-# Legal Filing Profile
-#
-# Court filings: motions, briefs, orders, docket entries.
-# Extracts case_number, court, parties, filing_date, docket_entries.
-
+# Legal Filing extraction profile
+# Matches court filings: motions, briefs, orders, docket entries
 name: legal_filing
-description: "Court filings: motions, briefs, orders, docket entries"
+description: Court filings: motions, briefs, orders, docket entries
 priority: 40

-# Matching predicates for legal filing classification
 match:
  all:
-    # Must have at least one legal filing marker
    - any:
        - text_contains:
-            ["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF",
-             "Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner",
-             "Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"]
-        - heading_matches: '^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)'
-    # And appropriate page count
-    - structural: {page_count: {min: 1, max: 500}}
+            patterns: ["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF", "Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner", "Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"]
+        - heading_matches:
+            pattern: "^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)"
+    - structural:
+        has_table: false
+        has_form_field: false
+        has_math: false
+        page_count:
+          min: 1
+          max: 500

-# Extraction tuning for legal filings
 extraction:
-  # Use xy_cut reading order for complex layouts
  reading_order: xy_cut
-  # Default table detection
  table_detection: default
-  # Standard readability threshold
  readability_threshold: 0.5
-  # Include headers and footers (page numbers and citations are load-bearing in legal docs)
-  include_headers_footers: true
-  # Don't include invisible text
  include_invisible: false
+  include_headers_footers: true
+  force_ocr: false
+  min_block_chars: 0

-# Field extraction specifications
 fields:
  case_number:
-    near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."]
-    regex: '[\w-]+:?\s*\d+[\w-]*'
-    parse: string
+    type: string
+    extraction:
+      near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."]
+      regex: "[\\w-]+:?\\s*\\d+[\\w-]*"
+      parse: string

  court:
-    region: top_quarter
-    pick: largest_font
+    type: string
+    extraction:
+      region: top_quarter
+      pick: largest_font
+      parse: string

  parties:
-    near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."]
+    type: array
+    extraction:
+      near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."]
+      fallback: []

  filing_date:
-    near: ["Filed", "Date Filed", "Dated"]
-    parse: date
+    type: date
+    extraction:
+      near: ["Filed", "Date Filed", "Dated"]
+      parse: date

  docket_entries:
-    region: full
+    type: array
+    extraction:
+      region: bottom_half
+      fallback: []
--- a/profiles/builtin/receipt/profile.yaml
+++ b/profiles/builtin/receipt/profile.yaml
@ -1,52 +1,67 @@
+# Receipt extraction profile
+# Matches point-of-sale or purchase receipts with items and payment method
+name: receipt
 description: Point-of-sale or purchase receipt with items, payment method
 priority: 45
+
 match:
-  any:
-    - text_patterns:
-        - "(?i)receipt"
-        - "(?i)store receipt"
-        - "(?i)register receipt"
-        - "(?i)transaction receipt"
-    - text_patterns:
-        - "(?i)total.*sold"
-        - "(?i)change.*due"
-        - "(?i)cash.*credit"
-        - "(?i)card.*payment"
+  all:
+    - any:
+        - text_contains:
+            patterns: ["receipt", "store receipt", "register receipt", "transaction receipt"]
+        - text_contains:
+            patterns: ["total sold", "change due", "cash credit", "card payment"]
    - structural:
-        - has_monetary_columnar_layout: true
-        - page_aspect_ratio: "narrow_or_square"
-  page_count_hint: 1
-profile_fields:
+        has_table: true
+        has_form_field: false
+        has_math: false
+        page_count:
+          min: 1
+          max: 2
+
+extraction:
+  reading_order: line_dominant
+  table_detection: default
+  readability_threshold: 0.5
+  include_invisible: false
+  include_headers_footers: false
+  force_ocr: false
+  min_block_chars: 0
+
+fields:
  merchant:
    type: string
    extraction:
-      patterns:
-        - "(?i)^([A-Z][A-Za-z0-9\\s&']+)$"
-        - "(?i)(?:store|merchant|retailer)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&']+)"
-      fallback: null
+      region: top_quarter
+      pick: largest_font
+      parse: string
+
  date:
    type: date
    extraction:
-      patterns:
-        - "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
-        - "([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+([0-9]{1,2}:[0-9]{2})"
-      fallback: null
+      regex: "\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}"
+      parse: date
+
  total:
    type: decimal
    extraction:
-      patterns:
-        - "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-      fallback: null
+      regex: "([\\d,]+\\.\\d{2})"
+      near: ["Total", "Amount Due", "Balance"]
+      max_distance_pt: 80
+      parse: decimal
+
  tax:
    type: decimal
    extraction:
-      patterns:
-        - "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
-      fallback: null
+      regex: "([\\d,]+\\.\\d{2})"
+      near: ["Tax", "VAT"]
+      max_distance_pt: 80
+      parse: decimal
+
  items:
    type: array
    extraction:
-      columnar_regions: "monetary_columns"
+      table_region: largest_table
      schema:
        - name: name
          type: string
@ -58,11 +73,9 @@ profile_fields:
          type: decimal
          required: false
      fallback: []
+
  payment_method:
    type: string
    extraction:
-      patterns:
-        - "(?i)(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
-      fallback: null
-reading_order: line_dominant
-zone_filtering: exclude_headers_footers
+      regex: "(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
+      parse: string
--- a/profiles/builtin/scientific_paper/profile.yaml
+++ b/profiles/builtin/scientific_paper/profile.yaml
@ -1,66 +1,87 @@
-# Scientific Paper Profile
-#
-# Academic papers from arXiv, journals, conference proceedings.
-# Extracts title, authors, abstract, DOI, journal, publication_date, references.
-
+# Scientific Paper extraction profile
+# Matches academic papers from arXiv, journals, conference proceedings
 name: scientific_paper
 description: Academic papers from arXiv, journals, conference proceedings
 priority: 30

-# Matching predicates for scientific paper classification
 match:
  all:
-    # Must have at least one scientific paper marker
    - any:
-        - text_contains: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"]
-        - heading_matches: '^(Abstract|Introduction|References|Bibliography)'
-    # And either has math OR structured headings OR appropriate page count
+        - text_contains:
+            patterns: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"]
+        - heading_matches:
+            pattern: "^(Abstract|Introduction|References|Bibliography)"
    - any:
        - structural:
+            has_table: false
+            has_form_field: false
            has_math: true
+            page_count: null
        - structural:
-            heading_depth: {min: 2}
-        - structural:
-            page_count: {min: 4, max: 50}
+            has_table: false
+            has_form_field: false
+            has_math: false
+            page_count:
+              min: 4
+              max: 50
+  none:
+    - text_contains:
+        patterns: ["Invoice", "Receipt", "WHEREAS", "NOW THEREFORE"]

-# Extraction tuning for scientific papers
 extraction:
-  # Use xy_cut reading order for 2-column layout handling
  reading_order: xy_cut
-  # Default table detection
  table_detection: default
-  # Standard readability threshold
  readability_threshold: 0.5
-  # Don't include invisible text
  include_invisible: false
+  include_headers_footers: false
+  force_ocr: false
+  min_block_chars: 0

-# Field extraction specifications
 fields:
  title:
-    region: top_quarter
-    pick: largest_font
+    type: string
+    extraction:
+      region: top_quarter
+      pick: largest_font
+      parse: string

  authors:
-    region: top_quarter
-    pick: nearest_below
-    after: title
+    type: array
+    extraction:
+      region: top_quarter
+      pick: nearest_below
+      after_heading: title
+      fallback: []

  abstract:
-    near: ["Abstract"]
-    region: top_half
+    type: string
+    extraction:
+      near: ["Abstract"]
+      region: top_half
+      parse: string

  doi:
-    regex: 'doi[:\.]\s*(10\.\d{4,9}/[\w\-\._;()/:]+)'
-    parse: string
+    type: string
+    extraction:
+      regex: "doi[:\\.]\\s*(10\\.\\d{4,9}/[\\w\\-\\._;()/:]+)"
+      parse: string

  journal:
-    region: top_eighth
-    pick: first
+    type: string
+    extraction:
+      region: top_eighth
+      pick: first
+      parse: string

  publication_date:
-    near: ["Published", "Received", "Accepted"]
-    parse: date
+    type: date
+    extraction:
+      near: ["Published", "Received", "Accepted"]
+      parse: date

  references:
-    region: bottom_half
-    after_heading: References
+    type: array
+    extraction:
+      region: bottom_half
+      after_heading: References
+      fallback: []
--- a/profiles/builtin/slide_deck/profile.yaml
+++ b/profiles/builtin/slide_deck/profile.yaml
@ -1,64 +1,59 @@
-# Slide Deck Profile
-#
-# PowerPoint / Keynote / Google Slides exports as PDF.
-# Extracts title, presenter, date, slide_titles.
-
+# Slide Deck extraction profile
+# Matches PowerPoint / Keynote / Google Slides exports as PDF
 name: slide_deck
 description: PowerPoint / Keynote / Google Slides exports as PDF
 priority: 15

-# Matching predicates for slide deck classification
 match:
  all:
-    # Page count in typical slide deck range
    - structural:
-        page_count: {min: 3, max: 200}
-    # And EITHER: has limited font diversity (not a dense academic paper)
-    # OR: contains "Slide N" patterns
-    # OR: contains slide deck keywords
+        has_table: false
+        has_form_field: false
+        has_math: false
+        page_count:
+          min: 3
+          max: 200
    - any:
-        - structural:
-            has_form_field: false
-            font_diversity: {min: 2, max: 10}
-        - text_matches: '^Slide \d+$'
-        - text_contains: ["slides", "presentation"]
+        - text_matches:
+            pattern: "^Slide \\d+$"
+        - text_contains:
+            patterns: ["slides", "presentation"]
  none:
-    # Exclude academic papers (these have their own profile)
-    - text_contains: ["Abstract", "References", "WHEREAS", "Invoice"]
+    - text_contains:
+        patterns: ["Abstract", "References", "WHEREAS", "Invoice"]

-# Extraction tuning for slide decks
 extraction:
-  # Use xy_cut reading order for proper layout handling
  reading_order: xy_cut
-  # Default table detection
  table_detection: default
-  # Lower readability threshold for slides (less text density)
  readability_threshold: 0.6
-  # Don't include invisible text
  include_invisible: false
-  # Minimum block characters
+  include_headers_footers: false
+  force_ocr: false
  min_block_chars: 5

-# Field extraction specifications
 fields:
  title:
    type: string
-    region: middle_half
-    pick: largest_font
-    page: first
+    extraction:
+      region: top_half
+      pick: largest_font
+      parse: string

  presenter:
    type: string
-    region: bottom_half
-    pick: largest_font
-    page: first
+    extraction:
+      region: top_half
+      pick: largest_font
+      parse: string

  date:
    type: date
-    near: ["Date"]
-    parse: date
+    extraction:
+      near: ["Date"]
+      parse: date

  slide_titles:
    type: array
-    pick: largest_font
-    per_page: true
+    extraction:
+      pick: largest_font
+      fallback: []
--- a/tests/fixtures/profiles/invoice/01.pdf
+++ b/tests/fixtures/profiles/invoice/01.pdf
@ -0,0 +1 @@
+../../classifier/invoice/01.pdf
--- a/tests/fixtures/profiles/invoice/02.pdf
+++ b/tests/fixtures/profiles/invoice/02.pdf
@ -0,0 +1 @@
+../../classifier/invoice/02.pdf
--- a/tests/fixtures/profiles/invoice/03.pdf
+++ b/tests/fixtures/profiles/invoice/03.pdf
@ -0,0 +1 @@
+../../classifier/invoice/03.pdf
--- a/tests/fixtures/profiles/invoice/04.pdf
+++ b/tests/fixtures/profiles/invoice/04.pdf
@ -0,0 +1 @@
+../../classifier/invoice/04.pdf
--- a/tests/fixtures/profiles/invoice/05.pdf
+++ b/tests/fixtures/profiles/invoice/05.pdf
@ -0,0 +1 @@
+../../classifier/invoice/05.pdf
--- a/tests/fixtures/profiles/invoice/06.pdf
+++ b/tests/fixtures/profiles/invoice/06.pdf
@ -0,0 +1 @@
+../../classifier/invoice/06.pdf
--- a/tests/fixtures/profiles/invoice/07.pdf
+++ b/tests/fixtures/profiles/invoice/07.pdf
@ -0,0 +1 @@
+../../classifier/invoice/07.pdf
--- a/tests/fixtures/profiles/invoice/08.pdf
+++ b/tests/fixtures/profiles/invoice/08.pdf
@ -0,0 +1 @@
+../../classifier/invoice/08.pdf
--- a/tests/fixtures/profiles/invoice/09.pdf
+++ b/tests/fixtures/profiles/invoice/09.pdf
@ -0,0 +1 @@
+../../classifier/invoice/09.pdf
--- a/tests/fixtures/profiles/invoice/10.pdf
+++ b/tests/fixtures/profiles/invoice/10.pdf
@ -0,0 +1 @@
+../../classifier/invoice/10.pdf
--- a/tests/fixtures/profiles/invoice/11.pdf
+++ b/tests/fixtures/profiles/invoice/11.pdf
@ -0,0 +1 @@
+../../classifier/invoice/11.pdf
--- a/tests/fixtures/profiles/invoice/12.pdf
+++ b/tests/fixtures/profiles/invoice/12.pdf
@ -0,0 +1 @@
+../../classifier/invoice/12.pdf
--- a/tests/fixtures/profiles/invoice/13.pdf
+++ b/tests/fixtures/profiles/invoice/13.pdf
@ -0,0 +1 @@
+../../classifier/invoice/13.pdf
--- a/tests/fixtures/profiles/invoice/14.pdf
+++ b/tests/fixtures/profiles/invoice/14.pdf
@ -0,0 +1 @@
+../../classifier/invoice/14.pdf
--- a/tests/fixtures/profiles/invoice/15.pdf
+++ b/tests/fixtures/profiles/invoice/15.pdf
@ -0,0 +1 @@
+../../classifier/invoice/15.pdf
--- a/tests/fixtures/profiles/invoice/16.pdf
+++ b/tests/fixtures/profiles/invoice/16.pdf
@ -0,0 +1 @@
+../../classifier/invoice/16.pdf
--- a/tests/fixtures/profiles/invoice/17.pdf
+++ b/tests/fixtures/profiles/invoice/17.pdf
@ -0,0 +1 @@
+../../classifier/invoice/17.pdf
--- a/tests/fixtures/profiles/invoice/18.pdf
+++ b/tests/fixtures/profiles/invoice/18.pdf
@ -0,0 +1 @@
+../../classifier/invoice/18.pdf
--- a/tests/fixtures/profiles/invoice/19.pdf
+++ b/tests/fixtures/profiles/invoice/19.pdf
@ -0,0 +1 @@
+../../classifier/invoice/19.pdf
--- a/tests/fixtures/profiles/invoice/20.pdf
+++ b/tests/fixtures/profiles/invoice/20.pdf
@ -0,0 +1 @@
+../../classifier/invoice/20.pdf
--- a/tests/fixtures/profiles/invoice/21.pdf
+++ b/tests/fixtures/profiles/invoice/21.pdf
@ -0,0 +1 @@
+../../classifier/invoice/21.pdf
--- a/tests/fixtures/profiles/invoice/22.pdf
+++ b/tests/fixtures/profiles/invoice/22.pdf
@ -0,0 +1 @@
+../../classifier/invoice/22.pdf
--- a/tests/fixtures/profiles/invoice/23.pdf
+++ b/tests/fixtures/profiles/invoice/23.pdf
@ -0,0 +1 @@
+../../classifier/invoice/23.pdf
--- a/tests/fixtures/profiles/invoice/24.pdf
+++ b/tests/fixtures/profiles/invoice/24.pdf
@ -0,0 +1 @@
+../../classifier/invoice/24.pdf
--- a/tests/fixtures/profiles/invoice/25.pdf
+++ b/tests/fixtures/profiles/invoice/25.pdf
@ -0,0 +1 @@
+../../classifier/invoice/25.pdf
--- a/tests/fixtures/profiles/invoice/26.pdf
+++ b/tests/fixtures/profiles/invoice/26.pdf
@ -0,0 +1 @@
+../../classifier/invoice/26.pdf
--- a/tests/fixtures/profiles/invoice/27.pdf
+++ b/tests/fixtures/profiles/invoice/27.pdf
@ -0,0 +1 @@
+../../classifier/invoice/27.pdf
--- a/tests/fixtures/profiles/invoice/28.pdf
+++ b/tests/fixtures/profiles/invoice/28.pdf
@ -0,0 +1 @@
+../../classifier/invoice/28.pdf
--- a/tests/fixtures/profiles/invoice/29.pdf
+++ b/tests/fixtures/profiles/invoice/29.pdf
@ -0,0 +1 @@
+../../classifier/invoice/29.pdf
--- a/tests/fixtures/profiles/invoice/30.pdf
+++ b/tests/fixtures/profiles/invoice/30.pdf
@ -0,0 +1 @@
+../../classifier/invoice/30.pdf
--- a/tests/fixtures/profiles/invoice/31.pdf
+++ b/tests/fixtures/profiles/invoice/31.pdf
@ -0,0 +1 @@
+../../classifier/invoice/31.pdf
--- a/tests/fixtures/profiles/invoice/32.pdf
+++ b/tests/fixtures/profiles/invoice/32.pdf
@ -0,0 +1 @@
+../../classifier/invoice/32.pdf
--- a/tests/fixtures/profiles/invoice/33.pdf
+++ b/tests/fixtures/profiles/invoice/33.pdf
@ -0,0 +1 @@
+../../classifier/invoice/33.pdf
--- a/tests/fixtures/profiles/invoice/34.pdf
+++ b/tests/fixtures/profiles/invoice/34.pdf
@ -0,0 +1 @@
+../../classifier/invoice/34.pdf
--- a/tests/fixtures/profiles/invoice/35.pdf
+++ b/tests/fixtures/profiles/invoice/35.pdf
@ -0,0 +1 @@
+../../classifier/invoice/35.pdf
--- a/tests/fixtures/profiles/invoice/36.pdf
+++ b/tests/fixtures/profiles/invoice/36.pdf
@ -0,0 +1 @@
+../../classifier/invoice/36.pdf
--- a/tests/fixtures/profiles/invoice/37.pdf
+++ b/tests/fixtures/profiles/invoice/37.pdf
@ -0,0 +1 @@
+../../classifier/invoice/37.pdf
--- a/tests/fixtures/profiles/invoice/38.pdf
+++ b/tests/fixtures/profiles/invoice/38.pdf
@ -0,0 +1 @@
+../../classifier/invoice/38.pdf
--- a/tests/fixtures/profiles/invoice/39.pdf
+++ b/tests/fixtures/profiles/invoice/39.pdf
@ -0,0 +1 @@
+../../classifier/invoice/39.pdf
--- a/tests/fixtures/profiles/invoice/40.pdf
+++ b/tests/fixtures/profiles/invoice/40.pdf
@ -0,0 +1 @@
+../../classifier/invoice/40.pdf
--- a/tests/fixtures/profiles/invoice/41.pdf
+++ b/tests/fixtures/profiles/invoice/41.pdf
@ -0,0 +1 @@
+../../classifier/invoice/41.pdf
--- a/tests/fixtures/profiles/invoice/42.pdf
+++ b/tests/fixtures/profiles/invoice/42.pdf
@ -0,0 +1 @@
+../../classifier/invoice/42.pdf
--- a/tests/fixtures/profiles/invoice/43.pdf
+++ b/tests/fixtures/profiles/invoice/43.pdf
@ -0,0 +1 @@
+../../classifier/invoice/43.pdf
--- a/tests/fixtures/profiles/invoice/44.pdf
+++ b/tests/fixtures/profiles/invoice/44.pdf
@ -0,0 +1 @@
+../../classifier/invoice/44.pdf
--- a/tests/fixtures/profiles/invoice/45.pdf
+++ b/tests/fixtures/profiles/invoice/45.pdf
@ -0,0 +1 @@
+../../classifier/invoice/45.pdf
--- a/tests/fixtures/profiles/invoice/46.pdf
+++ b/tests/fixtures/profiles/invoice/46.pdf
@ -0,0 +1 @@
+../../classifier/invoice/46.pdf
--- a/tests/fixtures/profiles/invoice/47.pdf
+++ b/tests/fixtures/profiles/invoice/47.pdf
@ -0,0 +1 @@
+../../classifier/invoice/47.pdf
--- a/tests/fixtures/profiles/invoice/48.pdf
+++ b/tests/fixtures/profiles/invoice/48.pdf
@ -0,0 +1 @@
+../../classifier/invoice/48.pdf
--- a/tests/fixtures/profiles/invoice/49.pdf
+++ b/tests/fixtures/profiles/invoice/49.pdf
@ -0,0 +1 @@
+../../classifier/invoice/49.pdf
--- a/tests/fixtures/profiles/invoice/50.pdf
+++ b/tests/fixtures/profiles/invoice/50.pdf
@ -0,0 +1 @@
+../../classifier/invoice/50.pdf
--- a/tests/fixtures/profiles/receipt/tampered-receipt.pdf
+++ b/tests/fixtures/profiles/receipt/tampered-receipt.pdf
@ -0,0 +1 @@
+../../../sdk-conformance/fixtures/receipts/tampered-receipt.pdf
--- a/tests/fixtures/profiles/receipt/valid-receipt.pdf
+++ b/tests/fixtures/profiles/receipt/valid-receipt.pdf
@ -0,0 +1 @@
+../../../sdk-conformance/fixtures/receipts/valid-receipt.pdf
				`@ -0,0 +1 @@`
				`../../../sdk-conformance/fixtures/receipts/tampered-receipt.pdf`