pdftract/crates/pdftract-core/src/profiles/mod.rs
jedarden 80dbf0f703 feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00

206 lines
7.1 KiB
Rust

//! Profile loading and validation.
//!
//! This module provides functionality for loading and validating extraction
//! profiles from YAML files. Profiles define extraction options, field mappings,
//! and output formatting rules.
//!
//! # Security
//!
//! Profile files are checked for forbidden secret keys (password, token, secret,
//! api_key, etc.) to prevent accidental publication of credentials in profiles
//! that are checked into source control. See [`check_forbidden_keys`] and
//! [`ForbiddenKeyError`] for details.
//!
//! # Document Type Profiles
//!
//! The core types for document type classification (Phase 5.6) are
//! [`ProfileType`], [`Profile`], and [`MatchPredicate`]. These are the shared
//! vocabulary between the rule engine, built-in profile definitions, and
//! user-authored YAML profiles.
mod apply_profile;
mod engine;
mod extraction;
mod extraction_loader;
mod field_extractor;
mod loader;
mod match_eval;
mod signals;
mod types;
pub use apply_profile::{apply_extraction_tuning, apply_profile_to_metadata, classify_and_select_profile};
pub use engine::{
classify, has_currency_pattern, ClassificationResult, ClassifierEngine, FeatureSignals,
};
pub use extraction::{
ExtractionProfile, ExtractionTuning, FieldExtraction, FieldSchema, FieldSpec, MatchExpr,
ExtractionMatchPredicate,
};
pub use extraction_loader::{
find_profile, get_xdg_profile_dir, load_extraction_profiles, load_profile_file, ProfileOrigin,
ProfileSource, validate_profile_file,
};
pub use field_extractor::{extract_profile_fields, FieldExtractionResult};
pub use loader::{
check_forbidden_keys, load_profiles_from_dir, ForbiddenKeyError, ProfileLoadError,
};
pub use match_eval::{evaluate_match, MatchResult};
pub use signals::{extract_feature_signals, extract_signals_from_results, PageSignalAccumulator};
pub use types::{MatchPredicate as ClassificationMatchPredicate, Profile, ProfileType};
use crate::diagnostics::DiagCode;
/// Diagnostic code for forbidden secret keys in profiles.
///
/// Emitted when a profile YAML contains keys that suggest credentials or secrets.
/// This is a security measure to prevent accidental publication of secrets in
/// profile files checked into source control.
pub const PROFILE_SECRETS_FORBIDDEN: DiagCode = DiagCode::ProfileSecretsForbidden;
/// Load the built-in classification profiles.
///
/// This function embeds the profile YAML files at compile time via
/// `include_str!` and parses them into `Profile` structs. The profiles
/// are bundled into the binary and available without any external files.
///
/// # Feature Gate
///
/// This function is only available when the `profiles` feature is enabled.
/// When the feature is disabled, this function returns an empty vector.
///
/// # Returns
///
/// A vector of `Profile` structs representing the built-in classification
/// profiles for the 9 document types: invoice, receipt, contract, scientific_paper,
/// slide_deck, form, bank_statement, legal_filing, and book_chapter.
#[cfg(feature = "profiles")]
pub fn load_builtins() -> Vec<Profile> {
let profiles = [
include_str!("../../../../profiles/builtin/classification/invoice.yaml"),
include_str!("../../../../profiles/builtin/classification/receipt.yaml"),
include_str!("../../../../profiles/builtin/classification/contract.yaml"),
include_str!("../../../../profiles/builtin/classification/scientific_paper.yaml"),
include_str!("../../../../profiles/builtin/classification/slide_deck.yaml"),
include_str!("../../../../profiles/builtin/classification/form.yaml"),
include_str!("../../../../profiles/builtin/classification/bank_statement.yaml"),
include_str!("../../../../profiles/builtin/classification/legal_filing.yaml"),
include_str!("../../../../profiles/builtin/classification/book_chapter.yaml"),
];
let mut result = Vec::with_capacity(profiles.len());
for yaml_content in profiles {
match serde_yaml::from_str::<Profile>(yaml_content) {
Ok(profile) => result.push(profile),
Err(e) => {
// Log the error but continue loading other profiles
// In production, this might use tracing::error!
eprintln!("Failed to parse built-in profile: {}", e);
}
}
}
result
}
/// Load the built-in classification profiles (profiles feature disabled).
///
/// When the `profiles` feature is disabled, no built-in profiles are available.
/// This function returns an empty vector.
#[cfg(not(feature = "profiles"))]
pub fn load_builtins() -> Vec<Profile> {
Vec::new()
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "profiles")]
#[test]
fn test_load_builtins_returns_all_nine_profiles() {
let profiles = load_builtins();
assert_eq!(profiles.len(), 9, "Expected 9 built-in profiles");
}
#[cfg(feature = "profiles")]
#[test]
fn test_load_builtins_contains_all_profile_types() {
let profiles = load_builtins();
let types: Vec<_> = profiles.iter().map(|p| p.profile_type).collect();
assert!(
types.contains(&ProfileType::Invoice),
"Missing invoice profile"
);
assert!(
types.contains(&ProfileType::Receipt),
"Missing receipt profile"
);
assert!(
types.contains(&ProfileType::Contract),
"Missing contract profile"
);
assert!(
types.contains(&ProfileType::ScientificPaper),
"Missing scientific_paper profile"
);
assert!(
types.contains(&ProfileType::SlideDeck),
"Missing slide_deck profile"
);
assert!(types.contains(&ProfileType::Form), "Missing form profile");
assert!(
types.contains(&ProfileType::BankStatement),
"Missing bank_statement profile"
);
assert!(
types.contains(&ProfileType::LegalFiling),
"Missing legal_filing profile"
);
assert!(
types.contains(&ProfileType::BookChapter),
"Missing book_chapter profile"
);
}
#[cfg(feature = "profiles")]
#[test]
fn test_load_builtins_profiles_have_valid_thresholds() {
let profiles = load_builtins();
for profile in &profiles {
assert!(
profile.threshold > 0.0 && profile.threshold <= 1.0,
"Profile '{}' has invalid threshold {}",
profile.name,
profile.threshold
);
}
}
#[cfg(feature = "profiles")]
#[test]
fn test_load_builtins_profiles_have_predicates() {
let profiles = load_builtins();
for profile in &profiles {
assert!(
!profile.predicates.is_empty(),
"Profile '{}' has no predicates",
profile.name
);
}
}
#[cfg(not(feature = "profiles"))]
#[test]
fn test_load_builtins_returns_empty_when_disabled() {
let profiles = load_builtins();
assert_eq!(
profiles.len(),
0,
"Expected no profiles when feature is disabled"
);
}
}