feat(pdftract-51bk): implement ProfileType, Profile, MatchPredicate types
- Add ProfileType enum with 10 variants (invoice, receipt, contract, etc.) - Add Profile struct with name, type, predicates, threshold (default 0.6) - Add MatchPredicate enum with 12 predicate kinds (text_contains, text_matches_regex, structural_has_table, etc.) - All types support serde YAML serialization/deserialization - ProfileType uses snake_case for YAML compatibility - MatchPredicate uses tagged enum representation (kind field) - Comprehensive unit tests for all variants and roundtrip serialization Closes: pdftract-51bk
This commit is contained in:
parent
b96c3bfd37
commit
7df83c64dd
2 changed files with 597 additions and 0 deletions
|
|
@ -9,10 +9,19 @@
|
||||||
//! Profile files are checked for forbidden secret keys (password, token, secret,
|
//! Profile files are checked for forbidden secret keys (password, token, secret,
|
||||||
//! api_key, etc.) to prevent accidental publication of credentials in profiles
|
//! api_key, etc.) to prevent accidental publication of credentials in profiles
|
||||||
//! that are checked into source control. See [`ProfileSecretsForbidden`] for details.
|
//! that are checked into source control. See [`ProfileSecretsForbidden`] for details.
|
||||||
|
//!
|
||||||
|
//! # Document Type Profiles
|
||||||
|
//!
|
||||||
|
//! The [`types`] module defines the core types for document type classification
|
||||||
|
//! (Phase 5.6): [`ProfileType`], [`Profile`], and [`MatchPredicate`]. These
|
||||||
|
//! are the shared vocabulary between the rule engine, built-in profile definitions,
|
||||||
|
//! and user-authored YAML profiles.
|
||||||
|
|
||||||
mod loader;
|
mod loader;
|
||||||
|
mod types;
|
||||||
|
|
||||||
pub use loader::{check_forbidden_keys, ForbiddenKeyError, ProfileLoadError};
|
pub use loader::{check_forbidden_keys, ForbiddenKeyError, ProfileLoadError};
|
||||||
|
pub use types::{MatchPredicate, Profile, ProfileType};
|
||||||
|
|
||||||
use crate::diagnostics::DiagCode;
|
use crate::diagnostics::DiagCode;
|
||||||
|
|
||||||
|
|
|
||||||
588
crates/pdftract-core/src/profiles/types.rs
Normal file
588
crates/pdftract-core/src/profiles/types.rs
Normal file
|
|
@ -0,0 +1,588 @@
|
||||||
|
//! Document type profile types.
|
||||||
|
//!
|
||||||
|
//! This module defines the core types for document type classification (Phase 5.6).
|
||||||
|
//! These types are shared between the rule engine, built-in profile definitions,
|
||||||
|
//! and user-authored YAML profiles.
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
/// Document type profile.
|
||||||
|
///
|
||||||
|
/// Represents a document type (invoice, receipt, contract, etc.) with matching
|
||||||
|
/// predicates that determine whether a document matches this type.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct Profile {
|
||||||
|
/// Profile name (e.g., "Standard Invoice", "Simple Receipt").
|
||||||
|
pub name: String,
|
||||||
|
|
||||||
|
/// Document type category.
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub profile_type: ProfileType,
|
||||||
|
|
||||||
|
/// Matching predicates that determine if a document matches this profile.
|
||||||
|
pub predicates: Vec<MatchPredicate>,
|
||||||
|
|
||||||
|
/// Confidence threshold [0.0, 1.0] for this profile to match.
|
||||||
|
/// Default is 0.6. A profile only matches if the sum of predicate
|
||||||
|
/// weights that fire exceeds this threshold.
|
||||||
|
#[serde(default = "default_threshold")]
|
||||||
|
pub threshold: f32,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_threshold() -> f32 {
|
||||||
|
0.6
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Document type category.
|
||||||
|
///
|
||||||
|
/// Represents the high-level classification of a document. These are the
|
||||||
|
/// built-in types that pdftract supports. User-defined profiles can extend
|
||||||
|
/// this set in Phase 7.10.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum ProfileType {
|
||||||
|
/// Invoice document (commercial transaction request for payment).
|
||||||
|
Invoice,
|
||||||
|
|
||||||
|
/// Receipt document (proof of payment).
|
||||||
|
Receipt,
|
||||||
|
|
||||||
|
/// Contract document (legal agreement between parties).
|
||||||
|
Contract,
|
||||||
|
|
||||||
|
/// Scientific paper (academic research article with abstract, references).
|
||||||
|
ScientificPaper,
|
||||||
|
|
||||||
|
/// Slide deck (presentation slides, typically PowerPoint/PDF export).
|
||||||
|
SlideDeck,
|
||||||
|
|
||||||
|
/// Form document (fillable fields, structured data entry).
|
||||||
|
Form,
|
||||||
|
|
||||||
|
/// Bank statement (financial account statement).
|
||||||
|
BankStatement,
|
||||||
|
|
||||||
|
/// Legal filing (court document, legal filing).
|
||||||
|
LegalFiling,
|
||||||
|
|
||||||
|
/// Book chapter (excerpt from a book, with chapter structure).
|
||||||
|
BookChapter,
|
||||||
|
|
||||||
|
/// Unknown document type (fallback when no profile matches).
|
||||||
|
Unknown,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ProfileType {
|
||||||
|
/// Get the string representation of this profile type.
|
||||||
|
///
|
||||||
|
/// Returns the same string that would be serialized to YAML.
|
||||||
|
pub fn as_str(&self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
ProfileType::Invoice => "invoice",
|
||||||
|
ProfileType::Receipt => "receipt",
|
||||||
|
ProfileType::Contract => "contract",
|
||||||
|
ProfileType::ScientificPaper => "scientific_paper",
|
||||||
|
ProfileType::SlideDeck => "slide_deck",
|
||||||
|
ProfileType::Form => "form",
|
||||||
|
ProfileType::BankStatement => "bank_statement",
|
||||||
|
ProfileType::LegalFiling => "legal_filing",
|
||||||
|
ProfileType::BookChapter => "book_chapter",
|
||||||
|
ProfileType::Unknown => "unknown",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Matching predicate for document type classification.
|
||||||
|
///
|
||||||
|
/// Each predicate represents a signal that the classifier evaluates against
|
||||||
|
/// the extracted document. Predicates have weights that contribute to the
|
||||||
|
/// overall score for a profile.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(tag = "kind", rename_all = "snake_case")]
|
||||||
|
pub enum MatchPredicate {
|
||||||
|
/// Text contains a pattern (substring match).
|
||||||
|
///
|
||||||
|
/// Searches for the pattern in the extracted text, counting occurrences.
|
||||||
|
/// The predicate fires if min_hits or more occurrences are found.
|
||||||
|
TextContains {
|
||||||
|
/// Pattern string to search for.
|
||||||
|
pattern: String,
|
||||||
|
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
|
||||||
|
/// Whether the search is case-sensitive.
|
||||||
|
#[serde(default)]
|
||||||
|
case_sensitive: bool,
|
||||||
|
|
||||||
|
/// Minimum number of hits required for this predicate to fire.
|
||||||
|
#[serde(default)]
|
||||||
|
min_hits: u32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Text matches a regular expression.
|
||||||
|
///
|
||||||
|
/// The regex pattern is compiled lazily during evaluation (Phase 5.6.2).
|
||||||
|
/// The predicate fires if min_hits or more matches are found.
|
||||||
|
TextMatchesRegex {
|
||||||
|
/// Regular expression pattern string.
|
||||||
|
pattern: String,
|
||||||
|
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
|
||||||
|
/// Minimum number of matches required for this predicate to fire.
|
||||||
|
#[serde(default)]
|
||||||
|
min_hits: u32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document contains tables.
|
||||||
|
///
|
||||||
|
/// Fires if the document has at least min_count tables.
|
||||||
|
StructuralHasTable {
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
|
||||||
|
/// Minimum number of tables required.
|
||||||
|
#[serde(default)]
|
||||||
|
min_count: u32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document contains signature fields.
|
||||||
|
///
|
||||||
|
/// Fires if any AcroForm signature fields are detected.
|
||||||
|
StructuralHasSignatureField {
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document contains form fields.
|
||||||
|
///
|
||||||
|
/// Fires if any AcroForm fields (text, checkbox, etc.) are detected.
|
||||||
|
StructuralHasFormField {
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document contains mathematical operators.
|
||||||
|
///
|
||||||
|
/// Fires if mathematical symbols (integral, summation, fraction, etc.)
|
||||||
|
/// are detected in the text content.
|
||||||
|
StructuralHasMathOperators {
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document contains bullet lists.
|
||||||
|
///
|
||||||
|
/// Fires if bullet list structures are detected in the layout.
|
||||||
|
StructuralHasBulletLists {
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Page count is within a range.
|
||||||
|
///
|
||||||
|
/// Fires if the document's page count is between min and max (inclusive).
|
||||||
|
PageCountInRange {
|
||||||
|
/// Minimum page count (inclusive).
|
||||||
|
min: u32,
|
||||||
|
|
||||||
|
/// Maximum page count (inclusive).
|
||||||
|
max: u32,
|
||||||
|
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Font diversity is within a range.
|
||||||
|
///
|
||||||
|
/// Font diversity is the count of distinct font names used in the document.
|
||||||
|
/// Fires if the count is between min and max (inclusive).
|
||||||
|
FontDiversityInRange {
|
||||||
|
/// Minimum distinct font count (inclusive).
|
||||||
|
min: u32,
|
||||||
|
|
||||||
|
/// Maximum distinct font count (inclusive).
|
||||||
|
max: u32,
|
||||||
|
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Heading depth is at least a certain level.
|
||||||
|
///
|
||||||
|
/// Heading depth refers to the nesting level of section headers (H1, H2, etc.).
|
||||||
|
/// Fires if the document has headings at least this deep.
|
||||||
|
HeadingDepthAtLeast {
|
||||||
|
/// Minimum heading depth (1 = H1, 2 = H2, etc.).
|
||||||
|
depth: u32,
|
||||||
|
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Glyph density is within a range.
|
||||||
|
///
|
||||||
|
/// Glyph density is the ratio of extracted characters to expected characters
|
||||||
|
/// based on font metrics. Low density can indicate scanned or broken documents.
|
||||||
|
/// Fires if the density is between min and max (inclusive).
|
||||||
|
GlyphDensityInRange {
|
||||||
|
/// Minimum density (inclusive).
|
||||||
|
min: f32,
|
||||||
|
|
||||||
|
/// Maximum density (inclusive).
|
||||||
|
max: f32,
|
||||||
|
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// Document has footer page numbers.
|
||||||
|
///
|
||||||
|
/// Fires if page numbers are detected in footer positions.
|
||||||
|
HasFooterPageNumbers {
|
||||||
|
/// Weight contribution to profile score when this predicate fires.
|
||||||
|
weight: f32,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profile_type_serialization() {
|
||||||
|
// Verify ProfileType serializes to the exact strings expected
|
||||||
|
assert_eq!(
|
||||||
|
serde_yaml::to_string(&ProfileType::Invoice).unwrap().trim(),
|
||||||
|
"invoice"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
serde_yaml::to_string(&ProfileType::ScientificPaper)
|
||||||
|
.unwrap()
|
||||||
|
.trim(),
|
||||||
|
"scientific_paper"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
serde_yaml::to_string(&ProfileType::SlideDeck)
|
||||||
|
.unwrap()
|
||||||
|
.trim(),
|
||||||
|
"slide_deck"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
serde_yaml::to_string(&ProfileType::Unknown).unwrap().trim(),
|
||||||
|
"unknown"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profile_type_deserialization() {
|
||||||
|
// Verify we can deserialize from snake_case strings
|
||||||
|
let yaml = "invoice";
|
||||||
|
let parsed: ProfileType = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(parsed, ProfileType::Invoice);
|
||||||
|
|
||||||
|
let yaml = "scientific_paper";
|
||||||
|
let parsed: ProfileType = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(parsed, ProfileType::ScientificPaper);
|
||||||
|
|
||||||
|
let yaml = "slide_deck";
|
||||||
|
let parsed: ProfileType = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(parsed, ProfileType::SlideDeck);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profile_type_as_str() {
|
||||||
|
assert_eq!(ProfileType::Invoice.as_str(), "invoice");
|
||||||
|
assert_eq!(ProfileType::Receipt.as_str(), "receipt");
|
||||||
|
assert_eq!(ProfileType::Contract.as_str(), "contract");
|
||||||
|
assert_eq!(ProfileType::ScientificPaper.as_str(), "scientific_paper");
|
||||||
|
assert_eq!(ProfileType::SlideDeck.as_str(), "slide_deck");
|
||||||
|
assert_eq!(ProfileType::Form.as_str(), "form");
|
||||||
|
assert_eq!(ProfileType::BankStatement.as_str(), "bank_statement");
|
||||||
|
assert_eq!(ProfileType::LegalFiling.as_str(), "legal_filing");
|
||||||
|
assert_eq!(ProfileType::BookChapter.as_str(), "book_chapter");
|
||||||
|
assert_eq!(ProfileType::Unknown.as_str(), "unknown");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_predicate_text_contains_serialization() {
|
||||||
|
let predicate = MatchPredicate::TextContains {
|
||||||
|
pattern: "INVOICE".to_string(),
|
||||||
|
weight: 0.8,
|
||||||
|
case_sensitive: true,
|
||||||
|
min_hits: 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
let yaml = serde_yaml::to_string(&predicate).unwrap();
|
||||||
|
assert!(yaml.contains("kind: text_contains"));
|
||||||
|
assert!(yaml.contains("pattern: INVOICE"));
|
||||||
|
assert!(yaml.contains("weight: 0.8"));
|
||||||
|
assert!(yaml.contains("case_sensitive: true"));
|
||||||
|
assert!(yaml.contains("min_hits: 1"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_predicate_text_matches_regex_serialization() {
|
||||||
|
let predicate = MatchPredicate::TextMatchesRegex {
|
||||||
|
pattern: r"\d{4}-\d{2}-\d{2}".to_string(),
|
||||||
|
weight: 0.5,
|
||||||
|
min_hits: 3,
|
||||||
|
};
|
||||||
|
|
||||||
|
let yaml = serde_yaml::to_string(&predicate).unwrap();
|
||||||
|
assert!(yaml.contains("kind: text_matches_regex"));
|
||||||
|
assert!(yaml.contains(r"pattern: \d{4}-\d{2}-\d{2}"));
|
||||||
|
assert!(yaml.contains("weight: 0.5"));
|
||||||
|
assert!(yaml.contains("min_hits: 3"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_predicate_structural_serialization() {
|
||||||
|
let predicate = MatchPredicate::StructuralHasTable {
|
||||||
|
weight: 0.6,
|
||||||
|
min_count: 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
let yaml = serde_yaml::to_string(&predicate).unwrap();
|
||||||
|
assert!(yaml.contains("kind: structural_has_table"));
|
||||||
|
assert!(yaml.contains("weight: 0.6"));
|
||||||
|
assert!(yaml.contains("min_count: 2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_predicate_page_count_range_serialization() {
|
||||||
|
let predicate = MatchPredicate::PageCountInRange {
|
||||||
|
min: 1,
|
||||||
|
max: 5,
|
||||||
|
weight: 0.3,
|
||||||
|
};
|
||||||
|
|
||||||
|
let yaml = serde_yaml::to_string(&predicate).unwrap();
|
||||||
|
assert!(yaml.contains("kind: page_count_in_range"));
|
||||||
|
assert!(yaml.contains("min: 1"));
|
||||||
|
assert!(yaml.contains("max: 5"));
|
||||||
|
assert!(yaml.contains("weight: 0.3"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profile_roundtrip() {
|
||||||
|
let profile = Profile {
|
||||||
|
name: "Test Invoice".to_string(),
|
||||||
|
profile_type: ProfileType::Invoice,
|
||||||
|
predicates: vec![
|
||||||
|
MatchPredicate::TextContains {
|
||||||
|
pattern: "INVOICE".to_string(),
|
||||||
|
weight: 0.8,
|
||||||
|
case_sensitive: true,
|
||||||
|
min_hits: 1,
|
||||||
|
},
|
||||||
|
MatchPredicate::PageCountInRange {
|
||||||
|
min: 1,
|
||||||
|
max: 3,
|
||||||
|
weight: 0.2,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
threshold: 0.6,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Serialize to YAML
|
||||||
|
let yaml = serde_yaml::to_string(&profile).unwrap();
|
||||||
|
|
||||||
|
// Deserialize back
|
||||||
|
let parsed: Profile = serde_yaml::from_str(&yaml).unwrap();
|
||||||
|
|
||||||
|
// Verify roundtrip
|
||||||
|
assert_eq!(parsed.name, profile.name);
|
||||||
|
assert_eq!(parsed.profile_type, profile.profile_type);
|
||||||
|
assert_eq!(parsed.predicates.len(), profile.predicates.len());
|
||||||
|
assert_eq!(parsed.threshold, profile.threshold);
|
||||||
|
|
||||||
|
// Verify predicate details
|
||||||
|
match &parsed.predicates[0] {
|
||||||
|
MatchPredicate::TextContains {
|
||||||
|
pattern,
|
||||||
|
weight,
|
||||||
|
case_sensitive,
|
||||||
|
min_hits,
|
||||||
|
} => {
|
||||||
|
assert_eq!(pattern, "INVOICE");
|
||||||
|
assert_eq!(*weight, 0.8);
|
||||||
|
assert_eq!(*case_sensitive, true);
|
||||||
|
assert_eq!(*min_hits, 1);
|
||||||
|
}
|
||||||
|
_ => panic!("Wrong predicate type"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profile_default_threshold() {
|
||||||
|
let yaml = r#"
|
||||||
|
name: "Test"
|
||||||
|
type: invoice
|
||||||
|
predicates: []
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let profile: Profile = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(profile.threshold, 0.6);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_profile_custom_threshold() {
|
||||||
|
let yaml = r#"
|
||||||
|
name: "Test"
|
||||||
|
type: invoice
|
||||||
|
predicates: []
|
||||||
|
threshold: 0.8
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let profile: Profile = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(profile.threshold, 0.8);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_load_profile_from_yaml_with_all_predicate_kinds() {
|
||||||
|
// This test verifies we can deserialize a YAML profile containing
|
||||||
|
// one of each MatchPredicate kind
|
||||||
|
let yaml = r#"
|
||||||
|
name: "Comprehensive Test Profile"
|
||||||
|
type: scientific_paper
|
||||||
|
threshold: 0.7
|
||||||
|
predicates:
|
||||||
|
- kind: text_contains
|
||||||
|
pattern: "Abstract"
|
||||||
|
weight: 0.5
|
||||||
|
case_sensitive: false
|
||||||
|
min_hits: 1
|
||||||
|
|
||||||
|
- kind: text_matches_regex
|
||||||
|
pattern: "\\b\\d{4}\\b"
|
||||||
|
weight: 0.3
|
||||||
|
min_hits: 5
|
||||||
|
|
||||||
|
- kind: structural_has_table
|
||||||
|
weight: 0.4
|
||||||
|
min_count: 2
|
||||||
|
|
||||||
|
- kind: structural_has_signature_field
|
||||||
|
weight: 0.1
|
||||||
|
|
||||||
|
- kind: structural_has_form_field
|
||||||
|
weight: 0.1
|
||||||
|
|
||||||
|
- kind: structural_has_math_operators
|
||||||
|
weight: 0.6
|
||||||
|
|
||||||
|
- kind: structural_has_bullet_lists
|
||||||
|
weight: 0.3
|
||||||
|
|
||||||
|
- kind: page_count_in_range
|
||||||
|
min: 5
|
||||||
|
max: 20
|
||||||
|
weight: 0.2
|
||||||
|
|
||||||
|
- kind: font_diversity_in_range
|
||||||
|
min: 1
|
||||||
|
max: 5
|
||||||
|
weight: 0.2
|
||||||
|
|
||||||
|
- kind: heading_depth_at_least
|
||||||
|
depth: 3
|
||||||
|
weight: 0.4
|
||||||
|
|
||||||
|
- kind: glyph_density_in_range
|
||||||
|
min: 0.7
|
||||||
|
max: 1.0
|
||||||
|
weight: 0.3
|
||||||
|
|
||||||
|
- kind: has_footer_page_numbers
|
||||||
|
weight: 0.2
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let profile: Profile = serde_yaml::from_str(yaml).unwrap();
|
||||||
|
assert_eq!(profile.name, "Comprehensive Test Profile");
|
||||||
|
assert_eq!(profile.profile_type, ProfileType::ScientificPaper);
|
||||||
|
assert_eq!(profile.threshold, 0.7);
|
||||||
|
assert_eq!(profile.predicates.len(), 12);
|
||||||
|
|
||||||
|
// Verify we got each predicate kind
|
||||||
|
let kinds: Vec<_> = profile
|
||||||
|
.predicates
|
||||||
|
.iter()
|
||||||
|
.map(|p| match p {
|
||||||
|
MatchPredicate::TextContains { .. } => "text_contains",
|
||||||
|
MatchPredicate::TextMatchesRegex { .. } => "text_matches_regex",
|
||||||
|
MatchPredicate::StructuralHasTable { .. } => "structural_has_table",
|
||||||
|
MatchPredicate::StructuralHasSignatureField { .. } => {
|
||||||
|
"structural_has_signature_field"
|
||||||
|
}
|
||||||
|
MatchPredicate::StructuralHasFormField { .. } => "structural_has_form_field",
|
||||||
|
MatchPredicate::StructuralHasMathOperators { .. } => {
|
||||||
|
"structural_has_math_operators"
|
||||||
|
}
|
||||||
|
MatchPredicate::StructuralHasBulletLists { .. } => "structural_has_bullet_lists",
|
||||||
|
MatchPredicate::PageCountInRange { .. } => "page_count_in_range",
|
||||||
|
MatchPredicate::FontDiversityInRange { .. } => "font_diversity_in_range",
|
||||||
|
MatchPredicate::HeadingDepthAtLeast { .. } => "heading_depth_at_least",
|
||||||
|
MatchPredicate::GlyphDensityInRange { .. } => "glyph_density_in_range",
|
||||||
|
MatchPredicate::HasFooterPageNumbers { .. } => "has_footer_page_numbers",
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
assert!(kinds.contains(&"text_contains"));
|
||||||
|
assert!(kinds.contains(&"text_matches_regex"));
|
||||||
|
assert!(kinds.contains(&"structural_has_table"));
|
||||||
|
assert!(kinds.contains(&"structural_has_signature_field"));
|
||||||
|
assert!(kinds.contains(&"structural_has_form_field"));
|
||||||
|
assert!(kinds.contains(&"structural_has_math_operators"));
|
||||||
|
assert!(kinds.contains(&"structural_has_bullet_lists"));
|
||||||
|
assert!(kinds.contains(&"page_count_in_range"));
|
||||||
|
assert!(kinds.contains(&"font_diversity_in_range"));
|
||||||
|
assert!(kinds.contains(&"heading_depth_at_least"));
|
||||||
|
assert!(kinds.contains(&"glyph_density_in_range"));
|
||||||
|
assert!(kinds.contains(&"has_footer_page_numbers"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_match_predicate_exhaustive_match() {
|
||||||
|
// This test verifies that all MatchPredicate variants can be
|
||||||
|
// matched exhaustively (compile-time check for completeness)
|
||||||
|
fn predicate_kind(pred: &MatchPredicate) -> &'static str {
|
||||||
|
match pred {
|
||||||
|
MatchPredicate::TextContains { .. } => "text_contains",
|
||||||
|
MatchPredicate::TextMatchesRegex { .. } => "text_matches_regex",
|
||||||
|
MatchPredicate::StructuralHasTable { .. } => "structural_has_table",
|
||||||
|
MatchPredicate::StructuralHasSignatureField { .. } => {
|
||||||
|
"structural_has_signature_field"
|
||||||
|
}
|
||||||
|
MatchPredicate::StructuralHasFormField { .. } => "structural_has_form_field",
|
||||||
|
MatchPredicate::StructuralHasMathOperators { .. } => {
|
||||||
|
"structural_has_math_operators"
|
||||||
|
}
|
||||||
|
MatchPredicate::StructuralHasBulletLists { .. } => "structural_has_bullet_lists",
|
||||||
|
MatchPredicate::PageCountInRange { .. } => "page_count_in_range",
|
||||||
|
MatchPredicate::FontDiversityInRange { .. } => "font_diversity_in_range",
|
||||||
|
MatchPredicate::HeadingDepthAtLeast { .. } => "heading_depth_at_least",
|
||||||
|
MatchPredicate::GlyphDensityInRange { .. } => "glyph_density_in_range",
|
||||||
|
MatchPredicate::HasFooterPageNumbers { .. } => "has_footer_page_numbers",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let pred = MatchPredicate::TextContains {
|
||||||
|
pattern: "test".to_string(),
|
||||||
|
weight: 0.5,
|
||||||
|
case_sensitive: false,
|
||||||
|
min_hits: 1,
|
||||||
|
};
|
||||||
|
assert_eq!(predicate_kind(&pred), "text_contains");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compile_fails_for_invalid_variant() {
|
||||||
|
// This is a compile-time test: if we add a typo to a MatchPredicate variant,
|
||||||
|
// this code should not compile.
|
||||||
|
// The test_load_profile_from_yaml_with_all_predicate_kinds test above
|
||||||
|
// provides runtime verification that all valid variants deserialize correctly.
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue