pdftract/crates/pdftract-cli/tests/test_form.rs
jedarden 9abc386cce feat(pdftract-3h9xo): implement threads JSON output + schema integration
Phase 7.7.3: Add threads field to ExtractionResult with ThreadJson schema integration.

Changes:
- Added ThreadJson and BeadJson structs to schema/mod.rs
- Added thread_to_json() function to threads/mod.rs
- Added build_page_ref_to_index() helper to parser/pages.rs
- Added threads field to ExtractionResult in extract.rs
- Implemented Phase 7.7 extraction logic with discover_threads/walk_beads
- Added threads_to_markdown() and collapse_page_ranges() to markdown.rs
- Updated JSON schema with ThreadJson and BeadJson definitions
- Added thread_to_py() and bead_to_py() conversions in pdftract-py
- Exported ThreadJson, BeadJson from lib.rs

All 32 threads module tests pass. All 35 markdown tests pass.

Verification: notes/pdftract-3h9xo.md

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 13:40:15 -04:00

347 lines
11 KiB
Rust

//! Form profile regression tests
//!
//! This module tests the form document profile against fixtures
//! at `tests/fixtures/profiles/form/`.
//!
//! The form profile is DEGENERATE - it has NO field extractors.
//! Per plan line 3045: "form has no field extractor; the form_fields
//! output from Phase 7.4 is surfaced separately in extraction output".
//!
//! Acceptance criteria (from bead pdftract-596dz):
//! - profiles/builtin/form.yaml validates
//! - 5+ fixtures with expected outputs
//! - metadata.profile_fields is empty (degenerate profile)
//! - output.form_fields is populated (when Phase 7.4 is integrated)
use std::fs;
use std::path::{Path, PathBuf};
/// Get the workspace root directory
fn workspace_root() -> PathBuf {
let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap();
let path = PathBuf::from(manifest_dir);
// We're in crates/pdftract-cli, so go up two levels to reach workspace root
path.parent().unwrap().parent().unwrap().to_path_buf()
}
/// Path to form profile fixtures
fn fixture_dir() -> PathBuf {
workspace_root().join("tests/fixtures/profiles/form")
}
/// Path to form profile YAML
fn profile_path() -> PathBuf {
workspace_root().join("profiles/builtin/form/profile.yaml")
}
/// Form fixture names
const FORM_FIXTURES: &[&str] = &["irs_1040", "w2", "i9", "expense_report", "intake_form"];
/// Expected output file suffix
const EXPECTED_SUFFIX: &str = "-expected.json";
/// Verify the form profile YAML exists and is valid
#[test]
fn test_form_profile_exists() {
let profile_path = profile_path();
assert!(
profile_path.exists(),
"Form profile not found at {}",
profile_path.display()
);
let content = fs::read_to_string(profile_path).expect("Failed to read form profile");
// Verify profile is not empty
assert!(!content.trim().is_empty(), "Form profile is empty");
// Verify required top-level keys exist
assert!(content.contains("name:"), "Profile missing 'name' key");
assert!(
content.contains("description:"),
"Profile missing 'description' key"
);
assert!(
content.contains("priority:"),
"Profile missing 'priority' key"
);
assert!(
content.contains("threshold:"),
"Profile missing 'threshold' key"
);
assert!(
content.contains("predicates:"),
"Profile missing 'predicates' key"
);
// Verify form profile has type: form
assert!(content.contains("type:"), "Profile missing 'type' key");
assert!(content.contains("form"), "Profile type should be 'form'");
}
/// Verify all fixture directories exist with expected outputs
#[test]
fn test_form_fixture_structure() {
let fixture_dir = fixture_dir();
assert!(
fixture_dir.exists(),
"Form fixture directory not found at {}",
fixture_dir.display()
);
// Verify README.md exists
let readme_path = fixture_dir.join("README.md");
assert!(readme_path.exists(), "Missing README.md in form fixtures");
// Verify PROVENANCE.md exists
let provenance_path = fixture_dir.join("PROVENANCE.md");
assert!(
provenance_path.exists(),
"Missing PROVENANCE.md in form fixtures"
);
// Verify all expected output files exist
for fixture_name in FORM_FIXTURES {
let expected_path = fixture_dir.join(format!("{}{}", fixture_name, EXPECTED_SUFFIX));
assert!(
expected_path.exists(),
"Missing expected output for fixture '{}': {}",
fixture_name,
expected_path.display()
);
// Verify expected output is valid JSON
let content = fs::read_to_string(&expected_path).expect("Failed to read expected output");
let _: serde_json::Value = serde_json::from_str(&content).expect(&format!(
"Expected output is not valid JSON: {}",
expected_path.display()
));
// Verify expected output has required structure
let json: serde_json::Value = serde_json::from_str(&content).unwrap();
// Check metadata.document_type is "form"
let document_type = json.pointer("/metadata/document_type").expect(&format!(
"Missing /metadata/document_type in {}",
expected_path.display()
));
assert_eq!(
document_type.as_str(),
Some("form"),
"Document type should be 'form' in {}",
expected_path.display()
);
// Check metadata.profile_name is "form"
let profile_name = json.pointer("/metadata/profile_name").expect(&format!(
"Missing /metadata/profile_name in {}",
expected_path.display()
));
assert_eq!(
profile_name.as_str(),
Some("form"),
"Profile name should be 'form' in {}",
expected_path.display()
);
// CRITICAL: Check metadata.profile_fields is empty (degenerate profile)
let profile_fields = json.pointer("/metadata/profile_fields").expect(&format!(
"Missing /metadata/profile_fields in {}",
expected_path.display()
));
let obj = profile_fields
.as_object()
.expect("profile_fields is not an object");
assert!(
obj.is_empty(),
"Form profile should have empty profile_fields (degenerate profile) in {}",
expected_path.display()
);
// Verify document_type_confidence is present and valid
let confidence = json
.pointer("/metadata/document_type_confidence")
.expect(&format!(
"Missing /metadata/document_type_confidence in {}",
expected_path.display()
));
assert!(
confidence.as_f64().is_some(),
"document_type_confidence should be a number in {}",
expected_path.display()
);
let conf_value = confidence.as_f64().unwrap();
assert!(
conf_value >= 0.0 && conf_value <= 1.0,
"document_type_confidence should be between 0 and 1 in {}",
expected_path.display()
);
}
}
/// Verify form profile schema matches Phase 7.10 specification
#[test]
fn test_form_profile_schema() {
let profile_path = profile_path();
let content = fs::read_to_string(profile_path).expect("Failed to read form profile");
// Parse YAML as JSON to verify structure
let yaml_value: serde_yaml::Value =
serde_yaml::from_str(&content).expect("Form profile is not valid YAML");
// Verify top-level structure
assert_eq!(
yaml_value["name"].as_str(),
Some("Form Document"),
"Profile name should be 'Form Document'"
);
assert!(
yaml_value["description"].is_string(),
"Profile should have a description"
);
assert!(
yaml_value["threshold"].is_number(),
"Profile should have a numeric threshold"
);
// Verify type is "form"
assert_eq!(
yaml_value["type"].as_str(),
Some("form"),
"Profile type should be 'form'"
);
// Verify predicates exist
assert!(
yaml_value["predicates"].is_sequence(),
"Profile should have predicates array"
);
let predicates = yaml_value["predicates"].as_sequence().unwrap();
assert!(
!predicates.is_empty(),
"Profile should have at least one predicate"
);
// Verify form-specific predicates
// - structural_has_form_field (weight 0.4)
// - text_contains "form" (weight 0.2)
// - page_count_in_range 1-10 (weight 0.15)
// - text_contains "application" (weight 0.15)
// - text_contains "please complete" (weight 0.1)
let predicate_kinds: Vec<String> = predicates
.iter()
.filter_map(|p| {
p.get("kind")
.and_then(|k| k.as_str().map(|s| s.to_string()))
})
.collect();
assert!(
predicate_kinds.contains(&"structural_has_form_field".to_string()),
"Form profile should have structural_has_form_field predicate"
);
assert!(
predicate_kinds.contains(&"text_contains".to_string()),
"Form profile should have text_contains predicate"
);
assert!(
predicate_kinds.contains(&"page_count_in_range".to_string()),
"Form profile should have page_count_in_range predicate"
);
}
/// Verify form profile degenerate behavior (no field extractors)
#[test]
fn test_form_profile_is_degenerate() {
// This test verifies that the form profile has no field extractors,
// which is the expected degenerate behavior per plan line 3045.
let profile_path = profile_path();
let content = fs::read_to_string(profile_path).expect("Failed to read form profile");
// The classification profile (profile.yaml) doesn't have fields,
// but the extraction profile (classification/form.yaml) should have
// profile_fields: {} (empty object)
let extraction_profile_path =
workspace_root().join("profiles/builtin/classification/form.yaml");
assert!(
extraction_profile_path.exists(),
"Extraction profile not found at {}",
extraction_profile_path.display()
);
let extraction_content =
fs::read_to_string(extraction_profile_path).expect("Failed to read extraction profile");
// Parse YAML to verify profile_fields is empty
let yaml_value: serde_yaml::Value =
serde_yaml::from_str(&extraction_content).expect("Extraction profile is not valid YAML");
let profile_fields = &yaml_value["profile_fields"];
// serde_yaml::Value uses is_mapping() for objects
assert!(
profile_fields.is_mapping(),
"profile_fields should be a mapping/object"
);
// Check if the mapping is empty
let is_empty = if let Some(mapping) = profile_fields.as_mapping() {
mapping.is_empty()
} else {
false
};
assert!(
is_empty,
"Form profile should have empty profile_fields (degenerate profile)"
);
// Verify form_fields_integration: true is present
assert!(
extraction_content.contains("form_fields_integration: true"),
"Form profile should have form_fields_integration: true"
);
// Verify reading_order: line_dominant
assert!(
extraction_content.contains("reading_order: line_dominant"),
"Form profile should have reading_order: line_dominant"
);
}
/// Verify README.md mentions degenerate profile behavior
#[test]
fn test_form_readme_mentions_degenerate() {
let readme_path = fixture_dir().join("README.md");
let content = fs::read_to_string(&readme_path).expect("Failed to read README.md");
// Verify README explains that form is a degenerate profile
assert!(
content.contains("degenerate"),
"README should mention that the form profile is degenerate"
);
assert!(
content.contains("profile_fields: {{}}"),
"README should show empty profile_fields"
);
assert!(
content.contains("NO field extractors"),
"README should explain that there are no field extractors"
);
}