pdftract/crates/pdftract-core/tests/schema_validate_fixtures.rs
jedarden d0f52751ce fix(pdftract-39gey): fix indent trigger to not split drop-cap paragraphs
The indent trigger was using .abs() which fired on both increased indent
(non-indented → indented) AND decreased indent (indented → non-indented).
This caused drop-cap style paragraphs (indented first line, flush-left
continuation) to incorrectly split into two blocks.

Per plan Phase 4.4 heuristic #2, indent change should only trigger when the
current line is MORE indented (to the right, larger x0) than the block
average - i.e., a new paragraph starting after non-indented text. It should
NOT trigger for decreased indent (first line indented, rest flush-left).

Fix: Remove .abs() and only check if line_x0 - block_avg_x0 > threshold.

Tests:
- test_indented_first_line_new_block: PASS (non-indented → indented splits)
- test_indented_first_line_of_paragraph_not_split: PASS (drop cap stays together)
- All 179 line module tests: PASS
2026-06-07 13:43:19 -04:00

235 lines
8.2 KiB
Rust

//! JSON Schema validation integration tests.
//!
//! These tests verify that pdftract extraction outputs conform to the
//! published JSON Schema at docs/schema/v1.0/pdftract.schema.json.
//!
//! Per bead pdftract-2rc4 (Phase 6.1.4), this is a regression guard:
//! any code change that emits a field not in the schema, or omits a
//! required one, fails CI.
//!
//! Test workflow:
//! 1. Walk tests/fixtures/json_schema/ for *.pdf inputs
//! 2. Extract each PDF to JSON using pdftract_core
//! 3. Validate the JSON against the bundled schema
//! 4. Fail on any validation errors
//!
//! Fixtures with expected JSON files (.expected.json) are verified for
//! exact match. Fixtures without expected files generate them for
//! manual review on first run.
use std::fs;
use std::path::{PathBuf};
use pdftract_core::extract::extract_pdf;
use pdftract_core::options::ExtractionOptions;
/// Fixture directory for JSON schema validation tests
const FIXTURES_DIR: &str = "tests/fixtures/json_schema";
/// A single test fixture for JSON schema validation.
struct Fixture {
name: String,
pdf_path: PathBuf,
expected_path: Option<PathBuf>,
}
impl Fixture {
/// Load all fixtures from the fixtures directory.
fn load_all() -> Vec<Self> {
let fixtures_dir = PathBuf::from(FIXTURES_DIR);
let mut fixtures = Vec::new();
let entries = fs::read_dir(&fixtures_dir)
.unwrap_or_else(|e| panic!("Failed to read fixtures directory '{}': {}", FIXTURES_DIR, e));
for entry in entries {
let entry = entry.unwrap();
let path = entry.path();
// Only process PDF files
if path.extension().and_then(|s| s.to_str()) != Some("pdf") {
continue;
}
let name = path.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown")
.to_string();
let expected_path = path.with_extension("expected.json");
fixtures.push(Fixture {
name,
pdf_path: path,
expected_path: if expected_path.exists() { Some(expected_path) } else { None },
});
}
// Sort for deterministic test order
fixtures.sort_by(|a, b| a.name.cmp(&b.name));
fixtures
}
}
/// Load the bundled JSON Schema for validation.
fn load_schema() -> jsonschema::Validator {
let schema_json = include_str!("../../../docs/schema/v1.0/pdftract.schema.json");
let schema: serde_json::Value = serde_json::from_str(schema_json)
.expect("Bundled schema is not valid JSON");
jsonschema::validator_for(&schema)
.expect("Bundled schema is not valid JSON Schema")
}
/// Validate a JSON value against the schema.
///
/// Returns Ok(()) if validation passes, Err with error details otherwise.
fn validate_json(schema: &jsonschema::Validator, value: &serde_json::Value) -> Result<(), Vec<String>> {
let result = schema.validate(value);
match result {
Ok(_) => Ok(()),
Err(error) => {
// If there's at least one error, collect all errors using iter_errors
let error_details: Vec<String> = schema
.iter_errors(value)
.map(|e| {
let path = e.instance_path.to_string();
format!("{} {}", path, e)
})
.collect();
Err(error_details)
}
}
}
/// Test a single fixture for schema compliance.
fn test_fixture(fixture: &Fixture) {
println!("Testing fixture: {}", fixture.name);
// Load the schema
let schema = load_schema();
// Extract PDF to JSON
let extraction_result = extract_pdf(&fixture.pdf_path, &ExtractionOptions::default())
.unwrap_or_else(|e| panic!("Failed to extract fixture '{}': {}", fixture.name, e));
// Convert to JSON using the same serialization as the CLI
let json_value = pdftract_core::extract::result_to_json(&extraction_result);
// Validate against schema
if let Err(validation_errors) = validate_json(&schema, &json_value) {
panic!(
"Fixture '{}' failed schema validation with {} error(s):\n{}",
fixture.name,
validation_errors.len(),
validation_errors.join("\n")
);
}
// If expected JSON exists, verify exact match (for regression detection)
if let Some(ref expected_path) = fixture.expected_path {
let expected_json = fs::read_to_string(expected_path)
.unwrap_or_else(|e| panic!("Failed to read expected JSON for '{}': {}", fixture.name, e));
let expected_value: serde_json::Value = serde_json::from_str(&expected_json)
.unwrap_or_else(|e| panic!("Failed to parse expected JSON for '{}': {}", fixture.name, e));
if json_value != expected_value {
// For helpful debugging, show a diff-like comparison
let json_str = serde_json::to_string_pretty(&json_value).unwrap();
eprintln!("=== JSON MISMATCH ===");
eprintln!("Fixture: {}", fixture.name);
eprintln!("Expected: {}", expected_path.display());
eprintln!("\nActual output:\n{}", json_str);
eprintln!("====================");
// Write actual output to a .actual.json file for comparison
let actual_path = expected_path.with_extension("actual.json");
fs::write(&actual_path, json_str)
.unwrap_or_else(|e| eprintln!("Warning: Failed to write actual JSON: {}", e));
panic!("Fixture '{}' output does not match expected JSON", fixture.name);
}
} else {
// No expected file exists - generate it for manual review
let expected_path = fixture.pdf_path.with_extension("expected.json");
let json_str = serde_json::to_string_pretty(&json_value).unwrap();
println!("No expected.json found - creating it:");
println!(" File: {}", expected_path.display());
fs::write(&expected_path, json_str)
.unwrap_or_else(|e| eprintln!("Warning: Failed to write expected.json: {}", e));
}
}
// Test functions for each fixture
#[test]
fn test_all_fixtures_schema_compliance() {
let fixtures = Fixture::load_all();
assert!(!fixtures.is_empty(), "No fixtures found in '{}'", FIXTURES_DIR);
for fixture in &fixtures {
test_fixture(fixture);
}
}
// Individual test functions for common fixtures (useful for targeted runs)
#[test]
fn test_simple_invoice() {
let fixture = Fixture {
name: "simple_invoice".to_string(),
pdf_path: PathBuf::from(format!("{}/simple_invoice.pdf", FIXTURES_DIR)),
expected_path: Some(PathBuf::from(format!("{}/simple_invoice.expected.json", FIXTURES_DIR))),
};
if fixture.pdf_path.exists() {
test_fixture(&fixture);
}
}
#[test]
fn test_sample() {
let fixture = Fixture {
name: "sample".to_string(),
pdf_path: PathBuf::from(format!("{}/sample.pdf", FIXTURES_DIR)),
expected_path: Some(PathBuf::from(format!("{}/sample.expected.json", FIXTURES_DIR))),
};
if fixture.pdf_path.exists() {
test_fixture(&fixture);
}
}
#[test]
fn test_encrypted_rc4() {
let fixture = Fixture {
name: "EC-04-rc4-encrypted".to_string(),
pdf_path: PathBuf::from(format!("{}/EC-04-rc4-encrypted.pdf", FIXTURES_DIR)),
expected_path: Some(PathBuf::from(format!("{}/EC-04-rc4-encrypted.expected.json", FIXTURES_DIR))),
};
if fixture.pdf_path.exists() {
test_fixture(&fixture);
}
}
#[test]
fn test_encrypted_aes128() {
let fixture = Fixture {
name: "EC-05-aes128-encrypted".to_string(),
pdf_path: PathBuf::from(format!("{}/EC-05-aes128-encrypted.pdf", FIXTURES_DIR)),
expected_path: Some(PathBuf::from(format!("{}/EC-05-aes128-encrypted.expected.json", FIXTURES_DIR))),
};
if fixture.pdf_path.exists() {
test_fixture(&fixture);
}
}
#[test]
fn test_valid_minimal() {
let fixture = Fixture {
name: "valid-minimal".to_string(),
pdf_path: PathBuf::from(format!("{}/valid-minimal.pdf", FIXTURES_DIR)),
expected_path: Some(PathBuf::from(format!("{}/valid-minimal.expected.json", FIXTURES_DIR))),
};
if fixture.pdf_path.exists() {
test_fixture(&fixture);
}
}