- Add worked example to Glyph struct showing all 11 fields - Add worked example to Span struct showing all 10 fields - Examples use rust,no_run for internal dependencies - cargo doc passes with docs.rs feature set - Verification note added at notes/pdftract-3eohy.md Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
167 lines
5.2 KiB
Rust
167 lines
5.2 KiB
Rust
//! JSON validation subcommand.
|
|
//!
|
|
//! Implements the `pdftract validate` command that validates JSON files
|
|
//! against the pdftract schema. Useful for validating cached results,
|
|
//! MCP-tool responses captured to disk, and profile-extracted outputs.
|
|
|
|
use anyhow::{Context, Result};
|
|
use serde_json::Value;
|
|
use std::fs;
|
|
use std::io::{self, Read};
|
|
use std::path::Path;
|
|
|
|
/// The bundled JSON Schema for pdftract extraction output v1.0.
|
|
///
|
|
/// Loaded from the committed schema file at build time.
|
|
const BUNDLED_SCHEMA_JSON: &str = include_str!("../../../docs/schema/v1.0/pdftract.schema.json");
|
|
|
|
/// Arguments for the validate subcommand.
|
|
pub struct ValidateArgs {
|
|
/// Path to the JSON file to validate, or "-" for stdin
|
|
pub file: String,
|
|
/// Optional path to a custom schema file
|
|
pub schema_path: Option<String>,
|
|
/// Quiet mode - suppress error output
|
|
pub quiet: bool,
|
|
}
|
|
|
|
/// Load the schema from a path or use the bundled schema.
|
|
fn load_schema(schema_path: Option<&str>) -> Result<jsonschema::JSONSchema> {
|
|
let schema_json = if let Some(path) = schema_path {
|
|
// Load custom schema from file
|
|
fs::read_to_string(path)
|
|
.with_context(|| format!("Failed to read schema from '{}'", path))?
|
|
} else {
|
|
// Use bundled schema
|
|
BUNDLED_SCHEMA_JSON.to_string()
|
|
};
|
|
|
|
let schema: Value = serde_json::from_str(&schema_json)
|
|
.context("Schema is not valid JSON")?;
|
|
|
|
jsonschema::JSONSchema::compile(&schema)
|
|
.context("Schema is not valid JSON Schema Draft 2020-12")
|
|
}
|
|
|
|
/// Read JSON from a file path or stdin.
|
|
fn read_json(file: &str) -> Result<Value> {
|
|
let json_str = if file == "-" {
|
|
// Read from stdin
|
|
let mut buffer = String::new();
|
|
io::stdin().read_to_string(&mut buffer)
|
|
.context("Failed to read JSON from stdin")?;
|
|
buffer
|
|
} else {
|
|
// Read from file
|
|
fs::read_to_string(file)
|
|
.with_context(|| format!("Failed to read JSON from '{}'", file))?
|
|
};
|
|
|
|
serde_json::from_str(&json_str)
|
|
.with_context(|| format!("Failed to parse JSON from '{}'", file))
|
|
}
|
|
|
|
/// Format a JSON path to use '/' separators instead of JSON pointer notation.
|
|
///
|
|
/// The jsonschema crate returns paths like "/pages/0/spans/3/text" (JSON Pointer),
|
|
/// which is already human-readable. We just ensure it starts with a single slash.
|
|
fn format_path(instance_path: &str) -> String {
|
|
if instance_path.is_empty() {
|
|
"/".to_string()
|
|
} else if instance_path.starts_with('/') {
|
|
instance_path.to_string()
|
|
} else {
|
|
format!("/{}", instance_path)
|
|
}
|
|
}
|
|
|
|
/// Run the validate subcommand.
|
|
///
|
|
/// Returns Ok(()) if validation passes, Err otherwise.
|
|
pub fn run_validate(args: ValidateArgs) -> Result<()> {
|
|
let schema = load_schema(args.schema_path.as_deref())?;
|
|
|
|
let json_value = read_json(&args.file)?;
|
|
|
|
let result = schema.validate(&json_value);
|
|
|
|
if let Err(errors) = result {
|
|
// Collect all validation errors
|
|
let error_details: Vec<String> = errors.map(|e| {
|
|
let path = format_path(&e.instance_path.to_string());
|
|
format!("{} {}", path, e)
|
|
}).collect();
|
|
|
|
if !args.quiet {
|
|
for error in &error_details {
|
|
println!("{}", error);
|
|
}
|
|
}
|
|
|
|
// Return error to trigger exit code 1
|
|
anyhow::bail!("JSON validation failed with {} error(s)", error_details.len());
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_format_path() {
|
|
assert_eq!(format_path(""), "/");
|
|
assert_eq!(format_path("/pages/0/spans/3/text"), "/pages/0/spans/3/text");
|
|
assert_eq!(format_path("pages/0/spans/3/text"), "/pages/0/spans/3/text");
|
|
}
|
|
|
|
#[test]
|
|
fn test_bundled_schema_is_valid() {
|
|
// Verify the bundled schema compiles successfully
|
|
let _schema = load_schema(None).unwrap();
|
|
}
|
|
|
|
#[test]
|
|
fn test_minimal_valid_json_passes() {
|
|
let json_value = serde_json::json!({
|
|
"schema_version": "1.0",
|
|
"metadata": {
|
|
"page_count": 1,
|
|
"is_tagged": false,
|
|
"is_encrypted": false,
|
|
"contains_javascript": false,
|
|
"contains_xfa": false,
|
|
"ocg_present": false,
|
|
"conformance": "none",
|
|
"javascript_actions": []
|
|
},
|
|
"outline": [],
|
|
"threads": [],
|
|
"attachments": [],
|
|
"signatures": [],
|
|
"form_fields": [],
|
|
"links": [],
|
|
"pages": [{
|
|
"page_index": 0,
|
|
"page_number": 1,
|
|
"width": 612.0,
|
|
"height": 792.0,
|
|
"rotation": 0,
|
|
"type": "text",
|
|
"spans": [],
|
|
"blocks": [],
|
|
"tables": [],
|
|
"annotations": []
|
|
}],
|
|
"extraction_quality": {
|
|
"overall_quality": "none"
|
|
},
|
|
"errors": []
|
|
});
|
|
|
|
let schema = load_schema(None).unwrap();
|
|
let result = schema.validate(&json_value);
|
|
assert!(result.is_ok(), "Minimal valid JSON should pass validation");
|
|
}
|
|
}
|