From c51c725d5c5752632ba261322ee9652153ff73f4 Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 1 Jun 2026 10:00:20 -0400 Subject: [PATCH] feat(bf-4w2rt): scaffold pdftract-schema-migrate crate - Add crates/pdftract-schema-migrate/ workspace member - Implement migration framework for v1.x schema versions - MigrationRegistry with version-pair migration functions - Identity migration for v1.0 -> v1.0 - Validation: rejects major version changes and downgrades - Convenience API: migrate(), run_migration(), read_json(), write_json() - Add migrate-schema CLI binary - --from/--to version arguments - stdin/stdout or file I/O support - Auto-detect pretty-print for terminal output - Full test coverage for migration registry and validation Closes bf-4w2rt. Verification: notes/bf-4w2rt.md --- crates/pdftract-schema-migrate/Cargo.toml | 24 ++ .../src/bin/migrate-schema.rs | 142 ++++++++ crates/pdftract-schema-migrate/src/lib.rs | 338 ++++++++++++++++++ 3 files changed, 504 insertions(+) create mode 100644 crates/pdftract-schema-migrate/Cargo.toml create mode 100644 crates/pdftract-schema-migrate/src/bin/migrate-schema.rs create mode 100644 crates/pdftract-schema-migrate/src/lib.rs diff --git a/crates/pdftract-schema-migrate/Cargo.toml b/crates/pdftract-schema-migrate/Cargo.toml new file mode 100644 index 0000000..57840ac --- /dev/null +++ b/crates/pdftract-schema-migrate/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "pdftract-schema-migrate" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +publish = true + +[lib] +name = "pdftract_schema_migrate" +path = "src/lib.rs" + +[[bin]] +name = "migrate-schema" +path = "src/bin/migrate-schema.rs" + +[dependencies] +anyhow = { workspace = true } +serde = { workspace = true } +serde_json = "1" + +[dev-dependencies] +serde_json = "1" diff --git a/crates/pdftract-schema-migrate/src/bin/migrate-schema.rs b/crates/pdftract-schema-migrate/src/bin/migrate-schema.rs new file mode 100644 index 0000000..a6e226e --- /dev/null +++ b/crates/pdftract-schema-migrate/src/bin/migrate-schema.rs @@ -0,0 +1,142 @@ +//! CLI tool for migrating pdftract JSON output between schema versions. +//! +//! Usage: +//! migrate-schema --from 1.0 --to 1.0 input.json > output.json +//! cat input.json | migrate-schema --from 1.0 --to 1.0 > output.json +//! migrate-schema --from 1.0 --to 1.0 input.json -o output.json + +use anyhow::{Context, Result}; +use pdftract_schema_migrate::{read_json, write_json, run_migration}; +use std::io::{self, IsTerminal}; + +fn main() -> Result<()> { + let args = parse_args()?; + + // Validate migration direction first (fail fast) + pdftract_schema_migrate::validate_migration(&args.from, &args.to) + .context("Migration validation failed")?; + + // Run the migration + run_migration(&args.from, &args.to, &args.input, &args.output, args.pretty) + .context("Migration execution failed")?; + + Ok(()) +} + +/// CLI arguments +struct Args { + from: String, + to: String, + input: String, + output: String, + pretty: bool, +} + +/// Parse command-line arguments. +/// +/// We use a simple parser to avoid additional dependencies for this small tool. +fn parse_args() -> Result { + let mut args = std::env::args(); + let program_name = args.next().unwrap_or_else(|| "migrate-schema".to_string()); + + let mut from = None; + let mut to = None; + let mut input = "-".to_string(); // Default to stdin + let mut output = "-".to_string(); // Default to stdout + let mut pretty = false; + + while let Some(arg) = args.next() { + match arg.as_str() { + "--from" => { + from = Some(args.next().context("--from requires a value")?); + } + "--to" => { + to = Some(args.next().context("--to requires a value")?); + } + "-i" | "--input" => { + input = args.next().context("--input requires a value")?; + } + "-o" | "--output" => { + output = args.next().context("--output requires a value")?; + } + "-p" | "--pretty" => { + pretty = true; + } + "-h" | "--help" => { + print_usage(&program_name); + std::process::exit(0); + } + "-V" | "--version" => { + println!("migrate-schema {}", env!("CARGO_PKG_VERSION")); + std::process::exit(0); + } + arg if arg.starts_with('-') => { + anyhow::bail!("Unknown option: {}", arg); + } + _ => { + // Positional argument: input file + if input == "-" { + input = arg; + } else { + anyhow::bail!("Unexpected argument: {}", arg); + } + } + } + } + + let from = from.context("--from is required (use --help for usage)")?; + let to = to.context("--to is required (use --help for usage)")?; + + // Auto-detect pretty-print: default to true when writing to terminal + if !pretty && output == "-" { + pretty = io::stdout().is_terminal(); + } + + Ok(Args { + from, + to, + input, + output, + pretty, + }) +} + +/// Print usage information. +fn print_usage(program_name: &str) { + let program = program_name.rsplit('/').next().unwrap_or(program_name); + println!( + "Schema version migration tool for pdftract JSON output + +Usage: + {program} --from --to [options] [input] + +Arguments: + --from Source schema version (e.g., 1.0) + --to Target schema version (e.g., 1.0, 1.1) + [input] Input JSON file (default: stdin) + +Options: + -o, --output Output JSON file (default: stdout) + -p, --pretty Pretty-print output JSON + -h, --help Show this help message + -V, --version Show version information + +Examples: + # Migrate with stdin/stdout + cat input.json | {program} --from 1.0 --to 1.0 + + # Migrate with file I/O + {program} --from 1.0 --to 1.0 input.json -o output.json + + # Pretty-print output + {program} --from 1.0 --to 1.0 input.json --pretty + +Notes: + - Only v1.x to v1.y migrations are supported (same major version) + - Downgrades are not allowed (e.g., v1.1 to v1.0) + - Use '-' for stdin/stdout (default for both input and output) + +Available migrations: + - v1.0 -> v1.0 (identity migration)" + ); +} diff --git a/crates/pdftract-schema-migrate/src/lib.rs b/crates/pdftract-schema-migrate/src/lib.rs new file mode 100644 index 0000000..cbb01a5 --- /dev/null +++ b/crates/pdftract-schema-migrate/src/lib.rs @@ -0,0 +1,338 @@ +//! Schema version migration for pdftract JSON output. +//! +//! This crate implements migration between minor versions of the pdftract schema. +//! Following the plan's additive-evolution rules, minor version changes are additive only, +//! so migrations are primarily for field renames and default additions. +//! +//! # Example +//! +//! ```rust +//! use pdftract_schema_migrate::{migrate, MigrationRegistry}; +//! use serde_json::json; +//! +//! let registry = MigrationRegistry::new(); +//! let input = json!({"schema_version": "1.0", "data": "test"}); +//! let output = registry.migrate("1.0", "1.0", input).unwrap(); +//! assert_eq!(input, output); // identity migration +//! ``` + +use anyhow::{bail, Context, Result}; +use serde_json::Value; +use std::collections::HashMap; +use std::io::{self, Read, Write}; + +/// Migration function type: transforms a JSON value from one schema version to another. +pub type MigrationFn = Box Result + Send + Sync>; + +/// Registry of available migrations. +/// +/// Maps (from_version, to_version) to the migration function. +pub struct MigrationRegistry { + migrations: HashMap<(&'static str, &'static str), MigrationFn>, +} + +impl MigrationRegistry { + /// Create a new registry with all known migrations registered. + pub fn new() -> Self { + let mut migrations: HashMap<(&'static str, &'static str), MigrationFn> = HashMap::new(); + + // Register identity migration for v1.0 -> v1.0 + migrations.insert(("1.0", "1.0"), Box::new(|v| Ok(v))); + + // Future migrations would be registered here: + // migrations.insert(("1.0", "1.1"), Box::new(migrate_1_0_to_1_1)); + + Self { migrations } + } + + /// Check if a migration is registered for the given version pair. + pub fn has_migration(&self, from: &str, to: &str) -> bool { + self.migrations.contains_key(&(from.as_ref(), to.as_ref())) + } + + /// Execute the migration for the given version pair. + pub fn migrate(&self, from: &str, to: &str, json: Value) -> Result { + let key = (from.as_ref(), to.as_ref()); + + match self.migrations.get(&key) { + Some(migration_fn) => migration_fn(json), + None => bail!( + "No migration registered from version '{}' to '{}'. Available migrations: v1.0 -> v1.0 (identity)", + from, to + ), + } + } +} + +impl Default for MigrationRegistry { + fn default() -> Self { + Self::new() + } +} + +/// Parse and normalize a version string. +/// +/// Ensures version strings follow the "major.minor" format. +/// For now, we only support major version 1 (v1.x series). +pub fn parse_version(version: &str) -> Result<(u32, u32)> { + let parts: Vec<&str> = version.split('.').collect(); + + if parts.len() != 2 { + bail!( + "Invalid version format '{}': expected 'major.minor' (e.g., '1.0')", + version + ); + } + + let major: u32 = parts[0] + .parse() + .context("Major version must be a number")?; + let minor: u32 = parts[1] + .parse() + .context("Minor version must be a number")?; + + // Only support v1.x for now + if major != 1 { + bail!("Major version {} is not supported (only v1.x migrations are implemented)", major); + } + + Ok((major, minor)) +} + +/// Validate that migration is allowed between versions. +/// +/// Rules: +/// - Major version changes (v1 -> v2) are NOT allowed (breaking changes) +/// - Downgrades (v1.1 -> v1.0) are NOT allowed (data loss risk) +/// - Same version (v1.0 -> v1.0) is allowed (identity migration) +pub fn validate_migration(from: &str, to: &str) -> Result<()> { + let (from_major, from_minor) = parse_version(from)?; + let (to_major, to_minor) = parse_version(to)?; + + // Reject major version changes + if from_major != to_major { + bail!( + "Cannot migrate from v{}.{} to v{}.{}: major version changes are breaking changes and require a full data migration plan", + from_major, from_minor, to_major, to_minor + ); + } + + // Reject downgrades + if to_minor < from_minor { + bail!( + "Cannot downgrade from v{}.{} to v{}.{}: downgrades may lose data and are not supported", + from_major, from_minor, to_major, to_minor + ); + } + + Ok(()) +} + +/// Convenience function for migrating a JSON value between schema versions. +/// +/// This function creates a new registry and runs the migration. +/// For repeated migrations, creating a `MigrationRegistry` instance is more efficient. +/// +/// # Arguments +/// +/// * `from` - Source schema version (e.g., "1.0") +/// * `to` - Target schema version (e.g., "1.0", "1.1") +/// * `json` - JSON value to migrate +/// +/// # Returns +/// +/// Returns the migrated JSON value, or an error if the migration fails. +pub fn migrate(from: &str, to: &str, json: Value) -> Result { + let registry = MigrationRegistry::new(); + registry.migrate(from, to, json) +} + +/// Read JSON from a file path or stdin. +pub fn read_json(path: &str) -> Result { + let json_str = if path == "-" { + let mut buffer = String::new(); + io::stdin().read_to_string(&mut buffer) + .context("Failed to read JSON from stdin")?; + buffer + } else { + std::fs::read_to_string(path) + .with_context(|| format!("Failed to read JSON from '{}'", path))? + }; + + serde_json::from_str(&json_str) + .with_context(|| format!("Failed to parse JSON from '{}'", path)) +} + +/// Write JSON to a file path or stdout. +pub fn write_json(path: &str, json: &Value, pretty: bool) -> Result<()> { + let json_str = if pretty { + serde_json::to_string_pretty(json) + } else { + serde_json::to_string(json) + } + .context("Failed to serialize output JSON")?; + + if path == "-" { + io::stdout() + .write_all(json_str.as_bytes()) + .context("Failed to write JSON to stdout")?; + } else { + std::fs::write(path, json_str) + .with_context(|| format!("Failed to write JSON to '{}'", path))?; + } + + Ok(()) +} + +/// Run a schema migration. +/// +/// # Arguments +/// +/// * `from` - Source schema version (e.g., "1.0") +/// * `to` - Target schema version (e.g., "1.0", "1.1") +/// * `input` - Input JSON file path ("-" for stdin) +/// * `output` - Output JSON file path ("-" for stdout) +/// * `pretty` - Whether to pretty-print the output +/// +/// # Returns +/// +/// Returns `Ok(())` on success, or an error if the migration fails. +pub fn run_migration(from: &str, to: &str, input: &str, output: &str, pretty: bool) -> Result<()> { + // Create migration registry + let registry = MigrationRegistry::new(); + + // Check if the specific migration exists + if !registry.has_migration(from, to) { + // Give a helpful error message + if from == to { + // Same version should always be supported + bail!( + "Identity migration for v{} is missing from registry - this is a bug", + from + ); + } else { + bail!( + "Migration from v{} to v{} is not yet implemented. Available migrations: v1.0 -> v1.0 (identity)", + from, to + ); + } + } + + // Read input JSON + let json_value = read_json(input)?; + + // Perform migration + let mut migrated_json = registry + .migrate(from, to, json_value) + .with_context(|| format!("Migration from v{} to v{} failed", from, to))?; + + // Update schema_version field if it exists and versions differ + if from != to { + if let Some(obj) = migrated_json.as_object_mut() { + // Update schema_version to the target version + obj.insert("schema_version".to_string(), Value::String(to.to_string())); + } + } + + // Write output JSON + write_json(output, &migrated_json, pretty)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_parse_version_valid() { + assert_eq!(parse_version("1.0").unwrap(), (1, 0)); + assert_eq!(parse_version("1.1").unwrap(), (1, 1)); + assert_eq!(parse_version("1.10").unwrap(), (1, 10)); + } + + #[test] + fn test_parse_version_invalid() { + assert!(parse_version("1").is_err()); + assert!(parse_version("1.0.0").is_err()); + assert!(parse_version("v1.0").is_err()); + assert!(parse_version("2.0").is_err()); // Only v1.x supported + } + + #[test] + fn test_validate_migration_same_version() { + assert!(validate_migration("1.0", "1.0").is_ok()); + assert!(validate_migration("1.1", "1.1").is_ok()); + } + + #[test] + fn test_validate_migration_upgrade_allowed() { + assert!(validate_migration("1.0", "1.1").is_ok()); + assert!(validate_migration("1.0", "1.10").is_ok()); + } + + #[test] + fn test_validate_migration_downgrade_rejected() { + assert!(validate_migration("1.1", "1.0").is_err()); + assert!(validate_migration("1.10", "1.0").is_err()); + } + + #[test] + fn test_validate_migration_major_version_change_rejected() { + assert!(validate_migration("1.0", "2.0").is_err()); + // This test will fail once we actually support v2, but that's intentional + } + + #[test] + fn test_migration_registry_identity() { + let registry = MigrationRegistry::new(); + + let input = json!({ + "schema_version": "1.0", + "test": "value" + }); + + let result = registry.migrate("1.0", "1.0", input.clone()).unwrap(); + + // Identity migration should return unchanged value + assert_eq!(input, result); + } + + #[test] + fn test_migration_registry_unsupported() { + let registry = MigrationRegistry::new(); + + let input = json!({"test": "value"}); + + let result = registry.migrate("1.0", "1.1", input); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("No migration registered")); + } + + #[test] + fn test_migration_registry_has_migration() { + let registry = MigrationRegistry::new(); + + assert!(registry.has_migration("1.0", "1.0")); + assert!(!registry.has_migration("1.0", "1.1")); + assert!(!registry.has_migration("2.0", "2.0")); + } + + #[test] + fn test_migrate_convenience_function() { + let input = json!({"test": "value"}); + let result = migrate("1.0", "1.0", input.clone()).unwrap(); + assert_eq!(input, result); + } + + #[test] + fn test_migration_registry_default() { + let registry = MigrationRegistry::default(); + assert!(registry.has_migration("1.0", "1.0")); + } +}