feat(pdftract-5nv9h): implement xtask gen-schema with stable ordering and proper metadata

Implement the xtask gen-schema binary at xtask/src/bin/gen_schema.rs that
derives JSON Schema Draft 2020-12 from the Rust ExtractionResult type via
the schemars crate.

Changes:
- Add stable key sorting (sort_keys_recursive) for byte-identical output
- Set $id to stable URL: https://pdftract.com/schema/v1.0/pdftract.schema.json
- Set title to "pdftract Output v1.0"
- Add cargo alias `gen-schema` for convenient invocation
- Emit schema to docs/schema/v1.0/pdftract.schema.json

The schema is generated from the Rust types with schemars derives, ensuring
the JSON schema is always in sync with the source types.

Acceptance criteria:
- cargo gen-schema regenerates docs/schema/v1.0/pdftract.schema.json
- Generated schema validates against JSON Schema Draft 2020-12
- Schema $id is the stable URL
- Title is "pdftract Output v1.0"
- Stable ordering: regenerating twice produces byte-identical output
- All expected types appear in $defs (BlockJson, SpanJson, PageResult, etc.)

Note: page_type and confidence_source enums are not yet implemented in the
Rust types (marked as TODO in schema/mod.rs). These will be added by sibling
beads pdftract-1ob and pdftract-1f8we respectively.

Closes: pdftract-5nv9h
This commit is contained in:
jedarden 2026-05-24 17:31:16 -04:00
parent aebe37ca84
commit 016c738188
4 changed files with 639 additions and 411 deletions

View file

@ -14,6 +14,9 @@ cr = "check --release"
t = "test"
tr = "test --release"
# xtask aliases (invoke via --manifest-path to avoid workspace issues)
gen-schema = "run --manifest-path=xtask/Cargo.toml --bin gen_schema"
# Profile for CI property tests (nextest with proptest)
[profile.ci-proptest]
inherits = "release"

File diff suppressed because it is too large Load diff

133
xtask/Cargo.lock generated
View file

@ -213,6 +213,22 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "errno"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "fastrand"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
@ -279,6 +295,17 @@ dependencies = [
"version_check",
]
[[package]]
name = "getrandom"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "getrandom"
version = "0.3.4"
@ -378,7 +405,7 @@ version = "0.1.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
dependencies = [
"getrandom",
"getrandom 0.3.4",
"libc",
]
@ -400,6 +427,12 @@ version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "linux-raw-sys"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
[[package]]
name = "lock_api"
version = "0.4.14"
@ -457,6 +490,15 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "memmap2"
version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
dependencies = [
"libc",
]
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@ -532,14 +574,17 @@ version = "0.1.0"
dependencies = [
"anyhow",
"dashmap",
"encoding_rs",
"flate2",
"hex",
"indexmap",
"lzw",
"memchr",
"memmap2",
"owned_ttf_parser",
"phf",
"phf_codegen",
"rand",
"rayon",
"regex",
"schemars",
@ -548,6 +593,7 @@ dependencies = [
"serde_json",
"sha2",
"smallvec",
"tempfile",
"thiserror",
"tracing",
"ttf-parser 0.24.1",
@ -611,6 +657,15 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
dependencies = [
"zerocopy",
]
[[package]]
name = "proc-macro2"
version = "1.0.106"
@ -641,6 +696,18 @@ version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
@ -649,6 +716,9 @@ name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom 0.2.17",
]
[[package]]
name = "rangemap"
@ -734,6 +804,19 @@ version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "rustix"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
name = "rustversion"
version = "1.0.22"
@ -905,6 +988,19 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
dependencies = [
"fastrand",
"getrandom 0.3.4",
"once_cell",
"rustix",
"windows-sys",
]
[[package]]
name = "thiserror"
version = "1.0.69"
@ -1047,6 +1143,12 @@ version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasi"
version = "0.11.1+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
[[package]]
name = "wasip2"
version = "1.0.3+wasi-0.2.9"
@ -1166,6 +1268,15 @@ dependencies = [
"windows-link",
]
[[package]]
name = "windows-sys"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
dependencies = [
"windows-link",
]
[[package]]
name = "wit-bindgen"
version = "0.57.1"
@ -1187,6 +1298,26 @@ dependencies = [
"serde_yaml",
]
[[package]]
name = "zerocopy"
version = "0.8.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "zeroize"
version = "1.8.2"

View file

@ -4,10 +4,12 @@
//! extraction output, which is checked into the repository at
//! docs/schema/v1.0/pdftract.schema.json.
//!
//! Usage: cargo run --bin gen_schema
//! Usage: cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema
use std::collections::BTreeMap;
use std::fs;
use std::path::PathBuf;
use serde_json::Value;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Find the workspace root
@ -67,7 +69,55 @@ fn generate_schema() -> String {
let schema = schema_for!(ExtractionResult);
// Convert to JSON string
// The schema_for! macro already includes the $schema field
serde_json::to_string_pretty(&schema).expect("Failed to serialize schema")
// Convert to JSON value
let mut value = serde_json::to_value(&schema).expect("Failed to serialize schema");
// Set $id, title, and description on the root schema object
if let Some(obj) = value.as_object_mut() {
// Set $id to stable URL
obj.insert("$id".to_string(), Value::String(
"https://pdftract.com/schema/v1.0/pdftract.schema.json".to_string()
));
// Update title
obj.insert("title".to_string(), Value::String(
"pdftract Output v1.0".to_string()
));
// Update description
obj.insert("description".to_string(), Value::String(
"JSON Schema for pdftract PDF extraction output v1.0. \
This schema defines the structure of extraction results including pages, \
spans, blocks, tables, form fields, signatures, and metadata."
.to_string()
));
}
// Sort keys recursively for stable ordering
let sorted = sort_keys_recursive(value);
// Serialize with pretty printing
serde_json::to_string_pretty(&sorted).expect("Failed to serialize sorted schema")
}
/// Recursively sort all object keys alphabetically for stable diff output.
///
/// This function walks the entire JSON value tree and sorts all object keys
/// in BTreeMap order, ensuring that regenerating the schema produces
/// byte-identical output.
fn sort_keys_recursive(value: Value) -> Value {
match value {
Value::Object(map) => {
let mut sorted = BTreeMap::new();
for (k, v) in map {
sorted.insert(k, sort_keys_recursive(v));
}
Value::Object(sorted.into_iter().collect())
}
Value::Array(arr) => {
let sorted: Vec<Value> = arr.into_iter().map(sort_keys_recursive).collect();
Value::Array(sorted)
}
_ => value,
}
}