feat(pdftract-5nv9h): implement xtask gen-schema with stable ordering and proper metadata
Implement the xtask gen-schema binary at xtask/src/bin/gen_schema.rs that derives JSON Schema Draft 2020-12 from the Rust ExtractionResult type via the schemars crate. Changes: - Add stable key sorting (sort_keys_recursive) for byte-identical output - Set $id to stable URL: https://pdftract.com/schema/v1.0/pdftract.schema.json - Set title to "pdftract Output v1.0" - Add cargo alias `gen-schema` for convenient invocation - Emit schema to docs/schema/v1.0/pdftract.schema.json The schema is generated from the Rust types with schemars derives, ensuring the JSON schema is always in sync with the source types. Acceptance criteria: - cargo gen-schema regenerates docs/schema/v1.0/pdftract.schema.json - Generated schema validates against JSON Schema Draft 2020-12 - Schema $id is the stable URL - Title is "pdftract Output v1.0" - Stable ordering: regenerating twice produces byte-identical output - All expected types appear in $defs (BlockJson, SpanJson, PageResult, etc.) Note: page_type and confidence_source enums are not yet implemented in the Rust types (marked as TODO in schema/mod.rs). These will be added by sibling beads pdftract-1ob and pdftract-1f8we respectively. Closes: pdftract-5nv9h
This commit is contained in:
parent
aebe37ca84
commit
016c738188
4 changed files with 639 additions and 411 deletions
|
|
@ -14,6 +14,9 @@ cr = "check --release"
|
||||||
t = "test"
|
t = "test"
|
||||||
tr = "test --release"
|
tr = "test --release"
|
||||||
|
|
||||||
|
# xtask aliases (invoke via --manifest-path to avoid workspace issues)
|
||||||
|
gen-schema = "run --manifest-path=xtask/Cargo.toml --bin gen_schema"
|
||||||
|
|
||||||
# Profile for CI property tests (nextest with proptest)
|
# Profile for CI property tests (nextest with proptest)
|
||||||
[profile.ci-proptest]
|
[profile.ci-proptest]
|
||||||
inherits = "release"
|
inherits = "release"
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
133
xtask/Cargo.lock
generated
133
xtask/Cargo.lock
generated
|
|
@ -213,6 +213,22 @@ version = "1.0.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "errno"
|
||||||
|
version = "0.3.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fastrand"
|
||||||
|
version = "2.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "find-msvc-tools"
|
name = "find-msvc-tools"
|
||||||
version = "0.1.9"
|
version = "0.1.9"
|
||||||
|
|
@ -279,6 +295,17 @@ dependencies = [
|
||||||
"version_check",
|
"version_check",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "getrandom"
|
||||||
|
version = "0.2.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"libc",
|
||||||
|
"wasi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "getrandom"
|
name = "getrandom"
|
||||||
version = "0.3.4"
|
version = "0.3.4"
|
||||||
|
|
@ -378,7 +405,7 @@ version = "0.1.34"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"getrandom",
|
"getrandom 0.3.4",
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -400,6 +427,12 @@ version = "0.2.186"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "linux-raw-sys"
|
||||||
|
version = "0.12.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lock_api"
|
name = "lock_api"
|
||||||
version = "0.4.14"
|
version = "0.4.14"
|
||||||
|
|
@ -457,6 +490,15 @@ version = "2.8.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memmap2"
|
||||||
|
version = "0.9.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "minimal-lexical"
|
name = "minimal-lexical"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
|
|
@ -532,14 +574,17 @@ version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"dashmap",
|
"dashmap",
|
||||||
|
"encoding_rs",
|
||||||
"flate2",
|
"flate2",
|
||||||
"hex",
|
"hex",
|
||||||
"indexmap",
|
"indexmap",
|
||||||
"lzw",
|
"lzw",
|
||||||
"memchr",
|
"memchr",
|
||||||
|
"memmap2",
|
||||||
"owned_ttf_parser",
|
"owned_ttf_parser",
|
||||||
"phf",
|
"phf",
|
||||||
"phf_codegen",
|
"phf_codegen",
|
||||||
|
"rand",
|
||||||
"rayon",
|
"rayon",
|
||||||
"regex",
|
"regex",
|
||||||
"schemars",
|
"schemars",
|
||||||
|
|
@ -548,6 +593,7 @@ dependencies = [
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"sha2",
|
"sha2",
|
||||||
"smallvec",
|
"smallvec",
|
||||||
|
"tempfile",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tracing",
|
"tracing",
|
||||||
"ttf-parser 0.24.1",
|
"ttf-parser 0.24.1",
|
||||||
|
|
@ -611,6 +657,15 @@ version = "0.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ppv-lite86"
|
||||||
|
version = "0.2.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
|
||||||
|
dependencies = [
|
||||||
|
"zerocopy",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "1.0.106"
|
version = "1.0.106"
|
||||||
|
|
@ -641,6 +696,18 @@ version = "0.8.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
|
checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"rand_chacha",
|
||||||
|
"rand_core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_chacha"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||||
|
dependencies = [
|
||||||
|
"ppv-lite86",
|
||||||
"rand_core",
|
"rand_core",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -649,6 +716,9 @@ name = "rand_core"
|
||||||
version = "0.6.4"
|
version = "0.6.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom 0.2.17",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rangemap"
|
name = "rangemap"
|
||||||
|
|
@ -734,6 +804,19 @@ version = "0.8.10"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustix"
|
||||||
|
version = "1.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
"errno",
|
||||||
|
"libc",
|
||||||
|
"linux-raw-sys",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustversion"
|
name = "rustversion"
|
||||||
version = "1.0.22"
|
version = "1.0.22"
|
||||||
|
|
@ -905,6 +988,19 @@ dependencies = [
|
||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tempfile"
|
||||||
|
version = "3.27.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
|
||||||
|
dependencies = [
|
||||||
|
"fastrand",
|
||||||
|
"getrandom 0.3.4",
|
||||||
|
"once_cell",
|
||||||
|
"rustix",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
version = "1.0.69"
|
version = "1.0.69"
|
||||||
|
|
@ -1047,6 +1143,12 @@ version = "0.9.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasi"
|
||||||
|
version = "0.11.1+wasi-snapshot-preview1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasip2"
|
name = "wasip2"
|
||||||
version = "1.0.3+wasi-0.2.9"
|
version = "1.0.3+wasi-0.2.9"
|
||||||
|
|
@ -1166,6 +1268,15 @@ dependencies = [
|
||||||
"windows-link",
|
"windows-link",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.61.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
||||||
|
dependencies = [
|
||||||
|
"windows-link",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wit-bindgen"
|
name = "wit-bindgen"
|
||||||
version = "0.57.1"
|
version = "0.57.1"
|
||||||
|
|
@ -1187,6 +1298,26 @@ dependencies = [
|
||||||
"serde_yaml",
|
"serde_yaml",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zerocopy"
|
||||||
|
version = "0.8.48"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
|
||||||
|
dependencies = [
|
||||||
|
"zerocopy-derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zerocopy-derive"
|
||||||
|
version = "0.8.48"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zeroize"
|
name = "zeroize"
|
||||||
version = "1.8.2"
|
version = "1.8.2"
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,12 @@
|
||||||
//! extraction output, which is checked into the repository at
|
//! extraction output, which is checked into the repository at
|
||||||
//! docs/schema/v1.0/pdftract.schema.json.
|
//! docs/schema/v1.0/pdftract.schema.json.
|
||||||
//!
|
//!
|
||||||
//! Usage: cargo run --bin gen_schema
|
//! Usage: cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema
|
||||||
|
|
||||||
|
use std::collections::BTreeMap;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
// Find the workspace root
|
// Find the workspace root
|
||||||
|
|
@ -67,7 +69,55 @@ fn generate_schema() -> String {
|
||||||
|
|
||||||
let schema = schema_for!(ExtractionResult);
|
let schema = schema_for!(ExtractionResult);
|
||||||
|
|
||||||
// Convert to JSON string
|
// Convert to JSON value
|
||||||
// The schema_for! macro already includes the $schema field
|
let mut value = serde_json::to_value(&schema).expect("Failed to serialize schema");
|
||||||
serde_json::to_string_pretty(&schema).expect("Failed to serialize schema")
|
|
||||||
|
// Set $id, title, and description on the root schema object
|
||||||
|
if let Some(obj) = value.as_object_mut() {
|
||||||
|
// Set $id to stable URL
|
||||||
|
obj.insert("$id".to_string(), Value::String(
|
||||||
|
"https://pdftract.com/schema/v1.0/pdftract.schema.json".to_string()
|
||||||
|
));
|
||||||
|
|
||||||
|
// Update title
|
||||||
|
obj.insert("title".to_string(), Value::String(
|
||||||
|
"pdftract Output v1.0".to_string()
|
||||||
|
));
|
||||||
|
|
||||||
|
// Update description
|
||||||
|
obj.insert("description".to_string(), Value::String(
|
||||||
|
"JSON Schema for pdftract PDF extraction output v1.0. \
|
||||||
|
This schema defines the structure of extraction results including pages, \
|
||||||
|
spans, blocks, tables, form fields, signatures, and metadata."
|
||||||
|
.to_string()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort keys recursively for stable ordering
|
||||||
|
let sorted = sort_keys_recursive(value);
|
||||||
|
|
||||||
|
// Serialize with pretty printing
|
||||||
|
serde_json::to_string_pretty(&sorted).expect("Failed to serialize sorted schema")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Recursively sort all object keys alphabetically for stable diff output.
|
||||||
|
///
|
||||||
|
/// This function walks the entire JSON value tree and sorts all object keys
|
||||||
|
/// in BTreeMap order, ensuring that regenerating the schema produces
|
||||||
|
/// byte-identical output.
|
||||||
|
fn sort_keys_recursive(value: Value) -> Value {
|
||||||
|
match value {
|
||||||
|
Value::Object(map) => {
|
||||||
|
let mut sorted = BTreeMap::new();
|
||||||
|
for (k, v) in map {
|
||||||
|
sorted.insert(k, sort_keys_recursive(v));
|
||||||
|
}
|
||||||
|
Value::Object(sorted.into_iter().collect())
|
||||||
|
}
|
||||||
|
Value::Array(arr) => {
|
||||||
|
let sorted: Vec<Value> = arr.into_iter().map(sort_keys_recursive).collect();
|
||||||
|
Value::Array(sorted)
|
||||||
|
}
|
||||||
|
_ => value,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue