Add cargo xtask gen-shape-db command that walks font directories, rasterizes glyphs at 32x32 via fontdue, computes pHash, and outputs build/glyph-shapes.json. Implementation details: - Fontdue integration for TrueType/OpenType font loading - 32x32 bitmap rasterization with centering - DCT-based pHash computation (32x32 DCT → 8x8 low-freq → median threshold) - Character frequency data for collision resolution - Deduplication by (phash, char) pairs - Cross-character collision handling (keep higher-frequency char) - Sorted output by pHash ascending Artifacts: - build/frequency.json: Character frequency rankings - build/README.md: Command documentation and usage Acceptance criteria: - ✅ cargo xtask gen-shape-db --fonts <dir> produces valid JSON - ✅ Deterministic output (byte-identical on same inputs) - ✅ Fontdue integration and 32x32 rasterization - ✅ pHash computation via DCT - ⚠️ No system fonts for full integration test (documented) Closes: pdftract-2aq0
27 lines
486 B
TOML
27 lines
486 B
TOML
[workspace]
|
|
|
|
[package]
|
|
name = "xtask"
|
|
version = "0.1.0"
|
|
edition = "2021"
|
|
license = "MIT OR Apache-2.0"
|
|
publish = false
|
|
|
|
[[bin]]
|
|
name = "xtask"
|
|
path = "src/main.rs"
|
|
|
|
[[bin]]
|
|
name = "gen_schema"
|
|
path = "src/bin/gen_schema.rs"
|
|
|
|
[dependencies]
|
|
serde = { version = "1.0", features = ["derive"] }
|
|
serde_json = "1.0"
|
|
serde_yaml = "0.9"
|
|
glob = "0.3"
|
|
humantime = "2.1"
|
|
lopdf = "0.34"
|
|
schemars = "1.2"
|
|
pdftract-core = { path = "../crates/pdftract-core", features = ["schemars"] }
|
|
fontdue = "0.9"
|