pdftract/crates/pdftract-core/build.rs
jedarden 54fe6c1964 feat(pdftract-1xf4d): implement TH-06 supply-chain gate
- Add minimum version requirements to deny.toml (ring >= 0.17.5, rustls >= 0.23)
- Create build/CHECKSUMS.sha256 for build-time data file integrity
- Update build.rs to verify checksums on every build
- Add tampering detection tests (th06_checksum_test.rs)
- Create nightly supply-chain scan workflow (pdftract-nightly-supply-chain.yaml)
- Update audit.toml with advisory exceptions

Closes: pdftract-1xf4d
Refs: plan lines 877, 883-896, 906-913
2026-05-26 17:31:13 -04:00

995 lines
32 KiB
Rust

use std::env;
use std::fs;
use std::path::Path;
fn main() {
println!("cargo:rerun-if-changed=build/std14-metrics.json");
println!("cargo:rerun-if-changed=build/named-encodings.json");
println!("cargo:rerun-if-changed=build/agl.json");
println!("cargo:rerun-if-changed=build/font-fingerprints.json");
println!("cargo:rerun-if-changed=build/predefined-cmaps/");
println!("cargo:rerun-if-changed=build/glyph-shapes.json");
println!("cargo:rerun-if-changed=build/wordlist-en-20k.txt");
println!("cargo:rerun-if-changed=build/CHECKSUMS.sha256");
// Verify build-time data file checksums (TH-06 supply-chain gate)
if let Err(e) = verify_checksums() {
eprintln!("cargo:warning=Checksum verification failed: {}", e);
eprintln!("cargo:warning=Build-time data files may have been tampered with or need regeneration.");
eprintln!("cargo:warning=To regenerate CHECKSUMS.sha256, run: cd crates/pdftract-core/build && sha256sum std14-metrics.json named-encodings.json agl.json font-fingerprints.json wordlist-en-20k.txt predefined-cmaps/*.json > CHECKSUMS.sha256 && sha256sum ../../../build/glyph-shapes.json >> CHECKSUMS.sha256");
panic!("Checksum verification failed - aborting build");
}
let out_dir = env::var("OUT_DIR").unwrap();
let out_path = Path::new(&out_dir);
let metrics_path = Path::new("build/std14-metrics.json");
// Generate std14 metrics
generate_std14_metrics(out_path, metrics_path);
// Generate named encoding tables
let encodings_path = Path::new("build/named-encodings.json");
generate_named_encodings(out_path, encodings_path);
// Generate AGL phf maps
let agl_path = Path::new("build/agl.json");
generate_agl_maps(out_path, agl_path);
// Generate font fingerprint phf map
let fingerprints_path = Path::new("build/font-fingerprints.json");
generate_font_fingerprints(out_path, fingerprints_path);
// Generate predefined CMap registry
generate_predefined_cmaps(out_path);
// Generate glyph shape database
let shapes_path = Path::new("build/glyph-shapes.json");
generate_shape_db(out_path, shapes_path);
// Generate English wordlist
let wordlist_path = Path::new("build/wordlist-en-20k.txt");
generate_wordlist(out_path, wordlist_path);
}
fn generate_std14_metrics(out_dir: &Path, metrics_path: &Path) {
let json_content = fs::read_to_string(metrics_path).expect("Failed to read std14-metrics.json");
let data: serde_json::Value =
serde_json::from_str(&json_content).expect("Failed to parse std14-metrics.json");
let fonts = data["fonts"].as_object().expect("fonts object missing");
let mut metrics_structs = String::new();
for (font_name, font_data) in fonts {
let font_ident = font_name.replace("-", "_");
let weights = font_data["weights"]
.as_array()
.expect("weights array missing");
let weights_array: Vec<String> = weights
.iter()
.map(|v| v.as_u64().unwrap_or(0).to_string())
.collect();
let font_bbox = font_data["font_bbox"]
.as_array()
.expect("font_bbox array missing");
let font_bbox: Vec<String> = font_bbox
.iter()
.map(|v| v.as_i64().unwrap_or(0).to_string())
.collect();
let ascent = font_data["ascent"].as_i64().expect("ascent missing");
let descent = font_data["descent"].as_i64().expect("descent missing");
let italic_angle = font_data["italic_angle"]
.as_f64()
.expect("italic_angle missing");
let cap_height = font_data["cap_height"]
.as_i64()
.expect("cap_height missing");
let stem_v = font_data["stem_v"].as_i64().expect("stem_v missing");
let encoding_str = font_data["encoding"].as_str().expect("encoding missing");
let encoding = match encoding_str {
"StandardEncoding" => "NamedEncoding::Standard",
"SymbolEncoding" => "NamedEncoding::Symbol",
"ZapfDingbatsEncoding" => "NamedEncoding::ZapfDingbats",
_ => "NamedEncoding::Standard",
};
metrics_structs.push_str(&format!(
r#"
static {}_WIDTHS: &[u16; 256] = &[{}];
static {}_METRICS: Std14Metrics = Std14Metrics {{
widths: &{}_WIDTHS,
ascent: {},
descent: {},
italic_angle: {}f32,
font_bbox: [{}],
cap_height: {},
stem_v: {},
encoding: {},
}};
"#,
font_ident.to_uppercase(),
weights_array.join(", "),
font_ident.to_uppercase(),
font_ident.to_uppercase(),
ascent,
descent,
italic_angle,
font_bbox.join(", "),
cap_height,
stem_v,
encoding
));
}
// Build the phf map using phf_codegen
let mut map_builder = phf_codegen::Map::new();
for font_name in fonts.keys() {
let ident = font_name.replace("-", "_");
map_builder.entry(
font_name.as_str(),
&format!("&{}_METRICS", ident.to_uppercase()),
);
}
let rust_code = format!(
r#"
// Auto-generated Standard 14 font metrics.
// Do not edit manually.
{}
pub fn get_std14_metrics(name: &str) -> Option<&'static Std14Metrics> {{
static METRICS: phf::Map<&'static str, &'static Std14Metrics> = {};
METRICS.get(name).copied()
}}
"#,
metrics_structs,
map_builder.build()
);
fs::write(Path::new(out_dir).join("std14_registry.rs"), rust_code)
.expect("Failed to write std14_registry.rs");
}
fn generate_named_encodings(out_dir: &Path, encodings_path: &Path) {
let json_content =
fs::read_to_string(encodings_path).expect("Failed to read named-encodings.json");
let data: serde_json::Value =
serde_json::from_str(&json_content).expect("Failed to parse named-encodings.json");
let encodings = data.as_object().expect("encodings object missing");
let mut encoding_arrays = String::new();
for (encoding_name, encoding_data) in encodings {
let ident = match encoding_name.as_str() {
"WinAnsiEncoding" => "WIN_ANSI",
"MacRomanEncoding" => "MAC_ROMAN",
"MacExpertEncoding" => "MAC_EXPERT",
"StandardEncoding" => "STANDARD",
"SymbolEncoding" => "SYMBOL",
"ZapfDingbatsEncoding" => "ZAPF_DINGBATS",
_ => continue,
};
let entries = encoding_data
.as_object()
.expect("encoding data is not an object");
let mut array_values = Vec::new();
for i in 0..256 {
let key = format!("0x{:02X}", i);
let value = entries.get(&key).and_then(|v| v.as_str());
let rust_value = match value {
Some(glyph_name) => format!("Some(\"{}\")", glyph_name),
None => "None".to_string(),
};
array_values.push(rust_value);
}
encoding_arrays.push_str(&format!(
r#"
pub static {}: [Option<&'static str>; 256] = [
{}];
"#,
ident,
array_values.join(", ")
));
}
let rust_code = format!(
r#"
// Auto-generated named encoding tables.
// Do not edit manually.
// Source: ISO 32000-1 Annex D
{}
pub fn get_named_encoding_table(encoding: NamedEncoding) -> &'static [Option<&'static str>; 256] {{
match encoding {{
NamedEncoding::WinAnsi => &WIN_ANSI,
NamedEncoding::MacRoman => &MAC_ROMAN,
NamedEncoding::MacExpert => &MAC_EXPERT,
NamedEncoding::Standard => &STANDARD,
NamedEncoding::Symbol => &SYMBOL,
NamedEncoding::ZapfDingbats => &ZAPF_DINGBATS,
}}
}}
"#,
encoding_arrays
);
fs::write(Path::new(out_dir).join("named_encodings.rs"), rust_code)
.expect("Failed to write named_encodings.rs");
}
fn generate_agl_maps(out_dir: &Path, agl_path: &Path) {
let json_content = fs::read_to_string(agl_path).expect("Failed to read agl.json");
let data: serde_json::Value =
serde_json::from_str(&json_content).expect("Failed to parse agl.json");
// Single-codepoint map
let single = data["merged_single"]
.as_object()
.expect("merged_single object missing");
let mut single_map_builder = phf_codegen::Map::new();
for (name, uvalue) in single {
let uvalue_str = uvalue.as_str().expect("unicode value is not a string");
// Parse the JSON unicode escape like "A" into a Rust char literal
let unicode_char = decode_json_unicode(uvalue_str);
single_map_builder.entry(name.as_str(), &format!("'\\u{{{}}}'", unicode_char));
}
// Multi-codepoint map
let multi = data["merged_multi"]
.as_object()
.expect("merged_multi object missing");
let mut multi_arrays = String::new();
let mut multi_map_builder = phf_codegen::Map::new();
for (name, uvalues) in multi {
let uvalues_arr = uvalues.as_array().expect("multi value is not an array");
let ident = name.to_uppercase().replace("-", "_").replace(".", "_");
let chars: Vec<String> = uvalues_arr
.iter()
.map(|v| {
let uvalue_str = v.as_str().expect("unicode value is not a string");
let unicode_char = decode_json_unicode(uvalue_str);
format!("'\\u{{{}}}'", unicode_char)
})
.collect();
multi_arrays.push_str(&format!(
r#"
static {}: &[char] = &[{}];
"#,
ident,
chars.join(", ")
));
multi_map_builder.entry(name.as_str(), &format!("&{}", ident));
}
let rust_code = format!(
r#"
// Auto-generated Adobe Glyph List (AGL) phf maps.
// Do not edit manually.
// Source: Adobe Glyph List 1.4 + AGLFN 1.7
// https://github.com/adobe-type-tools/agl-aglfn
{}
/// AGL phf map for single-codepoint glyph names.
/// Maps glyph names like "A", "quoteright", "Euro" to their Unicode codepoints.
pub static AGL: phf::Map<&'static str, char> = {};
/// AGL phf map for multi-codepoint (ligature) glyph names.
/// Maps glyph names like "dalethatafpatah" to sequences of Unicode codepoints.
pub static AGL_MULTI: phf::Map<&'static str, &[char]> = {};
"#,
multi_arrays,
single_map_builder.build(),
multi_map_builder.build()
);
fs::write(Path::new(out_dir).join("agl.rs"), rust_code).expect("Failed to write agl.rs");
}
/// Decode a JSON unicode escape string like "\\u0041" to "0041".
fn decode_json_unicode(s: &str) -> String {
// The JSON has "\\uXXXX" which Rust reads as "\uXXXX"
// We need to extract just the hex part
if let Some(suffix) = s.strip_prefix("\\u") {
suffix.to_string()
} else {
s.to_string()
}
}
/// Generate font fingerprint phf map from font-fingerprints.json.
///
/// The JSON format is:
/// ```json
/// [
/// {
/// "sha256_hex": "abc123...",
/// "font_name": "Font Name (informational)",
/// "entries": [[gid1, codepoint1], [gid2, codepoint2], ...]
/// }
/// ]
/// ```
///
/// Each entry maps a glyph ID to a Unicode codepoint for a specific font
/// identified by its SHA-256 hash.
fn generate_font_fingerprints(out_dir: &Path, fingerprints_path: &Path) {
let json_content =
fs::read_to_string(fingerprints_path).expect("Failed to read font-fingerprints.json");
let data: serde_json::Value =
serde_json::from_str(&json_content).expect("Failed to parse font-fingerprints.json");
let fonts = data.as_array().expect("font-fingerprints must be an array");
let mut entries_arrays = String::new();
let mut map_builder = phf_codegen::Map::new();
// Store keys and values to ensure they live long enough
let mut keys = Vec::new();
let mut values = Vec::new();
for font_entry in fonts {
let sha256_hex = font_entry
.get("sha256_hex")
.and_then(|v| v.as_str())
.expect("sha256_hex must be a string");
// Skip empty hashes (placeholder entries)
if sha256_hex.is_empty() {
continue;
}
// Validate SHA-256 hex (64 hex chars = 32 bytes)
if sha256_hex.len() != 64 {
panic!(
"SHA-256 hex must be 64 characters, got {}",
sha256_hex.len()
);
}
// Convert hex string to [u8; 32] bytes
let hash_bytes: [u8; 32] = hex_decode_to_array(sha256_hex);
// Get entries
let entries = font_entry
.get("entries")
.and_then(|v| v.as_array())
.expect("entries must be an array");
let ident = format!("HASH_{}", sha256_hex.replace('-', "_"));
// Build the entries array
let mut entry_values = Vec::new();
for entry in entries {
let arr = entry.as_array().expect("entry must be an array");
let gid = arr
.first()
.and_then(|v| v.as_u64())
.expect("gid must be a number") as u16;
let codepoint = arr
.get(1)
.and_then(|v| v.as_u64())
.expect("codepoint must be a number") as u32;
// Validate codepoint is a valid Unicode scalar value
if !is_valid_unicode_scalar(codepoint) {
panic!("Invalid Unicode scalar: 0x{:X}", codepoint);
}
entry_values.push(format!("({}, {})", gid, codepoint));
}
entries_arrays.push_str(&format!(
r#"
static {}: &[(u16, u32)] = &[{}];
"#,
ident,
entry_values.join(", ")
));
// Build the phf map key as a byte array literal
let key_bytes: Vec<String> = hash_bytes.iter().map(|b| format!("0x{:02x}", b)).collect();
let key = format!("[{}]", key_bytes.join(", "));
let value = format!("&{}", ident);
keys.push(key);
values.push(value);
}
// Add entries to the map builder
for (key, value) in keys.iter().zip(values.iter()) {
map_builder.entry(key.as_str(), value.as_str());
}
let rust_code = format!(
r#"
// Auto-generated font fingerprint phf map.
// Do not edit manually.
// Source: build/font-fingerprints.json
{}
/// Font fingerprint database.
///
/// Maps SHA-256 hashes of embedded font programs to their glyph ID to
/// Unicode codepoint mappings. This is Level 3 of the encoding fallback
/// chain, used when:
/// - /ToUnicode is missing or empty
/// - The embedded font subset has stripped glyph names
/// - The font binary matches a known fingerprint
///
/// The hash is computed over the DECODED font program bytes (post stream
/// decoding, pre-interpretation).
pub static FONT_FINGERPRINTS: phf::Map<[u8; 32], &'static [(u16, u32)]> = {};
"#,
entries_arrays,
map_builder.build()
);
fs::write(Path::new(out_dir).join("font_fingerprints.rs"), rust_code)
.expect("Failed to write font_fingerprints.rs");
}
/// Decode a hex string to a [u8; 32] array.
fn hex_decode_to_array(hex: &str) -> [u8; 32] {
let mut bytes = [0u8; 32];
for i in 0..32 {
let byte_str = &hex[i * 2..i * 2 + 2];
bytes[i] = u8::from_str_radix(byte_str, 16).expect("Invalid hex string");
}
bytes
}
/// Check if a value is a valid Unicode scalar value.
fn is_valid_unicode_scalar(cp: u32) -> bool {
// Unicode scalar values: 0x0..=0xD7FF, 0xE000..=0x10FFFF
(0x0..=0xD7FF).contains(&cp) || (0xE000..=0x10FFFF).contains(&cp)
}
/// Generate predefined CMap CID->Unicode mappings.
///
/// Reads JSON files from build/predefined-cmaps/ and generates phf maps
/// for CID->Unicode lookups. The JSON files contain mappings from CIDs
/// to their Unicode codepoint(s).
fn generate_predefined_cmaps(out_dir: &Path) {
let predefined_cmaps_dir = Path::new("build/predefined-cmaps");
// Generate each character collection
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-japan1", "japan1");
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-gb1", "gb1");
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-cns1", "cns1");
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-korea1", "korea1");
}
/// Generate a single character collection's CMap module.
fn generate_collection_cmap(out_dir: &Path, base_dir: &Path, json_name: &str, module_name: &str) {
let json_path = base_dir.join(format!("{}.json", json_name));
let out_path = out_dir.join(format!("predefined_cmap_{}.rs", module_name));
// Check if the JSON file exists
if !json_path.exists() {
// Generate a stub implementation
let rust_code = format!(
r#"
// Auto-generated {collection} CID to Unicode mapping.
//
// Source: {json_name}.json (not found - stub implementation)
// Do not edit manually.
/// Look up a CID in the {collection} character collection.
///
/// Returns None if the CID is not assigned in {collection} or if the
/// predefined CMap data file is missing.
pub fn cid_to_unicode(cid: u32) -> Option<&'static [char]> {{
let _ = cid;
None
}}
"#,
collection = module_name.to_uppercase(),
json_name = json_name,
);
fs::write(&out_path, rust_code)
.unwrap_or_else(|_| panic!("Failed to write {}", out_path.display()));
return;
}
let json_content = fs::read_to_string(&json_path)
.unwrap_or_else(|_| panic!("Failed to read {}", json_path.display()));
let data: serde_json::Value = serde_json::from_str(&json_content)
.unwrap_or_else(|_| panic!("Failed to parse {}", json_path.display()));
// Build phf map
let mut map_builder = phf_codegen::Map::new();
let mut arrays = String::new();
if let Some(mappings) = data.as_object() {
for (cid_str, unicode_value) in mappings {
let cid: u32 = cid_str
.parse()
.unwrap_or_else(|_| panic!("Invalid CID key: {}", cid_str));
// Parse the Unicode value
if let Some(unicode_str) = unicode_value.as_str() {
let chars = parse_unicode_value(unicode_str);
// Generate array name
let array_ident = format!("CID_{}_{}", module_name.to_uppercase(), cid);
// Build the array
let char_literals: Vec<String> = chars
.iter()
.map(|c| format!("'\\u{{{:04X}}}'", *c as u32))
.collect();
arrays.push_str(&format!(
r#"
static {}: &[char] = &[{}];
"#,
array_ident,
char_literals.join(", ")
));
// Use u32 key as decimal literal
map_builder.entry(cid, &format!("&{}", array_ident));
}
}
}
let rust_code = format!(
r#"
// Auto-generated {collection} CID to Unicode mapping.
//
// Source: {json_name}.json
// Do not edit manually.
{arrays}
/// Look up a CID in the {collection} character collection.
///
/// Returns None if the CID is not assigned in {collection}.
pub fn cid_to_unicode(cid: u32) -> Option<&'static [char]> {{
static MAP: phf::Map<u32, &'static [char]> = {map};
// CIDs are 16-bit in these collections, but we use u32 for the API
if cid <= u16::MAX as u32 {{
MAP.get(&cid).copied()
}} else {{
None
}}
}}
"#,
collection = module_name.to_uppercase(),
json_name = json_name,
arrays = arrays,
map = map_builder.build(),
);
fs::write(&out_path, rust_code)
.unwrap_or_else(|_| panic!("Failed to write {}", out_path.display()));
}
/// Parse a Unicode value from JSON to a Vec<char>.
///
/// The JSON value can be:
/// - A single Unicode escape like "A" (A)
/// - Multiple Unicode escapes for ligatures like "fi" (fi)
fn parse_unicode_value(s: &str) -> Vec<char> {
let mut chars = Vec::new();
let mut chars_iter = s.chars();
while let Some(c) = chars_iter.next() {
if c == '\\' {
// Expect \uXXXX
if chars_iter.next() == Some('u') {
// Read 4 hex digits
let mut hex_str = String::new();
for _ in 0..4 {
if let Some(hex_c) = chars_iter.next() {
hex_str.push(hex_c);
}
}
if let Ok(codepoint) = u32::from_str_radix(&hex_str, 16) {
if let Some(unicode_char) = char::from_u32(codepoint) {
chars.push(unicode_char);
}
}
}
}
}
if chars.is_empty() && !s.is_empty() {
// Fallback: try to parse as direct character
chars.extend(s.chars());
}
chars
}
/// Generate glyph shape database from glyph-shapes.json.
///
/// Reads build/glyph-shapes.json and emits two parallel static arrays:
/// - SHAPE_TABLE: &'static [(u64, char)] sorted by pHash
/// - FREQ_TABLE: &'static [(u64, u32)] for frequency ranks (same order as SHAPE_TABLE)
///
/// # JSON format
///
/// Array of entries:
/// ```json
/// {
/// "phash_hex": "0123456789abcdef",
/// "char": "A",
/// "source_font": "font.ttf",
/// "frequency_rank": 1
/// }
/// ```
fn generate_shape_db(out_dir: &Path, _shapes_path: &Path) {
// Resolve shapes_path relative to the workspace root
// build.rs runs from the crate directory, but the build/ dir is at workspace root
// We can find the workspace root by going up from the crate directory
let crate_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
let workspace_root = crate_dir.ancestors().nth(2).unwrap_or(crate_dir); // workspace is usually 2 levels up
let actual_shapes_path = workspace_root.join("build").join("glyph-shapes.json");
// Check if the JSON file exists
if !actual_shapes_path.exists() {
// Emit a build warning and empty tables
println!(
"cargo:warning=glyph-shapes.json not found at {}, generating empty shape database",
actual_shapes_path.display()
);
let rust_code = r#"
// Auto-generated glyph shape database.
// Source: build/glyph-shapes.json (not found - empty database)
// Do not edit manually.
/// Shape database: empty (run `cargo xtask gen-shape-db` to generate).
pub static SHAPE_TABLE: &[(u64, char)] = &[];
/// Frequency table: empty (run `cargo xtask gen-shape-db` to generate).
pub static FREQ_TABLE: &[(u64, u32)] = &[];
/// Compile-time assertion that tables are parallel.
const _: () = assert!(SHAPE_TABLE.len() == FREQ_TABLE.len());
"#;
fs::write(Path::new(out_dir).join("shape_db.rs"), rust_code)
.expect("Failed to write shape_db.rs");
return;
}
let json_content =
fs::read_to_string(&actual_shapes_path).expect("Failed to read glyph-shapes.json");
let data: serde_json::Value =
serde_json::from_str(&json_content).expect("Failed to parse glyph-shapes.json");
let entries = data.as_array().expect("glyph-shapes.json must be an array");
// Parse and sort entries by pHash
let mut sorted_entries: Vec<(u64, char, u32)> = Vec::new();
for (idx, entry) in entries.iter().enumerate() {
let phash_hex = entry
.get("phash_hex")
.and_then(|v| v.as_str())
.unwrap_or("");
let phash = u64::from_str_radix(phash_hex, 16)
.unwrap_or_else(|e| panic!("Invalid phash_hex at index {}: {}", idx, e));
let char_str = entry.get("char").and_then(|v| v.as_str()).unwrap_or("");
let ch = char_str
.chars()
.next()
.unwrap_or_else(|| panic!("Empty char field at index {}", idx));
let freq_rank = entry
.get("frequency_rank")
.and_then(|v| v.as_u64())
.unwrap_or(0) as u32;
sorted_entries.push((phash, ch, freq_rank));
}
// Sort by pHash ascending
sorted_entries.sort_by_key(|a| a.0);
// Check for duplicate pHash entries
for i in 1..sorted_entries.len() {
if sorted_entries[i].0 == sorted_entries[i - 1].0 {
eprintln!(
"Warning: duplicate pHash {:016x} at indices {} and {}",
sorted_entries[i].0,
i - 1,
i
);
}
}
// Generate SHAPE_TABLE entries
let mut shape_entries = Vec::new();
for &(phash, ch, _) in &sorted_entries {
// Use Rust's Debug formatter which produces valid char literals
// e.g. 'a', '\n', '\u{1f600}'
let char_literal = format!("{:?}", ch);
shape_entries.push(format!("(0x{:016x}, {})", phash, char_literal));
}
// Generate FREQ_TABLE entries
let mut freq_entries = Vec::new();
for &(phash, _, freq) in &sorted_entries {
freq_entries.push(format!("(0x{:016x}, {})", phash, freq));
}
let rust_code = format!(
r#"
// Auto-generated glyph shape database.
// Source: build/glyph-shapes.json
// Do not edit manually.
/// Shape database: pHash -> character mapping sorted by pHash.
pub static SHAPE_TABLE: &[(u64, char)] = &[
{}
];
/// Frequency table: pHash -> frequency rank (same order as SHAPE_TABLE).
/// Higher rank = more common character.
pub static FREQ_TABLE: &[(u64, u32)] = &[
{}
];
/// Compile-time assertion that tables have the same length.
const _: () = assert!(SHAPE_TABLE.len() == FREQ_TABLE.len());
"#,
shape_entries.join(",\n "),
freq_entries.join(",\n ")
);
fs::write(Path::new(out_dir).join("shape_db.rs"), rust_code)
.expect("Failed to write shape_db.rs");
}
/// Generate English wordlist phf::Set from wordlist-en-20k.txt.
///
/// Reads build/wordlist-en-20k.txt and emits a compile-time phf::Set
/// containing ~20,000 common English words for dictionary coverage
/// scoring in readability analysis.
///
/// # Format
///
/// One lowercase word per line, sorted by frequency (most common first).
/// Words must be ASCII only, 1-30 characters.
///
/// # Source
///
/// google-10000-english 20k.txt (frequency-sorted English word list)
fn generate_wordlist(out_dir: &Path, wordlist_path: &Path) {
// Check if the wordlist file exists
if !wordlist_path.exists() {
// Emit a build warning and empty set
println!(
"cargo:warning=wordlist-en-20k.txt not found at {}, generating empty wordlist",
wordlist_path.display()
);
let rust_code = r#"
// Auto-generated English wordlist.
// Source: build/wordlist-en-20k.txt (not found - empty wordlist)
// Do not edit manually.
/// English wordlist: empty (wordlist-en-20k.txt not found).
pub static EN_WORDLIST_20K: phf::Set<&'static str> = phf::Set::empty();
"#;
fs::write(Path::new(out_dir).join("wordlist.rs"), rust_code)
.expect("Failed to write wordlist.rs");
return;
}
let wordlist_content = fs::read_to_string(wordlist_path)
.unwrap_or_else(|_| panic!("Failed to read {}", wordlist_path.display()));
// Validate and collect words
let mut words = Vec::new();
let mut line_num = 0;
for line in wordlist_content.lines() {
line_num += 1;
let word = line.trim();
// Skip empty lines
if word.is_empty() {
continue;
}
// Validate: ASCII only, lowercase, length 1-30
if !word.is_ascii() {
panic!("wordlist-en-20k.txt:{}: non-ASCII word: {}", line_num, word);
}
if word != word.to_lowercase() {
panic!(
"wordlist-en-20k.txt:{}: non-lowercase word: {}",
line_num, word
);
}
if !(1..=30).contains(&word.len()) {
panic!(
"wordlist-en-20k.txt:{}: word length {} outside range [1, 30]: {}",
line_num,
word.len(),
word
);
}
words.push(word);
}
// Build phf::Set
let mut set_builder = phf_codegen::Set::new();
for word in &words {
set_builder.entry(word);
}
let rust_code = format!(
r#"
// Auto-generated English wordlist.
// Source: build/wordlist-en-20k.txt
// Do not edit manually.
//
// A compile-time phf::Set of ~20,000 common English words, sorted by
// frequency. Used for dictionary coverage scoring in readability analysis.
//
// Word count: {}
/// English wordlist: 20,000 most common English words.
///
/// Lookup is O(1) via phf's perfect hash function. Words are lowercase
/// ASCII only, length 1-30 characters.
///
/// # Example
///
/// ```
/// use pdftract_core::layout::wordlist::EN_WORDLIST_20K;
///
/// assert!(EN_WORDLIST_20K.contains("the"));
/// assert!(EN_WORDLIST_20K.contains("computer"));
/// assert!(!EN_WORDLIST_20K.contains("xyzqwerty"));
/// ```
pub static EN_WORDLIST_20K: phf::Set<&'static str> = {};
"#,
words.len(),
set_builder.build()
);
fs::write(Path::new(out_dir).join("wordlist.rs"), rust_code)
.expect("Failed to write wordlist.rs");
}
/// Verify SHA-256 checksums of build-time data files.
///
/// This is the TH-06 supply-chain gate implementation. It reads CHECKSUMS.sha256
/// and verifies that each build-time data file matches its expected checksum.
///
/// # Returns
///
/// `Ok(())` if all checksums match, `Err(String)` with a descriptive message otherwise.
fn verify_checksums() -> Result<(), String> {
use std::collections::HashMap;
use std::io::BufRead;
let checksums_path = Path::new("build/CHECKSUMS.sha256");
if !checksums_path.exists() {
return Err(format!("CHECKSUMS.sha256 not found at {}", checksums_path.display()));
}
let checksums_file = fs::File::open(checksums_path)
.map_err(|e| format!("Failed to open CHECKSUMS.sha256: {}", e))?;
// Parse CHECKSUMS.sha256 into a map of path -> expected checksum
let mut expected_checksums: HashMap<String, String> = HashMap::new();
let reader = std::io::BufReader::new(checksums_file);
for line in reader.lines() {
let line = line.map_err(|e| format!("Failed to read CHECKSUMS.sha256: {}", e))?;
let line = line.trim();
// Skip empty lines and comments
if line.is_empty() || line.starts_with('#') {
continue;
}
// Parse: "checksum path"
let parts: Vec<&str> = line.splitn(2, " ").collect();
if parts.len() != 2 {
return Err(format!("Invalid checksum line: {}", line));
}
let checksum = parts[0].to_string();
let path = parts[1].to_string();
expected_checksums.insert(path, checksum);
}
// Verify each file's checksum
let mut failures = Vec::new();
for (path, expected_checksum) in &expected_checksums {
let file_path = Path::new(path);
// Skip files that don't exist (they may be optional, like glyph-shapes.json)
if !file_path.exists() {
eprintln!("cargo:warning=Checksum file not found (optional): {}", path);
continue;
}
// Compute SHA-256 of the file
let actual_checksum = compute_sha256(file_path)
.map_err(|e| format!("Failed to compute checksum for {}: {}", path, e))?;
if actual_checksum != *expected_checksum {
failures.push(format!(
"{}: expected {}, got {}",
path, expected_checksum, actual_checksum
));
}
}
if !failures.is_empty() {
Err(format!(
"Checksum verification failed for {} file(s):\n {}",
failures.len(),
failures.join("\n ")
))
} else {
Ok(())
}
}
/// Compute SHA-256 checksum of a file.
///
/// # Returns
///
/// Hex-encoded checksum string (64 hex characters).
fn compute_sha256(path: &Path) -> Result<String, String> {
use std::io::Read;
use sha2::{Digest, Sha256};
let mut file = fs::File::open(path)
.map_err(|e| format!("Failed to open {}: {}", path.display(), e))?;
let mut hasher = Sha256::new();
let mut buffer = [0u8; 8192];
loop {
let n = file.read(&mut buffer)
.map_err(|e| format!("Failed to read {}: {}", path.display(), e))?;
if n == 0 {
break;
}
hasher.update(&buffer[..n]);
}
Ok(format!("{:x}", hasher.finalize()))
}