- Add minimum version requirements to deny.toml (ring >= 0.17.5, rustls >= 0.23) - Create build/CHECKSUMS.sha256 for build-time data file integrity - Update build.rs to verify checksums on every build - Add tampering detection tests (th06_checksum_test.rs) - Create nightly supply-chain scan workflow (pdftract-nightly-supply-chain.yaml) - Update audit.toml with advisory exceptions Closes: pdftract-1xf4d Refs: plan lines 877, 883-896, 906-913
995 lines
32 KiB
Rust
995 lines
32 KiB
Rust
use std::env;
|
|
use std::fs;
|
|
use std::path::Path;
|
|
|
|
fn main() {
|
|
println!("cargo:rerun-if-changed=build/std14-metrics.json");
|
|
println!("cargo:rerun-if-changed=build/named-encodings.json");
|
|
println!("cargo:rerun-if-changed=build/agl.json");
|
|
println!("cargo:rerun-if-changed=build/font-fingerprints.json");
|
|
println!("cargo:rerun-if-changed=build/predefined-cmaps/");
|
|
println!("cargo:rerun-if-changed=build/glyph-shapes.json");
|
|
println!("cargo:rerun-if-changed=build/wordlist-en-20k.txt");
|
|
println!("cargo:rerun-if-changed=build/CHECKSUMS.sha256");
|
|
|
|
// Verify build-time data file checksums (TH-06 supply-chain gate)
|
|
if let Err(e) = verify_checksums() {
|
|
eprintln!("cargo:warning=Checksum verification failed: {}", e);
|
|
eprintln!("cargo:warning=Build-time data files may have been tampered with or need regeneration.");
|
|
eprintln!("cargo:warning=To regenerate CHECKSUMS.sha256, run: cd crates/pdftract-core/build && sha256sum std14-metrics.json named-encodings.json agl.json font-fingerprints.json wordlist-en-20k.txt predefined-cmaps/*.json > CHECKSUMS.sha256 && sha256sum ../../../build/glyph-shapes.json >> CHECKSUMS.sha256");
|
|
panic!("Checksum verification failed - aborting build");
|
|
}
|
|
|
|
let out_dir = env::var("OUT_DIR").unwrap();
|
|
let out_path = Path::new(&out_dir);
|
|
let metrics_path = Path::new("build/std14-metrics.json");
|
|
|
|
// Generate std14 metrics
|
|
generate_std14_metrics(out_path, metrics_path);
|
|
|
|
// Generate named encoding tables
|
|
let encodings_path = Path::new("build/named-encodings.json");
|
|
generate_named_encodings(out_path, encodings_path);
|
|
|
|
// Generate AGL phf maps
|
|
let agl_path = Path::new("build/agl.json");
|
|
generate_agl_maps(out_path, agl_path);
|
|
|
|
// Generate font fingerprint phf map
|
|
let fingerprints_path = Path::new("build/font-fingerprints.json");
|
|
generate_font_fingerprints(out_path, fingerprints_path);
|
|
|
|
// Generate predefined CMap registry
|
|
generate_predefined_cmaps(out_path);
|
|
|
|
// Generate glyph shape database
|
|
let shapes_path = Path::new("build/glyph-shapes.json");
|
|
generate_shape_db(out_path, shapes_path);
|
|
|
|
// Generate English wordlist
|
|
let wordlist_path = Path::new("build/wordlist-en-20k.txt");
|
|
generate_wordlist(out_path, wordlist_path);
|
|
}
|
|
|
|
fn generate_std14_metrics(out_dir: &Path, metrics_path: &Path) {
|
|
let json_content = fs::read_to_string(metrics_path).expect("Failed to read std14-metrics.json");
|
|
|
|
let data: serde_json::Value =
|
|
serde_json::from_str(&json_content).expect("Failed to parse std14-metrics.json");
|
|
|
|
let fonts = data["fonts"].as_object().expect("fonts object missing");
|
|
|
|
let mut metrics_structs = String::new();
|
|
|
|
for (font_name, font_data) in fonts {
|
|
let font_ident = font_name.replace("-", "_");
|
|
let weights = font_data["weights"]
|
|
.as_array()
|
|
.expect("weights array missing");
|
|
|
|
let weights_array: Vec<String> = weights
|
|
.iter()
|
|
.map(|v| v.as_u64().unwrap_or(0).to_string())
|
|
.collect();
|
|
|
|
let font_bbox = font_data["font_bbox"]
|
|
.as_array()
|
|
.expect("font_bbox array missing");
|
|
let font_bbox: Vec<String> = font_bbox
|
|
.iter()
|
|
.map(|v| v.as_i64().unwrap_or(0).to_string())
|
|
.collect();
|
|
|
|
let ascent = font_data["ascent"].as_i64().expect("ascent missing");
|
|
let descent = font_data["descent"].as_i64().expect("descent missing");
|
|
let italic_angle = font_data["italic_angle"]
|
|
.as_f64()
|
|
.expect("italic_angle missing");
|
|
let cap_height = font_data["cap_height"]
|
|
.as_i64()
|
|
.expect("cap_height missing");
|
|
let stem_v = font_data["stem_v"].as_i64().expect("stem_v missing");
|
|
|
|
let encoding_str = font_data["encoding"].as_str().expect("encoding missing");
|
|
let encoding = match encoding_str {
|
|
"StandardEncoding" => "NamedEncoding::Standard",
|
|
"SymbolEncoding" => "NamedEncoding::Symbol",
|
|
"ZapfDingbatsEncoding" => "NamedEncoding::ZapfDingbats",
|
|
_ => "NamedEncoding::Standard",
|
|
};
|
|
|
|
metrics_structs.push_str(&format!(
|
|
r#"
|
|
static {}_WIDTHS: &[u16; 256] = &[{}];
|
|
static {}_METRICS: Std14Metrics = Std14Metrics {{
|
|
widths: &{}_WIDTHS,
|
|
ascent: {},
|
|
descent: {},
|
|
italic_angle: {}f32,
|
|
font_bbox: [{}],
|
|
cap_height: {},
|
|
stem_v: {},
|
|
encoding: {},
|
|
}};
|
|
"#,
|
|
font_ident.to_uppercase(),
|
|
weights_array.join(", "),
|
|
font_ident.to_uppercase(),
|
|
font_ident.to_uppercase(),
|
|
ascent,
|
|
descent,
|
|
italic_angle,
|
|
font_bbox.join(", "),
|
|
cap_height,
|
|
stem_v,
|
|
encoding
|
|
));
|
|
}
|
|
|
|
// Build the phf map using phf_codegen
|
|
let mut map_builder = phf_codegen::Map::new();
|
|
|
|
for font_name in fonts.keys() {
|
|
let ident = font_name.replace("-", "_");
|
|
map_builder.entry(
|
|
font_name.as_str(),
|
|
&format!("&{}_METRICS", ident.to_uppercase()),
|
|
);
|
|
}
|
|
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated Standard 14 font metrics.
|
|
// Do not edit manually.
|
|
|
|
{}
|
|
|
|
pub fn get_std14_metrics(name: &str) -> Option<&'static Std14Metrics> {{
|
|
static METRICS: phf::Map<&'static str, &'static Std14Metrics> = {};
|
|
METRICS.get(name).copied()
|
|
}}
|
|
"#,
|
|
metrics_structs,
|
|
map_builder.build()
|
|
);
|
|
|
|
fs::write(Path::new(out_dir).join("std14_registry.rs"), rust_code)
|
|
.expect("Failed to write std14_registry.rs");
|
|
}
|
|
|
|
fn generate_named_encodings(out_dir: &Path, encodings_path: &Path) {
|
|
let json_content =
|
|
fs::read_to_string(encodings_path).expect("Failed to read named-encodings.json");
|
|
|
|
let data: serde_json::Value =
|
|
serde_json::from_str(&json_content).expect("Failed to parse named-encodings.json");
|
|
|
|
let encodings = data.as_object().expect("encodings object missing");
|
|
|
|
let mut encoding_arrays = String::new();
|
|
|
|
for (encoding_name, encoding_data) in encodings {
|
|
let ident = match encoding_name.as_str() {
|
|
"WinAnsiEncoding" => "WIN_ANSI",
|
|
"MacRomanEncoding" => "MAC_ROMAN",
|
|
"MacExpertEncoding" => "MAC_EXPERT",
|
|
"StandardEncoding" => "STANDARD",
|
|
"SymbolEncoding" => "SYMBOL",
|
|
"ZapfDingbatsEncoding" => "ZAPF_DINGBATS",
|
|
_ => continue,
|
|
};
|
|
|
|
let entries = encoding_data
|
|
.as_object()
|
|
.expect("encoding data is not an object");
|
|
|
|
let mut array_values = Vec::new();
|
|
for i in 0..256 {
|
|
let key = format!("0x{:02X}", i);
|
|
let value = entries.get(&key).and_then(|v| v.as_str());
|
|
let rust_value = match value {
|
|
Some(glyph_name) => format!("Some(\"{}\")", glyph_name),
|
|
None => "None".to_string(),
|
|
};
|
|
array_values.push(rust_value);
|
|
}
|
|
|
|
encoding_arrays.push_str(&format!(
|
|
r#"
|
|
pub static {}: [Option<&'static str>; 256] = [
|
|
{}];
|
|
"#,
|
|
ident,
|
|
array_values.join(", ")
|
|
));
|
|
}
|
|
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated named encoding tables.
|
|
// Do not edit manually.
|
|
// Source: ISO 32000-1 Annex D
|
|
|
|
{}
|
|
|
|
pub fn get_named_encoding_table(encoding: NamedEncoding) -> &'static [Option<&'static str>; 256] {{
|
|
match encoding {{
|
|
NamedEncoding::WinAnsi => &WIN_ANSI,
|
|
NamedEncoding::MacRoman => &MAC_ROMAN,
|
|
NamedEncoding::MacExpert => &MAC_EXPERT,
|
|
NamedEncoding::Standard => &STANDARD,
|
|
NamedEncoding::Symbol => &SYMBOL,
|
|
NamedEncoding::ZapfDingbats => &ZAPF_DINGBATS,
|
|
}}
|
|
}}
|
|
"#,
|
|
encoding_arrays
|
|
);
|
|
|
|
fs::write(Path::new(out_dir).join("named_encodings.rs"), rust_code)
|
|
.expect("Failed to write named_encodings.rs");
|
|
}
|
|
|
|
fn generate_agl_maps(out_dir: &Path, agl_path: &Path) {
|
|
let json_content = fs::read_to_string(agl_path).expect("Failed to read agl.json");
|
|
|
|
let data: serde_json::Value =
|
|
serde_json::from_str(&json_content).expect("Failed to parse agl.json");
|
|
|
|
// Single-codepoint map
|
|
let single = data["merged_single"]
|
|
.as_object()
|
|
.expect("merged_single object missing");
|
|
|
|
let mut single_map_builder = phf_codegen::Map::new();
|
|
|
|
for (name, uvalue) in single {
|
|
let uvalue_str = uvalue.as_str().expect("unicode value is not a string");
|
|
// Parse the JSON unicode escape like "A" into a Rust char literal
|
|
let unicode_char = decode_json_unicode(uvalue_str);
|
|
single_map_builder.entry(name.as_str(), &format!("'\\u{{{}}}'", unicode_char));
|
|
}
|
|
|
|
// Multi-codepoint map
|
|
let multi = data["merged_multi"]
|
|
.as_object()
|
|
.expect("merged_multi object missing");
|
|
|
|
let mut multi_arrays = String::new();
|
|
let mut multi_map_builder = phf_codegen::Map::new();
|
|
|
|
for (name, uvalues) in multi {
|
|
let uvalues_arr = uvalues.as_array().expect("multi value is not an array");
|
|
let ident = name.to_uppercase().replace("-", "_").replace(".", "_");
|
|
|
|
let chars: Vec<String> = uvalues_arr
|
|
.iter()
|
|
.map(|v| {
|
|
let uvalue_str = v.as_str().expect("unicode value is not a string");
|
|
let unicode_char = decode_json_unicode(uvalue_str);
|
|
format!("'\\u{{{}}}'", unicode_char)
|
|
})
|
|
.collect();
|
|
|
|
multi_arrays.push_str(&format!(
|
|
r#"
|
|
static {}: &[char] = &[{}];
|
|
"#,
|
|
ident,
|
|
chars.join(", ")
|
|
));
|
|
|
|
multi_map_builder.entry(name.as_str(), &format!("&{}", ident));
|
|
}
|
|
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated Adobe Glyph List (AGL) phf maps.
|
|
// Do not edit manually.
|
|
// Source: Adobe Glyph List 1.4 + AGLFN 1.7
|
|
// https://github.com/adobe-type-tools/agl-aglfn
|
|
|
|
{}
|
|
|
|
/// AGL phf map for single-codepoint glyph names.
|
|
/// Maps glyph names like "A", "quoteright", "Euro" to their Unicode codepoints.
|
|
pub static AGL: phf::Map<&'static str, char> = {};
|
|
|
|
/// AGL phf map for multi-codepoint (ligature) glyph names.
|
|
/// Maps glyph names like "dalethatafpatah" to sequences of Unicode codepoints.
|
|
pub static AGL_MULTI: phf::Map<&'static str, &[char]> = {};
|
|
"#,
|
|
multi_arrays,
|
|
single_map_builder.build(),
|
|
multi_map_builder.build()
|
|
);
|
|
|
|
fs::write(Path::new(out_dir).join("agl.rs"), rust_code).expect("Failed to write agl.rs");
|
|
}
|
|
|
|
/// Decode a JSON unicode escape string like "\\u0041" to "0041".
|
|
fn decode_json_unicode(s: &str) -> String {
|
|
// The JSON has "\\uXXXX" which Rust reads as "\uXXXX"
|
|
// We need to extract just the hex part
|
|
if let Some(suffix) = s.strip_prefix("\\u") {
|
|
suffix.to_string()
|
|
} else {
|
|
s.to_string()
|
|
}
|
|
}
|
|
|
|
/// Generate font fingerprint phf map from font-fingerprints.json.
|
|
///
|
|
/// The JSON format is:
|
|
/// ```json
|
|
/// [
|
|
/// {
|
|
/// "sha256_hex": "abc123...",
|
|
/// "font_name": "Font Name (informational)",
|
|
/// "entries": [[gid1, codepoint1], [gid2, codepoint2], ...]
|
|
/// }
|
|
/// ]
|
|
/// ```
|
|
///
|
|
/// Each entry maps a glyph ID to a Unicode codepoint for a specific font
|
|
/// identified by its SHA-256 hash.
|
|
fn generate_font_fingerprints(out_dir: &Path, fingerprints_path: &Path) {
|
|
let json_content =
|
|
fs::read_to_string(fingerprints_path).expect("Failed to read font-fingerprints.json");
|
|
|
|
let data: serde_json::Value =
|
|
serde_json::from_str(&json_content).expect("Failed to parse font-fingerprints.json");
|
|
|
|
let fonts = data.as_array().expect("font-fingerprints must be an array");
|
|
|
|
let mut entries_arrays = String::new();
|
|
let mut map_builder = phf_codegen::Map::new();
|
|
|
|
// Store keys and values to ensure they live long enough
|
|
let mut keys = Vec::new();
|
|
let mut values = Vec::new();
|
|
|
|
for font_entry in fonts {
|
|
let sha256_hex = font_entry
|
|
.get("sha256_hex")
|
|
.and_then(|v| v.as_str())
|
|
.expect("sha256_hex must be a string");
|
|
|
|
// Skip empty hashes (placeholder entries)
|
|
if sha256_hex.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
// Validate SHA-256 hex (64 hex chars = 32 bytes)
|
|
if sha256_hex.len() != 64 {
|
|
panic!(
|
|
"SHA-256 hex must be 64 characters, got {}",
|
|
sha256_hex.len()
|
|
);
|
|
}
|
|
|
|
// Convert hex string to [u8; 32] bytes
|
|
let hash_bytes: [u8; 32] = hex_decode_to_array(sha256_hex);
|
|
|
|
// Get entries
|
|
let entries = font_entry
|
|
.get("entries")
|
|
.and_then(|v| v.as_array())
|
|
.expect("entries must be an array");
|
|
|
|
let ident = format!("HASH_{}", sha256_hex.replace('-', "_"));
|
|
|
|
// Build the entries array
|
|
let mut entry_values = Vec::new();
|
|
for entry in entries {
|
|
let arr = entry.as_array().expect("entry must be an array");
|
|
let gid = arr
|
|
.first()
|
|
.and_then(|v| v.as_u64())
|
|
.expect("gid must be a number") as u16;
|
|
let codepoint = arr
|
|
.get(1)
|
|
.and_then(|v| v.as_u64())
|
|
.expect("codepoint must be a number") as u32;
|
|
|
|
// Validate codepoint is a valid Unicode scalar value
|
|
if !is_valid_unicode_scalar(codepoint) {
|
|
panic!("Invalid Unicode scalar: 0x{:X}", codepoint);
|
|
}
|
|
|
|
entry_values.push(format!("({}, {})", gid, codepoint));
|
|
}
|
|
|
|
entries_arrays.push_str(&format!(
|
|
r#"
|
|
static {}: &[(u16, u32)] = &[{}];
|
|
"#,
|
|
ident,
|
|
entry_values.join(", ")
|
|
));
|
|
|
|
// Build the phf map key as a byte array literal
|
|
let key_bytes: Vec<String> = hash_bytes.iter().map(|b| format!("0x{:02x}", b)).collect();
|
|
|
|
let key = format!("[{}]", key_bytes.join(", "));
|
|
let value = format!("&{}", ident);
|
|
|
|
keys.push(key);
|
|
values.push(value);
|
|
}
|
|
|
|
// Add entries to the map builder
|
|
for (key, value) in keys.iter().zip(values.iter()) {
|
|
map_builder.entry(key.as_str(), value.as_str());
|
|
}
|
|
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated font fingerprint phf map.
|
|
// Do not edit manually.
|
|
// Source: build/font-fingerprints.json
|
|
|
|
{}
|
|
|
|
/// Font fingerprint database.
|
|
///
|
|
/// Maps SHA-256 hashes of embedded font programs to their glyph ID to
|
|
/// Unicode codepoint mappings. This is Level 3 of the encoding fallback
|
|
/// chain, used when:
|
|
/// - /ToUnicode is missing or empty
|
|
/// - The embedded font subset has stripped glyph names
|
|
/// - The font binary matches a known fingerprint
|
|
///
|
|
/// The hash is computed over the DECODED font program bytes (post stream
|
|
/// decoding, pre-interpretation).
|
|
pub static FONT_FINGERPRINTS: phf::Map<[u8; 32], &'static [(u16, u32)]> = {};
|
|
"#,
|
|
entries_arrays,
|
|
map_builder.build()
|
|
);
|
|
|
|
fs::write(Path::new(out_dir).join("font_fingerprints.rs"), rust_code)
|
|
.expect("Failed to write font_fingerprints.rs");
|
|
}
|
|
|
|
/// Decode a hex string to a [u8; 32] array.
|
|
fn hex_decode_to_array(hex: &str) -> [u8; 32] {
|
|
let mut bytes = [0u8; 32];
|
|
for i in 0..32 {
|
|
let byte_str = &hex[i * 2..i * 2 + 2];
|
|
bytes[i] = u8::from_str_radix(byte_str, 16).expect("Invalid hex string");
|
|
}
|
|
bytes
|
|
}
|
|
|
|
/// Check if a value is a valid Unicode scalar value.
|
|
fn is_valid_unicode_scalar(cp: u32) -> bool {
|
|
// Unicode scalar values: 0x0..=0xD7FF, 0xE000..=0x10FFFF
|
|
(0x0..=0xD7FF).contains(&cp) || (0xE000..=0x10FFFF).contains(&cp)
|
|
}
|
|
|
|
/// Generate predefined CMap CID->Unicode mappings.
|
|
///
|
|
/// Reads JSON files from build/predefined-cmaps/ and generates phf maps
|
|
/// for CID->Unicode lookups. The JSON files contain mappings from CIDs
|
|
/// to their Unicode codepoint(s).
|
|
fn generate_predefined_cmaps(out_dir: &Path) {
|
|
let predefined_cmaps_dir = Path::new("build/predefined-cmaps");
|
|
|
|
// Generate each character collection
|
|
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-japan1", "japan1");
|
|
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-gb1", "gb1");
|
|
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-cns1", "cns1");
|
|
generate_collection_cmap(out_dir, predefined_cmaps_dir, "adobe-korea1", "korea1");
|
|
}
|
|
|
|
/// Generate a single character collection's CMap module.
|
|
fn generate_collection_cmap(out_dir: &Path, base_dir: &Path, json_name: &str, module_name: &str) {
|
|
let json_path = base_dir.join(format!("{}.json", json_name));
|
|
let out_path = out_dir.join(format!("predefined_cmap_{}.rs", module_name));
|
|
|
|
// Check if the JSON file exists
|
|
if !json_path.exists() {
|
|
// Generate a stub implementation
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated {collection} CID to Unicode mapping.
|
|
//
|
|
// Source: {json_name}.json (not found - stub implementation)
|
|
// Do not edit manually.
|
|
|
|
/// Look up a CID in the {collection} character collection.
|
|
///
|
|
/// Returns None if the CID is not assigned in {collection} or if the
|
|
/// predefined CMap data file is missing.
|
|
pub fn cid_to_unicode(cid: u32) -> Option<&'static [char]> {{
|
|
let _ = cid;
|
|
None
|
|
}}
|
|
"#,
|
|
collection = module_name.to_uppercase(),
|
|
json_name = json_name,
|
|
);
|
|
fs::write(&out_path, rust_code)
|
|
.unwrap_or_else(|_| panic!("Failed to write {}", out_path.display()));
|
|
return;
|
|
}
|
|
|
|
let json_content = fs::read_to_string(&json_path)
|
|
.unwrap_or_else(|_| panic!("Failed to read {}", json_path.display()));
|
|
|
|
let data: serde_json::Value = serde_json::from_str(&json_content)
|
|
.unwrap_or_else(|_| panic!("Failed to parse {}", json_path.display()));
|
|
|
|
// Build phf map
|
|
let mut map_builder = phf_codegen::Map::new();
|
|
let mut arrays = String::new();
|
|
|
|
if let Some(mappings) = data.as_object() {
|
|
for (cid_str, unicode_value) in mappings {
|
|
let cid: u32 = cid_str
|
|
.parse()
|
|
.unwrap_or_else(|_| panic!("Invalid CID key: {}", cid_str));
|
|
|
|
// Parse the Unicode value
|
|
if let Some(unicode_str) = unicode_value.as_str() {
|
|
let chars = parse_unicode_value(unicode_str);
|
|
|
|
// Generate array name
|
|
let array_ident = format!("CID_{}_{}", module_name.to_uppercase(), cid);
|
|
|
|
// Build the array
|
|
let char_literals: Vec<String> = chars
|
|
.iter()
|
|
.map(|c| format!("'\\u{{{:04X}}}'", *c as u32))
|
|
.collect();
|
|
|
|
arrays.push_str(&format!(
|
|
r#"
|
|
static {}: &[char] = &[{}];
|
|
"#,
|
|
array_ident,
|
|
char_literals.join(", ")
|
|
));
|
|
|
|
// Use u32 key as decimal literal
|
|
map_builder.entry(cid, &format!("&{}", array_ident));
|
|
}
|
|
}
|
|
}
|
|
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated {collection} CID to Unicode mapping.
|
|
//
|
|
// Source: {json_name}.json
|
|
// Do not edit manually.
|
|
|
|
{arrays}
|
|
|
|
/// Look up a CID in the {collection} character collection.
|
|
///
|
|
/// Returns None if the CID is not assigned in {collection}.
|
|
pub fn cid_to_unicode(cid: u32) -> Option<&'static [char]> {{
|
|
static MAP: phf::Map<u32, &'static [char]> = {map};
|
|
|
|
// CIDs are 16-bit in these collections, but we use u32 for the API
|
|
if cid <= u16::MAX as u32 {{
|
|
MAP.get(&cid).copied()
|
|
}} else {{
|
|
None
|
|
}}
|
|
}}
|
|
"#,
|
|
collection = module_name.to_uppercase(),
|
|
json_name = json_name,
|
|
arrays = arrays,
|
|
map = map_builder.build(),
|
|
);
|
|
|
|
fs::write(&out_path, rust_code)
|
|
.unwrap_or_else(|_| panic!("Failed to write {}", out_path.display()));
|
|
}
|
|
|
|
/// Parse a Unicode value from JSON to a Vec<char>.
|
|
///
|
|
/// The JSON value can be:
|
|
/// - A single Unicode escape like "A" (A)
|
|
/// - Multiple Unicode escapes for ligatures like "fi" (fi)
|
|
fn parse_unicode_value(s: &str) -> Vec<char> {
|
|
let mut chars = Vec::new();
|
|
let mut chars_iter = s.chars();
|
|
|
|
while let Some(c) = chars_iter.next() {
|
|
if c == '\\' {
|
|
// Expect \uXXXX
|
|
if chars_iter.next() == Some('u') {
|
|
// Read 4 hex digits
|
|
let mut hex_str = String::new();
|
|
for _ in 0..4 {
|
|
if let Some(hex_c) = chars_iter.next() {
|
|
hex_str.push(hex_c);
|
|
}
|
|
}
|
|
|
|
if let Ok(codepoint) = u32::from_str_radix(&hex_str, 16) {
|
|
if let Some(unicode_char) = char::from_u32(codepoint) {
|
|
chars.push(unicode_char);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if chars.is_empty() && !s.is_empty() {
|
|
// Fallback: try to parse as direct character
|
|
chars.extend(s.chars());
|
|
}
|
|
|
|
chars
|
|
}
|
|
|
|
/// Generate glyph shape database from glyph-shapes.json.
|
|
///
|
|
/// Reads build/glyph-shapes.json and emits two parallel static arrays:
|
|
/// - SHAPE_TABLE: &'static [(u64, char)] sorted by pHash
|
|
/// - FREQ_TABLE: &'static [(u64, u32)] for frequency ranks (same order as SHAPE_TABLE)
|
|
///
|
|
/// # JSON format
|
|
///
|
|
/// Array of entries:
|
|
/// ```json
|
|
/// {
|
|
/// "phash_hex": "0123456789abcdef",
|
|
/// "char": "A",
|
|
/// "source_font": "font.ttf",
|
|
/// "frequency_rank": 1
|
|
/// }
|
|
/// ```
|
|
fn generate_shape_db(out_dir: &Path, _shapes_path: &Path) {
|
|
// Resolve shapes_path relative to the workspace root
|
|
// build.rs runs from the crate directory, but the build/ dir is at workspace root
|
|
// We can find the workspace root by going up from the crate directory
|
|
let crate_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
|
|
let workspace_root = crate_dir.ancestors().nth(2).unwrap_or(crate_dir); // workspace is usually 2 levels up
|
|
let actual_shapes_path = workspace_root.join("build").join("glyph-shapes.json");
|
|
|
|
// Check if the JSON file exists
|
|
if !actual_shapes_path.exists() {
|
|
// Emit a build warning and empty tables
|
|
println!(
|
|
"cargo:warning=glyph-shapes.json not found at {}, generating empty shape database",
|
|
actual_shapes_path.display()
|
|
);
|
|
let rust_code = r#"
|
|
// Auto-generated glyph shape database.
|
|
// Source: build/glyph-shapes.json (not found - empty database)
|
|
// Do not edit manually.
|
|
|
|
/// Shape database: empty (run `cargo xtask gen-shape-db` to generate).
|
|
pub static SHAPE_TABLE: &[(u64, char)] = &[];
|
|
|
|
/// Frequency table: empty (run `cargo xtask gen-shape-db` to generate).
|
|
pub static FREQ_TABLE: &[(u64, u32)] = &[];
|
|
|
|
/// Compile-time assertion that tables are parallel.
|
|
const _: () = assert!(SHAPE_TABLE.len() == FREQ_TABLE.len());
|
|
"#;
|
|
fs::write(Path::new(out_dir).join("shape_db.rs"), rust_code)
|
|
.expect("Failed to write shape_db.rs");
|
|
return;
|
|
}
|
|
|
|
let json_content =
|
|
fs::read_to_string(&actual_shapes_path).expect("Failed to read glyph-shapes.json");
|
|
|
|
let data: serde_json::Value =
|
|
serde_json::from_str(&json_content).expect("Failed to parse glyph-shapes.json");
|
|
|
|
let entries = data.as_array().expect("glyph-shapes.json must be an array");
|
|
|
|
// Parse and sort entries by pHash
|
|
let mut sorted_entries: Vec<(u64, char, u32)> = Vec::new();
|
|
|
|
for (idx, entry) in entries.iter().enumerate() {
|
|
let phash_hex = entry
|
|
.get("phash_hex")
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("");
|
|
|
|
let phash = u64::from_str_radix(phash_hex, 16)
|
|
.unwrap_or_else(|e| panic!("Invalid phash_hex at index {}: {}", idx, e));
|
|
|
|
let char_str = entry.get("char").and_then(|v| v.as_str()).unwrap_or("");
|
|
|
|
let ch = char_str
|
|
.chars()
|
|
.next()
|
|
.unwrap_or_else(|| panic!("Empty char field at index {}", idx));
|
|
|
|
let freq_rank = entry
|
|
.get("frequency_rank")
|
|
.and_then(|v| v.as_u64())
|
|
.unwrap_or(0) as u32;
|
|
|
|
sorted_entries.push((phash, ch, freq_rank));
|
|
}
|
|
|
|
// Sort by pHash ascending
|
|
sorted_entries.sort_by_key(|a| a.0);
|
|
|
|
// Check for duplicate pHash entries
|
|
for i in 1..sorted_entries.len() {
|
|
if sorted_entries[i].0 == sorted_entries[i - 1].0 {
|
|
eprintln!(
|
|
"Warning: duplicate pHash {:016x} at indices {} and {}",
|
|
sorted_entries[i].0,
|
|
i - 1,
|
|
i
|
|
);
|
|
}
|
|
}
|
|
|
|
// Generate SHAPE_TABLE entries
|
|
let mut shape_entries = Vec::new();
|
|
for &(phash, ch, _) in &sorted_entries {
|
|
// Use Rust's Debug formatter which produces valid char literals
|
|
// e.g. 'a', '\n', '\u{1f600}'
|
|
let char_literal = format!("{:?}", ch);
|
|
shape_entries.push(format!("(0x{:016x}, {})", phash, char_literal));
|
|
}
|
|
|
|
// Generate FREQ_TABLE entries
|
|
let mut freq_entries = Vec::new();
|
|
for &(phash, _, freq) in &sorted_entries {
|
|
freq_entries.push(format!("(0x{:016x}, {})", phash, freq));
|
|
}
|
|
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated glyph shape database.
|
|
// Source: build/glyph-shapes.json
|
|
// Do not edit manually.
|
|
|
|
/// Shape database: pHash -> character mapping sorted by pHash.
|
|
pub static SHAPE_TABLE: &[(u64, char)] = &[
|
|
{}
|
|
];
|
|
|
|
/// Frequency table: pHash -> frequency rank (same order as SHAPE_TABLE).
|
|
/// Higher rank = more common character.
|
|
pub static FREQ_TABLE: &[(u64, u32)] = &[
|
|
{}
|
|
];
|
|
|
|
/// Compile-time assertion that tables have the same length.
|
|
const _: () = assert!(SHAPE_TABLE.len() == FREQ_TABLE.len());
|
|
"#,
|
|
shape_entries.join(",\n "),
|
|
freq_entries.join(",\n ")
|
|
);
|
|
|
|
fs::write(Path::new(out_dir).join("shape_db.rs"), rust_code)
|
|
.expect("Failed to write shape_db.rs");
|
|
}
|
|
|
|
/// Generate English wordlist phf::Set from wordlist-en-20k.txt.
|
|
///
|
|
/// Reads build/wordlist-en-20k.txt and emits a compile-time phf::Set
|
|
/// containing ~20,000 common English words for dictionary coverage
|
|
/// scoring in readability analysis.
|
|
///
|
|
/// # Format
|
|
///
|
|
/// One lowercase word per line, sorted by frequency (most common first).
|
|
/// Words must be ASCII only, 1-30 characters.
|
|
///
|
|
/// # Source
|
|
///
|
|
/// google-10000-english 20k.txt (frequency-sorted English word list)
|
|
fn generate_wordlist(out_dir: &Path, wordlist_path: &Path) {
|
|
// Check if the wordlist file exists
|
|
if !wordlist_path.exists() {
|
|
// Emit a build warning and empty set
|
|
println!(
|
|
"cargo:warning=wordlist-en-20k.txt not found at {}, generating empty wordlist",
|
|
wordlist_path.display()
|
|
);
|
|
let rust_code = r#"
|
|
// Auto-generated English wordlist.
|
|
// Source: build/wordlist-en-20k.txt (not found - empty wordlist)
|
|
// Do not edit manually.
|
|
|
|
/// English wordlist: empty (wordlist-en-20k.txt not found).
|
|
pub static EN_WORDLIST_20K: phf::Set<&'static str> = phf::Set::empty();
|
|
"#;
|
|
fs::write(Path::new(out_dir).join("wordlist.rs"), rust_code)
|
|
.expect("Failed to write wordlist.rs");
|
|
return;
|
|
}
|
|
|
|
let wordlist_content = fs::read_to_string(wordlist_path)
|
|
.unwrap_or_else(|_| panic!("Failed to read {}", wordlist_path.display()));
|
|
|
|
// Validate and collect words
|
|
let mut words = Vec::new();
|
|
let mut line_num = 0;
|
|
|
|
for line in wordlist_content.lines() {
|
|
line_num += 1;
|
|
let word = line.trim();
|
|
|
|
// Skip empty lines
|
|
if word.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
// Validate: ASCII only, lowercase, length 1-30
|
|
if !word.is_ascii() {
|
|
panic!("wordlist-en-20k.txt:{}: non-ASCII word: {}", line_num, word);
|
|
}
|
|
if word != word.to_lowercase() {
|
|
panic!(
|
|
"wordlist-en-20k.txt:{}: non-lowercase word: {}",
|
|
line_num, word
|
|
);
|
|
}
|
|
if !(1..=30).contains(&word.len()) {
|
|
panic!(
|
|
"wordlist-en-20k.txt:{}: word length {} outside range [1, 30]: {}",
|
|
line_num,
|
|
word.len(),
|
|
word
|
|
);
|
|
}
|
|
|
|
words.push(word);
|
|
}
|
|
|
|
// Build phf::Set
|
|
let mut set_builder = phf_codegen::Set::new();
|
|
|
|
for word in &words {
|
|
set_builder.entry(word);
|
|
}
|
|
|
|
let rust_code = format!(
|
|
r#"
|
|
// Auto-generated English wordlist.
|
|
// Source: build/wordlist-en-20k.txt
|
|
// Do not edit manually.
|
|
//
|
|
// A compile-time phf::Set of ~20,000 common English words, sorted by
|
|
// frequency. Used for dictionary coverage scoring in readability analysis.
|
|
//
|
|
// Word count: {}
|
|
|
|
/// English wordlist: 20,000 most common English words.
|
|
///
|
|
/// Lookup is O(1) via phf's perfect hash function. Words are lowercase
|
|
/// ASCII only, length 1-30 characters.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```
|
|
/// use pdftract_core::layout::wordlist::EN_WORDLIST_20K;
|
|
///
|
|
/// assert!(EN_WORDLIST_20K.contains("the"));
|
|
/// assert!(EN_WORDLIST_20K.contains("computer"));
|
|
/// assert!(!EN_WORDLIST_20K.contains("xyzqwerty"));
|
|
/// ```
|
|
pub static EN_WORDLIST_20K: phf::Set<&'static str> = {};
|
|
"#,
|
|
words.len(),
|
|
set_builder.build()
|
|
);
|
|
|
|
fs::write(Path::new(out_dir).join("wordlist.rs"), rust_code)
|
|
.expect("Failed to write wordlist.rs");
|
|
}
|
|
|
|
/// Verify SHA-256 checksums of build-time data files.
|
|
///
|
|
/// This is the TH-06 supply-chain gate implementation. It reads CHECKSUMS.sha256
|
|
/// and verifies that each build-time data file matches its expected checksum.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// `Ok(())` if all checksums match, `Err(String)` with a descriptive message otherwise.
|
|
fn verify_checksums() -> Result<(), String> {
|
|
use std::collections::HashMap;
|
|
use std::io::BufRead;
|
|
|
|
let checksums_path = Path::new("build/CHECKSUMS.sha256");
|
|
if !checksums_path.exists() {
|
|
return Err(format!("CHECKSUMS.sha256 not found at {}", checksums_path.display()));
|
|
}
|
|
|
|
let checksums_file = fs::File::open(checksums_path)
|
|
.map_err(|e| format!("Failed to open CHECKSUMS.sha256: {}", e))?;
|
|
|
|
// Parse CHECKSUMS.sha256 into a map of path -> expected checksum
|
|
let mut expected_checksums: HashMap<String, String> = HashMap::new();
|
|
let reader = std::io::BufReader::new(checksums_file);
|
|
|
|
for line in reader.lines() {
|
|
let line = line.map_err(|e| format!("Failed to read CHECKSUMS.sha256: {}", e))?;
|
|
let line = line.trim();
|
|
|
|
// Skip empty lines and comments
|
|
if line.is_empty() || line.starts_with('#') {
|
|
continue;
|
|
}
|
|
|
|
// Parse: "checksum path"
|
|
let parts: Vec<&str> = line.splitn(2, " ").collect();
|
|
if parts.len() != 2 {
|
|
return Err(format!("Invalid checksum line: {}", line));
|
|
}
|
|
|
|
let checksum = parts[0].to_string();
|
|
let path = parts[1].to_string();
|
|
expected_checksums.insert(path, checksum);
|
|
}
|
|
|
|
// Verify each file's checksum
|
|
let mut failures = Vec::new();
|
|
|
|
for (path, expected_checksum) in &expected_checksums {
|
|
let file_path = Path::new(path);
|
|
|
|
// Skip files that don't exist (they may be optional, like glyph-shapes.json)
|
|
if !file_path.exists() {
|
|
eprintln!("cargo:warning=Checksum file not found (optional): {}", path);
|
|
continue;
|
|
}
|
|
|
|
// Compute SHA-256 of the file
|
|
let actual_checksum = compute_sha256(file_path)
|
|
.map_err(|e| format!("Failed to compute checksum for {}: {}", path, e))?;
|
|
|
|
if actual_checksum != *expected_checksum {
|
|
failures.push(format!(
|
|
"{}: expected {}, got {}",
|
|
path, expected_checksum, actual_checksum
|
|
));
|
|
}
|
|
}
|
|
|
|
if !failures.is_empty() {
|
|
Err(format!(
|
|
"Checksum verification failed for {} file(s):\n {}",
|
|
failures.len(),
|
|
failures.join("\n ")
|
|
))
|
|
} else {
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Compute SHA-256 checksum of a file.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// Hex-encoded checksum string (64 hex characters).
|
|
fn compute_sha256(path: &Path) -> Result<String, String> {
|
|
use std::io::Read;
|
|
use sha2::{Digest, Sha256};
|
|
|
|
let mut file = fs::File::open(path)
|
|
.map_err(|e| format!("Failed to open {}: {}", path.display(), e))?;
|
|
|
|
let mut hasher = Sha256::new();
|
|
let mut buffer = [0u8; 8192];
|
|
|
|
loop {
|
|
let n = file.read(&mut buffer)
|
|
.map_err(|e| format!("Failed to read {}: {}", path.display(), e))?;
|
|
if n == 0 {
|
|
break;
|
|
}
|
|
hasher.update(&buffer[..n]);
|
|
}
|
|
|
|
Ok(format!("{:x}", hasher.finalize()))
|
|
}
|