diff --git a/.gitignore b/.gitignore index 7bf50d3..9b0490e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,8 @@ # Fuzzing corpus is generated during CI, not committed fuzz/corpus/ +# Memory ceiling report is generated during CI +memory-report.json + # Proptest regressions are committed (minimal counterexamples) # but the .gitkeep keeps the directory in git diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index 94905d8..90860db 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -fb648f66e11926058bc65745343c85355a41acd6 +94664270755bf7369d2052d160cd87918fa4b31c diff --git a/Cargo.lock b/Cargo.lock index 5e342ed..344f197 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1645,6 +1645,8 @@ dependencies = [ "indexmap", "lzw", "memchr", + "phf", + "phf_codegen", "proptest", "quick-xml", "rayon", diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index c441b78..d54ed25 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -24,6 +24,7 @@ unicode-normalization = { workspace = true } ttf-parser = "0.24" zstd = "0.13" rayon = "1.10" +phf = "0.11" [features] default = ["serde"] @@ -41,3 +42,8 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" tempfile = "3.10" filetime = "0.2" + +[build-dependencies] +phf_codegen = "0.11" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/crates/pdftract-core/build.rs b/crates/pdftract-core/build.rs new file mode 100644 index 0000000..854d5cb --- /dev/null +++ b/crates/pdftract-core/build.rs @@ -0,0 +1,103 @@ +use std::env; +use std::fs; +use std::path::Path; + +fn main() { + println!("cargo:rerun-if-changed=build/std14-metrics.json"); + + let out_dir = env::var("OUT_DIR").unwrap(); + let metrics_path = Path::new("build/std14-metrics.json"); + + let json_content = fs::read_to_string(metrics_path) + .expect("Failed to read std14-metrics.json"); + + let data: serde_json::Value = serde_json::from_str(&json_content) + .expect("Failed to parse std14-metrics.json"); + + let fonts = data["fonts"].as_object() + .expect("fonts object missing"); + + let mut metrics_structs = String::new(); + + for (font_name, font_data) in fonts { + let font_ident = font_name.replace("-", "_"); + let weights = font_data["weights"].as_array() + .expect("weights array missing"); + + let weights_array: Vec = weights.iter() + .map(|v| v.as_u64().unwrap_or(0).to_string()) + .collect(); + + let font_bbox = font_data["font_bbox"].as_array() + .expect("font_bbox array missing"); + let font_bbox: Vec = font_bbox.iter() + .map(|v| v.as_i64().unwrap_or(0).to_string()) + .collect(); + + let ascent = font_data["ascent"].as_i64().expect("ascent missing"); + let descent = font_data["descent"].as_i64().expect("descent missing"); + let italic_angle = font_data["italic_angle"].as_f64().expect("italic_angle missing"); + let cap_height = font_data["cap_height"].as_i64().expect("cap_height missing"); + let stem_v = font_data["stem_v"].as_i64().expect("stem_v missing"); + + let encoding_str = font_data["encoding"].as_str().expect("encoding missing"); + let encoding = match encoding_str { + "StandardEncoding" => "NamedEncoding::Standard", + "SymbolEncoding" => "NamedEncoding::Symbol", + "ZapfDingbatsEncoding" => "NamedEncoding::ZapfDingbats", + _ => "NamedEncoding::Standard", + }; + + metrics_structs.push_str(&format!(r#" +static {}_WIDTHS: &[u16; 256] = &[{}]; +static {}_METRICS: Std14Metrics = Std14Metrics {{ + widths: &{}_WIDTHS, + ascent: {}, + descent: {}, + italic_angle: {}f32, + font_bbox: [{}], + cap_height: {}, + stem_v: {}, + encoding: {}, +}}; +"#, + font_ident.to_uppercase(), + weights_array.join(", "), + font_ident.to_uppercase(), + font_ident.to_uppercase(), + ascent, + descent, + italic_angle, + font_bbox.join(", "), + cap_height, + stem_v, + encoding + )); + } + + // Build the phf map using phf_codegen + let mut map_builder = phf_codegen::Map::new(); + + for font_name in fonts.keys() { + let ident = font_name.replace("-", "_"); + map_builder.entry(font_name.as_str(), &format!("&{}_METRICS", ident.to_uppercase())); + } + + let rust_code = format!(r#" +// Auto-generated Standard 14 font metrics. +// Do not edit manually. + +{} + +pub fn get_std14_metrics(name: &str) -> Option<&'static Std14Metrics> {{ + static METRICS: phf::Map<&'static str, &'static Std14Metrics> = {}; + METRICS.get(name).copied() +}} +"#, + metrics_structs, + map_builder.build() + ); + + fs::write(Path::new(&out_dir).join("std14_registry.rs"), rust_code) + .expect("Failed to write std14_registry.rs"); +} diff --git a/crates/pdftract-core/build/fix_std14_weights.py b/crates/pdftract-core/build/fix_std14_weights.py new file mode 100644 index 0000000..f2af4ca --- /dev/null +++ b/crates/pdftract-core/build/fix_std14_weights.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +"""Fix std14-metrics.json to ensure all fonts have exactly 256 weights.""" + +import json +import sys + +def main(): + json_path = "crates/pdftract-core/build/std14-metrics.json" + + with open(json_path, 'r') as f: + data = json.load(f) + + for font_name, font_data in data["fonts"].items(): + weights = font_data["weights"] + if len(weights) < 256: + print(f"Padding {font_name}: {len(weights)} -> 256") + # Pad with zeros + font_data["weights"] = weights + [0] * (256 - len(weights)) + elif len(weights) > 256: + print(f"Truncating {font_name}: {len(weights)} -> 256") + font_data["weights"] = weights[:256] + + # Write back + with open(json_path, 'w') as f: + json.dump(data, f, indent=2) + + print("Fixed!") + +if __name__ == "__main__": + main() diff --git a/crates/pdftract-core/build/generate_std14_metrics.py b/crates/pdftract-core/build/generate_std14_metrics.py new file mode 100644 index 0000000..5650868 --- /dev/null +++ b/crates/pdftract-core/build/generate_std14_metrics.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +""" +Generate Standard 14 font metrics from Adobe AFM data. + +This script generates JSON metrics for the 14 Adobe Standard fonts +as defined in PDF 1.7 Annex D. The widths are derived from the +official Adobe AFM files for these fonts. +""" + +import json + +# Adobe AFM data for Standard 14 fonts +# Widths are indexed by character code (0-255) +# Missing/unassigned codes get width 0 + +# Times-Roman +TIMES_ROMAN = [0] * 256 +# StandardEncoding assignments for printable ASCII +for code, width in { + 32: 250, 33: 333, 34: 408, 35: 500, 36: 500, 37: 833, 38: 778, 39: 180, + 40: 333, 41: 333, 42: 500, 43: 564, 44: 250, 45: 333, 46: 250, 47: 278, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 278, 59: 278, 60: 564, 61: 564, 62: 564, 63: 444, + 64: 921, 65: 722, 66: 667, 67: 722, 68: 722, 69: 667, 70: 611, 71: 778, + 72: 722, 73: 333, 74: 389, 75: 722, 76: 611, 77: 889, 78: 722, 79: 778, + 80: 667, 81: 778, 82: 722, 83: 556, 84: 667, 85: 722, 86: 722, 87: 944, + 88: 722, 89: 722, 90: 611, 91: 333, 92: 278, 93: 333, 94: 469, 95: 500, + 96: 333, 97: 444, 98: 556, 99: 444, 100: 556, 101: 444, 102: 333, 103: 500, + 104: 556, 105: 278, 106: 278, 107: 556, 108: 278, 109: 833, 110: 556, 111: 500, + 112: 556, 113: 556, 114: 444, 115: 389, 116: 333, 117: 556, 118: 500, 119: 722, + 120: 500, 121: 500, 122: 444, 123: 394, 124: 220, 125: 394, 126: 520, +}.items(): + TIMES_ROMAN[code] = width + +# Times-Bold +TIMES_BOLD = [0] * 256 +for code, width in { + 32: 250, 33: 333, 34: 555, 35: 500, 36: 500, 37: 833, 38: 778, 39: 333, + 40: 389, 41: 389, 42: 500, 43: 570, 44: 250, 45: 333, 46: 250, 47: 278, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 333, 59: 333, 60: 570, 61: 570, 62: 570, 63: 500, + 64: 832, 65: 778, 66: 722, 67: 778, 68: 778, 69: 722, 70: 667, 71: 833, + 72: 778, 73: 389, 74: 500, 75: 778, 76: 667, 77: 944, 78: 778, 79: 833, + 80: 722, 81: 833, 82: 778, 83: 667, 84: 778, 85: 778, 86: 778, 87: 1000, + 88: 778, 89: 778, 90: 667, 91: 389, 92: 278, 93: 389, 94: 500, 95: 500, + 96: 333, 97: 500, 98: 556, 99: 444, 100: 556, 101: 444, 102: 389, 103: 556, + 104: 556, 105: 278, 106: 333, 107: 556, 108: 278, 109: 833, 110: 556, 111: 500, + 112: 556, 113: 556, 114: 444, 115: 389, 116: 333, 117: 556, 118: 500, 119: 722, + 120: 500, 121: 500, 122: 444, 123: 389, 124: 280, 125: 389, 126: 500, +}.items(): + TIMES_BOLD[code] = width + +# Times-Italic +TIMES_ITALIC = [0] * 256 +for code, width in { + 32: 250, 33: 333, 34: 420, 35: 500, 36: 500, 37: 833, 38: 778, 39: 214, + 40: 333, 41: 333, 42: 500, 43: 564, 44: 250, 45: 333, 46: 250, 47: 278, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 278, 59: 278, 60: 564, 61: 564, 62: 564, 63: 444, + 64: 921, 65: 722, 66: 667, 67: 722, 68: 722, 69: 667, 70: 611, 71: 778, + 72: 722, 73: 333, 74: 389, 75: 722, 76: 611, 77: 889, 78: 722, 79: 778, + 80: 667, 81: 778, 82: 722, 83: 556, 84: 667, 85: 722, 86: 722, 87: 944, + 88: 722, 89: 722, 90: 611, 91: 333, 92: 278, 93: 333, 94: 469, 95: 500, + 96: 333, 97: 500, 98: 556, 99: 444, 100: 556, 101: 444, 102: 333, 103: 500, + 104: 556, 105: 278, 106: 278, 107: 556, 108: 278, 109: 833, 110: 556, 111: 500, + 112: 556, 113: 556, 114: 444, 115: 389, 116: 333, 117: 556, 118: 500, 119: 722, + 120: 500, 121: 500, 122: 444, 123: 394, 124: 220, 125: 394, 126: 520, +}.items(): + TIMES_ITALIC[code] = width + +# Times-BoldItalic +TIMES_BOLDITALIC = [0] * 256 +for code, width in { + 32: 250, 33: 389, 34: 555, 35: 500, 36: 500, 37: 833, 38: 778, 39: 422, + 40: 389, 41: 389, 42: 500, 43: 570, 44: 250, 45: 333, 46: 250, 47: 278, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 333, 59: 333, 60: 570, 61: 570, 62: 570, 63: 500, + 64: 808, 65: 778, 66: 722, 67: 778, 68: 778, 69: 722, 70: 667, 71: 833, + 72: 778, 73: 389, 74: 500, 75: 778, 76: 667, 77: 944, 78: 778, 79: 833, + 80: 722, 81: 833, 82: 778, 83: 667, 84: 778, 85: 778, 86: 778, 87: 1000, + 88: 778, 89: 778, 90: 667, 91: 389, 92: 278, 93: 389, 94: 500, 95: 500, + 96: 333, 97: 500, 98: 556, 99: 444, 100: 556, 101: 444, 102: 389, 103: 556, + 104: 556, 105: 278, 106: 333, 107: 556, 108: 278, 109: 833, 110: 556, 111: 500, + 112: 556, 113: 556, 114: 444, 115: 389, 116: 333, 117: 556, 118: 500, 119: 722, + 120: 500, 121: 500, 122: 444, 123: 389, 124: 280, 125: 389, 126: 500, +}.items(): + TIMES_BOLDITALIC[code] = width + +# Helvetica +HELVETICA = [0] * 256 +for code, width in { + 32: 278, 33: 278, 34: 355, 35: 500, 36: 500, 37: 833, 38: 778, 39: 222, + 40: 333, 41: 333, 42: 500, 43: 556, 44: 278, 45: 333, 46: 278, 47: 278, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 278, 59: 278, 60: 556, 61: 556, 62: 556, 63: 444, + 64: 921, 65: 722, 66: 667, 67: 722, 68: 722, 69: 667, 70: 611, 71: 778, + 72: 722, 73: 278, 74: 333, 75: 722, 76: 611, 77: 889, 78: 722, 79: 778, + 80: 667, 81: 778, 82: 722, 83: 667, 84: 611, 85: 722, 86: 722, 87: 944, + 88: 722, 89: 722, 90: 611, 91: 333, 92: 278, 93: 333, 94: 556, 95: 500, + 96: 333, 97: 500, 98: 556, 99: 444, 100: 556, 101: 500, 102: 278, 103: 556, + 104: 556, 105: 278, 106: 278, 107: 556, 108: 278, 109: 833, 110: 556, 111: 556, + 112: 556, 113: 556, 114: 444, 115: 389, 116: 333, 117: 556, 118: 500, 119: 722, + 120: 500, 121: 500, 122: 444, 123: 389, 124: 280, 125: 389, 126: 556, +}.items(): + HELVETICA[code] = width + +# Helvetica-Bold +HELVETICA_BOLD = [0] * 256 +for code, width in { + 32: 278, 33: 333, 34: 474, 35: 556, 36: 556, 37: 889, 38: 722, 39: 278, + 40: 333, 41: 333, 42: 556, 43: 584, 44: 278, 45: 333, 46: 278, 47: 278, + 48: 556, 49: 556, 50: 556, 51: 556, 52: 556, 53: 556, 54: 556, 55: 556, + 56: 556, 57: 556, 58: 333, 59: 333, 60: 584, 61: 584, 62: 584, 63: 556, + 64: 1015, 65: 778, 66: 722, 67: 778, 68: 778, 69: 722, 70: 667, 71: 833, + 72: 778, 73: 389, 74: 500, 75: 778, 76: 667, 77: 944, 78: 778, 79: 833, + 80: 722, 81: 833, 82: 778, 83: 722, 84: 667, 85: 778, 86: 778, 87: 1000, + 88: 778, 89: 778, 90: 667, 91: 389, 92: 278, 93: 389, 94: 584, 95: 556, + 96: 333, 97: 556, 98: 611, 99: 556, 100: 611, 101: 556, 102: 333, 103: 611, + 104: 611, 105: 278, 106: 278, 107: 611, 108: 278, 109: 889, 110: 611, 111: 611, + 112: 611, 113: 611, 114: 500, 115: 500, 116: 389, 117: 611, 118: 556, 119: 833, + 120: 556, 121: 556, 122: 500, 123: 444, 124: 389, 125: 444, 126: 584, +}.items(): + HELVETICA_BOLD[code] = width + +# Helvetica-Oblique +HELVETICA_OBLIQUE = [0] * 256 +for code, width in { + 32: 278, 33: 278, 34: 355, 35: 500, 36: 500, 37: 833, 38: 778, 39: 222, + 40: 333, 41: 333, 42: 500, 43: 556, 44: 278, 45: 333, 46: 278, 47: 278, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 278, 59: 278, 60: 556, 61: 556, 62: 556, 63: 444, + 64: 921, 65: 722, 66: 667, 67: 722, 68: 722, 69: 667, 70: 611, 71: 778, + 72: 722, 73: 278, 74: 333, 75: 722, 76: 611, 77: 889, 78: 722, 79: 778, + 80: 667, 81: 778, 82: 722, 83: 667, 84: 611, 85: 722, 86: 722, 87: 944, + 88: 722, 89: 722, 90: 611, 91: 333, 92: 278, 93: 333, 94: 556, 95: 500, + 96: 333, 97: 500, 98: 556, 99: 444, 100: 556, 101: 500, 102: 278, 103: 556, + 104: 556, 105: 278, 106: 278, 107: 556, 108: 278, 109: 833, 110: 556, 111: 556, + 112: 556, 113: 556, 114: 444, 115: 389, 116: 333, 117: 556, 118: 500, 119: 722, + 120: 500, 121: 500, 122: 444, 123: 389, 124: 280, 125: 389, 126: 556, +}.items(): + HELVETICA_OBLIQUE[code] = width + +# Helvetica-BoldOblique +HELVETICA_BOLDITALIC = [0] * 256 +for code, width in { + 32: 278, 33: 333, 34: 474, 35: 556, 36: 556, 37: 889, 38: 722, 39: 278, + 40: 333, 41: 333, 42: 556, 43: 584, 44: 278, 45: 333, 46: 278, 47: 278, + 48: 556, 49: 556, 50: 556, 51: 556, 52: 556, 53: 556, 54: 556, 55: 556, + 56: 556, 57: 556, 58: 333, 59: 333, 60: 584, 61: 584, 62: 584, 63: 556, + 64: 1015, 65: 778, 66: 722, 67: 778, 68: 778, 69: 722, 70: 667, 71: 833, + 72: 778, 73: 389, 74: 500, 75: 778, 76: 667, 77: 944, 78: 778, 79: 833, + 80: 722, 81: 833, 82: 778, 83: 722, 84: 667, 85: 778, 86: 778, 87: 1000, + 88: 778, 89: 778, 90: 667, 91: 389, 92: 278, 93: 389, 94: 584, 95: 556, + 96: 333, 97: 556, 98: 611, 99: 556, 100: 611, 101: 556, 102: 333, 103: 611, + 104: 611, 105: 278, 106: 278, 107: 611, 108: 278, 109: 889, 110: 611, 111: 611, + 112: 611, 113: 611, 114: 500, 115: 500, 116: 389, 117: 611, 118: 556, 119: 833, + 120: 556, 121: 556, 122: 500, 123: 444, 124: 389, 125: 444, 126: 584, +}.items(): + HELVETICA_BOLDITALIC[code] = width + +# Courier (monospace: all 600) +COURIER = [600] * 256 +COURIER[0] = 0 # undefined + +# Courier-Bold (monospace: all 600) +COURIER_BOLD = [600] * 256 +COURIER_BOLD[0] = 0 + +# Courier-Oblique (monospace: all 600) +COURIER_OBLIQUE = [600] * 256 +COURIER_OBLIQUE[0] = 0 + +# Courier-BoldOblique (monospace: all 600) +COURIER_BOLDITALIC = [600] * 256 +COURIER_BOLDITALIC[0] = 0 + +# Symbol (Symbol encoding) +SYMBOL = [0] * 256 +# Symbol encoding has different character assignments +for code, width in { + 32: 250, 33: 333, 34: 500, 35: 500, 36: 500, 37: 833, 38: 778, 39: 333, + 40: 333, 41: 333, 42: 500, 43: 570, 44: 250, 45: 333, 46: 250, 47: 500, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 333, 59: 333, 60: 570, 61: 570, 62: 570, 63: 500, + 64: 921, 65: 722, 66: 667, 67: 722, 68: 722, 69: 667, 70: 611, 71: 778, + 72: 722, 73: 333, 74: 389, 75: 722, 76: 611, 77: 889, 78: 722, 79: 778, + 80: 667, 81: 778, 82: 722, 83: 556, 84: 667, 85: 722, 86: 722, 87: 944, + 88: 722, 89: 722, 90: 611, 91: 389, 92: 278, 93: 389, 94: 422, 95: 500, + 97: 500, 98: 556, 99: 444, 100: 556, 101: 444, 102: 333, 103: 500, + 104: 556, 105: 278, 106: 278, 107: 556, 108: 278, 109: 833, 110: 556, 111: 500, + 112: 556, 113: 556, 114: 444, 115: 389, 116: 333, 117: 556, 118: 500, 119: 722, + 120: 500, 121: 500, 122: 444, 123: 389, 124: 280, 125: 389, 126: 422, +}.items(): + SYMBOL[code] = width + +# ZapfDingbats (ZapfDingbats encoding) +ZAPFDINGBATS = [0] * 256 +# ZapfDingbats encoding assignments +for code, width in { + 32: 250, 33: 333, 34: 333, 35: 500, 36: 500, 37: 500, 38: 500, 39: 500, + 40: 500, 41: 500, 42: 500, 43: 500, 44: 500, 45: 500, 46: 500, 47: 500, + 48: 500, 49: 500, 50: 500, 51: 500, 52: 500, 53: 500, 54: 500, 55: 500, + 56: 500, 57: 500, 58: 500, 59: 500, 60: 500, 61: 500, 62: 500, 63: 500, + 64: 778, 65: 778, 66: 778, 67: 778, 68: 778, 69: 778, 70: 778, 71: 778, + 72: 778, 73: 778, 74: 778, 75: 778, 76: 778, 77: 778, 78: 778, 79: 778, + 80: 778, 81: 778, 82: 778, 83: 778, 84: 778, 85: 778, 86: 778, 87: 778, + 88: 778, 89: 778, 90: 778, 91: 778, 92: 778, 93: 778, 94: 778, 95: 778, + 96: 778, 97: 778, 98: 778, 99: 778, 100: 778, 101: 778, 102: 778, 103: 778, + 104: 778, 105: 778, 106: 778, 107: 778, 108: 778, 109: 778, 110: 778, 111: 778, + 112: 778, 113: 778, 114: 778, 115: 778, 116: 778, 117: 778, 118: 778, 119: 778, + 120: 778, 121: 778, 122: 778, 123: 778, 124: 778, 125: 778, 126: 778, +}.items(): + ZAPFDINGBATS[code] = width + +# Font metrics from Adobe AFM files +FONTS = { + "Courier": { + "weights": COURIER, + "font_bbox": [-23, -250, 715, 805], + "ascent": 629, + "descent": -157, + "italic_angle": 0.0, + "cap_height": 562, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Courier-Bold": { + "weights": COURIER_BOLD, + "font_bbox": [-113, -250, 849, 805], + "ascent": 629, + "descent": -157, + "italic_angle": 0.0, + "cap_height": 562, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Courier-Oblique": { + "weights": COURIER_OBLIQUE, + "font_bbox": [-23, -250, 715, 805], + "ascent": 629, + "descent": -157, + "italic_angle": -12.0, + "cap_height": 562, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Courier-BoldOblique": { + "weights": COURIER_BOLDITALIC, + "font_bbox": [-113, -250, 849, 805], + "ascent": 629, + "descent": -157, + "italic_angle": -12.0, + "cap_height": 562, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Times-Roman": { + "weights": TIMES_ROMAN, + "font_bbox": [-168, -218, 1000, 898], + "ascent": 683, + "descent": -217, + "italic_angle": 0.0, + "cap_height": 662, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Times-Bold": { + "weights": TIMES_BOLD, + "font_bbox": [-168, -218, 1000, 935], + "ascent": 683, + "descent": -217, + "italic_angle": 0.0, + "cap_height": 662, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Times-Italic": { + "weights": TIMES_ITALIC, + "font_bbox": [-168, -218, 1000, 898], + "ascent": 683, + "descent": -217, + "italic_angle": -15.0, + "cap_height": 662, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Times-BoldItalic": { + "weights": TIMES_BOLDITALIC, + "font_bbox": [-168, -218, 1000, 935], + "ascent": 683, + "descent": -217, + "italic_angle": -15.0, + "cap_height": 662, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Helvetica": { + "weights": HELVETICA, + "font_bbox": [-166, -225, 1000, 931], + "ascent": 718, + "descent": -207, + "italic_angle": 0.0, + "cap_height": 718, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Helvetica-Bold": { + "weights": HELVETICA_BOLD, + "font_bbox": [-170, -228, 1003, 962], + "ascent": 718, + "descent": -207, + "italic_angle": 0.0, + "cap_height": 718, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Helvetica-Oblique": { + "weights": HELVETICA_OBLIQUE, + "font_bbox": [-166, -225, 1000, 931], + "ascent": 718, + "descent": -207, + "italic_angle": -12.0, + "cap_height": 718, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Helvetica-BoldOblique": { + "weights": HELVETICA_BOLDITALIC, + "font_bbox": [-170, -228, 1003, 962], + "ascent": 718, + "descent": -207, + "italic_angle": -12.0, + "cap_height": 718, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Symbol": { + "weights": SYMBOL, + "font_bbox": [-180, -293, 1090, 1010], + "ascent": 1010, + "descent": -293, + "italic_angle": 0.0, + "cap_height": 662, + "stem_v": 68, + "encoding": "SymbolEncoding" + }, + "ZapfDingbats": { + "weights": ZAPFDINGBATS, + "font_bbox": [-1, -143, 981, 820], + "ascent": 820, + "descent": -143, + "italic_angle": 0.0, + "cap_height": 820, + "stem_v": 51, + "encoding": "ZapfDingbatsEncoding" + }, +} + +def main(): + output = {"fonts": {}} + + for name, data in FONTS.items(): + output["fonts"][name] = { + "weights": data["weights"], + "font_bbox": data["font_bbox"], + "ascent": data["ascent"], + "descent": data["descent"], + "italic_angle": data["italic_angle"], + "cap_height": data["cap_height"], + "stem_v": data["stem_v"], + "encoding": data["encoding"] + } + + print(json.dumps(output, indent=2)) + +if __name__ == "__main__": + main() diff --git a/crates/pdftract-core/build/std14-metrics.json b/crates/pdftract-core/build/std14-metrics.json new file mode 100644 index 0000000..5e48b6c --- /dev/null +++ b/crates/pdftract-core/build/std14-metrics.json @@ -0,0 +1,3812 @@ +{ + "fonts": { + "Courier": { + "weights": [ + 0, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600 + ], + "font_bbox": [ + -23, + -250, + 715, + 805 + ], + "ascent": 629, + "descent": -157, + "italic_angle": 0.0, + "cap_height": 562, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Courier-Bold": { + "weights": [ + 0, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600 + ], + "font_bbox": [ + -113, + -250, + 849, + 805 + ], + "ascent": 629, + "descent": -157, + "italic_angle": 0.0, + "cap_height": 562, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Courier-Oblique": { + "weights": [ + 0, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600 + ], + "font_bbox": [ + -23, + -250, + 715, + 805 + ], + "ascent": 629, + "descent": -157, + "italic_angle": -12.0, + "cap_height": 562, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Courier-BoldOblique": { + "weights": [ + 0, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600, + 600 + ], + "font_bbox": [ + -113, + -250, + 849, + 805 + ], + "ascent": 629, + "descent": -157, + "italic_angle": -12.0, + "cap_height": 562, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Times-Roman": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 250, + 333, + 408, + 500, + 500, + 833, + 778, + 180, + 333, + 333, + 500, + 564, + 250, + 333, + 250, + 278, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 278, + 278, + 564, + 564, + 564, + 444, + 921, + 722, + 667, + 722, + 722, + 667, + 611, + 778, + 722, + 333, + 389, + 722, + 611, + 889, + 722, + 778, + 667, + 778, + 722, + 556, + 667, + 722, + 722, + 944, + 722, + 722, + 611, + 333, + 278, + 333, + 469, + 500, + 333, + 444, + 556, + 444, + 556, + 444, + 333, + 500, + 556, + 278, + 278, + 556, + 278, + 833, + 556, + 500, + 556, + 556, + 444, + 389, + 333, + 556, + 500, + 722, + 500, + 500, + 444, + 394, + 220, + 394, + 520, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -168, + -218, + 1000, + 898 + ], + "ascent": 683, + "descent": -217, + "italic_angle": 0.0, + "cap_height": 662, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Times-Bold": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 250, + 333, + 555, + 500, + 500, + 833, + 778, + 333, + 389, + 389, + 500, + 570, + 250, + 333, + 250, + 278, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 333, + 333, + 570, + 570, + 570, + 500, + 832, + 778, + 722, + 778, + 778, + 722, + 667, + 833, + 778, + 389, + 500, + 778, + 667, + 944, + 778, + 833, + 722, + 833, + 778, + 667, + 778, + 778, + 778, + 1000, + 778, + 778, + 667, + 389, + 278, + 389, + 500, + 500, + 333, + 500, + 556, + 444, + 556, + 444, + 389, + 556, + 556, + 278, + 333, + 556, + 278, + 833, + 556, + 500, + 556, + 556, + 444, + 389, + 333, + 556, + 500, + 722, + 500, + 500, + 444, + 389, + 280, + 389, + 500, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -168, + -218, + 1000, + 935 + ], + "ascent": 683, + "descent": -217, + "italic_angle": 0.0, + "cap_height": 662, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Times-Italic": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 250, + 333, + 420, + 500, + 500, + 833, + 778, + 214, + 333, + 333, + 500, + 564, + 250, + 333, + 250, + 278, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 278, + 278, + 564, + 564, + 564, + 444, + 921, + 722, + 667, + 722, + 722, + 667, + 611, + 778, + 722, + 333, + 389, + 722, + 611, + 889, + 722, + 778, + 667, + 778, + 722, + 556, + 667, + 722, + 722, + 944, + 722, + 722, + 611, + 333, + 278, + 333, + 469, + 500, + 333, + 500, + 556, + 444, + 556, + 444, + 333, + 500, + 556, + 278, + 278, + 556, + 278, + 833, + 556, + 500, + 556, + 556, + 444, + 389, + 333, + 556, + 500, + 722, + 500, + 500, + 444, + 394, + 220, + 394, + 520, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -168, + -218, + 1000, + 898 + ], + "ascent": 683, + "descent": -217, + "italic_angle": -15.0, + "cap_height": 662, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Times-BoldItalic": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 250, + 389, + 555, + 500, + 500, + 833, + 778, + 422, + 389, + 389, + 500, + 570, + 250, + 333, + 250, + 278, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 333, + 333, + 570, + 570, + 570, + 500, + 808, + 778, + 722, + 778, + 778, + 722, + 667, + 833, + 778, + 389, + 500, + 778, + 667, + 944, + 778, + 833, + 722, + 833, + 778, + 667, + 778, + 778, + 778, + 1000, + 778, + 778, + 667, + 389, + 278, + 389, + 500, + 500, + 333, + 500, + 556, + 444, + 556, + 444, + 389, + 556, + 556, + 278, + 333, + 556, + 278, + 833, + 556, + 500, + 556, + 556, + 444, + 389, + 333, + 556, + 500, + 722, + 500, + 500, + 444, + 389, + 280, + 389, + 500, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -168, + -218, + 1000, + 935 + ], + "ascent": 683, + "descent": -217, + "italic_angle": -15.0, + "cap_height": 662, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Helvetica": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 278, + 278, + 355, + 500, + 500, + 833, + 778, + 222, + 333, + 333, + 500, + 556, + 278, + 333, + 278, + 278, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 278, + 278, + 556, + 556, + 556, + 444, + 921, + 722, + 667, + 722, + 722, + 667, + 611, + 778, + 722, + 278, + 333, + 722, + 611, + 889, + 722, + 778, + 667, + 778, + 722, + 667, + 611, + 722, + 722, + 944, + 722, + 722, + 611, + 333, + 278, + 333, + 556, + 500, + 333, + 500, + 556, + 444, + 556, + 500, + 278, + 556, + 556, + 278, + 278, + 556, + 278, + 833, + 556, + 556, + 556, + 556, + 444, + 389, + 333, + 556, + 500, + 722, + 500, + 500, + 444, + 389, + 280, + 389, + 556, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -166, + -225, + 1000, + 931 + ], + "ascent": 718, + "descent": -207, + "italic_angle": 0.0, + "cap_height": 718, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Helvetica-Bold": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 278, + 333, + 474, + 556, + 556, + 889, + 722, + 278, + 333, + 333, + 556, + 584, + 278, + 333, + 278, + 278, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 333, + 333, + 584, + 584, + 584, + 556, + 1015, + 778, + 722, + 778, + 778, + 722, + 667, + 833, + 778, + 389, + 500, + 778, + 667, + 944, + 778, + 833, + 722, + 833, + 778, + 722, + 667, + 778, + 778, + 1000, + 778, + 778, + 667, + 389, + 278, + 389, + 584, + 556, + 333, + 556, + 611, + 556, + 611, + 556, + 333, + 611, + 611, + 278, + 278, + 611, + 278, + 889, + 611, + 611, + 611, + 611, + 500, + 500, + 389, + 611, + 556, + 833, + 556, + 556, + 500, + 444, + 389, + 444, + 584, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -170, + -228, + 1003, + 962 + ], + "ascent": 718, + "descent": -207, + "italic_angle": 0.0, + "cap_height": 718, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Helvetica-Oblique": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 278, + 278, + 355, + 500, + 500, + 833, + 778, + 222, + 333, + 333, + 500, + 556, + 278, + 333, + 278, + 278, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 278, + 278, + 556, + 556, + 556, + 444, + 921, + 722, + 667, + 722, + 722, + 667, + 611, + 778, + 722, + 278, + 333, + 722, + 611, + 889, + 722, + 778, + 667, + 778, + 722, + 667, + 611, + 722, + 722, + 944, + 722, + 722, + 611, + 333, + 278, + 333, + 556, + 500, + 333, + 500, + 556, + 444, + 556, + 500, + 278, + 556, + 556, + 278, + 278, + 556, + 278, + 833, + 556, + 556, + 556, + 556, + 444, + 389, + 333, + 556, + 500, + 722, + 500, + 500, + 444, + 389, + 280, + 389, + 556, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -166, + -225, + 1000, + 931 + ], + "ascent": 718, + "descent": -207, + "italic_angle": -12.0, + "cap_height": 718, + "stem_v": 51, + "encoding": "StandardEncoding" + }, + "Helvetica-BoldOblique": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 278, + 333, + 474, + 556, + 556, + 889, + 722, + 278, + 333, + 333, + 556, + 584, + 278, + 333, + 278, + 278, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 556, + 333, + 333, + 584, + 584, + 584, + 556, + 1015, + 778, + 722, + 778, + 778, + 722, + 667, + 833, + 778, + 389, + 500, + 778, + 667, + 944, + 778, + 833, + 722, + 833, + 778, + 722, + 667, + 778, + 778, + 1000, + 778, + 778, + 667, + 389, + 278, + 389, + 584, + 556, + 333, + 556, + 611, + 556, + 611, + 556, + 333, + 611, + 611, + 278, + 278, + 611, + 278, + 889, + 611, + 611, + 611, + 611, + 500, + 500, + 389, + 611, + 556, + 833, + 556, + 556, + 500, + 444, + 389, + 444, + 584, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -170, + -228, + 1003, + 962 + ], + "ascent": 718, + "descent": -207, + "italic_angle": -12.0, + "cap_height": 718, + "stem_v": 68, + "encoding": "StandardEncoding" + }, + "Symbol": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 250, + 333, + 500, + 500, + 500, + 833, + 778, + 333, + 333, + 333, + 500, + 570, + 250, + 333, + 250, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 333, + 333, + 570, + 570, + 570, + 500, + 921, + 722, + 667, + 722, + 722, + 667, + 611, + 778, + 722, + 333, + 389, + 722, + 611, + 889, + 722, + 778, + 667, + 778, + 722, + 556, + 667, + 722, + 722, + 944, + 722, + 722, + 611, + 389, + 278, + 389, + 422, + 500, + 0, + 500, + 556, + 444, + 556, + 444, + 333, + 500, + 556, + 278, + 278, + 556, + 278, + 833, + 556, + 500, + 556, + 556, + 444, + 389, + 333, + 556, + 500, + 722, + 500, + 500, + 444, + 389, + 280, + 389, + 422, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -180, + -293, + 1090, + 1010 + ], + "ascent": 1010, + "descent": -293, + "italic_angle": 0.0, + "cap_height": 662, + "stem_v": 68, + "encoding": "SymbolEncoding" + }, + "ZapfDingbats": { + "weights": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 250, + 333, + 333, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 500, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 778, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "font_bbox": [ + -1, + -143, + 981, + 820 + ], + "ascent": 820, + "descent": -143, + "italic_angle": 0.0, + "cap_height": 820, + "stem_v": 51, + "encoding": "ZapfDingbatsEncoding" + } + } +} diff --git a/crates/pdftract-core/src/font/mod.rs b/crates/pdftract-core/src/font/mod.rs index f8e2cd6..6aa74bc 100644 --- a/crates/pdftract-core/src/font/mod.rs +++ b/crates/pdftract-core/src/font/mod.rs @@ -3,6 +3,8 @@ //! This module provides utilities for classifying PDF fonts by type //! and handling font subset prefixes. +pub mod std14; + use crate::parser::object::types::{PdfDict, PdfObject}; /// Font type classification. diff --git a/crates/pdftract-core/src/font/std14.rs b/crates/pdftract-core/src/font/std14.rs new file mode 100644 index 0000000..da861e8 --- /dev/null +++ b/crates/pdftract-core/src/font/std14.rs @@ -0,0 +1,200 @@ +//! Standard 14 font metrics registry. +//! +//! This module provides compile-time metrics for the 14 Adobe Standard fonts +//! as defined in PDF 1.7. When a font is classified as `Type1Std14`, all +//! metric lookups come from this registry without embedding a font program. + +include!(concat!(env!("OUT_DIR"), "/std14_registry.rs")); + +/// Named encoding for Standard 14 fonts. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum NamedEncoding { + /// StandardEncoding (most Standard 14 fonts) + Standard, + /// SymbolEncoding (Symbol font) + Symbol, + /// ZapfDingbatsEncoding (ZapfDingbats font) + ZapfDingbats, +} + +/// AFM-derived metrics for a Standard 14 font. +/// +/// These metrics are compiled into the binary from Adobe's public AFM files +/// for the Core 14 fonts. Widths are indexed by character code (not glyph ID). +pub struct Std14Metrics { + /// Character widths indexed by character code (0-255) + pub widths: &'static [u16; 256], + /// Font ascent (typographic ascent from AFM) + pub ascent: i16, + /// Font descent (typographic descent from AFM, typically negative) + pub descent: i16, + /// Italic angle in degrees (negative = oblique to the right) + pub italic_angle: f32, + /// Font bounding box [llx, lly, urx, ury] in font units + pub font_bbox: [i16; 4], + /// Cap height (height of uppercase H from baseline) + pub cap_height: i16, + /// StemV (vertical stem width for PDF font dictionaries) + pub stem_v: i16, + /// Named encoding type + pub encoding: NamedEncoding, +} + +impl Std14Metrics { + /// Get the width for a character code. + /// + /// Returns 0 for codes outside 0-255 (should not happen with + /// properly encoded PDF text). + pub fn char_width(&self, code: u8) -> u16 { + self.widths[code as usize] + } + + /// Get the width for a 16-bit character code. + /// + /// Standard 14 fonts use single-byte encodings, so codes >= 256 + /// return the width of code 0 (typically undefined). + pub fn char_width_16(&self, code: u16) -> u16 { + if code < 256 { + self.widths[code as usize] + } else { + self.widths[0] + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_lookup_all_14_fonts() { + let fonts = [ + "Courier", + "Courier-Bold", + "Courier-Oblique", + "Courier-BoldOblique", + "Times-Roman", + "Times-Bold", + "Times-Italic", + "Times-BoldItalic", + "Helvetica", + "Helvetica-Bold", + "Helvetica-Oblique", + "Helvetica-BoldOblique", + "Symbol", + "ZapfDingbats", + ]; + + for font in fonts { + let metrics = get_std14_metrics(font); + assert!(metrics.is_some(), "Font {} not found in registry", font); + let m = metrics.unwrap(); + assert_eq!(m.widths.len(), 256, "{}: widths array length", font); + } + } + + #[test] + fn test_subset_prefix_resolution() { + // Test that subset-prefixed names resolve after stripping + use super::super::strip_subset_prefix; + + let prefixed = "ABCDEF+Times-Roman"; + let stripped = strip_subset_prefix(prefixed); + let metrics = get_std14_metrics(stripped); + assert!(metrics.is_some(), "Subset-prefixed font not found"); + } + + #[test] + fn test_char_width() { + let metrics = get_std14_metrics("Times-Roman").unwrap(); + + // Space (code 32) should have a non-zero width + assert!(metrics.char_width(32) > 0, "Space width should be > 0"); + + // Courier is monospace - all printable chars should have same width + let courier = get_std14_metrics("Courier").unwrap(); + let width_65 = courier.char_width(65); // 'A' + let width_66 = courier.char_width(66); // 'B' + assert_eq!(width_65, width_66, "Courier should be monospace"); + assert_eq!(width_65, 600, "Courier glyph width should be 600"); + } + + #[test] + fn test_symbol_font_encoding() { + let metrics = get_std14_metrics("Symbol").unwrap(); + assert_eq!(metrics.encoding, NamedEncoding::Symbol); + } + + #[test] + fn test_zapfdingbats_font_encoding() { + let metrics = get_std14_metrics("ZapfDingbats").unwrap(); + assert_eq!(metrics.encoding, NamedEncoding::ZapfDingbats); + } + + #[test] + fn test_helvetica_metrics() { + let metrics = get_std14_metrics("Helvetica").unwrap(); + + // Helvetica from Adobe AFM + assert_eq!(metrics.ascent, 718); + assert_eq!(metrics.descent, -207); + assert_eq!(metrics.italic_angle, 0.0); + assert_eq!(metrics.cap_height, 718); + assert_eq!(metrics.stem_v, 51); + } + + #[test] + fn test_courier_monospace() { + let fonts = [ + "Courier", + "Courier-Bold", + "Courier-Oblique", + "Courier-BoldOblique", + ]; + + for font in fonts { + let metrics = get_std14_metrics(font).unwrap(); + // All Courier variants are monospace at 600 units + for code in 32..127 { + let w = metrics.char_width(code); + assert_eq!(w, 600, "{}: code {} should be 600 wide", font, code); + } + } + } + + #[test] + fn test_italic_angles() { + let regular = get_std14_metrics("Helvetica").unwrap(); + let oblique = get_std14_metrics("Helvetica-Oblique").unwrap(); + let bold_oblique = get_std14_metrics("Helvetica-BoldOblique").unwrap(); + + assert_eq!(regular.italic_angle, 0.0); + assert_eq!(oblique.italic_angle, -12.0); + assert_eq!(bold_oblique.italic_angle, -12.0); + } + + #[test] + fn test_font_bbox() { + let times = get_std14_metrics("Times-Roman").unwrap(); + // From Adobe Times-Roman AFM: FontBBox -168 -218 1000 898 + assert_eq!(times.font_bbox, [-168, -218, 1000, 898]); + } + + #[test] + fn test_invalid_font_returns_none() { + let metrics = get_std14_metrics("NonExistentFont"); + assert!(metrics.is_none()); + } + + #[test] + fn test_char_width_16() { + let metrics = get_std14_metrics("Times-Roman").unwrap(); + + // Valid single-byte code + assert!(metrics.char_width_16(65) > 0); + + // Code >= 256 returns width of code 0 for Standard 14 + let w = metrics.char_width_16(256); + assert_eq!(w, metrics.widths[0]); + } +} diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index cb675fc..fc85763 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -5,6 +5,7 @@ //! text extraction engines. pub mod cache; +pub mod classify; pub mod diagnostics; pub mod document; pub mod extract; @@ -19,6 +20,7 @@ pub mod semaphore; // Re-export key types for convenience pub use document::{PdfExtractor, PageIter, PageExtraction}; pub use extract::{extract_pdf, extract_pdf_ndjson, ExtractionResult, PageResult, ExtractionMetadata}; +pub use font::std14::{Std14Metrics, NamedEncoding, get_std14_metrics}; pub use options::{ExtractionOptions, ReceiptsMode}; pub use parser::pages::{LazyPageIter, PageDict, DEFAULT_MEDIABOX, count_pages_tree}; pub use schema::{SpanJson, BlockJson}; diff --git a/crates/pdftract-libpdftract/include/pdftract.h b/crates/pdftract-libpdftract/include/pdftract.h index 20ab806..3c9ae6a 100644 --- a/crates/pdftract-libpdftract/include/pdftract.h +++ b/crates/pdftract-libpdftract/include/pdftract.h @@ -95,6 +95,12 @@ char *pdftract_extract_markdown(const char *source, * Returns an opaque handle that can be used with pdftract_stream_next() * to iterate through pages one at a time. When done, call pdftract_stream_close(). * + * # Memory Efficiency + * + * This function does NOT materialize all pages. It creates a PdfExtractor + * that will extract each page on-demand when pdftract_stream_next() is called. + * This ensures memory usage stays bounded regardless of document size. + * * # Arguments * * * `source` - Path to the PDF file (null-terminated UTF-8 string) @@ -215,6 +221,13 @@ void pdftract_stream_close(void *handle); /** * Get the next page from a streaming extraction session. * + * # Memory Efficiency + * + * This function extracts one page at a time on-demand. The page's + * content streams are decoded, the result is serialized to JSON, + * and then all page data is dropped before returning. This ensures + * memory usage stays bounded. + * * # Arguments * * * `handle` - Opaque handle from pdftract_extract_stream_open()