feat(pdftract-28m6): implement AGL compile-time phf::Map

Add Adobe Glyph List (AGL) 1.4 and AGLFN 1.7 compile-time lookup using phf::Map.

- Add generate_agl.py to parse AGL source files and generate agl.json
- Add aglfn.txt (AGLFN 1.7, ~770 entries) and glyphlist.txt (AGL 1.4, ~4400 entries)
- Add build.rs function to generate two phf::Map structures:
  - AGL: 4,200 single-codepoint entries
  - AGL_MULTI: 81 multi-codepoint entries (Hebrew/Arabic)
- Add src/font/agl.rs with public API:
  - unicode_for_glyph_name() - handles algorithmic patterns (uniXXXX, uXXXXXX), variant stripping, AGL lookup
  - unicode_for_glyph_name_multi() - for multi-codepoint ligatures

All 21 acceptance criteria tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-23 18:44:27 -04:00
parent b72d8312ce
commit 566cac2aea
7 changed files with 15207 additions and 0 deletions

View file

@ -5,6 +5,7 @@ use std::path::Path;
fn main() {
println!("cargo:rerun-if-changed=build/std14-metrics.json");
println!("cargo:rerun-if-changed=build/named-encodings.json");
println!("cargo:rerun-if-changed=build/agl.json");
let out_dir = env::var("OUT_DIR").unwrap();
let out_path = Path::new(&out_dir);
@ -16,6 +17,10 @@ fn main() {
// Generate named encoding tables
let encodings_path = Path::new("build/named-encodings.json");
generate_named_encodings(out_path, encodings_path);
// Generate AGL phf maps
let agl_path = Path::new("build/agl.json");
generate_agl_maps(out_path, agl_path);
}
fn generate_std14_metrics(out_dir: &Path, metrics_path: &Path) {
@ -184,3 +189,90 @@ pub fn get_named_encoding_table(encoding: NamedEncoding) -> &'static [Option<&'s
fs::write(Path::new(out_dir).join("named_encodings.rs"), rust_code)
.expect("Failed to write named_encodings.rs");
}
fn generate_agl_maps(out_dir: &Path, agl_path: &Path) {
let json_content = fs::read_to_string(agl_path)
.expect("Failed to read agl.json");
let data: serde_json::Value = serde_json::from_str(&json_content)
.expect("Failed to parse agl.json");
// Single-codepoint map
let single = data["merged_single"].as_object()
.expect("merged_single object missing");
let mut single_map_builder = phf_codegen::Map::new();
for (name, uvalue) in single {
let uvalue_str = uvalue.as_str()
.expect("unicode value is not a string");
// Parse the JSON unicode escape like "A" into a Rust char literal
let unicode_char = decode_json_unicode(uvalue_str);
single_map_builder.entry(name.as_str(), &format!("'\\u{{{}}}'", unicode_char));
}
// Multi-codepoint map
let multi = data["merged_multi"].as_object()
.expect("merged_multi object missing");
let mut multi_arrays = String::new();
let mut multi_map_builder = phf_codegen::Map::new();
for (name, uvalues) in multi {
let uvalues_arr = uvalues.as_array()
.expect("multi value is not an array");
let ident = name.to_uppercase().replace("-", "_").replace(".", "_");
let chars: Vec<String> = uvalues_arr.iter()
.map(|v| {
let uvalue_str = v.as_str().expect("unicode value is not a string");
let unicode_char = decode_json_unicode(uvalue_str);
format!("'\\u{{{}}}'", unicode_char)
})
.collect();
multi_arrays.push_str(&format!(r#"
static {}: &[char] = &[{}];
"#,
ident,
chars.join(", ")
));
multi_map_builder.entry(name.as_str(), &format!("&{}", ident));
}
let rust_code = format!(r#"
// Auto-generated Adobe Glyph List (AGL) phf maps.
// Do not edit manually.
// Source: Adobe Glyph List 1.4 + AGLFN 1.7
// https://github.com/adobe-type-tools/agl-aglfn
{}
/// AGL phf map for single-codepoint glyph names.
/// Maps glyph names like "A", "quoteright", "Euro" to their Unicode codepoints.
pub static AGL: phf::Map<&'static str, char> = {};
/// AGL phf map for multi-codepoint (ligature) glyph names.
/// Maps glyph names like "dalethatafpatah" to sequences of Unicode codepoints.
pub static AGL_MULTI: phf::Map<&'static str, &[char]> = {};
"#,
multi_arrays,
single_map_builder.build(),
multi_map_builder.build()
);
fs::write(Path::new(out_dir).join("agl.rs"), rust_code)
.expect("Failed to write agl.rs");
}
/// Decode a JSON unicode escape string like "\\u0041" to "0041".
fn decode_json_unicode(s: &str) -> String {
// The JSON has "\\uXXXX" which Rust reads as "\uXXXX"
// We need to extract just the hex part
if s.starts_with("\\u") {
s[2..].to_string()
} else {
s.to_string()
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,695 @@
# -----------------------------------------------------------
# Copyright 2002-2019 Adobe (http://www.adobe.com/).
#
# Redistribution and use in source and binary forms, with or
# without modification, are permitted provided that the
# following conditions are met:
#
# Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer.
#
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# Neither the name of Adobe nor the names of its contributors
# may be used to endorse or promote products derived from this
# software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------
# Name: Adobe Glyph List For New Fonts
# Table version: 1.7
# Date: November 6, 2008
# URL: https://github.com/adobe-type-tools/agl-aglfn
#
# Description:
#
# AGLFN (Adobe Glyph List For New Fonts) provides a list of base glyph
# names that are recommended for new fonts, which are compatible with
# the AGL (Adobe Glyph List) Specification, and which should be used
# as described in Section 6 of that document. AGLFN comprises the set
# of glyph names from AGL that map via the AGL Specification rules to
# the semantically correct UV (Unicode Value). For example, "Asmall"
# is omitted because AGL maps this glyph name to the PUA (Private Use
# Area) value U+F761, rather than to the UV that maps from the glyph
# name "A." Also omitted is "ffi," because AGL maps this to the
# Alphabetic Presentation Forms value U+FB03, rather than decomposing
# it into the following sequence of three UVs: U+0066, U+0066, and
# U+0069. The name "arrowvertex" has been omitted because this glyph
# now has a real UV, and AGL is now incorrect in mapping it to the PUA
# value U+F8E6. If you do not find an appropriate name for your glyph
# in this list, then please refer to Section 6 of the AGL
# Specification.
#
# Format: three semicolon-delimited fields:
# (1) Standard UV or CUS UV--four uppercase hexadecimal digits
# (2) Glyph name--upper/lowercase letters and digits
# (3) Character names: Unicode character names for standard UVs, and
# descriptive names for CUS UVs--uppercase letters, hyphen, and
# space
#
# The records are sorted by glyph name in increasing ASCII order,
# entries with the same glyph name are sorted in decreasing priority
# order, the UVs and Unicode character names are provided for
# convenience, lines starting with "#" are comments, and blank lines
# should be ignored.
#
# Revision History:
#
# 1.7 [6 November 2008]
# - Reverted to the original 1.4 and earlier mappings for Delta,
# Omega, and mu.
# - Removed mappings for "afii" names. These should now be assigned
# "uni" names.
# - Removed mappings for "commaaccent" names. These should now be
# assigned "uni" names.
#
# 1.6 [30 January 2006]
# - Completed work intended in 1.5.
#
# 1.5 [23 November 2005]
# - Removed duplicated block at end of file.
# - Changed mappings:
# 2206;Delta;INCREMENT changed to 0394;Delta;GREEK CAPITAL LETTER DELTA
# 2126;Omega;OHM SIGN changed to 03A9;Omega;GREEK CAPITAL LETTER OMEGA
# 03BC;mu;MICRO SIGN changed to 03BC;mu;GREEK SMALL LETTER MU
# - Corrected statement above about why "ffi" is omitted.
#
# 1.4 [24 September 2003]
# - Changed version to 1.4, to avoid confusion with the AGL 1.3.
# - Fixed spelling errors in the header.
# - Fully removed "arrowvertex," as it is mapped only to a PUA Unicode
# value in some fonts.
#
# 1.1 [17 April 2003]
# - Renamed [Tt]cedilla back to [Tt]commaaccent.
#
# 1.0 [31 January 2003]
# - Original version.
# - Derived from the AGLv1.2 by:
# removing the PUA area codes;
# removing duplicate Unicode mappings; and
# renaming "tcommaaccent" to "tcedilla" and "Tcommaaccent" to "Tcedilla"
#
0041;A;LATIN CAPITAL LETTER A
00C6;AE;LATIN CAPITAL LETTER AE
01FC;AEacute;LATIN CAPITAL LETTER AE WITH ACUTE
00C1;Aacute;LATIN CAPITAL LETTER A WITH ACUTE
0102;Abreve;LATIN CAPITAL LETTER A WITH BREVE
00C2;Acircumflex;LATIN CAPITAL LETTER A WITH CIRCUMFLEX
00C4;Adieresis;LATIN CAPITAL LETTER A WITH DIAERESIS
00C0;Agrave;LATIN CAPITAL LETTER A WITH GRAVE
0391;Alpha;GREEK CAPITAL LETTER ALPHA
0386;Alphatonos;GREEK CAPITAL LETTER ALPHA WITH TONOS
0100;Amacron;LATIN CAPITAL LETTER A WITH MACRON
0104;Aogonek;LATIN CAPITAL LETTER A WITH OGONEK
00C5;Aring;LATIN CAPITAL LETTER A WITH RING ABOVE
01FA;Aringacute;LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE
00C3;Atilde;LATIN CAPITAL LETTER A WITH TILDE
0042;B;LATIN CAPITAL LETTER B
0392;Beta;GREEK CAPITAL LETTER BETA
0043;C;LATIN CAPITAL LETTER C
0106;Cacute;LATIN CAPITAL LETTER C WITH ACUTE
010C;Ccaron;LATIN CAPITAL LETTER C WITH CARON
00C7;Ccedilla;LATIN CAPITAL LETTER C WITH CEDILLA
0108;Ccircumflex;LATIN CAPITAL LETTER C WITH CIRCUMFLEX
010A;Cdotaccent;LATIN CAPITAL LETTER C WITH DOT ABOVE
03A7;Chi;GREEK CAPITAL LETTER CHI
0044;D;LATIN CAPITAL LETTER D
010E;Dcaron;LATIN CAPITAL LETTER D WITH CARON
0110;Dcroat;LATIN CAPITAL LETTER D WITH STROKE
2206;Delta;INCREMENT
0045;E;LATIN CAPITAL LETTER E
00C9;Eacute;LATIN CAPITAL LETTER E WITH ACUTE
0114;Ebreve;LATIN CAPITAL LETTER E WITH BREVE
011A;Ecaron;LATIN CAPITAL LETTER E WITH CARON
00CA;Ecircumflex;LATIN CAPITAL LETTER E WITH CIRCUMFLEX
00CB;Edieresis;LATIN CAPITAL LETTER E WITH DIAERESIS
0116;Edotaccent;LATIN CAPITAL LETTER E WITH DOT ABOVE
00C8;Egrave;LATIN CAPITAL LETTER E WITH GRAVE
0112;Emacron;LATIN CAPITAL LETTER E WITH MACRON
014A;Eng;LATIN CAPITAL LETTER ENG
0118;Eogonek;LATIN CAPITAL LETTER E WITH OGONEK
0395;Epsilon;GREEK CAPITAL LETTER EPSILON
0388;Epsilontonos;GREEK CAPITAL LETTER EPSILON WITH TONOS
0397;Eta;GREEK CAPITAL LETTER ETA
0389;Etatonos;GREEK CAPITAL LETTER ETA WITH TONOS
00D0;Eth;LATIN CAPITAL LETTER ETH
20AC;Euro;EURO SIGN
0046;F;LATIN CAPITAL LETTER F
0047;G;LATIN CAPITAL LETTER G
0393;Gamma;GREEK CAPITAL LETTER GAMMA
011E;Gbreve;LATIN CAPITAL LETTER G WITH BREVE
01E6;Gcaron;LATIN CAPITAL LETTER G WITH CARON
011C;Gcircumflex;LATIN CAPITAL LETTER G WITH CIRCUMFLEX
0120;Gdotaccent;LATIN CAPITAL LETTER G WITH DOT ABOVE
0048;H;LATIN CAPITAL LETTER H
25CF;H18533;BLACK CIRCLE
25AA;H18543;BLACK SMALL SQUARE
25AB;H18551;WHITE SMALL SQUARE
25A1;H22073;WHITE SQUARE
0126;Hbar;LATIN CAPITAL LETTER H WITH STROKE
0124;Hcircumflex;LATIN CAPITAL LETTER H WITH CIRCUMFLEX
0049;I;LATIN CAPITAL LETTER I
0132;IJ;LATIN CAPITAL LIGATURE IJ
00CD;Iacute;LATIN CAPITAL LETTER I WITH ACUTE
012C;Ibreve;LATIN CAPITAL LETTER I WITH BREVE
00CE;Icircumflex;LATIN CAPITAL LETTER I WITH CIRCUMFLEX
00CF;Idieresis;LATIN CAPITAL LETTER I WITH DIAERESIS
0130;Idotaccent;LATIN CAPITAL LETTER I WITH DOT ABOVE
2111;Ifraktur;BLACK-LETTER CAPITAL I
00CC;Igrave;LATIN CAPITAL LETTER I WITH GRAVE
012A;Imacron;LATIN CAPITAL LETTER I WITH MACRON
012E;Iogonek;LATIN CAPITAL LETTER I WITH OGONEK
0399;Iota;GREEK CAPITAL LETTER IOTA
03AA;Iotadieresis;GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
038A;Iotatonos;GREEK CAPITAL LETTER IOTA WITH TONOS
0128;Itilde;LATIN CAPITAL LETTER I WITH TILDE
004A;J;LATIN CAPITAL LETTER J
0134;Jcircumflex;LATIN CAPITAL LETTER J WITH CIRCUMFLEX
004B;K;LATIN CAPITAL LETTER K
039A;Kappa;GREEK CAPITAL LETTER KAPPA
004C;L;LATIN CAPITAL LETTER L
0139;Lacute;LATIN CAPITAL LETTER L WITH ACUTE
039B;Lambda;GREEK CAPITAL LETTER LAMDA
013D;Lcaron;LATIN CAPITAL LETTER L WITH CARON
013F;Ldot;LATIN CAPITAL LETTER L WITH MIDDLE DOT
0141;Lslash;LATIN CAPITAL LETTER L WITH STROKE
004D;M;LATIN CAPITAL LETTER M
039C;Mu;GREEK CAPITAL LETTER MU
004E;N;LATIN CAPITAL LETTER N
0143;Nacute;LATIN CAPITAL LETTER N WITH ACUTE
0147;Ncaron;LATIN CAPITAL LETTER N WITH CARON
00D1;Ntilde;LATIN CAPITAL LETTER N WITH TILDE
039D;Nu;GREEK CAPITAL LETTER NU
004F;O;LATIN CAPITAL LETTER O
0152;OE;LATIN CAPITAL LIGATURE OE
00D3;Oacute;LATIN CAPITAL LETTER O WITH ACUTE
014E;Obreve;LATIN CAPITAL LETTER O WITH BREVE
00D4;Ocircumflex;LATIN CAPITAL LETTER O WITH CIRCUMFLEX
00D6;Odieresis;LATIN CAPITAL LETTER O WITH DIAERESIS
00D2;Ograve;LATIN CAPITAL LETTER O WITH GRAVE
01A0;Ohorn;LATIN CAPITAL LETTER O WITH HORN
0150;Ohungarumlaut;LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
014C;Omacron;LATIN CAPITAL LETTER O WITH MACRON
2126;Omega;OHM SIGN
038F;Omegatonos;GREEK CAPITAL LETTER OMEGA WITH TONOS
039F;Omicron;GREEK CAPITAL LETTER OMICRON
038C;Omicrontonos;GREEK CAPITAL LETTER OMICRON WITH TONOS
00D8;Oslash;LATIN CAPITAL LETTER O WITH STROKE
01FE;Oslashacute;LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
00D5;Otilde;LATIN CAPITAL LETTER O WITH TILDE
0050;P;LATIN CAPITAL LETTER P
03A6;Phi;GREEK CAPITAL LETTER PHI
03A0;Pi;GREEK CAPITAL LETTER PI
03A8;Psi;GREEK CAPITAL LETTER PSI
0051;Q;LATIN CAPITAL LETTER Q
0052;R;LATIN CAPITAL LETTER R
0154;Racute;LATIN CAPITAL LETTER R WITH ACUTE
0158;Rcaron;LATIN CAPITAL LETTER R WITH CARON
211C;Rfraktur;BLACK-LETTER CAPITAL R
03A1;Rho;GREEK CAPITAL LETTER RHO
0053;S;LATIN CAPITAL LETTER S
250C;SF010000;BOX DRAWINGS LIGHT DOWN AND RIGHT
2514;SF020000;BOX DRAWINGS LIGHT UP AND RIGHT
2510;SF030000;BOX DRAWINGS LIGHT DOWN AND LEFT
2518;SF040000;BOX DRAWINGS LIGHT UP AND LEFT
253C;SF050000;BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL
252C;SF060000;BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
2534;SF070000;BOX DRAWINGS LIGHT UP AND HORIZONTAL
251C;SF080000;BOX DRAWINGS LIGHT VERTICAL AND RIGHT
2524;SF090000;BOX DRAWINGS LIGHT VERTICAL AND LEFT
2500;SF100000;BOX DRAWINGS LIGHT HORIZONTAL
2502;SF110000;BOX DRAWINGS LIGHT VERTICAL
2561;SF190000;BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE
2562;SF200000;BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE
2556;SF210000;BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE
2555;SF220000;BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE
2563;SF230000;BOX DRAWINGS DOUBLE VERTICAL AND LEFT
2551;SF240000;BOX DRAWINGS DOUBLE VERTICAL
2557;SF250000;BOX DRAWINGS DOUBLE DOWN AND LEFT
255D;SF260000;BOX DRAWINGS DOUBLE UP AND LEFT
255C;SF270000;BOX DRAWINGS UP DOUBLE AND LEFT SINGLE
255B;SF280000;BOX DRAWINGS UP SINGLE AND LEFT DOUBLE
255E;SF360000;BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE
255F;SF370000;BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE
255A;SF380000;BOX DRAWINGS DOUBLE UP AND RIGHT
2554;SF390000;BOX DRAWINGS DOUBLE DOWN AND RIGHT
2569;SF400000;BOX DRAWINGS DOUBLE UP AND HORIZONTAL
2566;SF410000;BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL
2560;SF420000;BOX DRAWINGS DOUBLE VERTICAL AND RIGHT
2550;SF430000;BOX DRAWINGS DOUBLE HORIZONTAL
256C;SF440000;BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL
2567;SF450000;BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE
2568;SF460000;BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE
2564;SF470000;BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE
2565;SF480000;BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE
2559;SF490000;BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE
2558;SF500000;BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE
2552;SF510000;BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE
2553;SF520000;BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE
256B;SF530000;BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE
256A;SF540000;BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE
015A;Sacute;LATIN CAPITAL LETTER S WITH ACUTE
0160;Scaron;LATIN CAPITAL LETTER S WITH CARON
015E;Scedilla;LATIN CAPITAL LETTER S WITH CEDILLA
015C;Scircumflex;LATIN CAPITAL LETTER S WITH CIRCUMFLEX
03A3;Sigma;GREEK CAPITAL LETTER SIGMA
0054;T;LATIN CAPITAL LETTER T
03A4;Tau;GREEK CAPITAL LETTER TAU
0166;Tbar;LATIN CAPITAL LETTER T WITH STROKE
0164;Tcaron;LATIN CAPITAL LETTER T WITH CARON
0398;Theta;GREEK CAPITAL LETTER THETA
00DE;Thorn;LATIN CAPITAL LETTER THORN
0055;U;LATIN CAPITAL LETTER U
00DA;Uacute;LATIN CAPITAL LETTER U WITH ACUTE
016C;Ubreve;LATIN CAPITAL LETTER U WITH BREVE
00DB;Ucircumflex;LATIN CAPITAL LETTER U WITH CIRCUMFLEX
00DC;Udieresis;LATIN CAPITAL LETTER U WITH DIAERESIS
00D9;Ugrave;LATIN CAPITAL LETTER U WITH GRAVE
01AF;Uhorn;LATIN CAPITAL LETTER U WITH HORN
0170;Uhungarumlaut;LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
016A;Umacron;LATIN CAPITAL LETTER U WITH MACRON
0172;Uogonek;LATIN CAPITAL LETTER U WITH OGONEK
03A5;Upsilon;GREEK CAPITAL LETTER UPSILON
03D2;Upsilon1;GREEK UPSILON WITH HOOK SYMBOL
03AB;Upsilondieresis;GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
038E;Upsilontonos;GREEK CAPITAL LETTER UPSILON WITH TONOS
016E;Uring;LATIN CAPITAL LETTER U WITH RING ABOVE
0168;Utilde;LATIN CAPITAL LETTER U WITH TILDE
0056;V;LATIN CAPITAL LETTER V
0057;W;LATIN CAPITAL LETTER W
1E82;Wacute;LATIN CAPITAL LETTER W WITH ACUTE
0174;Wcircumflex;LATIN CAPITAL LETTER W WITH CIRCUMFLEX
1E84;Wdieresis;LATIN CAPITAL LETTER W WITH DIAERESIS
1E80;Wgrave;LATIN CAPITAL LETTER W WITH GRAVE
0058;X;LATIN CAPITAL LETTER X
039E;Xi;GREEK CAPITAL LETTER XI
0059;Y;LATIN CAPITAL LETTER Y
00DD;Yacute;LATIN CAPITAL LETTER Y WITH ACUTE
0176;Ycircumflex;LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
0178;Ydieresis;LATIN CAPITAL LETTER Y WITH DIAERESIS
1EF2;Ygrave;LATIN CAPITAL LETTER Y WITH GRAVE
005A;Z;LATIN CAPITAL LETTER Z
0179;Zacute;LATIN CAPITAL LETTER Z WITH ACUTE
017D;Zcaron;LATIN CAPITAL LETTER Z WITH CARON
017B;Zdotaccent;LATIN CAPITAL LETTER Z WITH DOT ABOVE
0396;Zeta;GREEK CAPITAL LETTER ZETA
0061;a;LATIN SMALL LETTER A
00E1;aacute;LATIN SMALL LETTER A WITH ACUTE
0103;abreve;LATIN SMALL LETTER A WITH BREVE
00E2;acircumflex;LATIN SMALL LETTER A WITH CIRCUMFLEX
00B4;acute;ACUTE ACCENT
0301;acutecomb;COMBINING ACUTE ACCENT
00E4;adieresis;LATIN SMALL LETTER A WITH DIAERESIS
00E6;ae;LATIN SMALL LETTER AE
01FD;aeacute;LATIN SMALL LETTER AE WITH ACUTE
00E0;agrave;LATIN SMALL LETTER A WITH GRAVE
2135;aleph;ALEF SYMBOL
03B1;alpha;GREEK SMALL LETTER ALPHA
03AC;alphatonos;GREEK SMALL LETTER ALPHA WITH TONOS
0101;amacron;LATIN SMALL LETTER A WITH MACRON
0026;ampersand;AMPERSAND
2220;angle;ANGLE
2329;angleleft;LEFT-POINTING ANGLE BRACKET
232A;angleright;RIGHT-POINTING ANGLE BRACKET
0387;anoteleia;GREEK ANO TELEIA
0105;aogonek;LATIN SMALL LETTER A WITH OGONEK
2248;approxequal;ALMOST EQUAL TO
00E5;aring;LATIN SMALL LETTER A WITH RING ABOVE
01FB;aringacute;LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE
2194;arrowboth;LEFT RIGHT ARROW
21D4;arrowdblboth;LEFT RIGHT DOUBLE ARROW
21D3;arrowdbldown;DOWNWARDS DOUBLE ARROW
21D0;arrowdblleft;LEFTWARDS DOUBLE ARROW
21D2;arrowdblright;RIGHTWARDS DOUBLE ARROW
21D1;arrowdblup;UPWARDS DOUBLE ARROW
2193;arrowdown;DOWNWARDS ARROW
2190;arrowleft;LEFTWARDS ARROW
2192;arrowright;RIGHTWARDS ARROW
2191;arrowup;UPWARDS ARROW
2195;arrowupdn;UP DOWN ARROW
21A8;arrowupdnbse;UP DOWN ARROW WITH BASE
005E;asciicircum;CIRCUMFLEX ACCENT
007E;asciitilde;TILDE
002A;asterisk;ASTERISK
2217;asteriskmath;ASTERISK OPERATOR
0040;at;COMMERCIAL AT
00E3;atilde;LATIN SMALL LETTER A WITH TILDE
0062;b;LATIN SMALL LETTER B
005C;backslash;REVERSE SOLIDUS
007C;bar;VERTICAL LINE
03B2;beta;GREEK SMALL LETTER BETA
2588;block;FULL BLOCK
007B;braceleft;LEFT CURLY BRACKET
007D;braceright;RIGHT CURLY BRACKET
005B;bracketleft;LEFT SQUARE BRACKET
005D;bracketright;RIGHT SQUARE BRACKET
02D8;breve;BREVE
00A6;brokenbar;BROKEN BAR
2022;bullet;BULLET
0063;c;LATIN SMALL LETTER C
0107;cacute;LATIN SMALL LETTER C WITH ACUTE
02C7;caron;CARON
21B5;carriagereturn;DOWNWARDS ARROW WITH CORNER LEFTWARDS
010D;ccaron;LATIN SMALL LETTER C WITH CARON
00E7;ccedilla;LATIN SMALL LETTER C WITH CEDILLA
0109;ccircumflex;LATIN SMALL LETTER C WITH CIRCUMFLEX
010B;cdotaccent;LATIN SMALL LETTER C WITH DOT ABOVE
00B8;cedilla;CEDILLA
00A2;cent;CENT SIGN
03C7;chi;GREEK SMALL LETTER CHI
25CB;circle;WHITE CIRCLE
2297;circlemultiply;CIRCLED TIMES
2295;circleplus;CIRCLED PLUS
02C6;circumflex;MODIFIER LETTER CIRCUMFLEX ACCENT
2663;club;BLACK CLUB SUIT
003A;colon;COLON
20A1;colonmonetary;COLON SIGN
002C;comma;COMMA
2245;congruent;APPROXIMATELY EQUAL TO
00A9;copyright;COPYRIGHT SIGN
00A4;currency;CURRENCY SIGN
0064;d;LATIN SMALL LETTER D
2020;dagger;DAGGER
2021;daggerdbl;DOUBLE DAGGER
010F;dcaron;LATIN SMALL LETTER D WITH CARON
0111;dcroat;LATIN SMALL LETTER D WITH STROKE
00B0;degree;DEGREE SIGN
03B4;delta;GREEK SMALL LETTER DELTA
2666;diamond;BLACK DIAMOND SUIT
00A8;dieresis;DIAERESIS
0385;dieresistonos;GREEK DIALYTIKA TONOS
00F7;divide;DIVISION SIGN
2593;dkshade;DARK SHADE
2584;dnblock;LOWER HALF BLOCK
0024;dollar;DOLLAR SIGN
20AB;dong;DONG SIGN
02D9;dotaccent;DOT ABOVE
0323;dotbelowcomb;COMBINING DOT BELOW
0131;dotlessi;LATIN SMALL LETTER DOTLESS I
22C5;dotmath;DOT OPERATOR
0065;e;LATIN SMALL LETTER E
00E9;eacute;LATIN SMALL LETTER E WITH ACUTE
0115;ebreve;LATIN SMALL LETTER E WITH BREVE
011B;ecaron;LATIN SMALL LETTER E WITH CARON
00EA;ecircumflex;LATIN SMALL LETTER E WITH CIRCUMFLEX
00EB;edieresis;LATIN SMALL LETTER E WITH DIAERESIS
0117;edotaccent;LATIN SMALL LETTER E WITH DOT ABOVE
00E8;egrave;LATIN SMALL LETTER E WITH GRAVE
0038;eight;DIGIT EIGHT
2208;element;ELEMENT OF
2026;ellipsis;HORIZONTAL ELLIPSIS
0113;emacron;LATIN SMALL LETTER E WITH MACRON
2014;emdash;EM DASH
2205;emptyset;EMPTY SET
2013;endash;EN DASH
014B;eng;LATIN SMALL LETTER ENG
0119;eogonek;LATIN SMALL LETTER E WITH OGONEK
03B5;epsilon;GREEK SMALL LETTER EPSILON
03AD;epsilontonos;GREEK SMALL LETTER EPSILON WITH TONOS
003D;equal;EQUALS SIGN
2261;equivalence;IDENTICAL TO
212E;estimated;ESTIMATED SYMBOL
03B7;eta;GREEK SMALL LETTER ETA
03AE;etatonos;GREEK SMALL LETTER ETA WITH TONOS
00F0;eth;LATIN SMALL LETTER ETH
0021;exclam;EXCLAMATION MARK
203C;exclamdbl;DOUBLE EXCLAMATION MARK
00A1;exclamdown;INVERTED EXCLAMATION MARK
2203;existential;THERE EXISTS
0066;f;LATIN SMALL LETTER F
2640;female;FEMALE SIGN
2012;figuredash;FIGURE DASH
25A0;filledbox;BLACK SQUARE
25AC;filledrect;BLACK RECTANGLE
0035;five;DIGIT FIVE
215D;fiveeighths;VULGAR FRACTION FIVE EIGHTHS
0192;florin;LATIN SMALL LETTER F WITH HOOK
0034;four;DIGIT FOUR
2044;fraction;FRACTION SLASH
20A3;franc;FRENCH FRANC SIGN
0067;g;LATIN SMALL LETTER G
03B3;gamma;GREEK SMALL LETTER GAMMA
011F;gbreve;LATIN SMALL LETTER G WITH BREVE
01E7;gcaron;LATIN SMALL LETTER G WITH CARON
011D;gcircumflex;LATIN SMALL LETTER G WITH CIRCUMFLEX
0121;gdotaccent;LATIN SMALL LETTER G WITH DOT ABOVE
00DF;germandbls;LATIN SMALL LETTER SHARP S
2207;gradient;NABLA
0060;grave;GRAVE ACCENT
0300;gravecomb;COMBINING GRAVE ACCENT
003E;greater;GREATER-THAN SIGN
2265;greaterequal;GREATER-THAN OR EQUAL TO
00AB;guillemotleft;LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00BB;guillemotright;RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
2039;guilsinglleft;SINGLE LEFT-POINTING ANGLE QUOTATION MARK
203A;guilsinglright;SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0068;h;LATIN SMALL LETTER H
0127;hbar;LATIN SMALL LETTER H WITH STROKE
0125;hcircumflex;LATIN SMALL LETTER H WITH CIRCUMFLEX
2665;heart;BLACK HEART SUIT
0309;hookabovecomb;COMBINING HOOK ABOVE
2302;house;HOUSE
02DD;hungarumlaut;DOUBLE ACUTE ACCENT
002D;hyphen;HYPHEN-MINUS
0069;i;LATIN SMALL LETTER I
00ED;iacute;LATIN SMALL LETTER I WITH ACUTE
012D;ibreve;LATIN SMALL LETTER I WITH BREVE
00EE;icircumflex;LATIN SMALL LETTER I WITH CIRCUMFLEX
00EF;idieresis;LATIN SMALL LETTER I WITH DIAERESIS
00EC;igrave;LATIN SMALL LETTER I WITH GRAVE
0133;ij;LATIN SMALL LIGATURE IJ
012B;imacron;LATIN SMALL LETTER I WITH MACRON
221E;infinity;INFINITY
222B;integral;INTEGRAL
2321;integralbt;BOTTOM HALF INTEGRAL
2320;integraltp;TOP HALF INTEGRAL
2229;intersection;INTERSECTION
25D8;invbullet;INVERSE BULLET
25D9;invcircle;INVERSE WHITE CIRCLE
263B;invsmileface;BLACK SMILING FACE
012F;iogonek;LATIN SMALL LETTER I WITH OGONEK
03B9;iota;GREEK SMALL LETTER IOTA
03CA;iotadieresis;GREEK SMALL LETTER IOTA WITH DIALYTIKA
0390;iotadieresistonos;GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
03AF;iotatonos;GREEK SMALL LETTER IOTA WITH TONOS
0129;itilde;LATIN SMALL LETTER I WITH TILDE
006A;j;LATIN SMALL LETTER J
0135;jcircumflex;LATIN SMALL LETTER J WITH CIRCUMFLEX
006B;k;LATIN SMALL LETTER K
03BA;kappa;GREEK SMALL LETTER KAPPA
0138;kgreenlandic;LATIN SMALL LETTER KRA
006C;l;LATIN SMALL LETTER L
013A;lacute;LATIN SMALL LETTER L WITH ACUTE
03BB;lambda;GREEK SMALL LETTER LAMDA
013E;lcaron;LATIN SMALL LETTER L WITH CARON
0140;ldot;LATIN SMALL LETTER L WITH MIDDLE DOT
003C;less;LESS-THAN SIGN
2264;lessequal;LESS-THAN OR EQUAL TO
258C;lfblock;LEFT HALF BLOCK
20A4;lira;LIRA SIGN
2227;logicaland;LOGICAL AND
00AC;logicalnot;NOT SIGN
2228;logicalor;LOGICAL OR
017F;longs;LATIN SMALL LETTER LONG S
25CA;lozenge;LOZENGE
0142;lslash;LATIN SMALL LETTER L WITH STROKE
2591;ltshade;LIGHT SHADE
006D;m;LATIN SMALL LETTER M
00AF;macron;MACRON
2642;male;MALE SIGN
2212;minus;MINUS SIGN
2032;minute;PRIME
00B5;mu;MICRO SIGN
00D7;multiply;MULTIPLICATION SIGN
266A;musicalnote;EIGHTH NOTE
266B;musicalnotedbl;BEAMED EIGHTH NOTES
006E;n;LATIN SMALL LETTER N
0144;nacute;LATIN SMALL LETTER N WITH ACUTE
0149;napostrophe;LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
0148;ncaron;LATIN SMALL LETTER N WITH CARON
0039;nine;DIGIT NINE
2209;notelement;NOT AN ELEMENT OF
2260;notequal;NOT EQUAL TO
2284;notsubset;NOT A SUBSET OF
00F1;ntilde;LATIN SMALL LETTER N WITH TILDE
03BD;nu;GREEK SMALL LETTER NU
0023;numbersign;NUMBER SIGN
006F;o;LATIN SMALL LETTER O
00F3;oacute;LATIN SMALL LETTER O WITH ACUTE
014F;obreve;LATIN SMALL LETTER O WITH BREVE
00F4;ocircumflex;LATIN SMALL LETTER O WITH CIRCUMFLEX
00F6;odieresis;LATIN SMALL LETTER O WITH DIAERESIS
0153;oe;LATIN SMALL LIGATURE OE
02DB;ogonek;OGONEK
00F2;ograve;LATIN SMALL LETTER O WITH GRAVE
01A1;ohorn;LATIN SMALL LETTER O WITH HORN
0151;ohungarumlaut;LATIN SMALL LETTER O WITH DOUBLE ACUTE
014D;omacron;LATIN SMALL LETTER O WITH MACRON
03C9;omega;GREEK SMALL LETTER OMEGA
03D6;omega1;GREEK PI SYMBOL
03CE;omegatonos;GREEK SMALL LETTER OMEGA WITH TONOS
03BF;omicron;GREEK SMALL LETTER OMICRON
03CC;omicrontonos;GREEK SMALL LETTER OMICRON WITH TONOS
0031;one;DIGIT ONE
2024;onedotenleader;ONE DOT LEADER
215B;oneeighth;VULGAR FRACTION ONE EIGHTH
00BD;onehalf;VULGAR FRACTION ONE HALF
00BC;onequarter;VULGAR FRACTION ONE QUARTER
2153;onethird;VULGAR FRACTION ONE THIRD
25E6;openbullet;WHITE BULLET
00AA;ordfeminine;FEMININE ORDINAL INDICATOR
00BA;ordmasculine;MASCULINE ORDINAL INDICATOR
221F;orthogonal;RIGHT ANGLE
00F8;oslash;LATIN SMALL LETTER O WITH STROKE
01FF;oslashacute;LATIN SMALL LETTER O WITH STROKE AND ACUTE
00F5;otilde;LATIN SMALL LETTER O WITH TILDE
0070;p;LATIN SMALL LETTER P
00B6;paragraph;PILCROW SIGN
0028;parenleft;LEFT PARENTHESIS
0029;parenright;RIGHT PARENTHESIS
2202;partialdiff;PARTIAL DIFFERENTIAL
0025;percent;PERCENT SIGN
002E;period;FULL STOP
00B7;periodcentered;MIDDLE DOT
22A5;perpendicular;UP TACK
2030;perthousand;PER MILLE SIGN
20A7;peseta;PESETA SIGN
03C6;phi;GREEK SMALL LETTER PHI
03D5;phi1;GREEK PHI SYMBOL
03C0;pi;GREEK SMALL LETTER PI
002B;plus;PLUS SIGN
00B1;plusminus;PLUS-MINUS SIGN
211E;prescription;PRESCRIPTION TAKE
220F;product;N-ARY PRODUCT
2282;propersubset;SUBSET OF
2283;propersuperset;SUPERSET OF
221D;proportional;PROPORTIONAL TO
03C8;psi;GREEK SMALL LETTER PSI
0071;q;LATIN SMALL LETTER Q
003F;question;QUESTION MARK
00BF;questiondown;INVERTED QUESTION MARK
0022;quotedbl;QUOTATION MARK
201E;quotedblbase;DOUBLE LOW-9 QUOTATION MARK
201C;quotedblleft;LEFT DOUBLE QUOTATION MARK
201D;quotedblright;RIGHT DOUBLE QUOTATION MARK
2018;quoteleft;LEFT SINGLE QUOTATION MARK
201B;quotereversed;SINGLE HIGH-REVERSED-9 QUOTATION MARK
2019;quoteright;RIGHT SINGLE QUOTATION MARK
201A;quotesinglbase;SINGLE LOW-9 QUOTATION MARK
0027;quotesingle;APOSTROPHE
0072;r;LATIN SMALL LETTER R
0155;racute;LATIN SMALL LETTER R WITH ACUTE
221A;radical;SQUARE ROOT
0159;rcaron;LATIN SMALL LETTER R WITH CARON
2286;reflexsubset;SUBSET OF OR EQUAL TO
2287;reflexsuperset;SUPERSET OF OR EQUAL TO
00AE;registered;REGISTERED SIGN
2310;revlogicalnot;REVERSED NOT SIGN
03C1;rho;GREEK SMALL LETTER RHO
02DA;ring;RING ABOVE
2590;rtblock;RIGHT HALF BLOCK
0073;s;LATIN SMALL LETTER S
015B;sacute;LATIN SMALL LETTER S WITH ACUTE
0161;scaron;LATIN SMALL LETTER S WITH CARON
015F;scedilla;LATIN SMALL LETTER S WITH CEDILLA
015D;scircumflex;LATIN SMALL LETTER S WITH CIRCUMFLEX
2033;second;DOUBLE PRIME
00A7;section;SECTION SIGN
003B;semicolon;SEMICOLON
0037;seven;DIGIT SEVEN
215E;seveneighths;VULGAR FRACTION SEVEN EIGHTHS
2592;shade;MEDIUM SHADE
03C3;sigma;GREEK SMALL LETTER SIGMA
03C2;sigma1;GREEK SMALL LETTER FINAL SIGMA
223C;similar;TILDE OPERATOR
0036;six;DIGIT SIX
002F;slash;SOLIDUS
263A;smileface;WHITE SMILING FACE
0020;space;SPACE
2660;spade;BLACK SPADE SUIT
00A3;sterling;POUND SIGN
220B;suchthat;CONTAINS AS MEMBER
2211;summation;N-ARY SUMMATION
263C;sun;WHITE SUN WITH RAYS
0074;t;LATIN SMALL LETTER T
03C4;tau;GREEK SMALL LETTER TAU
0167;tbar;LATIN SMALL LETTER T WITH STROKE
0165;tcaron;LATIN SMALL LETTER T WITH CARON
2234;therefore;THEREFORE
03B8;theta;GREEK SMALL LETTER THETA
03D1;theta1;GREEK THETA SYMBOL
00FE;thorn;LATIN SMALL LETTER THORN
0033;three;DIGIT THREE
215C;threeeighths;VULGAR FRACTION THREE EIGHTHS
00BE;threequarters;VULGAR FRACTION THREE QUARTERS
02DC;tilde;SMALL TILDE
0303;tildecomb;COMBINING TILDE
0384;tonos;GREEK TONOS
2122;trademark;TRADE MARK SIGN
25BC;triagdn;BLACK DOWN-POINTING TRIANGLE
25C4;triaglf;BLACK LEFT-POINTING POINTER
25BA;triagrt;BLACK RIGHT-POINTING POINTER
25B2;triagup;BLACK UP-POINTING TRIANGLE
0032;two;DIGIT TWO
2025;twodotenleader;TWO DOT LEADER
2154;twothirds;VULGAR FRACTION TWO THIRDS
0075;u;LATIN SMALL LETTER U
00FA;uacute;LATIN SMALL LETTER U WITH ACUTE
016D;ubreve;LATIN SMALL LETTER U WITH BREVE
00FB;ucircumflex;LATIN SMALL LETTER U WITH CIRCUMFLEX
00FC;udieresis;LATIN SMALL LETTER U WITH DIAERESIS
00F9;ugrave;LATIN SMALL LETTER U WITH GRAVE
01B0;uhorn;LATIN SMALL LETTER U WITH HORN
0171;uhungarumlaut;LATIN SMALL LETTER U WITH DOUBLE ACUTE
016B;umacron;LATIN SMALL LETTER U WITH MACRON
005F;underscore;LOW LINE
2017;underscoredbl;DOUBLE LOW LINE
222A;union;UNION
2200;universal;FOR ALL
0173;uogonek;LATIN SMALL LETTER U WITH OGONEK
2580;upblock;UPPER HALF BLOCK
03C5;upsilon;GREEK SMALL LETTER UPSILON
03CB;upsilondieresis;GREEK SMALL LETTER UPSILON WITH DIALYTIKA
03B0;upsilondieresistonos;GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
03CD;upsilontonos;GREEK SMALL LETTER UPSILON WITH TONOS
016F;uring;LATIN SMALL LETTER U WITH RING ABOVE
0169;utilde;LATIN SMALL LETTER U WITH TILDE
0076;v;LATIN SMALL LETTER V
0077;w;LATIN SMALL LETTER W
1E83;wacute;LATIN SMALL LETTER W WITH ACUTE
0175;wcircumflex;LATIN SMALL LETTER W WITH CIRCUMFLEX
1E85;wdieresis;LATIN SMALL LETTER W WITH DIAERESIS
2118;weierstrass;SCRIPT CAPITAL P
1E81;wgrave;LATIN SMALL LETTER W WITH GRAVE
0078;x;LATIN SMALL LETTER X
03BE;xi;GREEK SMALL LETTER XI
0079;y;LATIN SMALL LETTER Y
00FD;yacute;LATIN SMALL LETTER Y WITH ACUTE
0177;ycircumflex;LATIN SMALL LETTER Y WITH CIRCUMFLEX
00FF;ydieresis;LATIN SMALL LETTER Y WITH DIAERESIS
00A5;yen;YEN SIGN
1EF3;ygrave;LATIN SMALL LETTER Y WITH GRAVE
007A;z;LATIN SMALL LETTER Z
017A;zacute;LATIN SMALL LETTER Z WITH ACUTE
017E;zcaron;LATIN SMALL LETTER Z WITH CARON
017C;zdotaccent;LATIN SMALL LETTER Z WITH DOT ABOVE
0030;zero;DIGIT ZERO
03B6;zeta;GREEK SMALL LETTER ZETA
# END

View file

@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""Generate AGL JSON from Adobe's glyph list files."""
import json
import sys
from pathlib import Path
def parse_uvalue(value: str) -> str | list[str]:
"""Parse a Unicode value (hex) into a string or list of strings.
Single codepoint: "0041" -> "\\u0041"
Multi-codepoint: "05D3 05B2" -> ["\\u05D3", "\\u05B2"]
"""
parts = value.split()
if len(parts) == 1:
return f"\\u{parts[0]}"
return [f"\\u{p}" for p in parts]
def parse_glyphlist(path: Path) -> dict:
"""Parse glyphlist.txt into a dict.
Returns:
{"single": {"A": "\\u0041", ...}, "multi": {"dalethatafpatah": ["\\u05D3", "\\u05B2"], ...}}
"""
single = {}
multi = {}
with open(path) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split(";")
if len(parts) != 2:
continue
name, uvalue = parts
parsed = parse_uvalue(uvalue)
if isinstance(parsed, str):
single[name] = parsed
else:
multi[name] = parsed
return {"single": single, "multi": multi}
def parse_aglfn(path: Path) -> dict:
"""Parse aglfn.txt into a dict.
AGLFN is a subset of AGL for new fonts, all single-codepoint.
Format: UVALUE;NAME;DESCRIPTION
"""
result = {}
with open(path) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split(";")
if len(parts) < 2:
continue
uvalue, name = parts[0], parts[1]
result[name] = f"\\u{uvalue}"
return result
def main():
build_dir = Path(__file__).parent
glyphlist_path = build_dir / "glyphlist.txt"
aglfn_path = build_dir / "aglfn.txt"
if not glyphlist_path.exists():
print(f"Error: {glyphlist_path} not found", file=sys.stderr)
sys.exit(1)
if not aglfn_path.exists():
print(f"Error: {aglfn_path} not found", file=sys.stderr)
sys.exit(1)
glyphlist = parse_glyphlist(glyphlist_path)
aglfn = parse_aglfn(aglfn_path)
# Merge: AGLFN overrides glyphlist for consistency
# AGLFN is the authoritative list for new fonts
merged_single = {**glyphlist["single"], **aglfn}
merged_multi = glyphlist["multi"]
output = {
"aglfn": aglfn,
"glyphlist_single": glyphlist["single"],
"glyphlist_multi": glyphlist["multi"],
"merged_single": merged_single,
"merged_multi": merged_multi,
"stats": {
"aglfn_count": len(aglfn),
"glyphlist_single_count": len(glyphlist["single"]),
"glyphlist_multi_count": len(glyphlist["multi"]),
"merged_single_count": len(merged_single),
"merged_multi_count": len(merged_multi),
},
}
output_path = build_dir / "agl.json"
with open(output_path, "w") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"Generated {output_path}")
print(f" AGLFN: {len(aglfn)} entries")
print(f" Glyphlist single: {len(glyphlist['single'])} entries")
print(f" Glyphlist multi: {len(glyphlist['multi'])} entries")
print(f" Merged single: {len(merged_single)} entries")
print(f" Merged multi: {len(merged_multi)} entries")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,304 @@
//! Adobe Glyph List (AGL) lookup for glyph name to Unicode mapping.
//!
//! This module provides compile-time phf::Map lookups for the Adobe Glyph List,
//! which is the canonical mapping from PostScript glyph names to Unicode codepoints.
//!
//! # References
//!
//! - Adobe Glyph List Specification: https://github.com/adobe-type-tools/agl-aglfn
//! - AGL 1.4 (glyphlist.txt): ~4,400 entries
//! - AGLFN 1.7 (aglfn.txt): ~770 entries for new fonts
include!(concat!(env!("OUT_DIR"), "/agl.rs"));
use crate::diagnostics::DiagCode;
/// Lookup a single Unicode codepoint for a glyph name.
///
/// This handles:
/// 1. Algorithmic patterns (uniXXXX, uXXXXXX)
/// 2. Variant suffixes (.sc, .alt, etc.)
/// 3. AGL direct lookup
///
/// # Arguments
///
/// * `name` - The glyph name to lookup
///
/// # Returns
///
/// `Some(char)` if the name maps to a single codepoint, `None` otherwise.
///
/// # Examples
///
/// ```
/// use pdftract_core::font::agl::unicode_for_glyph_name;
///
/// assert_eq!(unicode_for_glyph_name("quoteright"), Some('\u{2019}'));
/// assert_eq!(unicode_for_glyph_name("uni20AC"), Some('\u{20AC}')); // Euro
/// assert_eq!(unicode_for_glyph_name("u1F600"), Some('\u{1F600}')); // Emoji
/// assert_eq!(unicode_for_glyph_name("A.sc"), Some('A')); // Variant stripped
/// ```
pub fn unicode_for_glyph_name(name: &str) -> Option<char> {
// 1. Handle algorithmic patterns first
if let Some(ch) = parse_algorithmic(name) {
return Some(ch);
}
// 2. Strip variant suffix and retry
let stripped = strip_variant_suffix(name);
if stripped != name {
if let Some(ch) = parse_algorithmic(stripped) {
return Some(ch);
}
if let Some(ch) = AGL.get(stripped) {
return Some(*ch);
}
}
// 3. Direct AGL lookup
AGL.get(name).copied()
}
/// Lookup multiple Unicode codepoints for a glyph name (ligatures).
///
/// # Arguments
///
/// * `name` - The glyph name to lookup
///
/// # Returns
///
/// `Some(&[char])` if the name maps to multiple codepoints, `None` otherwise.
///
/// # Examples
///
/// ```
/// use pdftract_core::font::agl::unicode_for_glyph_name_multi;
///
/// assert_eq!(unicode_for_glyph_name_multi("fi"), Some(&['f', 'i'][..]));
/// ```
pub fn unicode_for_glyph_name_multi(name: &str) -> Option<&'static [char]> {
// Check multi-codepoint map
if let Some(chars) = AGL_MULTI.get(name) {
return Some(chars);
}
// Strip variant suffix and retry
let stripped = strip_variant_suffix(name);
if stripped != name {
if let Some(chars) = AGL_MULTI.get(stripped) {
return Some(chars);
}
}
None
}
/// Parse algorithmic glyph name patterns.
///
/// Handles:
/// - `uniXXXX` (4 hex digits)
/// - `uXXXXXX` (up to 6 hex digits)
///
/// These are NOT in the AGL; they are algorithmic conventions.
fn parse_algorithmic(name: &str) -> Option<char> {
let name = name.trim_start_matches('#'); // Some PDFs use #uniXXXX
if let Some(rest) = name.strip_prefix("uni") {
// uniXXXX - exactly 4 hex digits
if rest.len() == 4 && rest.chars().all(|c| c.is_ascii_hexdigit()) {
return u32::from_str_radix(rest, 16).ok().and_then(|c| char::from_u32(c));
}
}
if let Some(rest) = name.strip_prefix('u') {
// uXXXXXX - up to 6 hex digits
if rest.len() <= 6 && rest.chars().all(|c| c.is_ascii_hexdigit()) {
return u32::from_str_radix(rest, 16).ok().and_then(|c| char::from_u32(c));
}
}
None
}
/// Strip variant suffix from a glyph name.
///
/// Handles patterns like:
/// - `H.sc` → `H` (small caps)
/// - `A.alt` → `A` (alternate)
/// - `foo.bar` → `foo`
///
/// The variant suffix is everything after the first `.`.
fn strip_variant_suffix(name: &str) -> &str {
name.split('.').next().unwrap_or(name)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_agl_quoteright() {
// quoteright is U+2019 in WinAnsiEncoding
assert_eq!(unicode_for_glyph_name("quoteright"), Some('\u{2019}'));
}
#[test]
fn test_agl_uni20ac() {
// uniXXXX pattern (Euro)
assert_eq!(unicode_for_glyph_name("uni20AC"), Some('\u{20AC}'));
}
#[test]
fn test_agl_u1f600() {
// uXXXXXX pattern (emoji)
assert_eq!(unicode_for_glyph_name("u1F600"), Some('\u{1F600}'));
}
#[test]
fn test_agl_variant_stripping() {
// Small caps variant
assert_eq!(unicode_for_glyph_name("A.sc"), Some('A'));
assert_eq!(unicode_for_glyph_name("H.sc"), Some('H'));
// Alt variant
assert_eq!(unicode_for_glyph_name("A.alt"), Some('A'));
}
#[test]
fn test_agl_unknown() {
// Unknown name returns None
assert_eq!(unicode_for_glyph_name("NotARealGlyphName"), None);
}
#[test]
fn test_agl_multi_fi() {
// fi ligature is single-codepoint U+FB01 in AGL, not multi-codepoint
assert_eq!(unicode_for_glyph_name("fi"), Some('\u{FB01}'));
assert_eq!(unicode_for_glyph_name_multi("fi"), None);
}
#[test]
fn test_agl_multi_ffi() {
// ffi ligature is single-codepoint U+FB03 in AGL
assert_eq!(unicode_for_glyph_name("ffi"), Some('\u{FB03}'));
assert_eq!(unicode_for_glyph_name_multi("ffi"), None);
}
#[test]
fn test_agl_multi_ff() {
// ff ligature is single-codepoint U+FB00 in AGL
assert_eq!(unicode_for_glyph_name("ff"), Some('\u{FB00}'));
assert_eq!(unicode_for_glyph_name_multi("ff"), None);
}
#[test]
fn test_agl_multi_fl() {
// fl ligature is single-codepoint U+FB02 in AGL
assert_eq!(unicode_for_glyph_name("fl"), Some('\u{FB02}'));
assert_eq!(unicode_for_glyph_name_multi("fl"), None);
}
#[test]
fn test_agl_multi_unknown() {
// Unknown name returns None
assert_eq!(unicode_for_glyph_name_multi("NotALigature"), None);
}
#[test]
fn test_agl_multi_hebrew() {
// Hebrew combining sequences are multi-codepoint
assert_eq!(
unicode_for_glyph_name_multi("dalethatafpatah"),
Some(&['\u{05D3}', '\u{05B2}'][..])
);
assert_eq!(
unicode_for_glyph_name_multi("lamedholam"),
Some(&['\u{05DC}', '\u{05B9}'][..])
);
}
#[test]
fn test_parse_algorithmic_uni() {
// uniXXXX (4 hex digits)
assert_eq!(parse_algorithmic("uni0041"), Some('A'));
assert_eq!(parse_algorithmic("uni20AC"), Some('\u{20AC}'));
assert_eq!(parse_algorithmic("uniFFFF"), Some('\u{FFFF}'));
// Not 4 digits
assert_eq!(parse_algorithmic("uni123"), None);
assert_eq!(parse_algorithmic("uni12345"), None);
assert_eq!(parse_algorithmic("uniGHIJ"), None);
}
#[test]
fn test_parse_algorithmic_u() {
// uXXXXXX (up to 6 hex digits)
assert_eq!(parse_algorithmic("u0041"), Some('A'));
assert_eq!(parse_algorithmic("u20AC"), Some('\u{20AC}'));
assert_eq!(parse_algorithmic("u1F600"), Some('\u{1F600}'));
// More than 6 digits
assert_eq!(parse_algorithmic("u1234567"), None);
assert_eq!(parse_algorithmic("uGGGGGG"), None);
}
#[test]
fn test_strip_variant_suffix() {
assert_eq!(strip_variant_suffix("A.sc"), "A");
assert_eq!(strip_variant_suffix("H.sc"), "H");
assert_eq!(strip_variant_suffix("foo.alt"), "foo");
assert_eq!(strip_variant_suffix("bar.baz.qux"), "bar");
assert_eq!(strip_variant_suffix("nosuffix"), "nosuffix");
assert_eq!(strip_variant_suffix(".dot"), "");
}
#[test]
fn test_agl_basic_letters() {
assert_eq!(unicode_for_glyph_name("A"), Some('A'));
assert_eq!(unicode_for_glyph_name("a"), Some('a'));
assert_eq!(unicode_for_glyph_name("Z"), Some('Z'));
assert_eq!(unicode_for_glyph_name("z"), Some('z'));
}
#[test]
fn test_agl_punctuation() {
assert_eq!(unicode_for_glyph_name("period"), Some('.'));
assert_eq!(unicode_for_glyph_name("comma"), Some(','));
assert_eq!(unicode_for_glyph_name("exclam"), Some('!'));
assert_eq!(unicode_for_glyph_name("question"), Some('?'));
}
#[test]
fn test_agl_quotes() {
assert_eq!(unicode_for_glyph_name("quoteleft"), Some('\u{2018}'));
assert_eq!(unicode_for_glyph_name("quoteright"), Some('\u{2019}'));
assert_eq!(unicode_for_glyph_name("quotedblleft"), Some('\u{201C}'));
assert_eq!(unicode_for_glyph_name("quotedblright"), Some('\u{201D}'));
}
#[test]
fn test_agl_euro() {
assert_eq!(unicode_for_glyph_name("Euro"), Some('\u{20AC}'));
}
#[test]
fn test_algorithmic_with_hash_prefix() {
// Some PDFs use #uniXXXX notation
assert_eq!(parse_algorithmic("#uni0041"), Some('A'));
assert_eq!(parse_algorithmic("#u0041"), Some('A'));
}
#[test]
fn test_multi_lookup_single_returns_none() {
// Single-codepoint names should return None from _multi
assert_eq!(unicode_for_glyph_name_multi("A"), None);
assert_eq!(unicode_for_glyph_name_multi("quoteright"), None);
}
#[test]
fn test_variant_stripping_with_multi() {
// Multi-codepoint with variant suffix should still work
// (though unlikely in practice)
assert_eq!(unicode_for_glyph_name_multi("fi.alt"), None); // No fi.alt in AGL_MULTI
}
}

View file

@ -8,11 +8,13 @@ pub mod embedded;
pub mod type0;
pub mod cmap;
pub mod encoding;
pub mod agl;
pub use embedded::{EmbeddedFont, FontMetrics, EmptyFontMetrics, GlyphBbox};
pub use type0::{Type0Font, DescendantCIDFont, CIDToGIDMap};
pub use cmap::{ToUnicodeMap, parse_to_unicode, parse_to_unicode_with_diags};
pub use encoding::{NamedEncoding, DifferencesOverlay, FontEncoding};
pub use agl::{unicode_for_glyph_name, unicode_for_glyph_name_multi};
use crate::parser::object::types::{PdfDict, PdfObject};