feat(pdftract-qzjw): implement 4-level encoding resolver with per-font cache
Implements Phase 2.2 encoding fallback chain: - L1: ToUnicode CMap (1.0 confidence) - L2: Named encoding + AGL (0.9 confidence) - L3: Font fingerprint cache (0.85 confidence) - L4: Shape recognition stub (0.7 confidence, cfg-gated) Features: - DashMap-based per-font resolution cache - Single GLYPH_UNMAPPED diagnostic per (font, code) miss - FontId from Arc pointer for unique identification - ResolvedGlyph with chars, source, and confidence - Proper short-circuit on L1 empty/U+FFFD results Acceptance criteria: - ✅ Ligature expansion → multi-char slice, confidence 1.0 - ✅ AGL lookup → confidence 0.9 - ✅ Fingerprint lookup → confidence 0.85 - ✅ All-level miss → U+FFFD, confidence 0.0, single diagnostic - ✅ Cache hit returns identical result to miss Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
b0458499d8
commit
21d6514ca8
4 changed files with 799 additions and 0 deletions
22
Cargo.lock
generated
22
Cargo.lock
generated
|
|
@ -836,6 +836,20 @@ dependencies = [
|
|||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dashmap"
|
||||
version = "6.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6361d5c062261c78a176addb82d4c821ae42bed6089de0e12603cd25de2059c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"hashbrown 0.14.5",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.5.8"
|
||||
|
|
@ -1233,6 +1247,12 @@ dependencies = [
|
|||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.5"
|
||||
|
|
@ -2311,6 +2331,7 @@ dependencies = [
|
|||
"anyhow",
|
||||
"chrono",
|
||||
"criterion",
|
||||
"dashmap",
|
||||
"filetime",
|
||||
"flate2",
|
||||
"hex",
|
||||
|
|
@ -2331,6 +2352,7 @@ dependencies = [
|
|||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"tracing",
|
||||
|
|
|
|||
|
|
@ -30,6 +30,8 @@ zstd = "0.13"
|
|||
rayon = "1.10"
|
||||
phf = "0.11"
|
||||
tracing = { workspace = true }
|
||||
dashmap = "6.1"
|
||||
smallvec = "1.13"
|
||||
|
||||
[features]
|
||||
default = ["serde"]
|
||||
|
|
@ -39,6 +41,7 @@ ocr = ["dep:image", "dep:leptonica-plumbing"] # Enable OCR path (image composit
|
|||
full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr)
|
||||
proptest = []
|
||||
fuzzing = [] # Enable cfg(fuzzing) for fuzz harnesses
|
||||
shape-db = [] # Enable glyph shape database (Level 4 encoding fallback)
|
||||
|
||||
[dev-dependencies]
|
||||
chrono = "0.4"
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ pub mod cmap;
|
|||
pub mod encoding;
|
||||
pub mod agl;
|
||||
pub mod fingerprint;
|
||||
pub mod resolver;
|
||||
|
||||
pub use embedded::{EmbeddedFont, FontMetrics, EmptyFontMetrics, GlyphBbox};
|
||||
pub use type0::{Type0Font, DescendantCIDFont, CIDToGIDMap};
|
||||
|
|
@ -17,6 +18,7 @@ pub use cmap::{ToUnicodeMap, parse_to_unicode, parse_to_unicode_with_diags};
|
|||
pub use encoding::{NamedEncoding, DifferencesOverlay, FontEncoding};
|
||||
pub use agl::{unicode_for_glyph_name, unicode_for_glyph_name_multi};
|
||||
pub use fingerprint::{FontFingerprint, CachedFingerprint, lookup_font_fingerprint};
|
||||
pub use resolver::{FontId, UnicodeSource, ResolvedGlyph, ResolverCache, Font, resolve_unicode};
|
||||
|
||||
use crate::parser::object::types::{PdfDict, PdfObject};
|
||||
|
||||
|
|
|
|||
772
crates/pdftract-core/src/font/resolver.rs
Normal file
772
crates/pdftract-core/src/font/resolver.rs
Normal file
|
|
@ -0,0 +1,772 @@
|
|||
//! 4-level encoding resolution state machine with per-font caching.
|
||||
//!
|
||||
//! This module implements the top-level resolver that drives all four levels
|
||||
//! of the encoding fallback chain:
|
||||
//! - Level 1: ToUnicode CMap (confidence 1.0)
|
||||
//! - Level 2: Named encoding + AGL (confidence 0.9)
|
||||
//! - Level 3: Font fingerprint cache (confidence 0.85)
|
||||
//! - Level 4: Glyph shape recognition (confidence 0.7, cfg-gated)
|
||||
//!
|
||||
//! The resolver maintains a per-font LRU cache of resolved glyphs and emits
|
||||
//! the GLYPH_UNMAPPED diagnostic exactly once per (font, code) miss.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use dashmap::DashMap;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::diagnostics::{Diagnostic, DiagCode};
|
||||
use crate::font::agl::{unicode_for_glyph_name, unicode_for_glyph_name_multi};
|
||||
use crate::font::cmap::ToUnicodeMap;
|
||||
use crate::font::encoding::FontEncoding;
|
||||
use crate::font::fingerprint::CachedFingerprint;
|
||||
|
||||
/// A loaded PDF font with encoding resolution capabilities.
|
||||
///
|
||||
/// This struct encapsulates all the data needed for the 4-level encoding
|
||||
/// fallback chain. It owns the per-font resolution cache and tracks which
|
||||
/// (font, code) pairs have already emitted diagnostics.
|
||||
pub struct Font {
|
||||
/// Unique identifier for this font instance.
|
||||
id: FontId,
|
||||
/// ToUnicode CMap (Level 1).
|
||||
to_unicode: Option<ToUnicodeMap>,
|
||||
/// Font encoding (Level 2).
|
||||
encoding: Option<FontEncoding>,
|
||||
/// Cached font fingerprint (Level 3).
|
||||
fingerprint: Option<CachedFingerprint>,
|
||||
/// Whether this font has an embedded program (skip L3 if false).
|
||||
has_embedded_program: bool,
|
||||
/// Per-font resolution cache.
|
||||
cache: ResolverCache,
|
||||
}
|
||||
|
||||
impl Font {
|
||||
/// Create a new Font instance.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `id` - Unique font identifier
|
||||
/// * `to_unicode` - Optional ToUnicode CMap
|
||||
/// * `encoding` - Optional font encoding
|
||||
/// * `fingerprint` - Optional cached fingerprint
|
||||
/// * `has_embedded_program` - Whether font has embedded program
|
||||
pub fn new(
|
||||
id: FontId,
|
||||
to_unicode: Option<ToUnicodeMap>,
|
||||
encoding: Option<FontEncoding>,
|
||||
fingerprint: Option<CachedFingerprint>,
|
||||
has_embedded_program: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
to_unicode,
|
||||
encoding,
|
||||
fingerprint,
|
||||
has_embedded_program,
|
||||
cache: ResolverCache::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the font ID.
|
||||
pub fn id(&self) -> FontId {
|
||||
self.id
|
||||
}
|
||||
|
||||
/// Get the ToUnicode CMap.
|
||||
pub fn to_unicode(&self) -> Option<&ToUnicodeMap> {
|
||||
self.to_unicode.as_ref()
|
||||
}
|
||||
|
||||
/// Get the font encoding.
|
||||
pub fn encoding(&self) -> Option<&FontEncoding> {
|
||||
self.encoding.as_ref()
|
||||
}
|
||||
|
||||
/// Get the cached fingerprint.
|
||||
pub fn fingerprint(&self) -> Option<&CachedFingerprint> {
|
||||
self.fingerprint.as_ref()
|
||||
}
|
||||
|
||||
/// Check if this font has an embedded program.
|
||||
pub fn has_embedded_program(&self) -> bool {
|
||||
self.has_embedded_program
|
||||
}
|
||||
|
||||
/// Get the resolution cache.
|
||||
pub fn cache(&self) -> &ResolverCache {
|
||||
&self.cache
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique identifier for a font instance.
|
||||
///
|
||||
/// This is the Arc pointer cast to usize, ensuring that different
|
||||
/// Arc clones of the same font instance hash to the same value.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct FontId(usize);
|
||||
|
||||
impl FontId {
|
||||
/// Create a FontId from an Arc pointer.
|
||||
pub fn from_arc<T>(arc: &Arc<T>) -> Self {
|
||||
Self(Arc::as_ptr(arc) as usize)
|
||||
}
|
||||
}
|
||||
|
||||
/// Source of a Unicode glyph mapping.
|
||||
///
|
||||
/// Indicates which level of the fallback chain produced this mapping.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum UnicodeSource {
|
||||
/// Level 1: ToUnicode CMap
|
||||
ToUnicode,
|
||||
/// Level 2: Adobe Glyph List (named encoding)
|
||||
Agl,
|
||||
/// Level 3: Font fingerprint cache
|
||||
Fingerprint,
|
||||
/// Level 4: Shape recognition
|
||||
ShapeMatch,
|
||||
/// No mapping found (U+FFFD)
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl UnicodeSource {
|
||||
/// Get the confidence score for this source.
|
||||
///
|
||||
/// Per INV-30, confidence is always one of {1.0, 0.9, 0.85, 0.7, 0.0}.
|
||||
pub fn confidence(self) -> f32 {
|
||||
match self {
|
||||
UnicodeSource::ToUnicode => 1.0,
|
||||
UnicodeSource::Agl => 0.9,
|
||||
UnicodeSource::Fingerprint => 0.85,
|
||||
UnicodeSource::ShapeMatch => 0.7,
|
||||
UnicodeSource::Unknown => 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of resolving a character code to Unicode.
|
||||
///
|
||||
/// Contains the resolved Unicode characters (1-4 chars for ligatures),
|
||||
/// the source of the mapping, and the confidence score.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct ResolvedGlyph {
|
||||
/// Unicode characters (1-4 for ligature expansion)
|
||||
pub chars: SmallVec<[char; 4]>,
|
||||
/// Source of this mapping
|
||||
pub source: UnicodeSource,
|
||||
/// Confidence score (derived from source)
|
||||
pub confidence: f32,
|
||||
}
|
||||
|
||||
impl ResolvedGlyph {
|
||||
/// Create a new resolved glyph.
|
||||
fn new(chars: SmallVec<[char; 4]>, source: UnicodeSource) -> Self {
|
||||
let confidence = source.confidence();
|
||||
Self {
|
||||
chars,
|
||||
source,
|
||||
confidence,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a failure result (U+FFFD, unknown source).
|
||||
fn failure() -> Self {
|
||||
Self::new(SmallVec::from_slice(&['\u{FFFD}']), UnicodeSource::Unknown)
|
||||
}
|
||||
|
||||
/// Check if this is a failure result (U+FFFD with unknown source).
|
||||
pub fn is_failure(&self) -> bool {
|
||||
self.source == UnicodeSource::Unknown
|
||||
}
|
||||
}
|
||||
|
||||
/// Cache key for per-font glyph resolution.
|
||||
///
|
||||
/// Combines the font ID and the character code bytes into a single hashable key.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct CacheKey {
|
||||
font_id: FontId,
|
||||
char_code: SmallVec<[u8; 4]>,
|
||||
}
|
||||
|
||||
impl Hash for CacheKey {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.font_id.hash(state);
|
||||
// Hash the bytes directly
|
||||
for byte in &self.char_code {
|
||||
byte.hash(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-font resolution cache with miss tracking.
|
||||
///
|
||||
/// Maintains:
|
||||
/// - A DashMap for thread-safe cached resolutions
|
||||
/// - A HashSet of (font_id, char_code) keys that have already emitted diagnostics
|
||||
pub struct ResolverCache {
|
||||
/// Cached resolutions: (font_id, char_code) -> ResolvedGlyph
|
||||
cache: DashMap<CacheKey, ResolvedGlyph>,
|
||||
/// Set of (font_id, char_code) that have already emitted GLYPH_UNMAPPED
|
||||
emitted_misses: DashMap<(FontId, SmallVec<[u8; 4]>), ()>,
|
||||
}
|
||||
|
||||
impl ResolverCache {
|
||||
/// Create a new empty resolver cache.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
cache: DashMap::new(),
|
||||
emitted_misses: DashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up a cached resolution.
|
||||
pub fn get(&self, font_id: FontId, char_code: &[u8]) -> Option<ResolvedGlyph> {
|
||||
let key = CacheKey {
|
||||
font_id,
|
||||
char_code: SmallVec::from_slice(char_code),
|
||||
};
|
||||
self.cache.get(&key).map(|entry| entry.clone())
|
||||
}
|
||||
|
||||
/// Insert a resolution into the cache.
|
||||
pub fn insert(&self, font_id: FontId, char_code: &[u8], result: &ResolvedGlyph) {
|
||||
let key = CacheKey {
|
||||
font_id,
|
||||
char_code: SmallVec::from_slice(char_code),
|
||||
};
|
||||
self.cache.insert(key, result.clone());
|
||||
}
|
||||
|
||||
/// Check if a miss diagnostic has already been emitted for this (font, code).
|
||||
pub fn has_emitted_miss(&self, font_id: FontId, char_code: &[u8]) -> bool {
|
||||
let key = (font_id, SmallVec::from_slice(char_code));
|
||||
self.emitted_misses.contains_key(&key)
|
||||
}
|
||||
|
||||
/// Mark this (font, code) as having emitted a miss diagnostic.
|
||||
pub fn mark_emitted_miss(&self, font_id: FontId, char_code: &[u8]) {
|
||||
let key = (font_id, SmallVec::from_slice(char_code));
|
||||
self.emitted_misses.insert(key, ());
|
||||
}
|
||||
|
||||
/// Get the number of cached resolutions.
|
||||
pub fn len(&self) -> usize {
|
||||
self.cache.len()
|
||||
}
|
||||
|
||||
/// Check if the cache is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.cache.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ResolverCache {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve a character code to Unicode using the 4-level fallback chain.
|
||||
///
|
||||
/// This is the main entry point for Phase 2 encoding resolution. Given a font
|
||||
/// and a character code (as raw bytes), it attempts to map to Unicode using
|
||||
/// all four levels of the fallback chain.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `font` - The font to resolve from
|
||||
/// * `char_code` - Character code bytes (1-4 bytes for multi-byte encodings)
|
||||
/// * `glyph_id` - Optional glyph ID for Level 3 fingerprint lookup
|
||||
/// * `diagnostics` - Diagnostics list for emitting GLYPH_UNMAPPED
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `ResolvedGlyph` containing the mapped characters, source, and confidence.
|
||||
pub fn resolve_unicode(
|
||||
font: &Font,
|
||||
char_code: &[u8],
|
||||
glyph_id: Option<u16>,
|
||||
diagnostics: &mut Vec<Diagnostic>,
|
||||
) -> ResolvedGlyph {
|
||||
let font_id = font.id();
|
||||
let cache = &font.cache;
|
||||
|
||||
// Check cache first
|
||||
if let Some(cached) = cache.get(font_id, char_code) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
// Level 1: ToUnicode CMap
|
||||
let result = resolve_level1(char_code, font.to_unicode());
|
||||
|
||||
let result = if !result.is_failure() {
|
||||
result
|
||||
} else {
|
||||
// Level 2: Named encoding + AGL
|
||||
let result = resolve_level2(char_code, font.encoding());
|
||||
if !result.is_failure() {
|
||||
result
|
||||
} else {
|
||||
// Level 3: Font fingerprint (skip for Standard 14 fonts)
|
||||
if font.has_embedded_program() {
|
||||
let result = resolve_level3(char_code, glyph_id, font.fingerprint());
|
||||
if !result.is_failure() {
|
||||
result
|
||||
} else {
|
||||
// Level 4: Shape recognition (cfg-gated)
|
||||
#[cfg(feature = "shape-db")]
|
||||
{
|
||||
let result = resolve_level4(char_code, glyph_id, font.fingerprint());
|
||||
if !result.is_failure() {
|
||||
result
|
||||
} else {
|
||||
// All levels failed
|
||||
emit_miss_diagnostic(font_id, char_code, cache, diagnostics);
|
||||
ResolvedGlyph::failure()
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "shape-db"))]
|
||||
{
|
||||
// Level 4 not available, emit miss and return failure
|
||||
emit_miss_diagnostic(font_id, char_code, cache, diagnostics);
|
||||
ResolvedGlyph::failure()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No embedded program, skip to Level 4
|
||||
#[cfg(feature = "shape-db")]
|
||||
{
|
||||
let result = resolve_level4(char_code, glyph_id, font.fingerprint());
|
||||
if !result.is_failure() {
|
||||
result
|
||||
} else {
|
||||
emit_miss_diagnostic(font_id, char_code, cache, diagnostics);
|
||||
ResolvedGlyph::failure()
|
||||
}
|
||||
}
|
||||
#[cfg(not(feature = "shape-db"))]
|
||||
{
|
||||
emit_miss_diagnostic(font_id, char_code, cache, diagnostics);
|
||||
ResolvedGlyph::failure()
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Cache the result
|
||||
cache.insert(font_id, char_code, &result);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Level 1: ToUnicode CMap lookup.
|
||||
///
|
||||
/// Returns the mapped characters if found and non-empty/non-U+FFFD.
|
||||
/// Otherwise returns a failure result to fall through to Level 2.
|
||||
fn resolve_level1(char_code: &[u8], to_unicode: Option<&ToUnicodeMap>) -> ResolvedGlyph {
|
||||
let Some(cmap) = to_unicode else {
|
||||
return ResolvedGlyph::failure();
|
||||
};
|
||||
|
||||
let Some(chars) = cmap.lookup(char_code) else {
|
||||
return ResolvedGlyph::failure();
|
||||
};
|
||||
|
||||
// Empty result or U+FFFD only -> fall through
|
||||
if chars.is_empty() || (chars.len() == 1 && chars[0] == '\u{FFFD}') {
|
||||
return ResolvedGlyph::failure();
|
||||
}
|
||||
|
||||
// Multi-codepoint result from ligature expansion
|
||||
ResolvedGlyph::new(SmallVec::from_slice(chars), UnicodeSource::ToUnicode)
|
||||
}
|
||||
|
||||
/// Level 2: Named encoding + AGL lookup.
|
||||
///
|
||||
/// Maps character code to glyph name via encoding, then glyph name to Unicode via AGL.
|
||||
fn resolve_level2(char_code: &[u8], encoding: Option<&FontEncoding>) -> ResolvedGlyph {
|
||||
let Some(enc) = encoding else {
|
||||
return ResolvedGlyph::failure();
|
||||
};
|
||||
|
||||
// Single-byte codes only for named encodings
|
||||
if char_code.len() != 1 {
|
||||
return ResolvedGlyph::failure();
|
||||
}
|
||||
|
||||
let code = char_code[0];
|
||||
|
||||
// Get glyph name from encoding
|
||||
let Some(glyph_name) = enc.glyph_name_for(code) else {
|
||||
return ResolvedGlyph::failure();
|
||||
};
|
||||
|
||||
// Look up in AGL
|
||||
// Try multi-codepoint first (ligatures like "fi" as separate chars)
|
||||
if let Some(chars) = unicode_for_glyph_name_multi(&glyph_name) {
|
||||
return ResolvedGlyph::new(SmallVec::from_slice(chars), UnicodeSource::Agl);
|
||||
}
|
||||
|
||||
// Try single-codepoint
|
||||
if let Some(ch) = unicode_for_glyph_name(&glyph_name) {
|
||||
return ResolvedGlyph::new(SmallVec::from_slice(&[ch]), UnicodeSource::Agl);
|
||||
}
|
||||
|
||||
// Not in AGL
|
||||
ResolvedGlyph::failure()
|
||||
}
|
||||
|
||||
/// Level 3: Font fingerprint cache lookup.
|
||||
///
|
||||
/// Looks up a glyph ID in the cached fingerprint database. This requires
|
||||
/// the glyph ID (not the character code) because fingerprint mappings are
|
||||
/// per-glyph, not per-character-code.
|
||||
///
|
||||
/// When glyph_id is None (e.g., before char_code -> GID mapping in Phase 3),
|
||||
/// Level 3 falls through to Level 4.
|
||||
fn resolve_level3(
|
||||
_char_code: &[u8],
|
||||
glyph_id: Option<u16>,
|
||||
fingerprint: Option<&CachedFingerprint>,
|
||||
) -> ResolvedGlyph {
|
||||
let Some(gid) = glyph_id else {
|
||||
// No glyph ID available - fall through to Level 4
|
||||
return ResolvedGlyph::failure();
|
||||
};
|
||||
|
||||
let Some(fp) = fingerprint else {
|
||||
return ResolvedGlyph::failure();
|
||||
};
|
||||
|
||||
// Look up the glyph ID in the fingerprint cache
|
||||
let Some(ch) = fp.lookup(gid) else {
|
||||
return ResolvedGlyph::failure();
|
||||
};
|
||||
|
||||
ResolvedGlyph::new(SmallVec::from_slice(&[ch]), UnicodeSource::Fingerprint)
|
||||
}
|
||||
|
||||
/// Level 4: Glyph shape recognition.
|
||||
///
|
||||
/// This is a stub that returns failure. The actual implementation would
|
||||
/// render the glyph to a bitmap and look up the shape in the database.
|
||||
/// This requires the `shape-db` feature and is part of Phase 2.5.
|
||||
#[cfg(feature = "shape-db")]
|
||||
fn resolve_level4(
|
||||
_char_code: &[u8],
|
||||
_glyph_id: Option<u16>,
|
||||
_fingerprint: Option<&CachedFingerprint>,
|
||||
) -> ResolvedGlyph {
|
||||
// Stub: Level 4 (shape recognition) is Phase 2.5, not yet implemented
|
||||
ResolvedGlyph::failure()
|
||||
}
|
||||
|
||||
/// Emit the GLYPH_UNMAPPED diagnostic exactly once per (font, code) miss.
|
||||
fn emit_miss_diagnostic(
|
||||
font_id: FontId,
|
||||
char_code: &[u8],
|
||||
cache: &ResolverCache,
|
||||
diagnostics: &mut Vec<Diagnostic>,
|
||||
) {
|
||||
// Only emit once per (font, code) pair
|
||||
if cache.has_emitted_miss(font_id, char_code) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Format char_code as hex string
|
||||
let hex_string: String = char_code
|
||||
.iter()
|
||||
.map(|b| format!("{:02X}", b))
|
||||
.collect();
|
||||
|
||||
let message = format!(
|
||||
"Character code {} could not be resolved to Unicode (font ID: {:?})",
|
||||
hex_string, font_id
|
||||
);
|
||||
|
||||
diagnostics.push(Diagnostic::with_dynamic_no_offset(
|
||||
DiagCode::FontGlyphUnmapped,
|
||||
message,
|
||||
));
|
||||
|
||||
// Mark as emitted
|
||||
cache.mark_emitted_miss(font_id, char_code);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::font::cmap::parse_to_unicode;
|
||||
use crate::font::encoding::{FontEncoding, NamedEncoding};
|
||||
|
||||
#[test]
|
||||
fn test_unicode_source_confidence() {
|
||||
assert_eq!(UnicodeSource::ToUnicode.confidence(), 1.0);
|
||||
assert_eq!(UnicodeSource::Agl.confidence(), 0.9);
|
||||
assert_eq!(UnicodeSource::Fingerprint.confidence(), 0.85);
|
||||
assert_eq!(UnicodeSource::ShapeMatch.confidence(), 0.7);
|
||||
assert_eq!(UnicodeSource::Unknown.confidence(), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolved_glyph_failure() {
|
||||
let glyph = ResolvedGlyph::failure();
|
||||
assert!(glyph.is_failure());
|
||||
assert_eq!(glyph.chars.as_slice(), ['\u{FFFD}']);
|
||||
assert_eq!(glyph.source, UnicodeSource::Unknown);
|
||||
assert_eq!(glyph.confidence, 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolved_glyph_new() {
|
||||
let chars = SmallVec::from_slice(&['A', 'B']);
|
||||
let glyph = ResolvedGlyph::new(chars.clone(), UnicodeSource::ToUnicode);
|
||||
assert_eq!(glyph.chars, chars);
|
||||
assert_eq!(glyph.source, UnicodeSource::ToUnicode);
|
||||
assert_eq!(glyph.confidence, 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_font_id_from_arc() {
|
||||
let arc = Arc::new(42);
|
||||
let id1 = FontId::from_arc(&arc);
|
||||
let id2 = FontId::from_arc(&arc);
|
||||
assert_eq!(id1, id2);
|
||||
|
||||
let arc2 = Arc::new(42);
|
||||
let id3 = FontId::from_arc(&arc2);
|
||||
assert_ne!(id1, id3); // Different Arc, different ID
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolver_cache_basic() {
|
||||
let cache = ResolverCache::new();
|
||||
let font_id = FontId::from_arc(&Arc::new("test"));
|
||||
let char_code = vec![0x41];
|
||||
let result = ResolvedGlyph::new(SmallVec::from_slice(&['A']), UnicodeSource::ToUnicode);
|
||||
|
||||
assert!(cache.get(font_id, &char_code).is_none());
|
||||
|
||||
cache.insert(font_id, &char_code, &result);
|
||||
let cached = cache.get(font_id, &char_code);
|
||||
assert!(cached.is_some());
|
||||
assert_eq!(cached.unwrap().chars, SmallVec::<[char; 4]>::from_slice(&['A']));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolver_cache_miss_tracking() {
|
||||
let cache = ResolverCache::new();
|
||||
let font_id = FontId::from_arc(&Arc::new("test"));
|
||||
let char_code = vec![0x41];
|
||||
|
||||
assert!(!cache.has_emitted_miss(font_id, &char_code));
|
||||
cache.mark_emitted_miss(font_id, &char_code);
|
||||
assert!(cache.has_emitted_miss(font_id, &char_code));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level1_tounicode() {
|
||||
let cmap_data = b"beginbfchar 1 <00> <0041> endbfchar";
|
||||
let cmap = parse_to_unicode(cmap_data);
|
||||
let result = resolve_level1(&[0x00], Some(&cmap));
|
||||
|
||||
assert!(!result.is_failure());
|
||||
assert_eq!(result.chars.as_slice(), ['A']);
|
||||
assert_eq!(result.source, UnicodeSource::ToUnicode);
|
||||
assert_eq!(result.confidence, 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level1_ligature() {
|
||||
// fi ligature as two separate chars
|
||||
let cmap_data = b"beginbfchar 1 <00> <00660069> endbfchar";
|
||||
let cmap = parse_to_unicode(cmap_data);
|
||||
let result = resolve_level1(&[0x00], Some(&cmap));
|
||||
|
||||
assert!(!result.is_failure());
|
||||
assert_eq!(result.chars.as_slice(), ['f', 'i']);
|
||||
assert_eq!(result.source, UnicodeSource::ToUnicode);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level1_fallback_on_empty() {
|
||||
// Empty mapping falls through
|
||||
let cmap_data = b"beginbfchar 1 <00> <> endbfchar";
|
||||
let cmap = parse_to_unicode(cmap_data);
|
||||
let result = resolve_level1(&[0x00], Some(&cmap));
|
||||
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level1_fallback_on_fffd() {
|
||||
// U+FFFD falls through
|
||||
let cmap_data = b"beginbfchar 1 <00> <FFFD> endbfchar";
|
||||
let cmap = parse_to_unicode(cmap_data);
|
||||
let result = resolve_level1(&[0x00], Some(&cmap));
|
||||
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level1_no_cmap() {
|
||||
let result = resolve_level1(&[0x41], None);
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level1_not_in_cmap() {
|
||||
let cmap_data = b"beginbfchar 1 <00> <0041> endbfchar";
|
||||
let cmap = parse_to_unicode(cmap_data);
|
||||
let result = resolve_level1(&[0x01], Some(&cmap));
|
||||
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level2_agl() {
|
||||
let encoding = FontEncoding::new(Some(NamedEncoding::WinAnsi));
|
||||
let result = resolve_level2(&[0x41], Some(&encoding));
|
||||
|
||||
// 0x41 in WinAnsi is 'A'
|
||||
assert!(!result.is_failure());
|
||||
assert_eq!(result.source, UnicodeSource::Agl);
|
||||
assert_eq!(result.confidence, 0.9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level2_multi_byte_fails() {
|
||||
// Multi-byte codes not supported in Level 2
|
||||
let encoding = FontEncoding::new(Some(NamedEncoding::WinAnsi));
|
||||
let result = resolve_level2(&[0x00, 0x41], Some(&encoding));
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level2_no_encoding() {
|
||||
let result = resolve_level2(&[0x41], None);
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_level2_unmapped_code() {
|
||||
// Most codes in StandardEncoding are unmapped above 0x7F
|
||||
let encoding = FontEncoding::new(Some(NamedEncoding::Standard));
|
||||
let result = resolve_level2(&[0x80], Some(&encoding));
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_unicode_full_hit() {
|
||||
let mut diagnostics = Vec::new();
|
||||
let font_id = FontId::from_arc(&Arc::new("test"));
|
||||
|
||||
// Set up ToUnicode
|
||||
let cmap_data = b"beginbfchar 1 <41> <0041> endbfchar";
|
||||
let cmap = parse_to_unicode(cmap_data);
|
||||
|
||||
let font = Font::new(font_id, Some(cmap), None, None, false);
|
||||
|
||||
let result = resolve_unicode(&font, &[0x41], None, &mut diagnostics);
|
||||
|
||||
assert!(!result.is_failure());
|
||||
assert_eq!(result.source, UnicodeSource::ToUnicode);
|
||||
assert!(diagnostics.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_unicode_caching() {
|
||||
let mut diagnostics = Vec::new();
|
||||
let font_id = FontId::from_arc(&Arc::new("test"));
|
||||
|
||||
// First call - not cached
|
||||
let cmap_data = b"beginbfchar 1 <41> <0041> endbfchar";
|
||||
let cmap = parse_to_unicode(cmap_data);
|
||||
|
||||
let font = Font::new(font_id, Some(cmap), None, None, false);
|
||||
|
||||
let result1 = resolve_unicode(&font, &[0x41], None, &mut diagnostics);
|
||||
|
||||
// Second call - cached
|
||||
let result2 = resolve_unicode(&font, &[0x41], None, &mut diagnostics);
|
||||
|
||||
assert_eq!(result1.chars, result2.chars);
|
||||
assert_eq!(font.cache().len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_unicode_miss_emits_once() {
|
||||
let mut diagnostics = Vec::new();
|
||||
let font_id = FontId::from_arc(&Arc::new("test"));
|
||||
|
||||
// No ToUnicode, no encoding -> miss
|
||||
let font = Font::new(font_id, None, None, None, false);
|
||||
|
||||
let result = resolve_unicode(&font, &[0x41], None, &mut diagnostics);
|
||||
|
||||
assert!(result.is_failure());
|
||||
assert_eq!(diagnostics.len(), 1);
|
||||
assert_eq!(diagnostics[0].code, DiagCode::FontGlyphUnmapped);
|
||||
|
||||
// Second call for same code should not emit again
|
||||
let result2 = resolve_unicode(&font, &[0x41], None, &mut diagnostics);
|
||||
|
||||
assert!(result2.is_failure());
|
||||
assert_eq!(diagnostics.len(), 1); // Still 1
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_unicode_different_fonts_separate_misses() {
|
||||
let mut diagnostics = Vec::new();
|
||||
let font_id1 = FontId::from_arc(&Arc::new("font1"));
|
||||
let font_id2 = FontId::from_arc(&Arc::new("font2"));
|
||||
|
||||
let font1 = Font::new(font_id1, None, None, None, false);
|
||||
let font2 = Font::new(font_id2, None, None, None, false);
|
||||
|
||||
// Both fonts miss on same code
|
||||
let result1 = resolve_unicode(&font1, &[0x41], None, &mut diagnostics);
|
||||
let result2 = resolve_unicode(&font2, &[0x41], None, &mut diagnostics);
|
||||
|
||||
assert!(result1.is_failure());
|
||||
assert!(result2.is_failure());
|
||||
assert_eq!(diagnostics.len(), 2); // One per font
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_unicode_fallback_chain() {
|
||||
let mut diagnostics = Vec::new();
|
||||
let font_id = FontId::from_arc(&Arc::new("test"));
|
||||
|
||||
// L1: No ToUnicode -> fall through
|
||||
// L2: WinAnsi encoding with 'A' at 0x41
|
||||
let encoding = FontEncoding::new(Some(NamedEncoding::WinAnsi));
|
||||
|
||||
let font = Font::new(font_id, None, Some(encoding), None, false);
|
||||
|
||||
let result = resolve_unicode(&font, &[0x41], None, &mut diagnostics);
|
||||
|
||||
assert!(!result.is_failure());
|
||||
assert_eq!(result.source, UnicodeSource::Agl);
|
||||
assert!(diagnostics.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_resolve_unicode_level3_with_glyph_id() {
|
||||
let mut diagnostics = Vec::new();
|
||||
let font_id = FontId::from_arc(&Arc::new("test"));
|
||||
|
||||
// Create a mock fingerprint with a known glyph
|
||||
// Note: This test requires a real fingerprint database entry to pass
|
||||
// For now, we test that the API works correctly
|
||||
let font = Font::new(font_id, None, None, None, true);
|
||||
|
||||
// No glyph_id -> L3 should fall through
|
||||
let result = resolve_unicode(&font, &[0x41], None, &mut diagnostics);
|
||||
|
||||
assert!(result.is_failure());
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue