diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index 5734f87..732c07e 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -e3a149fbf8f56a4e05881a92d45663b9c9bd3878 +6156381e783cb0e310cd3b7c3552b426a9ed0d28 diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index a8a5022..29de9a2 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -543,6 +543,14 @@ pub enum DiagCode { /// Phase origin: 2.1 FontCidtogidmapTruncated, + /// Character code in /Differences array exceeds valid range + /// + /// Emitted when a /Differences array contains an integer code outside the + /// valid range for single-byte encodings (0-255). The code is clamped to u8. + /// + /// Phase origin: 2.2 + FontEncodingDifferenceOutOfRange, + // === OCR_* codes === /// JBIG2 decoder not available @@ -798,7 +806,8 @@ impl DiagCode { | DiagCode::FontInvalidCmap | DiagCode::FontParseFailed | DiagCode::FontUnsupported - | DiagCode::FontCidtogidmapTruncated => "FONT", + | DiagCode::FontCidtogidmapTruncated + | DiagCode::FontEncodingDifferenceOutOfRange => "FONT", // OCR_* DiagCode::OcrJbig2Unsupported @@ -889,6 +898,7 @@ impl DiagCode { DiagCode::FontParseFailed => "FONT_PARSE_FAILED", DiagCode::FontUnsupported => "FONT_UNSUPPORTED", DiagCode::FontCidtogidmapTruncated => "FONT_CIDTOGIDMAP_TRUNCATED", + DiagCode::FontEncodingDifferenceOutOfRange => "ENCODING_DIFFERENCE_OUT_OF_RANGE", DiagCode::OcrJbig2Unsupported => "OCR_JBIG2_UNSUPPORTED", DiagCode::OcrJpxUnsupported => "OCR_JPX_UNSUPPORTED", DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED", @@ -965,6 +975,7 @@ impl DiagCode { | DiagCode::FontParseFailed | DiagCode::FontUnsupported | DiagCode::FontCidtogidmapTruncated + | DiagCode::FontEncodingDifferenceOutOfRange | DiagCode::OcrJbig2Unsupported | DiagCode::OcrJpxUnsupported | DiagCode::OcrCcittUnsupported @@ -1409,6 +1420,14 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "2.1", suggested_action: "The CIDToGIDMap stream has an odd byte count; the trailing byte was discarded", }, + DiagInfo { + code: DiagCode::FontEncodingDifferenceOutOfRange, + category: "FONT", + severity: Severity::Warning, + recoverable: true, + phase: "2.2", + suggested_action: "A /Differences array contains a character code outside 0-255; the code was clamped", + }, // === OCR_* codes === DiagInfo { code: DiagCode::OcrJbig2Unsupported, diff --git a/crates/pdftract-core/src/font/encoding.rs b/crates/pdftract-core/src/font/encoding.rs index 74841ce..3326dc0 100644 --- a/crates/pdftract-core/src/font/encoding.rs +++ b/crates/pdftract-core/src/font/encoding.rs @@ -11,6 +11,11 @@ //! These tables map character codes (0-255) to glyph names, which are then //! mapped to Unicode via the Adobe Glyph List (AGL). +use std::sync::Arc; + +use crate::diagnostics::{DiagCode, Diagnostic}; +use crate::parser::object::types::{PdfObject, PdfDict}; + include!(concat!(env!("OUT_DIR"), "/named_encodings.rs")); /// Named encoding for Type1 fonts. @@ -104,6 +109,260 @@ impl NamedEncoding { } } +/// Sparse overlay of glyph name assignments from a /Differences array. +/// +/// The /Differences array sparsely overrides specific character codes with custom +/// glyph names on top of a base encoding. Format: `[n /Name1 /Name2 ... m /OtherName ...]` +/// where each integer resets the position and subsequent names are assigned to consecutive codes. +/// +/// # Example +/// +/// A Differences array `[ 39 /quotesingle 96 /grave ]` creates: +/// - code 39 → "quotesingle" +/// - code 96 → "grave" +/// +/// # Lookup behavior +/// +/// The overlay is sparse; most codes are not present. Use `get()` to check for an override, +/// which returns `None` either when the code is not in the overlay or when the code is out of range. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DifferencesOverlay { + /// Sparse list of (code, glyph_name) overrides. + /// Sorted by code for binary search, though linear search is fine for <32 entries. + entries: Vec<(u8, Arc)>, +} + +impl DifferencesOverlay { + /// Create an empty overlay. + pub fn new() -> Self { + Self { entries: Vec::new() } + } + + /// Parse a /Differences array into an overlay. + /// + /// The array alternates between integers (starting codes) and names (glyph names). + /// Each integer resets the cursor, and subsequent names are assigned to consecutive codes. + /// + /// # Arguments + /// + /// * `diff_array` - The /Differences array from the font's Encoding dictionary + /// * `diagnostics` - Diagnostic list for parsing errors + /// + /// # Returns + /// + /// A `DifferencesOverlay` with parsed entries. Invalid entries are skipped with diagnostics. + /// + /// # Example + /// + /// ``` + /// // [ 39 /quotesingle 96 /grave ] + /// // → entries: [(39, "quotesingle"), (96, "grave")] + /// ``` + pub fn parse(diff_array: &PdfObject, diagnostics: &mut Vec) -> Self { + let mut overlay = Self::new(); + + let PdfObject::Array(arr) = diff_array else { + return overlay; + }; + + let mut cursor: u32 = 0; + + for (i, obj) in arr.iter().enumerate() { + match obj { + PdfObject::Integer(code) => { + // Clamp to u8 range and emit diagnostic if out of range + if *code < 0 { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::FontEncodingDifferenceOutOfRange, + format!("/Differences array at index {i} has negative integer {code}, clamping to 0"), + )); + cursor = 0; + } else if *code > 255 { + diagnostics.push(Diagnostic::with_dynamic_no_offset( + DiagCode::FontEncodingDifferenceOutOfRange, + format!("/Differences array at index {i} has code {code} > 255, clamping to 255"), + )); + cursor = 255; + } else { + cursor = *code as u32; + } + } + PdfObject::Name(name) => { + // Assign this name to the current cursor position + if cursor <= 255 { + overlay.entries.push((cursor as u8, Arc::clone(name))); + } + cursor = cursor.saturating_add(1); + } + _ => { + // Skip non-integer, non-name objects + // (this is technically a PDF spec violation, but we recover) + } + } + } + + overlay + } + + /// Get the glyph name override for a character code. + /// + /// Returns `Some(name)` if this code has an override, `None` otherwise. + /// The returned name may not be in the AGL; the resolver must handle that. + pub fn get(&self, code: u8) -> Option> { + // Linear search is fine for <32 entries; binary search for larger + self.entries + .iter() + .find(|(c, _)| *c == code) + .map(|(_, name)| Arc::clone(name)) + } + + /// Check if the overlay has any entries. + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + /// Get the number of entries in the overlay. + pub fn len(&self) -> usize { + self.entries.len() + } +} + +impl Default for DifferencesOverlay { + fn default() -> Self { + Self::new() + } +} + +/// Combined font encoding with base encoding and /Differences overlay. +/// +/// PDF font encodings are composed of: +/// 1. A base named encoding (WinAnsi, Standard, etc.) - optional +/// 2. A /Differences overlay that overrides specific codes - optional +/// +/// When both are present, the overlay takes precedence. The lookup order is: +/// 1. Check /Differences overlay for an override +/// 2. Fall back to base encoding table +/// 3. Return None if neither has the code +/// +/// # Default base encoding +/// +/// When neither `/Encoding/BaseEncoding` nor `/Encoding` is present: +/// - Type1 fonts: StandardEncoding +/// - TrueType fonts: The font's built-in encoding (often MacRoman or WinAnsi) +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FontEncoding { + /// Base named encoding (WinAnsi, Standard, etc.) - None for identity/implicit + base: Option, + /// Sparse overrides from /Differences array + differences: DifferencesOverlay, +} + +impl FontEncoding { + /// Create a new font encoding with the given base and empty differences. + pub fn new(base: Option) -> Self { + Self { + base, + differences: DifferencesOverlay::new(), + } + } + + /// Create a font encoding by parsing the /Encoding dictionary from a font. + /// + /// This handles all the encoding indirection patterns: + /// - `/Encoding` is a name → use that named encoding directly + /// - `/Encoding` is a dict with `/BaseEncoding` → use base + /Differences + /// - `/Encoding` is a dict without `/BaseEncoding` → use implicit base + /Differences + /// - No `/Encoding` key → use default base (Standard for Type1, built-in for TrueType) + /// + /// # Arguments + /// + /// * `font_dict` - The font dictionary from the PDF resource dictionary + /// * `default_base` - Default base encoding when /Encoding is absent (Standard for Type1) + /// * `diagnostics` - Diagnostic list for parsing errors + /// + /// # Returns + /// + /// A `FontEncoding` with parsed base encoding and differences overlay. + pub fn parse_from_font( + font_dict: &PdfDict, + default_base: Option, + diagnostics: &mut Vec, + ) -> Self { + // Get the /Encoding entry + let encoding_obj = match font_dict.get("/Encoding") { + Some(obj) => obj, + None => return Self::new(default_base), + }; + + match encoding_obj { + // Case 1: /Encoding is a name → use that named encoding directly + PdfObject::Name(name) => { + let base = NamedEncoding::from_name(name.as_ref()); + Self::new(base.or(default_base)) + } + + // Case 2: /Encoding is a dict → read /BaseEncoding and /Differences + PdfObject::Dict(encoding_dict) => { + // Parse /BaseEncoding (if present) + let base = encoding_dict + .get("/BaseEncoding") + .and_then(|obj| obj.as_name()) + .and_then(|name| NamedEncoding::from_name(name.as_ref())) + .or(default_base); + + // Parse /Differences (if present) + let differences = encoding_dict + .get("/Differences") + .map(|diff| DifferencesOverlay::parse(diff, diagnostics)) + .unwrap_or_default(); + + Self { base, differences } + } + + // Case 3: /Encoding is an indirect reference → would need resolution + // For now, treat as missing and use default + PdfObject::Ref(_) => Self::new(default_base), + + // Invalid /Encoding type → use default + _ => Self::new(default_base), + } + } + + /// Get the glyph name for a character code. + /// + /// Lookup order: + /// 1. Check /Differences overlay for an override + /// 2. Fall back to base encoding table + /// 3. Return None if neither has the code + /// + /// Returns `Some(name)` if found, `None` if not mapped. + /// The returned name may not be in the AGL; the resolver must handle that. + pub fn glyph_name_for(&self, code: u8) -> Option> { + // Check differences overlay first + if let Some(name) = self.differences.get(code) { + return Some(name); + } + + // Fall back to base encoding + self.base.and_then(|enc| enc.glyph_name(code).map(|s| Arc::from(s))) + } + + /// Check if this encoding has a differences overlay. + pub fn has_differences(&self) -> bool { + !self.differences.is_empty() + } + + /// Get the base encoding. + pub fn base_encoding(&self) -> Option { + self.base + } + + /// Get a reference to the differences overlay. + pub fn differences(&self) -> &DifferencesOverlay { + &self.differences + } +} + #[cfg(test)] mod tests { use super::*; @@ -176,4 +435,198 @@ mod tests { assert_eq!(enc.glyph_name(0x80), None); assert_eq!(enc.glyph_name(0x92), None); // WinAnsi has this, Standard doesn't } + + // === DifferencesOverlay tests === + + #[test] + fn test_differences_overlay_parse_simple() { + // [ 39 /quotesingle 96 /grave ] + let mut diagnostics = Vec::new(); + let arr = PdfObject::Array(Box::new(vec![ + PdfObject::Integer(39), + PdfObject::Name(Arc::from("quotesingle")), + PdfObject::Integer(96), + PdfObject::Name(Arc::from("grave")), + ])); + + let overlay = DifferencesOverlay::parse(&arr, &mut diagnostics); + + assert_eq!(overlay.get(39), Some(Arc::from("quotesingle"))); + assert_eq!(overlay.get(96), Some(Arc::from("grave"))); + assert_eq!(overlay.get(40), None); + assert_eq!(overlay.len(), 2); + assert!(diagnostics.is_empty()); + } + + #[test] + fn test_differences_overlay_parse_consecutive() { + // [ 39 /a /b /c ] + let mut diagnostics = Vec::new(); + let arr = PdfObject::Array(Box::new(vec![ + PdfObject::Integer(39), + PdfObject::Name(Arc::from("a")), + PdfObject::Name(Arc::from("b")), + PdfObject::Name(Arc::from("c")), + ])); + + let overlay = DifferencesOverlay::parse(&arr, &mut diagnostics); + + assert_eq!(overlay.get(39), Some(Arc::from("a"))); + assert_eq!(overlay.get(40), Some(Arc::from("b"))); + assert_eq!(overlay.get(41), Some(Arc::from("c"))); + assert_eq!(overlay.get(42), None); + assert_eq!(overlay.len(), 3); + } + + #[test] + fn test_differences_overlay_parse_multiple_blocks() { + // [ 39 /a /b 100 /x /y ] + let mut diagnostics = Vec::new(); + let arr = PdfObject::Array(Box::new(vec![ + PdfObject::Integer(39), + PdfObject::Name(Arc::from("a")), + PdfObject::Name(Arc::from("b")), + PdfObject::Integer(100), + PdfObject::Name(Arc::from("x")), + PdfObject::Name(Arc::from("y")), + ])); + + let overlay = DifferencesOverlay::parse(&arr, &mut diagnostics); + + assert_eq!(overlay.get(39), Some(Arc::from("a"))); + assert_eq!(overlay.get(40), Some(Arc::from("b"))); + assert_eq!(overlay.get(100), Some(Arc::from("x"))); + assert_eq!(overlay.get(101), Some(Arc::from("y"))); + assert_eq!(overlay.len(), 4); + } + + #[test] + fn test_differences_overlay_out_of_range_positive() { + // Code > 255 should emit diagnostic and clamp + let mut diagnostics = Vec::new(); + let arr = PdfObject::Array(Box::new(vec![ + PdfObject::Integer(300), + PdfObject::Name(Arc::from("a")), + ])); + + let overlay = DifferencesOverlay::parse(&arr, &mut diagnostics); + + assert_eq!(overlay.get(255), Some(Arc::from("a"))); + assert_eq!(diagnostics.len(), 1); + assert_eq!(diagnostics[0].code, DiagCode::FontEncodingDifferenceOutOfRange); + } + + #[test] + fn test_differences_overlay_out_of_range_negative() { + // Negative code should emit diagnostic and clamp to 0 + let mut diagnostics = Vec::new(); + let arr = PdfObject::Array(Box::new(vec![ + PdfObject::Integer(-5), + PdfObject::Name(Arc::from("a")), + ])); + + let overlay = DifferencesOverlay::parse(&arr, &mut diagnostics); + + assert_eq!(overlay.get(0), Some(Arc::from("a"))); + assert_eq!(diagnostics.len(), 1); + assert_eq!(diagnostics[0].code, DiagCode::FontEncodingDifferenceOutOfRange); + } + + #[test] + fn test_differences_overlay_empty() { + let mut diagnostics = Vec::new(); + let arr = PdfObject::Array(Box::new(vec![])); + + let overlay = DifferencesOverlay::parse(&arr, &mut diagnostics); + + assert!(overlay.is_empty()); + assert_eq!(overlay.len(), 0); + assert!(diagnostics.is_empty()); + } + + #[test] + fn test_differences_overlay_default() { + let overlay = DifferencesOverlay::default(); + assert!(overlay.is_empty()); + assert_eq!(overlay.get(0), None); + } + + // === FontEncoding tests === + + #[test] + fn test_font_encoding_new() { + let enc = FontEncoding::new(Some(NamedEncoding::WinAnsi)); + assert_eq!(enc.base_encoding(), Some(NamedEncoding::WinAnsi)); + assert!(!enc.has_differences()); + } + + #[test] + fn test_font_encoding_glyph_name_base_only() { + let enc = FontEncoding::new(Some(NamedEncoding::WinAnsi)); + assert_eq!(enc.glyph_name_for(0x92), Some(Arc::from("quoteright"))); + assert_eq!(enc.glyph_name_for(0x80), Some(Arc::from("Euro"))); + } + + #[test] + fn test_font_encoding_glyph_name_with_differences() { + // Base encoding has 0x92 = quoteright, but difference overrides it + let mut differences = DifferencesOverlay::new(); + differences.entries.push((0x92, Arc::from("customquote"))); + + let enc = FontEncoding { + base: Some(NamedEncoding::WinAnsi), + differences, + }; + + assert_eq!(enc.glyph_name_for(0x92), Some(Arc::from("customquote"))); + // Non-overlaid codes still use base + assert_eq!(enc.glyph_name_for(0x80), Some(Arc::from("Euro"))); + } + + #[test] + fn test_font_encoding_glyph_name_no_base() { + // No base encoding, only differences + let mut differences = DifferencesOverlay::new(); + differences.entries.push((0x20, Arc::from("space"))); + + let enc = FontEncoding { + base: None, + differences, + }; + + assert_eq!(enc.glyph_name_for(0x20), Some(Arc::from("space"))); + assert_eq!(enc.glyph_name_for(0x21), None); // Not in differences, no base + } + + #[test] + fn test_font_encoding_unknown_glyph_name() { + // Differences can contain arbitrary glyph names not in AGL + let mut differences = DifferencesOverlay::new(); + differences.entries.push((0x20, Arc::from("ArbitraryCustomGlyph"))); + + let enc = FontEncoding { + base: None, + differences, + }; + + // Should return the custom name, not None + assert_eq!(enc.glyph_name_for(0x20), Some(Arc::from("ArbitraryCustomGlyph"))); + } + + #[test] + fn test_font_encoding_lookup_order() { + // Differences should take precedence over base encoding + let mut differences = DifferencesOverlay::new(); + // WinAnsi has 0x92 = quoteright, override it + differences.entries.push((0x92, Arc::from("override"))); + + let enc = FontEncoding { + base: Some(NamedEncoding::WinAnsi), + differences, + }; + + assert_eq!(enc.glyph_name_for(0x92), Some(Arc::from("override"))); + // Base encoding still works for non-overlaid codes + assert_eq!(enc.glyph_name_for(0x80), Some(Arc::from("Euro"))); + } } diff --git a/crates/pdftract-core/src/font/mod.rs b/crates/pdftract-core/src/font/mod.rs index c107f53..2c99a8d 100644 --- a/crates/pdftract-core/src/font/mod.rs +++ b/crates/pdftract-core/src/font/mod.rs @@ -12,7 +12,7 @@ pub mod encoding; pub use embedded::{EmbeddedFont, FontMetrics, EmptyFontMetrics, GlyphBbox}; pub use type0::{Type0Font, DescendantCIDFont, CIDToGIDMap}; pub use cmap::{ToUnicodeMap, parse_to_unicode, parse_to_unicode_with_diags}; -pub use encoding::{NamedEncoding}; +pub use encoding::{NamedEncoding, DifferencesOverlay, FontEncoding}; use crate::parser::object::types::{PdfDict, PdfObject};