From 77304153fc58c5e221be8eeebd1ccf3c0c4edbd6 Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 23 May 2026 15:23:01 -0400 Subject: [PATCH] feat(pdftract-5sh): CIDToGIDMap resolver for CIDFontType2 Implements CIDToGIDMap resolver with Identity and stream forms: - Identity: zero-allocation short-circuit (GID == CID) - Stream: parses 2-byte big-endian GID values into Box<[u16]> - Emits CIDTOGIDMAP_TRUNCATED diagnostic on odd-byte-count input - Out-of-range CID returns GID 0 (notdef glyph) without panic Acceptance criteria: - Identity form: lookup of any CID returns same value as u16 - Stream form: synthetic 3-CID array decodes correctly [0, 5, 10] - Out-of-range CID returns GID 0 with no panic - Diagnostic CIDTOGIDMAP_TRUNCATED emitted on odd-byte-count input Refs: pdftract-5sh, Phase 2.1 line 1315 --- crates/pdftract-core/src/diagnostics.rs | 37 +++++- crates/pdftract-core/src/font/type0.rs | 161 ++++++++++++++++++------ notes/pdftract-5sh.md | 64 ++++++++++ 3 files changed, 226 insertions(+), 36 deletions(-) create mode 100644 notes/pdftract-5sh.md diff --git a/crates/pdftract-core/src/diagnostics.rs b/crates/pdftract-core/src/diagnostics.rs index 3c9dd57..b8557dc 100644 --- a/crates/pdftract-core/src/diagnostics.rs +++ b/crates/pdftract-core/src/diagnostics.rs @@ -527,6 +527,14 @@ pub enum DiagCode { /// Phase origin: 2.1 FontUnsupported, + /// CIDToGIDMap stream has odd byte count (truncated GID entry) + /// + /// Emitted when a CIDToGIDMap stream has an odd number of bytes, meaning + /// the last GID entry is incomplete. The trailing byte is discarded. + /// + /// Phase origin: 2.1 + FontCidtogidmapTruncated, + // === OCR_* codes === /// JBIG2 decoder not available @@ -754,7 +762,8 @@ impl DiagCode { | DiagCode::FontNotFound | DiagCode::FontInvalidCmap | DiagCode::FontParseFailed - | DiagCode::FontUnsupported => "FONT", + | DiagCode::FontUnsupported + | DiagCode::FontCidtogidmapTruncated => "FONT", // OCR_* DiagCode::OcrJbig2Unsupported @@ -839,6 +848,7 @@ impl DiagCode { DiagCode::FontInvalidCmap => "FONT_INVALID_CMAP", DiagCode::FontParseFailed => "FONT_PARSE_FAILED", DiagCode::FontUnsupported => "FONT_UNSUPPORTED", + DiagCode::FontCidtogidmapTruncated => "FONT_CIDTOGIDMAP_TRUNCATED", DiagCode::OcrJbig2Unsupported => "OCR_JBIG2_UNSUPPORTED", DiagCode::OcrJpxUnsupported => "OCR_JPX_UNSUPPORTED", DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED", @@ -910,6 +920,7 @@ impl DiagCode { | DiagCode::FontInvalidCmap | DiagCode::FontParseFailed | DiagCode::FontUnsupported + | DiagCode::FontCidtogidmapTruncated | DiagCode::OcrJbig2Unsupported | DiagCode::OcrJpxUnsupported | DiagCode::OcrCcittUnsupported @@ -1327,6 +1338,30 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[ phase: "2.2", suggested_action: "The CMap stream is malformed; it's treated as empty", }, + DiagInfo { + code: DiagCode::FontParseFailed, + category: "FONT", + severity: Severity::Warning, + recoverable: true, + phase: "2.1", + suggested_action: "The embedded font program is corrupt or invalid; the font is treated as having no glyph mappings", + }, + DiagInfo { + code: DiagCode::FontUnsupported, + category: "FONT", + severity: Severity::Warning, + recoverable: true, + phase: "2.1", + suggested_action: "A font type was encountered that doesn't support embedded font program loading", + }, + DiagInfo { + code: DiagCode::FontCidtogidmapTruncated, + category: "FONT", + severity: Severity::Warning, + recoverable: true, + phase: "2.1", + suggested_action: "The CIDToGIDMap stream has an odd byte count; the trailing byte was discarded", + }, // === OCR_* codes === DiagInfo { code: DiagCode::OcrJbig2Unsupported, diff --git a/crates/pdftract-core/src/font/type0.rs b/crates/pdftract-core/src/font/type0.rs index 755b3c9..9c41765 100644 --- a/crates/pdftract-core/src/font/type0.rs +++ b/crates/pdftract-core/src/font/type0.rs @@ -62,7 +62,7 @@ pub enum CIDToGIDMap { /// Identity mapping: GID == CID (most common for CIDFontType2). Identity, /// Custom mapping from a stream (2-byte big-endian GID values). - Custom(Vec), + Array(Box<[u16]>), } impl CIDToGIDMap { @@ -77,16 +77,10 @@ impl CIDToGIDMap { None } } - CIDToGIDMap::Custom(data) => { - // Data is a flat array of 2-byte big-endian GID values - // Indexed by CID: data[CID*2 .. CID*2+2] - let idx = (cid as usize) * 2; - if idx + 2 <= data.len() { - let gid = u16::from_be_bytes([data[idx], data[idx + 1]]); - Some(gid) - } else { - None - } + CIDToGIDMap::Array(arr) => { + // Direct index into the pre-parsed u16 array + // GID 0 is the .notdef glyph by convention + arr.get(cid as usize).copied().or(Some(0)) } } } @@ -236,7 +230,7 @@ impl Type0Font { // Load CIDToGIDMap for CIDFontType2 let cid_to_gid_map = if subtype == FontKind::CIDFontType2 { - Some(Self::load_cid_to_gid_map(cidfont_dict, source, opts, doc_counter)?) + Some(Self::load_cid_to_gid_map(cidfont_dict, source, opts, doc_counter, &mut diagnostics)?) } else { None }; @@ -359,11 +353,14 @@ impl Type0Font { /// Load the CIDToGIDMap from a CIDFontType2 dictionary. /// /// Returns the appropriate CIDToGIDMap variant. + /// + /// Emits `CIDTOGIDMAP_TRUNCATED` diagnostic if the stream has an odd byte count. fn load_cid_to_gid_map( cidfont_dict: &PdfDict, source: &dyn crate::parser::stream::PdfSource, opts: &ExtractionOptions, doc_counter: &mut u64, + diagnostics: &mut Vec, ) -> Type0Result { match cidfont_dict.get("/CIDToGIDMap") { Some(PdfObject::Name(name)) => { @@ -387,7 +384,23 @@ impl Type0Font { if data.is_empty() { Ok(CIDToGIDMap::Identity) } else { - Ok(CIDToGIDMap::Custom(data)) + // Check for odd byte count + let truncated = data.len() % 2 != 0; + if truncated { + diagnostics.push(Diagnostic::with_static_no_offset( + DiagCode::FontCidtogidmapTruncated, + "CIDToGIDMap stream has odd byte count; trailing byte discarded", + )); + } + + // Parse into u16 array (big-endian) + let len = data.len() / 2; + let mut arr = Vec::with_capacity(len); + for i in 0..len { + let gid = u16::from_be_bytes([data[i * 2], data[i * 2 + 1]]); + arr.push(gid); + } + Ok(CIDToGIDMap::Array(arr.into_boxed_slice())) } } Some(PdfObject::Ref(_)) => { @@ -496,26 +509,26 @@ mod tests { } #[test] - fn test_cid_to_gid_map_custom() { - // Create a simple custom map: [0x0000, 0x0001, 0x0002, 0x0003] + fn test_cid_to_gid_map_array() { + // Create a simple custom map: [0, 1, 2, 3] // Maps CID 0 -> GID 0, CID 1 -> GID 1, etc. - let data = vec![0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x03]; - let map = CIDToGIDMap::Custom(data); + let arr = vec![0u16, 1, 2, 3].into_boxed_slice(); + let map = CIDToGIDMap::Array(arr); assert_eq!(map.get(0), Some(0)); assert_eq!(map.get(1), Some(1)); assert_eq!(map.get(2), Some(2)); assert_eq!(map.get(3), Some(3)); - assert_eq!(map.get(4), None); // Out of range + // Out of range returns GID 0 (notdef), not None + assert_eq!(map.get(4), Some(0)); } #[test] - fn test_cid_to_gid_map_custom_big_endian() { - // Test big-endian decoding: CID 5 should map to GID 0x1234 - let mut data = vec![0u8; 12]; // Room for 6 GIDs - data[10] = 0x12; - data[11] = 0x34; - let map = CIDToGIDMap::Custom(data); + fn test_cid_to_gid_map_array_big_endian() { + // Test that parsed values are correct: CID 5 should map to GID 0x1234 + let mut arr = vec![0u16; 6]; + arr[5] = 0x1234; + let map = CIDToGIDMap::Array(arr.into_boxed_slice()); assert_eq!(map.get(5), Some(0x1234)); } @@ -956,9 +969,9 @@ mod tests { #[test] fn test_cid_to_gid_map_from_stream() { // Test loading CIDToGIDMap from a stream - // The stream data: [0x00, 0x00, 0x00, 0x01, 0x00, 0x02] - // Maps: CID 0 -> GID 0, CID 1 -> GID 1, CID 2 -> GID 2 - let stream_data = vec![0x00u8, 0x00, 0x00, 0x01, 0x00, 0x02]; + // The stream data: [0x00, 0x00, 0x00, 0x05, 0x00, 0x0A] + // Maps: CID 0 -> GID 0, CID 1 -> GID 5, CID 2 -> GID 10 + let stream_data = vec![0x00u8, 0x00, 0x00, 0x05, 0x00, 0x0A]; // Create a MemorySource with the stream data at offset 0 let mut full_data = vec![0u8; 1000]; // Reserve space for PDF-like structure @@ -996,19 +1009,97 @@ mod tests { // The load should succeed (even if FontDescriptor is missing, we handle it gracefully) assert!(result.is_ok()); let font = result.unwrap(); - // The CIDToGIDMap should be loaded (as Custom since stream decode succeeds) + // The CIDToGIDMap should be loaded (as Array since stream decode succeeds) assert!(font.descendant.cid_to_gid_map.is_some()); - // Verify the custom map works - if let Some(CIDToGIDMap::Custom(data)) = font.descendant.cid_to_gid_map { - assert_eq!(data.len(), 6); - // Verify the values are correct - assert_eq!(u16::from_be_bytes([data[0], data[1]]), 0); - assert_eq!(u16::from_be_bytes([data[2], data[3]]), 1); - assert_eq!(u16::from_be_bytes([data[4], data[5]]), 2); + // Verify the array map works + if let Some(CIDToGIDMap::Array(arr)) = &font.descendant.cid_to_gid_map { + assert_eq!(arr.len(), 3); + // Verify the values are correct: [0, 5, 10] + assert_eq!(arr[0], 0); + assert_eq!(arr[1], 5); + assert_eq!(arr[2], 10); + + // Verify the get method works + assert_eq!(font.descendant.get_gid(0), Some(0)); + assert_eq!(font.descendant.get_gid(1), Some(5)); + assert_eq!(font.descendant.get_gid(2), Some(10)); } } + #[test] + fn test_cid_to_gid_map_truncated() { + // Test loading CIDToGIDMap from a stream with odd byte count + // The stream data: [0x00, 0x00, 0x00, 0x05, 0x00] (5 bytes - odd!) + // Should emit CIDTOGIDMAP_TRUNCATED and truncate to 2 GIDs: [0, 5] + let stream_data = vec![0x00u8, 0x00, 0x00, 0x05, 0x00]; + + // Create a MemorySource with the stream data at offset 0 + let mut full_data = vec![0u8; 1000]; + full_data[0..stream_data.len()].copy_from_slice(&stream_data); + let source = MemorySource::new(full_data); + + let mut cidfont_dict = PdfDict::new(); + cidfont_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/CIDFontType2"))); + cidfont_dict.insert(intern("/BaseFont"), PdfObject::Name(intern("CIDFont"))); + cidfont_dict.insert( + intern("/CIDToGIDMap"), + PdfObject::Stream(Box::new(crate::parser::object::types::PdfStream { + dict: PdfDict::new(), + offset: 0, + len_hint: Some(5), + })), + ); + + // Wrap in a Type0 font dict + let mut font_dict = PdfDict::new(); + font_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/Type0"))); + font_dict.insert(intern("/BaseFont"), PdfObject::Name(intern("Type0Font"))); + font_dict.insert( + intern("/DescendantFonts"), + PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new( + cidfont_dict, + ))])), + ); + + let opts = ExtractionOptions::default(); + let mut counter = 0; + + let result = Type0Font::load(&font_dict, &source, &opts, &mut counter); + + // The load should succeed + assert!(result.is_ok()); + let font = result.unwrap(); + + // Check that the CIDTOGIDMAP_TRUNCATED diagnostic was emitted + let diagnostics = font.diagnostics(); + assert!(diagnostics.iter().any(|d| d.code == DiagCode::FontCidtogidmapTruncated)); + + // Verify the array has 2 elements (5 bytes / 2 = 2 GIDs, trailing byte discarded) + if let Some(CIDToGIDMap::Array(arr)) = &font.descendant.cid_to_gid_map { + assert_eq!(arr.len(), 2); + assert_eq!(arr[0], 0); + assert_eq!(arr[1], 5); + } + } + + #[test] + fn test_cid_to_gid_map_out_of_range() { + // Test that out-of-range CID returns GID 0 (notdef), not None + let arr = vec![0u16, 5, 10].into_boxed_slice(); + let map = CIDToGIDMap::Array(arr); + + // Valid CIDs + assert_eq!(map.get(0), Some(0)); + assert_eq!(map.get(1), Some(5)); + assert_eq!(map.get(2), Some(10)); + + // Out of range CID should return GID 0 (notdef), not None + assert_eq!(map.get(3), Some(0)); + assert_eq!(map.get(100), Some(0)); + assert_eq!(map.get(65535), Some(0)); + } + #[test] fn test_parse_w_array_high_cid_values() { // Test that high CID values (e.g., 50000+) work correctly diff --git a/notes/pdftract-5sh.md b/notes/pdftract-5sh.md new file mode 100644 index 0000000..57583f5 --- /dev/null +++ b/notes/pdftract-5sh.md @@ -0,0 +1,64 @@ +# pdftract-5sh: CIDToGIDMap resolver (Identity and stream forms) + +## Summary + +Implemented the CIDToGIDMap resolver for CIDFontType2 descendant fonts with: +- `/Identity` name detection (zero-allocation short-circuit) +- Stream form parsing into `Box<[u16]>` array (2-byte big-endian GID values) +- `CIDTOGIDMAP_TRUNCATED` diagnostic for odd-byte-count input +- Out-of-range CID returns GID 0 (notdef glyph) + +## Changes Made + +### 1. Added new diagnostic code (`diagnostics.rs`) + +- `DiagCode::FontCidtogidmapTruncated` - emitted when CIDToGIDMap stream has odd byte count +- Added to category, name, severity (Warning), and catalog entries + +### 2. Updated `CIDToGIDMap` enum (`type0.rs`) + +Changed from `Custom(Vec)` to `Array(Box<[u16]>)`: +- Pre-parsed u16 array instead of raw bytes +- Single heap allocation, not per-lookup +- `get()` method now uses `arr.get(cid as usize).copied().or(Some(0))` + +### 3. Updated `load_cid_to_gid_map()` function + +- Now parses decoded bytes into `Box<[u16]>` array +- Emits `CIDTOGIDMAP_TRUNCATED` diagnostic on odd-length input +- Truncates trailing byte instead of failing +- Takes `diagnostics: &mut Vec` parameter + +### 4. Updated tests + +- `test_cid_to_gid_map_array` - tests Array variant with [0, 1, 2, 3] +- `test_cid_to_gid_map_array_big_endian` - tests big-endian parsing +- `test_cid_to_gid_map_out_of_range` - tests GID 0 return for out-of-range CID +- `test_cid_to_gid_map_from_stream` - tests stream loading with [0, 5, 10] per acceptance criteria +- `test_cid_to_gid_map_truncated` - tests odd-byte-count diagnostic emission + +## Acceptance Criteria - PASS + +- [PASS] Identity form: lookup of any CID returns same value as u16 +- [PASS] Stream form: synthetic 3-CID array decodes correctly [0, 5, 10] +- [PASS] Out-of-range CID returns GID 0 with no panic +- [PASS] Diagnostic `CIDTOGIDMAP_TRUNCATED` emitted on odd-byte-count input + +## Test Results + +``` +test font::type0::tests::test_cid_to_gid_map_array ... ok +test font::type0::tests::test_cid_to_gid_map_array_big_endian ... ok +test font::type0::tests::test_cid_to_gid_map_identity ... ok +test font::type0::tests::test_cid_to_gid_map_out_of_range ... ok +test font::type0::tests::test_cid_to_gid_map_truncated ... ok +test font::type0::tests::test_cid_to_gid_map_from_stream ... ok +test result: ok. 6 passed; 0 failed; 0 ignored +``` + +All 25 type0 tests pass. + +## Files Modified + +- `crates/pdftract-core/src/diagnostics.rs` - added FontCidtogidmapTruncated diagnostic +- `crates/pdftract-core/src/font/type0.rs` - updated CIDToGIDMap enum and implementation