feat(pdftract-5sh): CIDToGIDMap resolver for CIDFontType2

Implements CIDToGIDMap resolver with Identity and stream forms:
- Identity: zero-allocation short-circuit (GID == CID)
- Stream: parses 2-byte big-endian GID values into Box<[u16]>
- Emits CIDTOGIDMAP_TRUNCATED diagnostic on odd-byte-count input
- Out-of-range CID returns GID 0 (notdef glyph) without panic

Acceptance criteria:
- Identity form: lookup of any CID returns same value as u16
- Stream form: synthetic 3-CID array decodes correctly [0, 5, 10]
- Out-of-range CID returns GID 0 with no panic
- Diagnostic CIDTOGIDMAP_TRUNCATED emitted on odd-byte-count input

Refs: pdftract-5sh, Phase 2.1 line 1315
This commit is contained in:
jedarden 2026-05-23 15:23:01 -04:00
parent 075de55846
commit 77304153fc
3 changed files with 226 additions and 36 deletions

View file

@ -527,6 +527,14 @@ pub enum DiagCode {
/// Phase origin: 2.1
FontUnsupported,
/// CIDToGIDMap stream has odd byte count (truncated GID entry)
///
/// Emitted when a CIDToGIDMap stream has an odd number of bytes, meaning
/// the last GID entry is incomplete. The trailing byte is discarded.
///
/// Phase origin: 2.1
FontCidtogidmapTruncated,
// === OCR_* codes ===
/// JBIG2 decoder not available
@ -754,7 +762,8 @@ impl DiagCode {
| DiagCode::FontNotFound
| DiagCode::FontInvalidCmap
| DiagCode::FontParseFailed
| DiagCode::FontUnsupported => "FONT",
| DiagCode::FontUnsupported
| DiagCode::FontCidtogidmapTruncated => "FONT",
// OCR_*
DiagCode::OcrJbig2Unsupported
@ -839,6 +848,7 @@ impl DiagCode {
DiagCode::FontInvalidCmap => "FONT_INVALID_CMAP",
DiagCode::FontParseFailed => "FONT_PARSE_FAILED",
DiagCode::FontUnsupported => "FONT_UNSUPPORTED",
DiagCode::FontCidtogidmapTruncated => "FONT_CIDTOGIDMAP_TRUNCATED",
DiagCode::OcrJbig2Unsupported => "OCR_JBIG2_UNSUPPORTED",
DiagCode::OcrJpxUnsupported => "OCR_JPX_UNSUPPORTED",
DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED",
@ -910,6 +920,7 @@ impl DiagCode {
| DiagCode::FontInvalidCmap
| DiagCode::FontParseFailed
| DiagCode::FontUnsupported
| DiagCode::FontCidtogidmapTruncated
| DiagCode::OcrJbig2Unsupported
| DiagCode::OcrJpxUnsupported
| DiagCode::OcrCcittUnsupported
@ -1327,6 +1338,30 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
phase: "2.2",
suggested_action: "The CMap stream is malformed; it's treated as empty",
},
DiagInfo {
code: DiagCode::FontParseFailed,
category: "FONT",
severity: Severity::Warning,
recoverable: true,
phase: "2.1",
suggested_action: "The embedded font program is corrupt or invalid; the font is treated as having no glyph mappings",
},
DiagInfo {
code: DiagCode::FontUnsupported,
category: "FONT",
severity: Severity::Warning,
recoverable: true,
phase: "2.1",
suggested_action: "A font type was encountered that doesn't support embedded font program loading",
},
DiagInfo {
code: DiagCode::FontCidtogidmapTruncated,
category: "FONT",
severity: Severity::Warning,
recoverable: true,
phase: "2.1",
suggested_action: "The CIDToGIDMap stream has an odd byte count; the trailing byte was discarded",
},
// === OCR_* codes ===
DiagInfo {
code: DiagCode::OcrJbig2Unsupported,

View file

@ -62,7 +62,7 @@ pub enum CIDToGIDMap {
/// Identity mapping: GID == CID (most common for CIDFontType2).
Identity,
/// Custom mapping from a stream (2-byte big-endian GID values).
Custom(Vec<u8>),
Array(Box<[u16]>),
}
impl CIDToGIDMap {
@ -77,16 +77,10 @@ impl CIDToGIDMap {
None
}
}
CIDToGIDMap::Custom(data) => {
// Data is a flat array of 2-byte big-endian GID values
// Indexed by CID: data[CID*2 .. CID*2+2]
let idx = (cid as usize) * 2;
if idx + 2 <= data.len() {
let gid = u16::from_be_bytes([data[idx], data[idx + 1]]);
Some(gid)
} else {
None
}
CIDToGIDMap::Array(arr) => {
// Direct index into the pre-parsed u16 array
// GID 0 is the .notdef glyph by convention
arr.get(cid as usize).copied().or(Some(0))
}
}
}
@ -236,7 +230,7 @@ impl Type0Font {
// Load CIDToGIDMap for CIDFontType2
let cid_to_gid_map = if subtype == FontKind::CIDFontType2 {
Some(Self::load_cid_to_gid_map(cidfont_dict, source, opts, doc_counter)?)
Some(Self::load_cid_to_gid_map(cidfont_dict, source, opts, doc_counter, &mut diagnostics)?)
} else {
None
};
@ -359,11 +353,14 @@ impl Type0Font {
/// Load the CIDToGIDMap from a CIDFontType2 dictionary.
///
/// Returns the appropriate CIDToGIDMap variant.
///
/// Emits `CIDTOGIDMAP_TRUNCATED` diagnostic if the stream has an odd byte count.
fn load_cid_to_gid_map(
cidfont_dict: &PdfDict,
source: &dyn crate::parser::stream::PdfSource,
opts: &ExtractionOptions,
doc_counter: &mut u64,
diagnostics: &mut Vec<Diagnostic>,
) -> Type0Result<CIDToGIDMap> {
match cidfont_dict.get("/CIDToGIDMap") {
Some(PdfObject::Name(name)) => {
@ -387,7 +384,23 @@ impl Type0Font {
if data.is_empty() {
Ok(CIDToGIDMap::Identity)
} else {
Ok(CIDToGIDMap::Custom(data))
// Check for odd byte count
let truncated = data.len() % 2 != 0;
if truncated {
diagnostics.push(Diagnostic::with_static_no_offset(
DiagCode::FontCidtogidmapTruncated,
"CIDToGIDMap stream has odd byte count; trailing byte discarded",
));
}
// Parse into u16 array (big-endian)
let len = data.len() / 2;
let mut arr = Vec::with_capacity(len);
for i in 0..len {
let gid = u16::from_be_bytes([data[i * 2], data[i * 2 + 1]]);
arr.push(gid);
}
Ok(CIDToGIDMap::Array(arr.into_boxed_slice()))
}
}
Some(PdfObject::Ref(_)) => {
@ -496,26 +509,26 @@ mod tests {
}
#[test]
fn test_cid_to_gid_map_custom() {
// Create a simple custom map: [0x0000, 0x0001, 0x0002, 0x0003]
fn test_cid_to_gid_map_array() {
// Create a simple custom map: [0, 1, 2, 3]
// Maps CID 0 -> GID 0, CID 1 -> GID 1, etc.
let data = vec![0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x03];
let map = CIDToGIDMap::Custom(data);
let arr = vec![0u16, 1, 2, 3].into_boxed_slice();
let map = CIDToGIDMap::Array(arr);
assert_eq!(map.get(0), Some(0));
assert_eq!(map.get(1), Some(1));
assert_eq!(map.get(2), Some(2));
assert_eq!(map.get(3), Some(3));
assert_eq!(map.get(4), None); // Out of range
// Out of range returns GID 0 (notdef), not None
assert_eq!(map.get(4), Some(0));
}
#[test]
fn test_cid_to_gid_map_custom_big_endian() {
// Test big-endian decoding: CID 5 should map to GID 0x1234
let mut data = vec![0u8; 12]; // Room for 6 GIDs
data[10] = 0x12;
data[11] = 0x34;
let map = CIDToGIDMap::Custom(data);
fn test_cid_to_gid_map_array_big_endian() {
// Test that parsed values are correct: CID 5 should map to GID 0x1234
let mut arr = vec![0u16; 6];
arr[5] = 0x1234;
let map = CIDToGIDMap::Array(arr.into_boxed_slice());
assert_eq!(map.get(5), Some(0x1234));
}
@ -956,9 +969,9 @@ mod tests {
#[test]
fn test_cid_to_gid_map_from_stream() {
// Test loading CIDToGIDMap from a stream
// The stream data: [0x00, 0x00, 0x00, 0x01, 0x00, 0x02]
// Maps: CID 0 -> GID 0, CID 1 -> GID 1, CID 2 -> GID 2
let stream_data = vec![0x00u8, 0x00, 0x00, 0x01, 0x00, 0x02];
// The stream data: [0x00, 0x00, 0x00, 0x05, 0x00, 0x0A]
// Maps: CID 0 -> GID 0, CID 1 -> GID 5, CID 2 -> GID 10
let stream_data = vec![0x00u8, 0x00, 0x00, 0x05, 0x00, 0x0A];
// Create a MemorySource with the stream data at offset 0
let mut full_data = vec![0u8; 1000]; // Reserve space for PDF-like structure
@ -996,19 +1009,97 @@ mod tests {
// The load should succeed (even if FontDescriptor is missing, we handle it gracefully)
assert!(result.is_ok());
let font = result.unwrap();
// The CIDToGIDMap should be loaded (as Custom since stream decode succeeds)
// The CIDToGIDMap should be loaded (as Array since stream decode succeeds)
assert!(font.descendant.cid_to_gid_map.is_some());
// Verify the custom map works
if let Some(CIDToGIDMap::Custom(data)) = font.descendant.cid_to_gid_map {
assert_eq!(data.len(), 6);
// Verify the values are correct
assert_eq!(u16::from_be_bytes([data[0], data[1]]), 0);
assert_eq!(u16::from_be_bytes([data[2], data[3]]), 1);
assert_eq!(u16::from_be_bytes([data[4], data[5]]), 2);
// Verify the array map works
if let Some(CIDToGIDMap::Array(arr)) = &font.descendant.cid_to_gid_map {
assert_eq!(arr.len(), 3);
// Verify the values are correct: [0, 5, 10]
assert_eq!(arr[0], 0);
assert_eq!(arr[1], 5);
assert_eq!(arr[2], 10);
// Verify the get method works
assert_eq!(font.descendant.get_gid(0), Some(0));
assert_eq!(font.descendant.get_gid(1), Some(5));
assert_eq!(font.descendant.get_gid(2), Some(10));
}
}
#[test]
fn test_cid_to_gid_map_truncated() {
// Test loading CIDToGIDMap from a stream with odd byte count
// The stream data: [0x00, 0x00, 0x00, 0x05, 0x00] (5 bytes - odd!)
// Should emit CIDTOGIDMAP_TRUNCATED and truncate to 2 GIDs: [0, 5]
let stream_data = vec![0x00u8, 0x00, 0x00, 0x05, 0x00];
// Create a MemorySource with the stream data at offset 0
let mut full_data = vec![0u8; 1000];
full_data[0..stream_data.len()].copy_from_slice(&stream_data);
let source = MemorySource::new(full_data);
let mut cidfont_dict = PdfDict::new();
cidfont_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/CIDFontType2")));
cidfont_dict.insert(intern("/BaseFont"), PdfObject::Name(intern("CIDFont")));
cidfont_dict.insert(
intern("/CIDToGIDMap"),
PdfObject::Stream(Box::new(crate::parser::object::types::PdfStream {
dict: PdfDict::new(),
offset: 0,
len_hint: Some(5),
})),
);
// Wrap in a Type0 font dict
let mut font_dict = PdfDict::new();
font_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/Type0")));
font_dict.insert(intern("/BaseFont"), PdfObject::Name(intern("Type0Font")));
font_dict.insert(
intern("/DescendantFonts"),
PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new(
cidfont_dict,
))])),
);
let opts = ExtractionOptions::default();
let mut counter = 0;
let result = Type0Font::load(&font_dict, &source, &opts, &mut counter);
// The load should succeed
assert!(result.is_ok());
let font = result.unwrap();
// Check that the CIDTOGIDMAP_TRUNCATED diagnostic was emitted
let diagnostics = font.diagnostics();
assert!(diagnostics.iter().any(|d| d.code == DiagCode::FontCidtogidmapTruncated));
// Verify the array has 2 elements (5 bytes / 2 = 2 GIDs, trailing byte discarded)
if let Some(CIDToGIDMap::Array(arr)) = &font.descendant.cid_to_gid_map {
assert_eq!(arr.len(), 2);
assert_eq!(arr[0], 0);
assert_eq!(arr[1], 5);
}
}
#[test]
fn test_cid_to_gid_map_out_of_range() {
// Test that out-of-range CID returns GID 0 (notdef), not None
let arr = vec![0u16, 5, 10].into_boxed_slice();
let map = CIDToGIDMap::Array(arr);
// Valid CIDs
assert_eq!(map.get(0), Some(0));
assert_eq!(map.get(1), Some(5));
assert_eq!(map.get(2), Some(10));
// Out of range CID should return GID 0 (notdef), not None
assert_eq!(map.get(3), Some(0));
assert_eq!(map.get(100), Some(0));
assert_eq!(map.get(65535), Some(0));
}
#[test]
fn test_parse_w_array_high_cid_values() {
// Test that high CID values (e.g., 50000+) work correctly

64
notes/pdftract-5sh.md Normal file
View file

@ -0,0 +1,64 @@
# pdftract-5sh: CIDToGIDMap resolver (Identity and stream forms)
## Summary
Implemented the CIDToGIDMap resolver for CIDFontType2 descendant fonts with:
- `/Identity` name detection (zero-allocation short-circuit)
- Stream form parsing into `Box<[u16]>` array (2-byte big-endian GID values)
- `CIDTOGIDMAP_TRUNCATED` diagnostic for odd-byte-count input
- Out-of-range CID returns GID 0 (notdef glyph)
## Changes Made
### 1. Added new diagnostic code (`diagnostics.rs`)
- `DiagCode::FontCidtogidmapTruncated` - emitted when CIDToGIDMap stream has odd byte count
- Added to category, name, severity (Warning), and catalog entries
### 2. Updated `CIDToGIDMap` enum (`type0.rs`)
Changed from `Custom(Vec<u8>)` to `Array(Box<[u16]>)`:
- Pre-parsed u16 array instead of raw bytes
- Single heap allocation, not per-lookup
- `get()` method now uses `arr.get(cid as usize).copied().or(Some(0))`
### 3. Updated `load_cid_to_gid_map()` function
- Now parses decoded bytes into `Box<[u16]>` array
- Emits `CIDTOGIDMAP_TRUNCATED` diagnostic on odd-length input
- Truncates trailing byte instead of failing
- Takes `diagnostics: &mut Vec<Diagnostic>` parameter
### 4. Updated tests
- `test_cid_to_gid_map_array` - tests Array variant with [0, 1, 2, 3]
- `test_cid_to_gid_map_array_big_endian` - tests big-endian parsing
- `test_cid_to_gid_map_out_of_range` - tests GID 0 return for out-of-range CID
- `test_cid_to_gid_map_from_stream` - tests stream loading with [0, 5, 10] per acceptance criteria
- `test_cid_to_gid_map_truncated` - tests odd-byte-count diagnostic emission
## Acceptance Criteria - PASS
- [PASS] Identity form: lookup of any CID returns same value as u16
- [PASS] Stream form: synthetic 3-CID array decodes correctly [0, 5, 10]
- [PASS] Out-of-range CID returns GID 0 with no panic
- [PASS] Diagnostic `CIDTOGIDMAP_TRUNCATED` emitted on odd-byte-count input
## Test Results
```
test font::type0::tests::test_cid_to_gid_map_array ... ok
test font::type0::tests::test_cid_to_gid_map_array_big_endian ... ok
test font::type0::tests::test_cid_to_gid_map_identity ... ok
test font::type0::tests::test_cid_to_gid_map_out_of_range ... ok
test font::type0::tests::test_cid_to_gid_map_truncated ... ok
test font::type0::tests::test_cid_to_gid_map_from_stream ... ok
test result: ok. 6 passed; 0 failed; 0 ignored
```
All 25 type0 tests pass.
## Files Modified
- `crates/pdftract-core/src/diagnostics.rs` - added FontCidtogidmapTruncated diagnostic
- `crates/pdftract-core/src/font/type0.rs` - updated CIDToGIDMap enum and implementation