feat(pdftract-5sh): CIDToGIDMap resolver for CIDFontType2
Implements CIDToGIDMap resolver with Identity and stream forms: - Identity: zero-allocation short-circuit (GID == CID) - Stream: parses 2-byte big-endian GID values into Box<[u16]> - Emits CIDTOGIDMAP_TRUNCATED diagnostic on odd-byte-count input - Out-of-range CID returns GID 0 (notdef glyph) without panic Acceptance criteria: - Identity form: lookup of any CID returns same value as u16 - Stream form: synthetic 3-CID array decodes correctly [0, 5, 10] - Out-of-range CID returns GID 0 with no panic - Diagnostic CIDTOGIDMAP_TRUNCATED emitted on odd-byte-count input Refs: pdftract-5sh, Phase 2.1 line 1315
This commit is contained in:
parent
075de55846
commit
77304153fc
3 changed files with 226 additions and 36 deletions
|
|
@ -527,6 +527,14 @@ pub enum DiagCode {
|
|||
/// Phase origin: 2.1
|
||||
FontUnsupported,
|
||||
|
||||
/// CIDToGIDMap stream has odd byte count (truncated GID entry)
|
||||
///
|
||||
/// Emitted when a CIDToGIDMap stream has an odd number of bytes, meaning
|
||||
/// the last GID entry is incomplete. The trailing byte is discarded.
|
||||
///
|
||||
/// Phase origin: 2.1
|
||||
FontCidtogidmapTruncated,
|
||||
|
||||
// === OCR_* codes ===
|
||||
|
||||
/// JBIG2 decoder not available
|
||||
|
|
@ -754,7 +762,8 @@ impl DiagCode {
|
|||
| DiagCode::FontNotFound
|
||||
| DiagCode::FontInvalidCmap
|
||||
| DiagCode::FontParseFailed
|
||||
| DiagCode::FontUnsupported => "FONT",
|
||||
| DiagCode::FontUnsupported
|
||||
| DiagCode::FontCidtogidmapTruncated => "FONT",
|
||||
|
||||
// OCR_*
|
||||
DiagCode::OcrJbig2Unsupported
|
||||
|
|
@ -839,6 +848,7 @@ impl DiagCode {
|
|||
DiagCode::FontInvalidCmap => "FONT_INVALID_CMAP",
|
||||
DiagCode::FontParseFailed => "FONT_PARSE_FAILED",
|
||||
DiagCode::FontUnsupported => "FONT_UNSUPPORTED",
|
||||
DiagCode::FontCidtogidmapTruncated => "FONT_CIDTOGIDMAP_TRUNCATED",
|
||||
DiagCode::OcrJbig2Unsupported => "OCR_JBIG2_UNSUPPORTED",
|
||||
DiagCode::OcrJpxUnsupported => "OCR_JPX_UNSUPPORTED",
|
||||
DiagCode::OcrCcittUnsupported => "OCR_CCITT_UNSUPPORTED",
|
||||
|
|
@ -910,6 +920,7 @@ impl DiagCode {
|
|||
| DiagCode::FontInvalidCmap
|
||||
| DiagCode::FontParseFailed
|
||||
| DiagCode::FontUnsupported
|
||||
| DiagCode::FontCidtogidmapTruncated
|
||||
| DiagCode::OcrJbig2Unsupported
|
||||
| DiagCode::OcrJpxUnsupported
|
||||
| DiagCode::OcrCcittUnsupported
|
||||
|
|
@ -1327,6 +1338,30 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
|
|||
phase: "2.2",
|
||||
suggested_action: "The CMap stream is malformed; it's treated as empty",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::FontParseFailed,
|
||||
category: "FONT",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "2.1",
|
||||
suggested_action: "The embedded font program is corrupt or invalid; the font is treated as having no glyph mappings",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::FontUnsupported,
|
||||
category: "FONT",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "2.1",
|
||||
suggested_action: "A font type was encountered that doesn't support embedded font program loading",
|
||||
},
|
||||
DiagInfo {
|
||||
code: DiagCode::FontCidtogidmapTruncated,
|
||||
category: "FONT",
|
||||
severity: Severity::Warning,
|
||||
recoverable: true,
|
||||
phase: "2.1",
|
||||
suggested_action: "The CIDToGIDMap stream has an odd byte count; the trailing byte was discarded",
|
||||
},
|
||||
// === OCR_* codes ===
|
||||
DiagInfo {
|
||||
code: DiagCode::OcrJbig2Unsupported,
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ pub enum CIDToGIDMap {
|
|||
/// Identity mapping: GID == CID (most common for CIDFontType2).
|
||||
Identity,
|
||||
/// Custom mapping from a stream (2-byte big-endian GID values).
|
||||
Custom(Vec<u8>),
|
||||
Array(Box<[u16]>),
|
||||
}
|
||||
|
||||
impl CIDToGIDMap {
|
||||
|
|
@ -77,16 +77,10 @@ impl CIDToGIDMap {
|
|||
None
|
||||
}
|
||||
}
|
||||
CIDToGIDMap::Custom(data) => {
|
||||
// Data is a flat array of 2-byte big-endian GID values
|
||||
// Indexed by CID: data[CID*2 .. CID*2+2]
|
||||
let idx = (cid as usize) * 2;
|
||||
if idx + 2 <= data.len() {
|
||||
let gid = u16::from_be_bytes([data[idx], data[idx + 1]]);
|
||||
Some(gid)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
CIDToGIDMap::Array(arr) => {
|
||||
// Direct index into the pre-parsed u16 array
|
||||
// GID 0 is the .notdef glyph by convention
|
||||
arr.get(cid as usize).copied().or(Some(0))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -236,7 +230,7 @@ impl Type0Font {
|
|||
|
||||
// Load CIDToGIDMap for CIDFontType2
|
||||
let cid_to_gid_map = if subtype == FontKind::CIDFontType2 {
|
||||
Some(Self::load_cid_to_gid_map(cidfont_dict, source, opts, doc_counter)?)
|
||||
Some(Self::load_cid_to_gid_map(cidfont_dict, source, opts, doc_counter, &mut diagnostics)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
|
@ -359,11 +353,14 @@ impl Type0Font {
|
|||
/// Load the CIDToGIDMap from a CIDFontType2 dictionary.
|
||||
///
|
||||
/// Returns the appropriate CIDToGIDMap variant.
|
||||
///
|
||||
/// Emits `CIDTOGIDMAP_TRUNCATED` diagnostic if the stream has an odd byte count.
|
||||
fn load_cid_to_gid_map(
|
||||
cidfont_dict: &PdfDict,
|
||||
source: &dyn crate::parser::stream::PdfSource,
|
||||
opts: &ExtractionOptions,
|
||||
doc_counter: &mut u64,
|
||||
diagnostics: &mut Vec<Diagnostic>,
|
||||
) -> Type0Result<CIDToGIDMap> {
|
||||
match cidfont_dict.get("/CIDToGIDMap") {
|
||||
Some(PdfObject::Name(name)) => {
|
||||
|
|
@ -387,7 +384,23 @@ impl Type0Font {
|
|||
if data.is_empty() {
|
||||
Ok(CIDToGIDMap::Identity)
|
||||
} else {
|
||||
Ok(CIDToGIDMap::Custom(data))
|
||||
// Check for odd byte count
|
||||
let truncated = data.len() % 2 != 0;
|
||||
if truncated {
|
||||
diagnostics.push(Diagnostic::with_static_no_offset(
|
||||
DiagCode::FontCidtogidmapTruncated,
|
||||
"CIDToGIDMap stream has odd byte count; trailing byte discarded",
|
||||
));
|
||||
}
|
||||
|
||||
// Parse into u16 array (big-endian)
|
||||
let len = data.len() / 2;
|
||||
let mut arr = Vec::with_capacity(len);
|
||||
for i in 0..len {
|
||||
let gid = u16::from_be_bytes([data[i * 2], data[i * 2 + 1]]);
|
||||
arr.push(gid);
|
||||
}
|
||||
Ok(CIDToGIDMap::Array(arr.into_boxed_slice()))
|
||||
}
|
||||
}
|
||||
Some(PdfObject::Ref(_)) => {
|
||||
|
|
@ -496,26 +509,26 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_cid_to_gid_map_custom() {
|
||||
// Create a simple custom map: [0x0000, 0x0001, 0x0002, 0x0003]
|
||||
fn test_cid_to_gid_map_array() {
|
||||
// Create a simple custom map: [0, 1, 2, 3]
|
||||
// Maps CID 0 -> GID 0, CID 1 -> GID 1, etc.
|
||||
let data = vec![0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x03];
|
||||
let map = CIDToGIDMap::Custom(data);
|
||||
let arr = vec![0u16, 1, 2, 3].into_boxed_slice();
|
||||
let map = CIDToGIDMap::Array(arr);
|
||||
|
||||
assert_eq!(map.get(0), Some(0));
|
||||
assert_eq!(map.get(1), Some(1));
|
||||
assert_eq!(map.get(2), Some(2));
|
||||
assert_eq!(map.get(3), Some(3));
|
||||
assert_eq!(map.get(4), None); // Out of range
|
||||
// Out of range returns GID 0 (notdef), not None
|
||||
assert_eq!(map.get(4), Some(0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cid_to_gid_map_custom_big_endian() {
|
||||
// Test big-endian decoding: CID 5 should map to GID 0x1234
|
||||
let mut data = vec![0u8; 12]; // Room for 6 GIDs
|
||||
data[10] = 0x12;
|
||||
data[11] = 0x34;
|
||||
let map = CIDToGIDMap::Custom(data);
|
||||
fn test_cid_to_gid_map_array_big_endian() {
|
||||
// Test that parsed values are correct: CID 5 should map to GID 0x1234
|
||||
let mut arr = vec![0u16; 6];
|
||||
arr[5] = 0x1234;
|
||||
let map = CIDToGIDMap::Array(arr.into_boxed_slice());
|
||||
|
||||
assert_eq!(map.get(5), Some(0x1234));
|
||||
}
|
||||
|
|
@ -956,9 +969,9 @@ mod tests {
|
|||
#[test]
|
||||
fn test_cid_to_gid_map_from_stream() {
|
||||
// Test loading CIDToGIDMap from a stream
|
||||
// The stream data: [0x00, 0x00, 0x00, 0x01, 0x00, 0x02]
|
||||
// Maps: CID 0 -> GID 0, CID 1 -> GID 1, CID 2 -> GID 2
|
||||
let stream_data = vec![0x00u8, 0x00, 0x00, 0x01, 0x00, 0x02];
|
||||
// The stream data: [0x00, 0x00, 0x00, 0x05, 0x00, 0x0A]
|
||||
// Maps: CID 0 -> GID 0, CID 1 -> GID 5, CID 2 -> GID 10
|
||||
let stream_data = vec![0x00u8, 0x00, 0x00, 0x05, 0x00, 0x0A];
|
||||
|
||||
// Create a MemorySource with the stream data at offset 0
|
||||
let mut full_data = vec![0u8; 1000]; // Reserve space for PDF-like structure
|
||||
|
|
@ -996,19 +1009,97 @@ mod tests {
|
|||
// The load should succeed (even if FontDescriptor is missing, we handle it gracefully)
|
||||
assert!(result.is_ok());
|
||||
let font = result.unwrap();
|
||||
// The CIDToGIDMap should be loaded (as Custom since stream decode succeeds)
|
||||
// The CIDToGIDMap should be loaded (as Array since stream decode succeeds)
|
||||
assert!(font.descendant.cid_to_gid_map.is_some());
|
||||
|
||||
// Verify the custom map works
|
||||
if let Some(CIDToGIDMap::Custom(data)) = font.descendant.cid_to_gid_map {
|
||||
assert_eq!(data.len(), 6);
|
||||
// Verify the values are correct
|
||||
assert_eq!(u16::from_be_bytes([data[0], data[1]]), 0);
|
||||
assert_eq!(u16::from_be_bytes([data[2], data[3]]), 1);
|
||||
assert_eq!(u16::from_be_bytes([data[4], data[5]]), 2);
|
||||
// Verify the array map works
|
||||
if let Some(CIDToGIDMap::Array(arr)) = &font.descendant.cid_to_gid_map {
|
||||
assert_eq!(arr.len(), 3);
|
||||
// Verify the values are correct: [0, 5, 10]
|
||||
assert_eq!(arr[0], 0);
|
||||
assert_eq!(arr[1], 5);
|
||||
assert_eq!(arr[2], 10);
|
||||
|
||||
// Verify the get method works
|
||||
assert_eq!(font.descendant.get_gid(0), Some(0));
|
||||
assert_eq!(font.descendant.get_gid(1), Some(5));
|
||||
assert_eq!(font.descendant.get_gid(2), Some(10));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cid_to_gid_map_truncated() {
|
||||
// Test loading CIDToGIDMap from a stream with odd byte count
|
||||
// The stream data: [0x00, 0x00, 0x00, 0x05, 0x00] (5 bytes - odd!)
|
||||
// Should emit CIDTOGIDMAP_TRUNCATED and truncate to 2 GIDs: [0, 5]
|
||||
let stream_data = vec![0x00u8, 0x00, 0x00, 0x05, 0x00];
|
||||
|
||||
// Create a MemorySource with the stream data at offset 0
|
||||
let mut full_data = vec![0u8; 1000];
|
||||
full_data[0..stream_data.len()].copy_from_slice(&stream_data);
|
||||
let source = MemorySource::new(full_data);
|
||||
|
||||
let mut cidfont_dict = PdfDict::new();
|
||||
cidfont_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/CIDFontType2")));
|
||||
cidfont_dict.insert(intern("/BaseFont"), PdfObject::Name(intern("CIDFont")));
|
||||
cidfont_dict.insert(
|
||||
intern("/CIDToGIDMap"),
|
||||
PdfObject::Stream(Box::new(crate::parser::object::types::PdfStream {
|
||||
dict: PdfDict::new(),
|
||||
offset: 0,
|
||||
len_hint: Some(5),
|
||||
})),
|
||||
);
|
||||
|
||||
// Wrap in a Type0 font dict
|
||||
let mut font_dict = PdfDict::new();
|
||||
font_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/Type0")));
|
||||
font_dict.insert(intern("/BaseFont"), PdfObject::Name(intern("Type0Font")));
|
||||
font_dict.insert(
|
||||
intern("/DescendantFonts"),
|
||||
PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new(
|
||||
cidfont_dict,
|
||||
))])),
|
||||
);
|
||||
|
||||
let opts = ExtractionOptions::default();
|
||||
let mut counter = 0;
|
||||
|
||||
let result = Type0Font::load(&font_dict, &source, &opts, &mut counter);
|
||||
|
||||
// The load should succeed
|
||||
assert!(result.is_ok());
|
||||
let font = result.unwrap();
|
||||
|
||||
// Check that the CIDTOGIDMAP_TRUNCATED diagnostic was emitted
|
||||
let diagnostics = font.diagnostics();
|
||||
assert!(diagnostics.iter().any(|d| d.code == DiagCode::FontCidtogidmapTruncated));
|
||||
|
||||
// Verify the array has 2 elements (5 bytes / 2 = 2 GIDs, trailing byte discarded)
|
||||
if let Some(CIDToGIDMap::Array(arr)) = &font.descendant.cid_to_gid_map {
|
||||
assert_eq!(arr.len(), 2);
|
||||
assert_eq!(arr[0], 0);
|
||||
assert_eq!(arr[1], 5);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cid_to_gid_map_out_of_range() {
|
||||
// Test that out-of-range CID returns GID 0 (notdef), not None
|
||||
let arr = vec![0u16, 5, 10].into_boxed_slice();
|
||||
let map = CIDToGIDMap::Array(arr);
|
||||
|
||||
// Valid CIDs
|
||||
assert_eq!(map.get(0), Some(0));
|
||||
assert_eq!(map.get(1), Some(5));
|
||||
assert_eq!(map.get(2), Some(10));
|
||||
|
||||
// Out of range CID should return GID 0 (notdef), not None
|
||||
assert_eq!(map.get(3), Some(0));
|
||||
assert_eq!(map.get(100), Some(0));
|
||||
assert_eq!(map.get(65535), Some(0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_w_array_high_cid_values() {
|
||||
// Test that high CID values (e.g., 50000+) work correctly
|
||||
|
|
|
|||
64
notes/pdftract-5sh.md
Normal file
64
notes/pdftract-5sh.md
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# pdftract-5sh: CIDToGIDMap resolver (Identity and stream forms)
|
||||
|
||||
## Summary
|
||||
|
||||
Implemented the CIDToGIDMap resolver for CIDFontType2 descendant fonts with:
|
||||
- `/Identity` name detection (zero-allocation short-circuit)
|
||||
- Stream form parsing into `Box<[u16]>` array (2-byte big-endian GID values)
|
||||
- `CIDTOGIDMAP_TRUNCATED` diagnostic for odd-byte-count input
|
||||
- Out-of-range CID returns GID 0 (notdef glyph)
|
||||
|
||||
## Changes Made
|
||||
|
||||
### 1. Added new diagnostic code (`diagnostics.rs`)
|
||||
|
||||
- `DiagCode::FontCidtogidmapTruncated` - emitted when CIDToGIDMap stream has odd byte count
|
||||
- Added to category, name, severity (Warning), and catalog entries
|
||||
|
||||
### 2. Updated `CIDToGIDMap` enum (`type0.rs`)
|
||||
|
||||
Changed from `Custom(Vec<u8>)` to `Array(Box<[u16]>)`:
|
||||
- Pre-parsed u16 array instead of raw bytes
|
||||
- Single heap allocation, not per-lookup
|
||||
- `get()` method now uses `arr.get(cid as usize).copied().or(Some(0))`
|
||||
|
||||
### 3. Updated `load_cid_to_gid_map()` function
|
||||
|
||||
- Now parses decoded bytes into `Box<[u16]>` array
|
||||
- Emits `CIDTOGIDMAP_TRUNCATED` diagnostic on odd-length input
|
||||
- Truncates trailing byte instead of failing
|
||||
- Takes `diagnostics: &mut Vec<Diagnostic>` parameter
|
||||
|
||||
### 4. Updated tests
|
||||
|
||||
- `test_cid_to_gid_map_array` - tests Array variant with [0, 1, 2, 3]
|
||||
- `test_cid_to_gid_map_array_big_endian` - tests big-endian parsing
|
||||
- `test_cid_to_gid_map_out_of_range` - tests GID 0 return for out-of-range CID
|
||||
- `test_cid_to_gid_map_from_stream` - tests stream loading with [0, 5, 10] per acceptance criteria
|
||||
- `test_cid_to_gid_map_truncated` - tests odd-byte-count diagnostic emission
|
||||
|
||||
## Acceptance Criteria - PASS
|
||||
|
||||
- [PASS] Identity form: lookup of any CID returns same value as u16
|
||||
- [PASS] Stream form: synthetic 3-CID array decodes correctly [0, 5, 10]
|
||||
- [PASS] Out-of-range CID returns GID 0 with no panic
|
||||
- [PASS] Diagnostic `CIDTOGIDMAP_TRUNCATED` emitted on odd-byte-count input
|
||||
|
||||
## Test Results
|
||||
|
||||
```
|
||||
test font::type0::tests::test_cid_to_gid_map_array ... ok
|
||||
test font::type0::tests::test_cid_to_gid_map_array_big_endian ... ok
|
||||
test font::type0::tests::test_cid_to_gid_map_identity ... ok
|
||||
test font::type0::tests::test_cid_to_gid_map_out_of_range ... ok
|
||||
test font::type0::tests::test_cid_to_gid_map_truncated ... ok
|
||||
test font::type0::tests::test_cid_to_gid_map_from_stream ... ok
|
||||
test result: ok. 6 passed; 0 failed; 0 ignored
|
||||
```
|
||||
|
||||
All 25 type0 tests pass.
|
||||
|
||||
## Files Modified
|
||||
|
||||
- `crates/pdftract-core/src/diagnostics.rs` - added FontCidtogidmapTruncated diagnostic
|
||||
- `crates/pdftract-core/src/font/type0.rs` - updated CIDToGIDMap enum and implementation
|
||||
Loading…
Add table
Reference in a new issue