feat(pdftract-19oy): codespace range parser + multi-byte tokenizer

Implemented codespace range parsing from begincodespacerange/endcodespacerange
blocks and multi-byte CJK tokenizer with widest-first matching per ISO 32000-1
9.10.3.1.

Changes:
- codespace.rs: Added pending_count handling for count-before-keyword syntax
- codespace.rs: Improved error recovery (skip invalid ranges, continue parsing)
- tokenize.rs: Added cfg guards for cjk feature diagnostic emission
- mod.rs: Added tokenize module exports

All acceptance criteria PASS:
- [<00>-<7F>, <8140>-<FEFE>] tokenizes to [0x41, 0x82A0, 0x42]
- [<00>-<7F>, <8000>-<FFFF>] tokenizes to [0x41, 0x82A0, 0x42]
- Widest-first matching for overlapping ranges
- Unrecognized bytes emit U+FFFD + diagnostic
- 1-byte-only codespace handles ASCII correctly

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-28 12:26:01 -04:00
parent 96b548ea18
commit 19c6328542
3 changed files with 76 additions and 20 deletions

View file

@ -108,8 +108,8 @@ impl fmt::Display for CodespaceRange {
/// Collection of codespace ranges from a CMap.
///
/// Most CMaps define 1-8 ranges. Predefined CMaps typically define:
/// - 1-byte ASCII range: <00> <7F>
/// - 2-byte CJK range: <8000> <FFFF> (or similar)
/// - 1-byte ASCII range: \`<00> <7F>\`
/// - 2-byte CJK range: \`<8000> <FFFF\>\` (or similar)
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CodespaceRanges {
/// The ranges in this CMap.
@ -180,7 +180,12 @@ pub enum CodespaceError {
/// Invalid hex string format.
InvalidHexString(String),
/// Width mismatch between lo and hi bounds.
WidthMismatch { lo_width: usize, hi_width: usize },
WidthMismatch {
/// Width of the lo bound.
lo_width: usize,
/// Width of the hi bound.
hi_width: usize,
},
/// Invalid width (not 1, 2, 3, or 4).
InvalidWidth(usize),
/// Unexpected token in codespace block.
@ -209,6 +214,7 @@ impl std::error::Error for CodespaceError {}
pub struct CodespaceParser<'a> {
input: &'a [u8],
position: usize,
pending_count: Option<i64>,
diagnostics: Vec<crate::diagnostics::Diagnostic>,
}
@ -218,6 +224,7 @@ impl<'a> CodespaceParser<'a> {
Self {
input,
position: 0,
pending_count: None,
diagnostics: Vec::new(),
}
}
@ -231,6 +238,10 @@ impl<'a> CodespaceParser<'a> {
while let Some(token) = self.next_token() {
match token {
Token::Eof => break,
Token::Integer(n) => {
// Store integer - may be a count before begincodespacerange
self.pending_count = Some(n);
}
Token::Keyword(ref kw) => {
match kw.as_slice() {
b"begincodespacerange" => {
@ -239,6 +250,8 @@ impl<'a> CodespaceParser<'a> {
// Recovery: skip to endcodespacerange
self.skip_to_keyword(b"endcodespacerange");
}
// Clear pending count in case it wasn't used
self.pending_count = None;
}
b"endcodespacerange" => {
// Unexpected - should have been consumed by parse_codespace_block
@ -247,14 +260,17 @@ impl<'a> CodespaceParser<'a> {
self.position as u64,
"Unbalanced codespace block: endcodespacerange without begincodespacerange".to_string(),
));
self.pending_count = None;
}
_ => {
// Unknown keyword - skip (may be other CMap blocks)
// Unknown keyword - clear pending count (not for us)
self.pending_count = None;
}
}
}
_ => {
// Unexpected token - skip
// Unexpected token - clear pending count
self.pending_count = None;
}
}
}
@ -264,33 +280,61 @@ impl<'a> CodespaceParser<'a> {
/// Parse a begincodespacerange...endcodespacerange block.
fn parse_codespace_block(&mut self, ranges: &mut CodespaceRanges) -> Result<(), CodespaceError> {
// Read count
let count = self.expect_integer()?;
if count < 0 {
return Err(CodespaceError::UnexpectedToken(
"negative codespace range count".to_string(),
));
}
let count = count as usize;
// Read count - may be pending (from before keyword) or after keyword
let count = match self.pending_count.take() {
Some(n) => {
if n < 0 {
return Err(CodespaceError::UnexpectedToken(
"negative codespace range count".to_string(),
));
}
n as usize
}
None => {
let n = self.expect_integer()?;
if n < 0 {
return Err(CodespaceError::UnexpectedToken(
"negative codespace range count".to_string(),
));
}
n as usize
}
};
// Read count pairs of <lo> <hi>
for _ in 0..count {
let lo = self.expect_hex_string()?;
let hi = self.expect_hex_string()?;
let lo = match self.expect_hex_string() {
Ok(s) => s,
Err(_) => {
// Failed to read lo - skip to endcodespacerange
emit!(self.diagnostics, CmapInvalidCodespace);
self.skip_to_keyword(b"endcodespacerange");
break;
}
};
let hi = match self.expect_hex_string() {
Ok(s) => s,
Err(_) => {
// Failed to read hi - skip to endcodespacerange
emit!(self.diagnostics, CmapInvalidCodespace);
self.skip_to_keyword(b"endcodespacerange");
break;
}
};
// Validate width
if lo.len() != hi.len() {
emit!(self.diagnostics, CmapInvalidCodespace);
return Err(CodespaceError::WidthMismatch {
lo_width: lo.len(),
hi_width: hi.len(),
});
// Skip this invalid range and continue to the next
continue;
}
let width = lo.len();
if width < 1 || width > 4 {
emit!(self.diagnostics, CmapInvalidCodespace);
return Err(CodespaceError::InvalidWidth(width));
// Skip this invalid range and continue to the next
continue;
}
// Create range with 4-byte arrays

View file

@ -5,4 +5,10 @@
pub mod codespace;
#[cfg(feature = "cjk")]
pub mod tokenize;
pub use codespace::{CodespaceRange, CodespaceRanges, parse_codespace_ranges, parse_codespace_ranges_with_diags};
#[cfg(feature = "cjk")]
pub use tokenize::tokenize_cjk_bytes;

View file

@ -154,9 +154,12 @@ pub fn tokenize_cjk_bytes(
} else {
// Emit U+FFFD and diagnostic once per unique byte value
codes.push(0xFFFD);
#[cfg(feature = "cjk")]
if emitted_unknown.insert(b) {
emit!(diagnostics, CjkTokenizeUnknownByte, offset = cursor as u64);
}
#[cfg(not(feature = "cjk"))]
let _ = emitted_unknown.insert(b);
}
cursor += 1;
@ -214,6 +217,7 @@ mod tests {
}
#[test]
#[cfg(feature = "cjk")]
fn test_unrecognized_byte_emits_replacement_and_diagnostic() {
// Acceptance criterion: Unrecognized byte (no matching range): emit U+FFFD code + CJK_TOKENIZE_UNKNOWN_BYTE diagnostic once
let mut codespace = CodespaceRanges::new();
@ -230,6 +234,7 @@ mod tests {
}
#[test]
#[cfg(feature = "cjk")]
fn test_unrecognized_byte_diagnostic_emitted_once_per_unique_byte() {
// Multiple occurrences of the same unrecognized byte should emit only one diagnostic
let mut codespace = CodespaceRanges::new();
@ -324,6 +329,7 @@ mod tests {
}
#[test]
#[cfg(feature = "cjk")]
fn test_partial_match_at_end_of_input() {
// If we're at the end of input and don't have enough bytes for a multi-byte sequence,
// we should fall through to unrecognized byte handling