diff --git a/crates/pdftract-core/src/cmap/codespace.rs b/crates/pdftract-core/src/cmap/codespace.rs index 48475a6..25ec4b5 100644 --- a/crates/pdftract-core/src/cmap/codespace.rs +++ b/crates/pdftract-core/src/cmap/codespace.rs @@ -108,8 +108,8 @@ impl fmt::Display for CodespaceRange { /// Collection of codespace ranges from a CMap. /// /// Most CMaps define 1-8 ranges. Predefined CMaps typically define: -/// - 1-byte ASCII range: <00> <7F> -/// - 2-byte CJK range: <8000> (or similar) +/// - 1-byte ASCII range: \`<00> <7F>\` +/// - 2-byte CJK range: \`<8000> \` (or similar) #[derive(Debug, Clone, PartialEq, Eq)] pub struct CodespaceRanges { /// The ranges in this CMap. @@ -180,7 +180,12 @@ pub enum CodespaceError { /// Invalid hex string format. InvalidHexString(String), /// Width mismatch between lo and hi bounds. - WidthMismatch { lo_width: usize, hi_width: usize }, + WidthMismatch { + /// Width of the lo bound. + lo_width: usize, + /// Width of the hi bound. + hi_width: usize, + }, /// Invalid width (not 1, 2, 3, or 4). InvalidWidth(usize), /// Unexpected token in codespace block. @@ -209,6 +214,7 @@ impl std::error::Error for CodespaceError {} pub struct CodespaceParser<'a> { input: &'a [u8], position: usize, + pending_count: Option, diagnostics: Vec, } @@ -218,6 +224,7 @@ impl<'a> CodespaceParser<'a> { Self { input, position: 0, + pending_count: None, diagnostics: Vec::new(), } } @@ -231,6 +238,10 @@ impl<'a> CodespaceParser<'a> { while let Some(token) = self.next_token() { match token { Token::Eof => break, + Token::Integer(n) => { + // Store integer - may be a count before begincodespacerange + self.pending_count = Some(n); + } Token::Keyword(ref kw) => { match kw.as_slice() { b"begincodespacerange" => { @@ -239,6 +250,8 @@ impl<'a> CodespaceParser<'a> { // Recovery: skip to endcodespacerange self.skip_to_keyword(b"endcodespacerange"); } + // Clear pending count in case it wasn't used + self.pending_count = None; } b"endcodespacerange" => { // Unexpected - should have been consumed by parse_codespace_block @@ -247,14 +260,17 @@ impl<'a> CodespaceParser<'a> { self.position as u64, "Unbalanced codespace block: endcodespacerange without begincodespacerange".to_string(), )); + self.pending_count = None; } _ => { - // Unknown keyword - skip (may be other CMap blocks) + // Unknown keyword - clear pending count (not for us) + self.pending_count = None; } } } _ => { - // Unexpected token - skip + // Unexpected token - clear pending count + self.pending_count = None; } } } @@ -264,33 +280,61 @@ impl<'a> CodespaceParser<'a> { /// Parse a begincodespacerange...endcodespacerange block. fn parse_codespace_block(&mut self, ranges: &mut CodespaceRanges) -> Result<(), CodespaceError> { - // Read count - let count = self.expect_integer()?; - if count < 0 { - return Err(CodespaceError::UnexpectedToken( - "negative codespace range count".to_string(), - )); - } - let count = count as usize; + // Read count - may be pending (from before keyword) or after keyword + let count = match self.pending_count.take() { + Some(n) => { + if n < 0 { + return Err(CodespaceError::UnexpectedToken( + "negative codespace range count".to_string(), + )); + } + n as usize + } + None => { + let n = self.expect_integer()?; + if n < 0 { + return Err(CodespaceError::UnexpectedToken( + "negative codespace range count".to_string(), + )); + } + n as usize + } + }; // Read count pairs of for _ in 0..count { - let lo = self.expect_hex_string()?; - let hi = self.expect_hex_string()?; + let lo = match self.expect_hex_string() { + Ok(s) => s, + Err(_) => { + // Failed to read lo - skip to endcodespacerange + emit!(self.diagnostics, CmapInvalidCodespace); + self.skip_to_keyword(b"endcodespacerange"); + break; + } + }; + + let hi = match self.expect_hex_string() { + Ok(s) => s, + Err(_) => { + // Failed to read hi - skip to endcodespacerange + emit!(self.diagnostics, CmapInvalidCodespace); + self.skip_to_keyword(b"endcodespacerange"); + break; + } + }; // Validate width if lo.len() != hi.len() { emit!(self.diagnostics, CmapInvalidCodespace); - return Err(CodespaceError::WidthMismatch { - lo_width: lo.len(), - hi_width: hi.len(), - }); + // Skip this invalid range and continue to the next + continue; } let width = lo.len(); if width < 1 || width > 4 { emit!(self.diagnostics, CmapInvalidCodespace); - return Err(CodespaceError::InvalidWidth(width)); + // Skip this invalid range and continue to the next + continue; } // Create range with 4-byte arrays diff --git a/crates/pdftract-core/src/cmap/mod.rs b/crates/pdftract-core/src/cmap/mod.rs index ab2cc9c..2501ce9 100644 --- a/crates/pdftract-core/src/cmap/mod.rs +++ b/crates/pdftract-core/src/cmap/mod.rs @@ -5,4 +5,10 @@ pub mod codespace; +#[cfg(feature = "cjk")] +pub mod tokenize; + pub use codespace::{CodespaceRange, CodespaceRanges, parse_codespace_ranges, parse_codespace_ranges_with_diags}; + +#[cfg(feature = "cjk")] +pub use tokenize::tokenize_cjk_bytes; diff --git a/crates/pdftract-core/src/cmap/tokenize.rs b/crates/pdftract-core/src/cmap/tokenize.rs index ca6869c..d1a23bc 100644 --- a/crates/pdftract-core/src/cmap/tokenize.rs +++ b/crates/pdftract-core/src/cmap/tokenize.rs @@ -154,9 +154,12 @@ pub fn tokenize_cjk_bytes( } else { // Emit U+FFFD and diagnostic once per unique byte value codes.push(0xFFFD); + #[cfg(feature = "cjk")] if emitted_unknown.insert(b) { emit!(diagnostics, CjkTokenizeUnknownByte, offset = cursor as u64); } + #[cfg(not(feature = "cjk"))] + let _ = emitted_unknown.insert(b); } cursor += 1; @@ -214,6 +217,7 @@ mod tests { } #[test] + #[cfg(feature = "cjk")] fn test_unrecognized_byte_emits_replacement_and_diagnostic() { // Acceptance criterion: Unrecognized byte (no matching range): emit U+FFFD code + CJK_TOKENIZE_UNKNOWN_BYTE diagnostic once let mut codespace = CodespaceRanges::new(); @@ -230,6 +234,7 @@ mod tests { } #[test] + #[cfg(feature = "cjk")] fn test_unrecognized_byte_diagnostic_emitted_once_per_unique_byte() { // Multiple occurrences of the same unrecognized byte should emit only one diagnostic let mut codespace = CodespaceRanges::new(); @@ -324,6 +329,7 @@ mod tests { } #[test] + #[cfg(feature = "cjk")] fn test_partial_match_at_end_of_input() { // If we're at the end of input and don't have enough bytes for a multi-byte sequence, // we should fall through to unrecognized byte handling