feat(pdftract-19oy): codespace range parser + multi-byte tokenizer

Implemented codespace range parsing from begincodespacerange/endcodespacerange blocks and multi-byte CJK tokenizer with widest-first matching per ISO 32000-1 9.10.3.1. Changes: - codespace.rs: Added pending_count handling for count-before-keyword syntax - codespace.rs: Improved error recovery (skip invalid ranges, continue parsing) - tokenize.rs: Added cfg guards for cjk feature diagnostic emission - mod.rs: Added tokenize module exports All acceptance criteria PASS: - [<00>-<7F>, <8140>-<FEFE>] tokenizes to [0x41, 0x82A0, 0x42] - [<00>-<7F>, <8000>-<FFFF>] tokenizes to [0x41, 0x82A0, 0x42] - Widest-first matching for overlapping ranges - Unrecognized bytes emit U+FFFD + diagnostic - 1-byte-only codespace handles ASCII correctly Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 12:26:01 -04:00 · 2026-05-28 12:26:01 -04:00 · 19c6328542
commit 19c6328542
parent 96b548ea18
3 changed files with 76 additions and 20 deletions
--- a/crates/pdftract-core/src/cmap/codespace.rs
+++ b/crates/pdftract-core/src/cmap/codespace.rs
@ -108,8 +108,8 @@ impl fmt::Display for CodespaceRange {
 /// Collection of codespace ranges from a CMap.
 ///
 /// Most CMaps define 1-8 ranges. Predefined CMaps typically define:
-/// - 1-byte ASCII range: <00> <7F>
-/// - 2-byte CJK range: <8000> <FFFF> (or similar)
+/// - 1-byte ASCII range: \`<00> <7F>\`
+/// - 2-byte CJK range: \`<8000> <FFFF\>\` (or similar)
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CodespaceRanges {
    /// The ranges in this CMap.
@ -180,7 +180,12 @@ pub enum CodespaceError {
    /// Invalid hex string format.
    InvalidHexString(String),
    /// Width mismatch between lo and hi bounds.
-    WidthMismatch { lo_width: usize, hi_width: usize },
+    WidthMismatch {
+        /// Width of the lo bound.
+        lo_width: usize,
+        /// Width of the hi bound.
+        hi_width: usize,
+    },
    /// Invalid width (not 1, 2, 3, or 4).
    InvalidWidth(usize),
    /// Unexpected token in codespace block.
@ -209,6 +214,7 @@ impl std::error::Error for CodespaceError {}
 pub struct CodespaceParser<'a> {
    input: &'a [u8],
    position: usize,
+    pending_count: Option<i64>,
    diagnostics: Vec<crate::diagnostics::Diagnostic>,
 }

@ -218,6 +224,7 @@ impl<'a> CodespaceParser<'a> {
        Self {
            input,
            position: 0,
+            pending_count: None,
            diagnostics: Vec::new(),
        }
    }
@ -231,6 +238,10 @@ impl<'a> CodespaceParser<'a> {
        while let Some(token) = self.next_token() {
            match token {
                Token::Eof => break,
+                Token::Integer(n) => {
+                    // Store integer - may be a count before begincodespacerange
+                    self.pending_count = Some(n);
+                }
                Token::Keyword(ref kw) => {
                    match kw.as_slice() {
                        b"begincodespacerange" => {
@ -239,6 +250,8 @@ impl<'a> CodespaceParser<'a> {
                                // Recovery: skip to endcodespacerange
                                self.skip_to_keyword(b"endcodespacerange");
                            }
+                            // Clear pending count in case it wasn't used
+                            self.pending_count = None;
                        }
                        b"endcodespacerange" => {
                            // Unexpected - should have been consumed by parse_codespace_block
@ -247,14 +260,17 @@ impl<'a> CodespaceParser<'a> {
                                self.position as u64,
                                "Unbalanced codespace block: endcodespacerange without begincodespacerange".to_string(),
                            ));
+                            self.pending_count = None;
                        }
                        _ => {
-                            // Unknown keyword - skip (may be other CMap blocks)
+                            // Unknown keyword - clear pending count (not for us)
+                            self.pending_count = None;
                        }
                    }
                }
                _ => {
-                    // Unexpected token - skip
+                    // Unexpected token - clear pending count
+                    self.pending_count = None;
                }
            }
        }
@ -264,33 +280,61 @@ impl<'a> CodespaceParser<'a> {

    /// Parse a begincodespacerange...endcodespacerange block.
    fn parse_codespace_block(&mut self, ranges: &mut CodespaceRanges) -> Result<(), CodespaceError> {
-        // Read count
-        let count = self.expect_integer()?;
-        if count < 0 {
-            return Err(CodespaceError::UnexpectedToken(
-                "negative codespace range count".to_string(),
-            ));
-        }
-        let count = count as usize;
+        // Read count - may be pending (from before keyword) or after keyword
+        let count = match self.pending_count.take() {
+            Some(n) => {
+                if n < 0 {
+                    return Err(CodespaceError::UnexpectedToken(
+                        "negative codespace range count".to_string(),
+                    ));
+                }
+                n as usize
+            }
+            None => {
+                let n = self.expect_integer()?;
+                if n < 0 {
+                    return Err(CodespaceError::UnexpectedToken(
+                        "negative codespace range count".to_string(),
+                    ));
+                }
+                n as usize
+            }
+        };

        // Read count pairs of <lo> <hi>
        for _ in 0..count {
-            let lo = self.expect_hex_string()?;
-            let hi = self.expect_hex_string()?;
+            let lo = match self.expect_hex_string() {
+                Ok(s) => s,
+                Err(_) => {
+                    // Failed to read lo - skip to endcodespacerange
+                    emit!(self.diagnostics, CmapInvalidCodespace);
+                    self.skip_to_keyword(b"endcodespacerange");
+                    break;
+                }
+            };
+
+            let hi = match self.expect_hex_string() {
+                Ok(s) => s,
+                Err(_) => {
+                    // Failed to read hi - skip to endcodespacerange
+                    emit!(self.diagnostics, CmapInvalidCodespace);
+                    self.skip_to_keyword(b"endcodespacerange");
+                    break;
+                }
+            };

            // Validate width
            if lo.len() != hi.len() {
                emit!(self.diagnostics, CmapInvalidCodespace);
-                return Err(CodespaceError::WidthMismatch {
-                    lo_width: lo.len(),
-                    hi_width: hi.len(),
-                });
+                // Skip this invalid range and continue to the next
+                continue;
            }

            let width = lo.len();
            if width < 1 || width > 4 {
                emit!(self.diagnostics, CmapInvalidCodespace);
-                return Err(CodespaceError::InvalidWidth(width));
+                // Skip this invalid range and continue to the next
+                continue;
            }

            // Create range with 4-byte arrays
--- a/crates/pdftract-core/src/cmap/mod.rs
+++ b/crates/pdftract-core/src/cmap/mod.rs
@ -5,4 +5,10 @@

 pub mod codespace;

+#[cfg(feature = "cjk")]
+pub mod tokenize;
+
 pub use codespace::{CodespaceRange, CodespaceRanges, parse_codespace_ranges, parse_codespace_ranges_with_diags};
+
+#[cfg(feature = "cjk")]
+pub use tokenize::tokenize_cjk_bytes;
--- a/crates/pdftract-core/src/cmap/tokenize.rs
+++ b/crates/pdftract-core/src/cmap/tokenize.rs
@ -154,9 +154,12 @@ pub fn tokenize_cjk_bytes(
            } else {
                // Emit U+FFFD and diagnostic once per unique byte value
                codes.push(0xFFFD);
+                #[cfg(feature = "cjk")]
                if emitted_unknown.insert(b) {
                    emit!(diagnostics, CjkTokenizeUnknownByte, offset = cursor as u64);
                }
+                #[cfg(not(feature = "cjk"))]
+                let _ = emitted_unknown.insert(b);
            }

            cursor += 1;
@ -214,6 +217,7 @@ mod tests {
    }

    #[test]
+    #[cfg(feature = "cjk")]
    fn test_unrecognized_byte_emits_replacement_and_diagnostic() {
        // Acceptance criterion: Unrecognized byte (no matching range): emit U+FFFD code + CJK_TOKENIZE_UNKNOWN_BYTE diagnostic once
        let mut codespace = CodespaceRanges::new();
@ -230,6 +234,7 @@ mod tests {
    }

    #[test]
+    #[cfg(feature = "cjk")]
    fn test_unrecognized_byte_diagnostic_emitted_once_per_unique_byte() {
        // Multiple occurrences of the same unrecognized byte should emit only one diagnostic
        let mut codespace = CodespaceRanges::new();
@ -324,6 +329,7 @@ mod tests {
    }

    #[test]
+    #[cfg(feature = "cjk")]
    fn test_partial_match_at_end_of_input() {
        // If we're at the end of input and don't have enough bytes for a multi-byte sequence,
        // we should fall through to unrecognized byte handling