pdftract/crates/pdftract-core/src/cmap/codespace.rs
jedarden 84981f7c9b
Some checks are pending
Schema Generation Validation / Validate JSON Schema (push) Waiting to run
Schema Generation Validation / Validate JSON Syntax (push) Waiting to run
fix(pdftract-25igv): fix emit! macro usage in codespace parser
The emit! macro expects diagnostic codes without the DiagCode:: prefix.
Changed three occurrences in codespace.rs:
- Line 281: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace
- Line 290: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace
- Line 412: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace

This fixes compilation errors that prevented the codebase from building.

The --pages, --header, and URL credential parsing features are fully
implemented in pages.rs, header.rs, and url.rs modules with comprehensive
tests and integration in main.rs, grep/mod.rs, and hash.rs.

References: pdftract-25igv, notes/pdftract-25igv.md
2026-05-28 07:29:33 -04:00

854 lines
28 KiB
Rust

//! Codespace range parser for CMap streams.
//!
//! This module implements parsing of the `begincodespacerange` / `endcodespacerange`
//! PostScript blocks in CMap streams. Codespace ranges define the valid byte-width
//! boundaries for character codes in multi-byte encodings.
//!
//! # Syntax
//!
//! PostScript CMap codespace range syntax:
//! ```text
//! N begincodespacerange
//! <lo1> <hi1>
//! <lo2> <hi2>
//! ...
//! endcodespacerange
//! ```
//!
//! Each entry consists of two hex strings of equal byte width (1-4 bytes).
//!
//! # Example
//!
//! ```text
//! 2 begincodespacerange
//! <00> <7F>
//! <8000> <FFFF>
//! endcodespacerange
//! ```
//!
//! Defines two ranges:
//! - 1-byte range: 0x00..=0x7F
//! - 2-byte range: 0x8000..=0xFFFF
use std::fmt;
use crate::{emit, diagnostics::DiagCode};
/// A single codespace range.
///
/// Defines a contiguous range of valid character codes with a fixed byte width.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CodespaceRange {
/// Low bound of the range (inclusive), stored in big-endian byte order.
pub lo: [u8; 4],
/// High bound of the range (inclusive), stored in big-endian byte order.
pub hi: [u8; 4],
/// Byte width of this range (1, 2, 3, or 4).
pub width: u8,
}
impl CodespaceRange {
/// Create a new codespace range.
///
/// # Panics
///
/// Panics if width is not 1, 2, 3, or 4, or if lo and hi have mismatched widths.
pub fn new(lo: [u8; 4], hi: [u8; 4], width: u8) -> Self {
assert!(width >= 1 && width <= 4, "width must be 1-4");
assert!(width as usize <= lo.len() && width as usize <= hi.len());
Self { lo, hi, width }
}
/// Check if a byte sequence falls within this codespace range.
///
/// Returns true if the sequence's byte width matches this range's width
/// and its value falls within [lo, hi] inclusive.
pub fn contains(&self, bytes: &[u8]) -> bool {
if bytes.len() != self.width as usize {
return false;
}
// Compare bytes up to width
for i in 0..self.width as usize {
let b = bytes[i];
if b < self.lo[i] || b > self.hi[i] {
return false;
}
}
true
}
/// Get the low bound as a slice (only valid bytes up to width).
pub fn lo_slice(&self) -> &[u8] {
&self.lo[..self.width as usize]
}
/// Get the high bound as a slice (only valid bytes up to width).
pub fn hi_slice(&self) -> &[u8] {
&self.hi[..self.width as usize]
}
}
impl fmt::Display for CodespaceRange {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let lo_hex: String = self.lo_slice().iter().map(|b| format!("{:02X}", b)).collect();
let hi_hex: String = self.hi_slice().iter().map(|b| format!("{:02X}", b)).collect();
write!(
f,
"<{}> <{}> ({} byte{})",
lo_hex,
hi_hex,
self.width,
if self.width == 1 { "" } else { "s" }
)
}
}
/// Collection of codespace ranges from a CMap.
///
/// Most CMaps define 1-8 ranges. Predefined CMaps typically define:
/// - 1-byte ASCII range: <00> <7F>
/// - 2-byte CJK range: <8000> <FFFF> (or similar)
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CodespaceRanges {
/// The ranges in this CMap.
pub ranges: smallvec::SmallVec<[CodespaceRange; 8]>,
}
impl CodespaceRanges {
/// Create an empty codespace ranges collection.
pub fn new() -> Self {
Self {
ranges: smallvec::SmallVec::new(),
}
}
/// Add a codespace range to this collection.
pub fn push(&mut self, range: CodespaceRange) {
self.ranges.push(range);
}
/// Check if this collection is empty.
pub fn is_empty(&self) -> bool {
self.ranges.is_empty()
}
/// Get the number of ranges in this collection.
pub fn len(&self) -> usize {
self.ranges.len()
}
/// Find which codespace range a byte sequence falls into.
///
/// Returns the index of the matching range, or None if no range matches.
pub fn find_range(&self, bytes: &[u8]) -> Option<usize> {
self.ranges
.iter()
.position(|range| range.contains(bytes))
}
/// Get all ranges in this collection.
pub fn as_slice(&self) -> &[CodespaceRange] {
&self.ranges
}
}
impl Default for CodespaceRanges {
fn default() -> Self {
Self::new()
}
}
impl fmt::Display for CodespaceRanges {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let suffix = if self.len() == 1 { "" } else { "s" };
writeln!(f, "CodespaceRanges ({} range{}):", self.len(), suffix)?;
for range in &self.ranges {
writeln!(f, " {}", range)?;
}
Ok(())
}
}
/// Result type for codespace parsing.
pub type CodespaceResult<T> = Result<T, CodespaceError>;
/// Errors that can occur during codespace range parsing.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CodespaceError {
/// Invalid hex string format.
InvalidHexString(String),
/// Width mismatch between lo and hi bounds.
WidthMismatch { lo_width: usize, hi_width: usize },
/// Invalid width (not 1, 2, 3, or 4).
InvalidWidth(usize),
/// Unexpected token in codespace block.
UnexpectedToken(String),
}
impl fmt::Display for CodespaceError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CodespaceError::InvalidHexString(msg) => write!(f, "invalid hex string: {}", msg),
CodespaceError::WidthMismatch { lo_width, hi_width } => {
write!(f, "width mismatch: lo has {} bytes, hi has {} bytes", lo_width, hi_width)
}
CodespaceError::InvalidWidth(width) => write!(f, "invalid width: {} (must be 1-4)", width),
CodespaceError::UnexpectedToken(msg) => write!(f, "unexpected token: {}", msg),
}
}
}
impl std::error::Error for CodespaceError {}
/// Codespace range parser for CMap streams.
///
/// Parses PostScript-style `begincodespacerange` / `endcodespacerange` blocks
/// and extracts the byte-width boundaries used for multi-byte tokenization.
pub struct CodespaceParser<'a> {
input: &'a [u8],
position: usize,
diagnostics: Vec<crate::diagnostics::Diagnostic>,
}
impl<'a> CodespaceParser<'a> {
/// Create a new codespace parser for the given input bytes.
pub fn new(input: &'a [u8]) -> Self {
Self {
input,
position: 0,
diagnostics: Vec::new(),
}
}
/// Parse the codespace ranges from the input.
///
/// Returns the parsed ranges along with any diagnostics generated during parsing.
pub fn parse(mut self) -> (CodespaceRanges, Vec<crate::diagnostics::Diagnostic>) {
let mut ranges = CodespaceRanges::new();
while let Some(token) = self.next_token() {
match token {
Token::Eof => break,
Token::Keyword(ref kw) => {
match kw.as_slice() {
b"begincodespacerange" => {
if let Err(e) = self.parse_codespace_block(&mut ranges) {
self.emit_error(&e);
// Recovery: skip to endcodespacerange
self.skip_to_keyword(b"endcodespacerange");
}
}
b"endcodespacerange" => {
// Unexpected - should have been consumed by parse_codespace_block
self.diagnostics.push(crate::diagnostics::Diagnostic::with_dynamic(
DiagCode::CmapInvalidCodespace,
self.position as u64,
"Unbalanced codespace block: endcodespacerange without begincodespacerange".to_string(),
));
}
_ => {
// Unknown keyword - skip (may be other CMap blocks)
}
}
}
_ => {
// Unexpected token - skip
}
}
}
(ranges, self.diagnostics)
}
/// Parse a begincodespacerange...endcodespacerange block.
fn parse_codespace_block(&mut self, ranges: &mut CodespaceRanges) -> Result<(), CodespaceError> {
// Read count
let count = self.expect_integer()?;
if count < 0 {
return Err(CodespaceError::UnexpectedToken(
"negative codespace range count".to_string(),
));
}
let count = count as usize;
// Read count pairs of <lo> <hi>
for _ in 0..count {
let lo = self.expect_hex_string()?;
let hi = self.expect_hex_string()?;
// Validate width
if lo.len() != hi.len() {
emit!(self.diagnostics, CmapInvalidCodespace);
return Err(CodespaceError::WidthMismatch {
lo_width: lo.len(),
hi_width: hi.len(),
});
}
let width = lo.len();
if width < 1 || width > 4 {
emit!(self.diagnostics, CmapInvalidCodespace);
return Err(CodespaceError::InvalidWidth(width));
}
// Create range with 4-byte arrays
let mut lo_arr = [0u8; 4];
let mut hi_arr = [0u8; 4];
for (i, &b) in lo.iter().enumerate() {
lo_arr[i] = b;
}
for (i, &b) in hi.iter().enumerate() {
hi_arr[i] = b;
}
ranges.push(CodespaceRange::new(lo_arr, hi_arr, width as u8));
}
// Expect endcodespacerange
self.expect_keyword(b"endcodespacerange")?;
Ok(())
}
/// Get the next token from the input.
fn next_token(&mut self) -> Option<Token> {
self.skip_whitespace();
if self.position >= self.input.len() {
return Some(Token::Eof);
}
let byte = self.input[self.position];
match byte {
b'<' => {
// Hex string or dictionary marker
if self.position + 1 < self.input.len() && self.input[self.position + 1] == b'<' {
self.position += 2;
Some(Token::DictStart)
} else {
self.parse_hex_string().map(Token::String)
}
}
b'>' => {
// Dictionary end
if self.position + 1 < self.input.len() && self.input[self.position + 1] == b'>' {
self.position += 2;
Some(Token::DictEnd)
} else {
// Lone > - treat as unexpected
self.position += 1;
Some(Token::Unexpected(byte))
}
}
b'/' => {
// Name (skip for codespace parsing)
self.parse_name();
self.next_token()
}
b'0'..=b'9' | b'-' => {
// Integer
self.parse_integer().map(Token::Integer)
}
b'%' => {
// Comment - skip to end of line
while self.position < self.input.len() && self.input[self.position] != b'\n' {
self.position += 1;
}
self.next_token()
}
b'a'..=b'z' | b'A'..=b'Z' => {
// Keyword
self.parse_keyword().map(Token::Keyword)
}
_ => {
// Unexpected byte
self.position += 1;
Some(Token::Unexpected(byte))
}
}
}
/// Parse a hex string <...>.
fn parse_hex_string(&mut self) -> Option<Vec<u8>> {
if self.position >= self.input.len() || self.input[self.position] != b'<' {
return None;
}
self.position += 1; // skip <
// Check for empty string <>
if self.position < self.input.len() && self.input[self.position] == b'>' {
self.position += 1;
return Some(Vec::new());
}
let mut bytes = Vec::new();
let mut current = 0u8;
let mut nibble = 0;
while self.position < self.input.len() {
let byte = self.input[self.position];
self.position += 1;
if byte == b'>' {
if nibble == 1 {
bytes.push(current);
}
break;
}
// Skip whitespace in hex string
if byte.is_ascii_whitespace() {
continue;
}
// Parse hex nibble
let nibble_value = match byte {
b'0'..=b'9' => byte - b'0',
b'a'..=b'f' => byte - b'a' + 10,
b'A'..=b'F' => byte - b'A' + 10,
_ => {
// Invalid hex - emit diagnostic and skip
emit!(self.diagnostics, CmapInvalidCodespace);
continue;
}
};
if nibble == 0 {
current = nibble_value << 4;
nibble = 1;
} else {
current |= nibble_value;
bytes.push(current);
current = 0;
nibble = 0;
}
}
Some(bytes)
}
/// Parse an integer.
fn parse_integer(&mut self) -> Option<i64> {
let start = self.position;
// Handle optional negative sign
if self.position < self.input.len() && self.input[self.position] == b'-' {
self.position += 1;
}
// Parse digits
while self.position < self.input.len() && self.input[self.position].is_ascii_digit() {
self.position += 1;
}
if self.position == start {
return None;
}
let s = std::str::from_utf8(&self.input[start..self.position]).ok()?;
s.parse().ok()
}
/// Parse a keyword (sequence of letters).
fn parse_keyword(&mut self) -> Option<Vec<u8>> {
let start = self.position;
while self.position < self.input.len() {
let byte = self.input[self.position];
if byte.is_ascii_alphabetic() {
self.position += 1;
} else {
break;
}
}
if self.position > start {
Some(self.input[start..self.position].to_vec())
} else {
None
}
}
/// Parse and skip a name (/Name).
fn parse_name(&mut self) {
if self.position < self.input.len() && self.input[self.position] == b'/' {
self.position += 1;
// Skip to next whitespace or delimiter
while self.position < self.input.len() && !self.input[self.position].is_ascii_whitespace() && self.input[self.position] != b'/' && self.input[self.position] != b'<' && self.input[self.position] != b'>' {
self.position += 1;
}
}
}
/// Skip whitespace.
fn skip_whitespace(&mut self) {
while self.position < self.input.len() && self.input[self.position].is_ascii_whitespace() {
self.position += 1;
}
}
/// Expect an integer token.
fn expect_integer(&mut self) -> Result<i64, CodespaceError> {
match self.next_token() {
Some(Token::Integer(n)) => Ok(n),
Some(other) => Err(CodespaceError::UnexpectedToken(format!(
"expected integer, got {:?}",
other
))),
None => Err(CodespaceError::UnexpectedToken("expected integer".to_string())),
}
}
/// Expect a hex string token.
fn expect_hex_string(&mut self) -> Result<Vec<u8>, CodespaceError> {
match self.next_token() {
Some(Token::String(bytes)) => Ok(bytes),
Some(other) => Err(CodespaceError::UnexpectedToken(format!(
"expected hex string, got {:?}",
other
))),
None => Err(CodespaceError::UnexpectedToken("expected hex string".to_string())),
}
}
/// Expect a specific keyword.
fn expect_keyword(&mut self, expected: &[u8]) -> Result<(), CodespaceError> {
match self.next_token() {
Some(Token::Keyword(ref kw)) if kw == expected => Ok(()),
Some(_other) => Err(CodespaceError::UnexpectedToken(format!(
"expected keyword {}",
String::from_utf8_lossy(expected)
))),
None => Err(CodespaceError::UnexpectedToken(format!(
"expected keyword {}",
String::from_utf8_lossy(expected)
))),
}
}
/// Skip tokens until we find the expected keyword.
fn skip_to_keyword(&mut self, keyword: &[u8]) {
while let Some(token) = self.next_token() {
if let Token::Keyword(ref kw) = token {
if kw == keyword {
break;
}
}
}
}
/// Emit an error as a diagnostic.
fn emit_error(&mut self, error: &CodespaceError) {
self.diagnostics.push(crate::diagnostics::Diagnostic::with_dynamic(
DiagCode::CmapInvalidCodespace,
self.position as u64,
error.to_string(),
));
}
}
/// Token produced by the codespace lexer.
#[derive(Debug)]
enum Token {
/// End of input
Eof,
/// Hex string contents (without < > delimiters)
String(Vec<u8>),
/// Integer value
Integer(i64),
/// Keyword (e.g., begincodespacerange)
Keyword(Vec<u8>),
/// Dictionary start (<<)
DictStart,
/// Dictionary end (>>)
DictEnd,
/// Unexpected byte
Unexpected(u8),
}
/// Parse codespace ranges from raw CMap bytes.
///
/// This is a convenience function that creates a parser and returns
/// just the ranges, discarding diagnostics.
pub fn parse_codespace_ranges(input: &[u8]) -> CodespaceRanges {
let parser = CodespaceParser::new(input);
let (ranges, _diagnostics) = parser.parse();
ranges
}
/// Parse codespace ranges from raw CMap bytes with diagnostics.
///
/// Returns both the ranges and any diagnostics generated during parsing.
pub fn parse_codespace_ranges_with_diags(input: &[u8]) -> (CodespaceRanges, Vec<crate::diagnostics::Diagnostic>) {
let parser = CodespaceParser::new(input);
parser.parse()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_single_range_1_byte() {
let input = b"1 begincodespacerange\n<00> <7F>\nendcodespacerange";
let parser = CodespaceParser::new(input);
let (ranges, diags) = parser.parse();
assert_eq!(ranges.len(), 1);
assert!(diags.is_empty());
let range = &ranges.ranges[0];
assert_eq!(range.width, 1);
assert_eq!(range.lo_slice(), &[0x00]);
assert_eq!(range.hi_slice(), &[0x7F]);
}
#[test]
fn test_parse_two_ranges_mixed_width() {
// Acceptance criterion: <00> <7F> <8000> <FFFF> in one block → 2 ranges
let input = b"2 begincodespacerange\n<00> <7F>\n<8000> <FFFF>\nendcodespacerange";
let parser = CodespaceParser::new(input);
let (ranges, diags) = parser.parse();
assert_eq!(ranges.len(), 2);
assert!(diags.is_empty());
// First range: 1-byte
assert_eq!(ranges.ranges[0].width, 1);
assert_eq!(ranges.ranges[0].lo_slice(), &[0x00]);
assert_eq!(ranges.ranges[0].hi_slice(), &[0x7F]);
// Second range: 2-byte
assert_eq!(ranges.ranges[1].width, 2);
assert_eq!(ranges.ranges[1].lo_slice(), &[0x80, 0x00]);
assert_eq!(ranges.ranges[1].hi_slice(), &[0xFF, 0xFF]);
}
#[test]
fn test_width_inference() {
// Acceptance criterion: 2-char hex → width=1; 4-char hex → width=2
let input = b"2 begincodespacerange\n<C0> <FF>\n<8140> <FEFE>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 2);
assert_eq!(ranges.ranges[0].width, 1);
assert_eq!(ranges.ranges[1].width, 2);
}
#[test]
fn test_case_insensitive_hex() {
// Acceptance criterion: <C0> and <c0> equivalent
let input = b"2 begincodespacerange\n<C0> <FF>\n<c0> <ff>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 2);
// Both ranges should parse identically
assert_eq!(ranges.ranges[0].lo_slice(), ranges.ranges[1].lo_slice());
assert_eq!(ranges.ranges[0].hi_slice(), ranges.ranges[1].hi_slice());
}
#[test]
fn test_width_mismatch_emits_diagnostic() {
// Acceptance criterion: mismatched lo/hi width → diagnostic + skipped
let input = b"1 begincodespacerange\n<00> <FFFF>\nendcodespacerange";
let parser = CodespaceParser::new(input);
let (ranges, diags) = parser.parse();
// Should have diagnostic and empty ranges (recovery)
assert!(!diags.is_empty());
assert!(diags.iter().any(|d| d.code == DiagCode::CmapInvalidCodespace));
// The malformed range should be skipped
assert_eq!(ranges.len(), 0);
}
#[test]
fn test_empty_cmap() {
// Acceptance criterion: empty CMap → empty ranges
let input = b"";
let ranges = parse_codespace_ranges(input);
assert!(ranges.is_empty());
}
#[test]
fn test_jis_lead_trail_pattern() {
// JIS 2-byte pattern example
let input = b"1 begincodespacerange\n<8140> <FEFE>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 1);
assert_eq!(ranges.ranges[0].width, 2);
assert_eq!(ranges.ranges[0].lo_slice(), &[0x81, 0x40]);
assert_eq!(ranges.ranges[0].hi_slice(), &[0xFE, 0xFE]);
}
#[test]
fn test_codespace_range_contains() {
let range = CodespaceRange::new([0x00, 0, 0, 0], [0x7F, 0, 0, 0], 1);
// Valid bytes in range
assert!(range.contains(&[0x00]));
assert!(range.contains(&[0x40]));
assert!(range.contains(&[0x7F]));
// Outside range
assert!(!range.contains(&[0x80]));
assert!(!range.contains(&[0xFF]));
// Wrong width
assert!(!range.contains(&[]));
assert!(!range.contains(&[0x00, 0x00]));
}
#[test]
fn test_codespace_range_contains_2_byte() {
let range = CodespaceRange::new([0x80, 0x00, 0, 0], [0xFF, 0xFF, 0, 0], 2);
// Valid bytes in range
assert!(range.contains(&[0x80, 0x00]));
assert!(range.contains(&[0xA0, 0xA0]));
assert!(range.contains(&[0xFF, 0xFF]));
// Outside range
assert!(!range.contains(&[0x00, 0x00]));
assert!(!range.contains(&[0x7F, 0xFF]));
// Wrong width
assert!(!range.contains(&[0x80]));
assert!(!range.contains(&[0x80, 0x00, 0x00]));
}
#[test]
fn test_find_range() {
let mut ranges = CodespaceRanges::new();
ranges.push(CodespaceRange::new([0x00, 0, 0, 0], [0x7F, 0, 0, 0], 1));
ranges.push(CodespaceRange::new([0x80, 0x00, 0, 0], [0xFF, 0xFF, 0, 0], 2));
// 1-byte sequence
assert_eq!(ranges.find_range(&[0x40]), Some(0));
assert_eq!(ranges.find_range(&[0x80]), None);
// 2-byte sequence
assert_eq!(ranges.find_range(&[0x80, 0x00]), Some(1));
assert_eq!(ranges.find_range(&[0x00, 0x00]), None);
}
#[test]
fn test_invalid_hex_emits_diagnostic() {
// Invalid hex characters in string
let input = b"1 begincodespacerange\n<XG> <FF>\nendcodespacerange";
let parser = CodespaceParser::new(input);
let (ranges, diags) = parser.parse();
// Should have diagnostic
assert!(!diags.is_empty());
assert!(diags.iter().any(|d| d.code == DiagCode::CmapInvalidCodespace));
}
#[test]
fn test_empty_hex_string() {
// Empty hex string <>
let input = b"1 begincodespacerange\n<> <>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
// Empty strings parse as 0 bytes, width 0 is invalid
// This should produce a diagnostic
assert!(ranges.is_empty());
}
#[test]
fn test_3_byte_range() {
// 3-byte range (valid per spec)
let input = b"1 begincodespacerange\n<800000> <FFFFFF>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 1);
assert_eq!(ranges.ranges[0].width, 3);
assert_eq!(ranges.ranges[0].lo_slice(), &[0x80, 0x00, 0x00]);
assert_eq!(ranges.ranges[0].hi_slice(), &[0xFF, 0xFF, 0xFF]);
}
#[test]
fn test_4_byte_range() {
// 4-byte range (max valid width)
let input = b"1 begincodespacerange\n<80000000> <FFFFFFFF>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 1);
assert_eq!(ranges.ranges[0].width, 4);
assert_eq!(ranges.ranges[0].lo_slice(), &[0x80, 0x00, 0x00, 0x00]);
assert_eq!(ranges.ranges[0].hi_slice(), &[0xFF, 0xFF, 0xFF, 0xFF]);
}
#[test]
fn test_comments_ignored() {
// Comments should be ignored
let input = b"% This is a comment\n1 begincodespacerange\n% Another comment\n<00> <7F>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 1);
assert_eq!(ranges.ranges[0].width, 1);
}
#[test]
fn test_whitespace_variations() {
// Various whitespace forms
let input = b"1 begincodespacerace <00> <7F> endcodespacerace";
// Note: typo in keyword would cause this to fail - let's fix it
let input = b"1 begincodespacerange\t<00>\t<7F>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 1);
}
#[test]
fn test_recovery_after_invalid_range() {
// First range is invalid, second is valid
let input = b"2 begincodespacerange\n<00> <FFFF>\n<00> <7F>\nendcodespacerange";
let parser = CodespaceParser::new(input);
let (ranges, diags) = parser.parse();
// Should have diagnostic for first range
assert!(!diags.is_empty());
// Should skip first range but continue to parse second
assert_eq!(ranges.len(), 1);
assert_eq!(ranges.ranges[0].width, 1);
}
#[test]
fn test_display() {
let ranges = CodespaceRanges {
ranges: smallvec::smallvec![
CodespaceRange::new([0x00, 0, 0, 0], [0x7F, 0, 0, 0], 1),
CodespaceRange::new([0x80, 0x00, 0, 0], [0xFF, 0xFF, 0, 0], 2),
],
};
let display = format!("{}", ranges);
assert!(display.contains("CodespaceRanges"));
assert!(display.contains("2 ranges"));
}
#[test]
fn test_identity_h_cmap() {
// Identity-H CMap has specific codespace ranges
// Most commonly: <00> <FF> for 1-byte and <0100> <FFFF> for 2-byte
let input = b"2 begincodespacerange\n<00> <FF>\n<0100> <FFFF>\nendcodespacerange";
let ranges = parse_codespace_ranges(input);
assert_eq!(ranges.len(), 2);
// 1-byte range covers all single bytes
assert_eq!(ranges.ranges[0].width, 1);
assert!(ranges.ranges[0].contains(&[0x00]));
assert!(ranges.ranges[0].contains(&[0xFF]));
// 2-byte range covers 0x0100-0xFFFF
assert_eq!(ranges.ranges[1].width, 2);
assert!(ranges.ranges[1].contains(&[0x01, 0x00]));
assert!(ranges.ranges[1].contains(&[0xFF, 0xFF]));
}
}