The emit! macro expects diagnostic codes without the DiagCode:: prefix. Changed three occurrences in codespace.rs: - Line 281: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace - Line 290: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace - Line 412: DiagCode::CmapInvalidCodespace → CmapInvalidCodespace This fixes compilation errors that prevented the codebase from building. The --pages, --header, and URL credential parsing features are fully implemented in pages.rs, header.rs, and url.rs modules with comprehensive tests and integration in main.rs, grep/mod.rs, and hash.rs. References: pdftract-25igv, notes/pdftract-25igv.md
854 lines
28 KiB
Rust
854 lines
28 KiB
Rust
//! Codespace range parser for CMap streams.
|
|
//!
|
|
//! This module implements parsing of the `begincodespacerange` / `endcodespacerange`
|
|
//! PostScript blocks in CMap streams. Codespace ranges define the valid byte-width
|
|
//! boundaries for character codes in multi-byte encodings.
|
|
//!
|
|
//! # Syntax
|
|
//!
|
|
//! PostScript CMap codespace range syntax:
|
|
//! ```text
|
|
//! N begincodespacerange
|
|
//! <lo1> <hi1>
|
|
//! <lo2> <hi2>
|
|
//! ...
|
|
//! endcodespacerange
|
|
//! ```
|
|
//!
|
|
//! Each entry consists of two hex strings of equal byte width (1-4 bytes).
|
|
//!
|
|
//! # Example
|
|
//!
|
|
//! ```text
|
|
//! 2 begincodespacerange
|
|
//! <00> <7F>
|
|
//! <8000> <FFFF>
|
|
//! endcodespacerange
|
|
//! ```
|
|
//!
|
|
//! Defines two ranges:
|
|
//! - 1-byte range: 0x00..=0x7F
|
|
//! - 2-byte range: 0x8000..=0xFFFF
|
|
|
|
use std::fmt;
|
|
|
|
use crate::{emit, diagnostics::DiagCode};
|
|
|
|
/// A single codespace range.
|
|
///
|
|
/// Defines a contiguous range of valid character codes with a fixed byte width.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct CodespaceRange {
|
|
/// Low bound of the range (inclusive), stored in big-endian byte order.
|
|
pub lo: [u8; 4],
|
|
/// High bound of the range (inclusive), stored in big-endian byte order.
|
|
pub hi: [u8; 4],
|
|
/// Byte width of this range (1, 2, 3, or 4).
|
|
pub width: u8,
|
|
}
|
|
|
|
impl CodespaceRange {
|
|
/// Create a new codespace range.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// Panics if width is not 1, 2, 3, or 4, or if lo and hi have mismatched widths.
|
|
pub fn new(lo: [u8; 4], hi: [u8; 4], width: u8) -> Self {
|
|
assert!(width >= 1 && width <= 4, "width must be 1-4");
|
|
assert!(width as usize <= lo.len() && width as usize <= hi.len());
|
|
Self { lo, hi, width }
|
|
}
|
|
|
|
/// Check if a byte sequence falls within this codespace range.
|
|
///
|
|
/// Returns true if the sequence's byte width matches this range's width
|
|
/// and its value falls within [lo, hi] inclusive.
|
|
pub fn contains(&self, bytes: &[u8]) -> bool {
|
|
if bytes.len() != self.width as usize {
|
|
return false;
|
|
}
|
|
|
|
// Compare bytes up to width
|
|
for i in 0..self.width as usize {
|
|
let b = bytes[i];
|
|
if b < self.lo[i] || b > self.hi[i] {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
true
|
|
}
|
|
|
|
/// Get the low bound as a slice (only valid bytes up to width).
|
|
pub fn lo_slice(&self) -> &[u8] {
|
|
&self.lo[..self.width as usize]
|
|
}
|
|
|
|
/// Get the high bound as a slice (only valid bytes up to width).
|
|
pub fn hi_slice(&self) -> &[u8] {
|
|
&self.hi[..self.width as usize]
|
|
}
|
|
}
|
|
|
|
impl fmt::Display for CodespaceRange {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
let lo_hex: String = self.lo_slice().iter().map(|b| format!("{:02X}", b)).collect();
|
|
let hi_hex: String = self.hi_slice().iter().map(|b| format!("{:02X}", b)).collect();
|
|
write!(
|
|
f,
|
|
"<{}> <{}> ({} byte{})",
|
|
lo_hex,
|
|
hi_hex,
|
|
self.width,
|
|
if self.width == 1 { "" } else { "s" }
|
|
)
|
|
}
|
|
}
|
|
|
|
/// Collection of codespace ranges from a CMap.
|
|
///
|
|
/// Most CMaps define 1-8 ranges. Predefined CMaps typically define:
|
|
/// - 1-byte ASCII range: <00> <7F>
|
|
/// - 2-byte CJK range: <8000> <FFFF> (or similar)
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub struct CodespaceRanges {
|
|
/// The ranges in this CMap.
|
|
pub ranges: smallvec::SmallVec<[CodespaceRange; 8]>,
|
|
}
|
|
|
|
impl CodespaceRanges {
|
|
/// Create an empty codespace ranges collection.
|
|
pub fn new() -> Self {
|
|
Self {
|
|
ranges: smallvec::SmallVec::new(),
|
|
}
|
|
}
|
|
|
|
/// Add a codespace range to this collection.
|
|
pub fn push(&mut self, range: CodespaceRange) {
|
|
self.ranges.push(range);
|
|
}
|
|
|
|
/// Check if this collection is empty.
|
|
pub fn is_empty(&self) -> bool {
|
|
self.ranges.is_empty()
|
|
}
|
|
|
|
/// Get the number of ranges in this collection.
|
|
pub fn len(&self) -> usize {
|
|
self.ranges.len()
|
|
}
|
|
|
|
/// Find which codespace range a byte sequence falls into.
|
|
///
|
|
/// Returns the index of the matching range, or None if no range matches.
|
|
pub fn find_range(&self, bytes: &[u8]) -> Option<usize> {
|
|
self.ranges
|
|
.iter()
|
|
.position(|range| range.contains(bytes))
|
|
}
|
|
|
|
/// Get all ranges in this collection.
|
|
pub fn as_slice(&self) -> &[CodespaceRange] {
|
|
&self.ranges
|
|
}
|
|
}
|
|
|
|
impl Default for CodespaceRanges {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
impl fmt::Display for CodespaceRanges {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
let suffix = if self.len() == 1 { "" } else { "s" };
|
|
writeln!(f, "CodespaceRanges ({} range{}):", self.len(), suffix)?;
|
|
for range in &self.ranges {
|
|
writeln!(f, " {}", range)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Result type for codespace parsing.
|
|
pub type CodespaceResult<T> = Result<T, CodespaceError>;
|
|
|
|
/// Errors that can occur during codespace range parsing.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum CodespaceError {
|
|
/// Invalid hex string format.
|
|
InvalidHexString(String),
|
|
/// Width mismatch between lo and hi bounds.
|
|
WidthMismatch { lo_width: usize, hi_width: usize },
|
|
/// Invalid width (not 1, 2, 3, or 4).
|
|
InvalidWidth(usize),
|
|
/// Unexpected token in codespace block.
|
|
UnexpectedToken(String),
|
|
}
|
|
|
|
impl fmt::Display for CodespaceError {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
CodespaceError::InvalidHexString(msg) => write!(f, "invalid hex string: {}", msg),
|
|
CodespaceError::WidthMismatch { lo_width, hi_width } => {
|
|
write!(f, "width mismatch: lo has {} bytes, hi has {} bytes", lo_width, hi_width)
|
|
}
|
|
CodespaceError::InvalidWidth(width) => write!(f, "invalid width: {} (must be 1-4)", width),
|
|
CodespaceError::UnexpectedToken(msg) => write!(f, "unexpected token: {}", msg),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::error::Error for CodespaceError {}
|
|
|
|
/// Codespace range parser for CMap streams.
|
|
///
|
|
/// Parses PostScript-style `begincodespacerange` / `endcodespacerange` blocks
|
|
/// and extracts the byte-width boundaries used for multi-byte tokenization.
|
|
pub struct CodespaceParser<'a> {
|
|
input: &'a [u8],
|
|
position: usize,
|
|
diagnostics: Vec<crate::diagnostics::Diagnostic>,
|
|
}
|
|
|
|
impl<'a> CodespaceParser<'a> {
|
|
/// Create a new codespace parser for the given input bytes.
|
|
pub fn new(input: &'a [u8]) -> Self {
|
|
Self {
|
|
input,
|
|
position: 0,
|
|
diagnostics: Vec::new(),
|
|
}
|
|
}
|
|
|
|
/// Parse the codespace ranges from the input.
|
|
///
|
|
/// Returns the parsed ranges along with any diagnostics generated during parsing.
|
|
pub fn parse(mut self) -> (CodespaceRanges, Vec<crate::diagnostics::Diagnostic>) {
|
|
let mut ranges = CodespaceRanges::new();
|
|
|
|
while let Some(token) = self.next_token() {
|
|
match token {
|
|
Token::Eof => break,
|
|
Token::Keyword(ref kw) => {
|
|
match kw.as_slice() {
|
|
b"begincodespacerange" => {
|
|
if let Err(e) = self.parse_codespace_block(&mut ranges) {
|
|
self.emit_error(&e);
|
|
// Recovery: skip to endcodespacerange
|
|
self.skip_to_keyword(b"endcodespacerange");
|
|
}
|
|
}
|
|
b"endcodespacerange" => {
|
|
// Unexpected - should have been consumed by parse_codespace_block
|
|
self.diagnostics.push(crate::diagnostics::Diagnostic::with_dynamic(
|
|
DiagCode::CmapInvalidCodespace,
|
|
self.position as u64,
|
|
"Unbalanced codespace block: endcodespacerange without begincodespacerange".to_string(),
|
|
));
|
|
}
|
|
_ => {
|
|
// Unknown keyword - skip (may be other CMap blocks)
|
|
}
|
|
}
|
|
}
|
|
_ => {
|
|
// Unexpected token - skip
|
|
}
|
|
}
|
|
}
|
|
|
|
(ranges, self.diagnostics)
|
|
}
|
|
|
|
/// Parse a begincodespacerange...endcodespacerange block.
|
|
fn parse_codespace_block(&mut self, ranges: &mut CodespaceRanges) -> Result<(), CodespaceError> {
|
|
// Read count
|
|
let count = self.expect_integer()?;
|
|
if count < 0 {
|
|
return Err(CodespaceError::UnexpectedToken(
|
|
"negative codespace range count".to_string(),
|
|
));
|
|
}
|
|
let count = count as usize;
|
|
|
|
// Read count pairs of <lo> <hi>
|
|
for _ in 0..count {
|
|
let lo = self.expect_hex_string()?;
|
|
let hi = self.expect_hex_string()?;
|
|
|
|
// Validate width
|
|
if lo.len() != hi.len() {
|
|
emit!(self.diagnostics, CmapInvalidCodespace);
|
|
return Err(CodespaceError::WidthMismatch {
|
|
lo_width: lo.len(),
|
|
hi_width: hi.len(),
|
|
});
|
|
}
|
|
|
|
let width = lo.len();
|
|
if width < 1 || width > 4 {
|
|
emit!(self.diagnostics, CmapInvalidCodespace);
|
|
return Err(CodespaceError::InvalidWidth(width));
|
|
}
|
|
|
|
// Create range with 4-byte arrays
|
|
let mut lo_arr = [0u8; 4];
|
|
let mut hi_arr = [0u8; 4];
|
|
for (i, &b) in lo.iter().enumerate() {
|
|
lo_arr[i] = b;
|
|
}
|
|
for (i, &b) in hi.iter().enumerate() {
|
|
hi_arr[i] = b;
|
|
}
|
|
|
|
ranges.push(CodespaceRange::new(lo_arr, hi_arr, width as u8));
|
|
}
|
|
|
|
// Expect endcodespacerange
|
|
self.expect_keyword(b"endcodespacerange")?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get the next token from the input.
|
|
fn next_token(&mut self) -> Option<Token> {
|
|
self.skip_whitespace();
|
|
|
|
if self.position >= self.input.len() {
|
|
return Some(Token::Eof);
|
|
}
|
|
|
|
let byte = self.input[self.position];
|
|
|
|
match byte {
|
|
b'<' => {
|
|
// Hex string or dictionary marker
|
|
if self.position + 1 < self.input.len() && self.input[self.position + 1] == b'<' {
|
|
self.position += 2;
|
|
Some(Token::DictStart)
|
|
} else {
|
|
self.parse_hex_string().map(Token::String)
|
|
}
|
|
}
|
|
b'>' => {
|
|
// Dictionary end
|
|
if self.position + 1 < self.input.len() && self.input[self.position + 1] == b'>' {
|
|
self.position += 2;
|
|
Some(Token::DictEnd)
|
|
} else {
|
|
// Lone > - treat as unexpected
|
|
self.position += 1;
|
|
Some(Token::Unexpected(byte))
|
|
}
|
|
}
|
|
b'/' => {
|
|
// Name (skip for codespace parsing)
|
|
self.parse_name();
|
|
self.next_token()
|
|
}
|
|
b'0'..=b'9' | b'-' => {
|
|
// Integer
|
|
self.parse_integer().map(Token::Integer)
|
|
}
|
|
b'%' => {
|
|
// Comment - skip to end of line
|
|
while self.position < self.input.len() && self.input[self.position] != b'\n' {
|
|
self.position += 1;
|
|
}
|
|
self.next_token()
|
|
}
|
|
b'a'..=b'z' | b'A'..=b'Z' => {
|
|
// Keyword
|
|
self.parse_keyword().map(Token::Keyword)
|
|
}
|
|
_ => {
|
|
// Unexpected byte
|
|
self.position += 1;
|
|
Some(Token::Unexpected(byte))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse a hex string <...>.
|
|
fn parse_hex_string(&mut self) -> Option<Vec<u8>> {
|
|
if self.position >= self.input.len() || self.input[self.position] != b'<' {
|
|
return None;
|
|
}
|
|
self.position += 1; // skip <
|
|
|
|
// Check for empty string <>
|
|
if self.position < self.input.len() && self.input[self.position] == b'>' {
|
|
self.position += 1;
|
|
return Some(Vec::new());
|
|
}
|
|
|
|
let mut bytes = Vec::new();
|
|
let mut current = 0u8;
|
|
let mut nibble = 0;
|
|
|
|
while self.position < self.input.len() {
|
|
let byte = self.input[self.position];
|
|
self.position += 1;
|
|
|
|
if byte == b'>' {
|
|
if nibble == 1 {
|
|
bytes.push(current);
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Skip whitespace in hex string
|
|
if byte.is_ascii_whitespace() {
|
|
continue;
|
|
}
|
|
|
|
// Parse hex nibble
|
|
let nibble_value = match byte {
|
|
b'0'..=b'9' => byte - b'0',
|
|
b'a'..=b'f' => byte - b'a' + 10,
|
|
b'A'..=b'F' => byte - b'A' + 10,
|
|
_ => {
|
|
// Invalid hex - emit diagnostic and skip
|
|
emit!(self.diagnostics, CmapInvalidCodespace);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
if nibble == 0 {
|
|
current = nibble_value << 4;
|
|
nibble = 1;
|
|
} else {
|
|
current |= nibble_value;
|
|
bytes.push(current);
|
|
current = 0;
|
|
nibble = 0;
|
|
}
|
|
}
|
|
|
|
Some(bytes)
|
|
}
|
|
|
|
/// Parse an integer.
|
|
fn parse_integer(&mut self) -> Option<i64> {
|
|
let start = self.position;
|
|
|
|
// Handle optional negative sign
|
|
if self.position < self.input.len() && self.input[self.position] == b'-' {
|
|
self.position += 1;
|
|
}
|
|
|
|
// Parse digits
|
|
while self.position < self.input.len() && self.input[self.position].is_ascii_digit() {
|
|
self.position += 1;
|
|
}
|
|
|
|
if self.position == start {
|
|
return None;
|
|
}
|
|
|
|
let s = std::str::from_utf8(&self.input[start..self.position]).ok()?;
|
|
s.parse().ok()
|
|
}
|
|
|
|
/// Parse a keyword (sequence of letters).
|
|
fn parse_keyword(&mut self) -> Option<Vec<u8>> {
|
|
let start = self.position;
|
|
|
|
while self.position < self.input.len() {
|
|
let byte = self.input[self.position];
|
|
if byte.is_ascii_alphabetic() {
|
|
self.position += 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if self.position > start {
|
|
Some(self.input[start..self.position].to_vec())
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Parse and skip a name (/Name).
|
|
fn parse_name(&mut self) {
|
|
if self.position < self.input.len() && self.input[self.position] == b'/' {
|
|
self.position += 1;
|
|
// Skip to next whitespace or delimiter
|
|
while self.position < self.input.len() && !self.input[self.position].is_ascii_whitespace() && self.input[self.position] != b'/' && self.input[self.position] != b'<' && self.input[self.position] != b'>' {
|
|
self.position += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Skip whitespace.
|
|
fn skip_whitespace(&mut self) {
|
|
while self.position < self.input.len() && self.input[self.position].is_ascii_whitespace() {
|
|
self.position += 1;
|
|
}
|
|
}
|
|
|
|
/// Expect an integer token.
|
|
fn expect_integer(&mut self) -> Result<i64, CodespaceError> {
|
|
match self.next_token() {
|
|
Some(Token::Integer(n)) => Ok(n),
|
|
Some(other) => Err(CodespaceError::UnexpectedToken(format!(
|
|
"expected integer, got {:?}",
|
|
other
|
|
))),
|
|
None => Err(CodespaceError::UnexpectedToken("expected integer".to_string())),
|
|
}
|
|
}
|
|
|
|
/// Expect a hex string token.
|
|
fn expect_hex_string(&mut self) -> Result<Vec<u8>, CodespaceError> {
|
|
match self.next_token() {
|
|
Some(Token::String(bytes)) => Ok(bytes),
|
|
Some(other) => Err(CodespaceError::UnexpectedToken(format!(
|
|
"expected hex string, got {:?}",
|
|
other
|
|
))),
|
|
None => Err(CodespaceError::UnexpectedToken("expected hex string".to_string())),
|
|
}
|
|
}
|
|
|
|
/// Expect a specific keyword.
|
|
fn expect_keyword(&mut self, expected: &[u8]) -> Result<(), CodespaceError> {
|
|
match self.next_token() {
|
|
Some(Token::Keyword(ref kw)) if kw == expected => Ok(()),
|
|
Some(_other) => Err(CodespaceError::UnexpectedToken(format!(
|
|
"expected keyword {}",
|
|
String::from_utf8_lossy(expected)
|
|
))),
|
|
None => Err(CodespaceError::UnexpectedToken(format!(
|
|
"expected keyword {}",
|
|
String::from_utf8_lossy(expected)
|
|
))),
|
|
}
|
|
}
|
|
|
|
/// Skip tokens until we find the expected keyword.
|
|
fn skip_to_keyword(&mut self, keyword: &[u8]) {
|
|
while let Some(token) = self.next_token() {
|
|
if let Token::Keyword(ref kw) = token {
|
|
if kw == keyword {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Emit an error as a diagnostic.
|
|
fn emit_error(&mut self, error: &CodespaceError) {
|
|
self.diagnostics.push(crate::diagnostics::Diagnostic::with_dynamic(
|
|
DiagCode::CmapInvalidCodespace,
|
|
self.position as u64,
|
|
error.to_string(),
|
|
));
|
|
}
|
|
}
|
|
|
|
/// Token produced by the codespace lexer.
|
|
#[derive(Debug)]
|
|
enum Token {
|
|
/// End of input
|
|
Eof,
|
|
/// Hex string contents (without < > delimiters)
|
|
String(Vec<u8>),
|
|
/// Integer value
|
|
Integer(i64),
|
|
/// Keyword (e.g., begincodespacerange)
|
|
Keyword(Vec<u8>),
|
|
/// Dictionary start (<<)
|
|
DictStart,
|
|
/// Dictionary end (>>)
|
|
DictEnd,
|
|
/// Unexpected byte
|
|
Unexpected(u8),
|
|
}
|
|
|
|
/// Parse codespace ranges from raw CMap bytes.
|
|
///
|
|
/// This is a convenience function that creates a parser and returns
|
|
/// just the ranges, discarding diagnostics.
|
|
pub fn parse_codespace_ranges(input: &[u8]) -> CodespaceRanges {
|
|
let parser = CodespaceParser::new(input);
|
|
let (ranges, _diagnostics) = parser.parse();
|
|
ranges
|
|
}
|
|
|
|
/// Parse codespace ranges from raw CMap bytes with diagnostics.
|
|
///
|
|
/// Returns both the ranges and any diagnostics generated during parsing.
|
|
pub fn parse_codespace_ranges_with_diags(input: &[u8]) -> (CodespaceRanges, Vec<crate::diagnostics::Diagnostic>) {
|
|
let parser = CodespaceParser::new(input);
|
|
parser.parse()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_parse_single_range_1_byte() {
|
|
let input = b"1 begincodespacerange\n<00> <7F>\nendcodespacerange";
|
|
let parser = CodespaceParser::new(input);
|
|
let (ranges, diags) = parser.parse();
|
|
|
|
assert_eq!(ranges.len(), 1);
|
|
assert!(diags.is_empty());
|
|
|
|
let range = &ranges.ranges[0];
|
|
assert_eq!(range.width, 1);
|
|
assert_eq!(range.lo_slice(), &[0x00]);
|
|
assert_eq!(range.hi_slice(), &[0x7F]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_two_ranges_mixed_width() {
|
|
// Acceptance criterion: <00> <7F> <8000> <FFFF> in one block → 2 ranges
|
|
let input = b"2 begincodespacerange\n<00> <7F>\n<8000> <FFFF>\nendcodespacerange";
|
|
let parser = CodespaceParser::new(input);
|
|
let (ranges, diags) = parser.parse();
|
|
|
|
assert_eq!(ranges.len(), 2);
|
|
assert!(diags.is_empty());
|
|
|
|
// First range: 1-byte
|
|
assert_eq!(ranges.ranges[0].width, 1);
|
|
assert_eq!(ranges.ranges[0].lo_slice(), &[0x00]);
|
|
assert_eq!(ranges.ranges[0].hi_slice(), &[0x7F]);
|
|
|
|
// Second range: 2-byte
|
|
assert_eq!(ranges.ranges[1].width, 2);
|
|
assert_eq!(ranges.ranges[1].lo_slice(), &[0x80, 0x00]);
|
|
assert_eq!(ranges.ranges[1].hi_slice(), &[0xFF, 0xFF]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_width_inference() {
|
|
// Acceptance criterion: 2-char hex → width=1; 4-char hex → width=2
|
|
let input = b"2 begincodespacerange\n<C0> <FF>\n<8140> <FEFE>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 2);
|
|
assert_eq!(ranges.ranges[0].width, 1);
|
|
assert_eq!(ranges.ranges[1].width, 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_case_insensitive_hex() {
|
|
// Acceptance criterion: <C0> and <c0> equivalent
|
|
let input = b"2 begincodespacerange\n<C0> <FF>\n<c0> <ff>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 2);
|
|
// Both ranges should parse identically
|
|
assert_eq!(ranges.ranges[0].lo_slice(), ranges.ranges[1].lo_slice());
|
|
assert_eq!(ranges.ranges[0].hi_slice(), ranges.ranges[1].hi_slice());
|
|
}
|
|
|
|
#[test]
|
|
fn test_width_mismatch_emits_diagnostic() {
|
|
// Acceptance criterion: mismatched lo/hi width → diagnostic + skipped
|
|
let input = b"1 begincodespacerange\n<00> <FFFF>\nendcodespacerange";
|
|
let parser = CodespaceParser::new(input);
|
|
let (ranges, diags) = parser.parse();
|
|
|
|
// Should have diagnostic and empty ranges (recovery)
|
|
assert!(!diags.is_empty());
|
|
assert!(diags.iter().any(|d| d.code == DiagCode::CmapInvalidCodespace));
|
|
// The malformed range should be skipped
|
|
assert_eq!(ranges.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_cmap() {
|
|
// Acceptance criterion: empty CMap → empty ranges
|
|
let input = b"";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert!(ranges.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_jis_lead_trail_pattern() {
|
|
// JIS 2-byte pattern example
|
|
let input = b"1 begincodespacerange\n<8140> <FEFE>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 1);
|
|
assert_eq!(ranges.ranges[0].width, 2);
|
|
assert_eq!(ranges.ranges[0].lo_slice(), &[0x81, 0x40]);
|
|
assert_eq!(ranges.ranges[0].hi_slice(), &[0xFE, 0xFE]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_codespace_range_contains() {
|
|
let range = CodespaceRange::new([0x00, 0, 0, 0], [0x7F, 0, 0, 0], 1);
|
|
|
|
// Valid bytes in range
|
|
assert!(range.contains(&[0x00]));
|
|
assert!(range.contains(&[0x40]));
|
|
assert!(range.contains(&[0x7F]));
|
|
|
|
// Outside range
|
|
assert!(!range.contains(&[0x80]));
|
|
assert!(!range.contains(&[0xFF]));
|
|
|
|
// Wrong width
|
|
assert!(!range.contains(&[]));
|
|
assert!(!range.contains(&[0x00, 0x00]));
|
|
}
|
|
|
|
#[test]
|
|
fn test_codespace_range_contains_2_byte() {
|
|
let range = CodespaceRange::new([0x80, 0x00, 0, 0], [0xFF, 0xFF, 0, 0], 2);
|
|
|
|
// Valid bytes in range
|
|
assert!(range.contains(&[0x80, 0x00]));
|
|
assert!(range.contains(&[0xA0, 0xA0]));
|
|
assert!(range.contains(&[0xFF, 0xFF]));
|
|
|
|
// Outside range
|
|
assert!(!range.contains(&[0x00, 0x00]));
|
|
assert!(!range.contains(&[0x7F, 0xFF]));
|
|
|
|
// Wrong width
|
|
assert!(!range.contains(&[0x80]));
|
|
assert!(!range.contains(&[0x80, 0x00, 0x00]));
|
|
}
|
|
|
|
#[test]
|
|
fn test_find_range() {
|
|
let mut ranges = CodespaceRanges::new();
|
|
ranges.push(CodespaceRange::new([0x00, 0, 0, 0], [0x7F, 0, 0, 0], 1));
|
|
ranges.push(CodespaceRange::new([0x80, 0x00, 0, 0], [0xFF, 0xFF, 0, 0], 2));
|
|
|
|
// 1-byte sequence
|
|
assert_eq!(ranges.find_range(&[0x40]), Some(0));
|
|
assert_eq!(ranges.find_range(&[0x80]), None);
|
|
|
|
// 2-byte sequence
|
|
assert_eq!(ranges.find_range(&[0x80, 0x00]), Some(1));
|
|
assert_eq!(ranges.find_range(&[0x00, 0x00]), None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_invalid_hex_emits_diagnostic() {
|
|
// Invalid hex characters in string
|
|
let input = b"1 begincodespacerange\n<XG> <FF>\nendcodespacerange";
|
|
let parser = CodespaceParser::new(input);
|
|
let (ranges, diags) = parser.parse();
|
|
|
|
// Should have diagnostic
|
|
assert!(!diags.is_empty());
|
|
assert!(diags.iter().any(|d| d.code == DiagCode::CmapInvalidCodespace));
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty_hex_string() {
|
|
// Empty hex string <>
|
|
let input = b"1 begincodespacerange\n<> <>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
// Empty strings parse as 0 bytes, width 0 is invalid
|
|
// This should produce a diagnostic
|
|
assert!(ranges.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_3_byte_range() {
|
|
// 3-byte range (valid per spec)
|
|
let input = b"1 begincodespacerange\n<800000> <FFFFFF>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 1);
|
|
assert_eq!(ranges.ranges[0].width, 3);
|
|
assert_eq!(ranges.ranges[0].lo_slice(), &[0x80, 0x00, 0x00]);
|
|
assert_eq!(ranges.ranges[0].hi_slice(), &[0xFF, 0xFF, 0xFF]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_4_byte_range() {
|
|
// 4-byte range (max valid width)
|
|
let input = b"1 begincodespacerange\n<80000000> <FFFFFFFF>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 1);
|
|
assert_eq!(ranges.ranges[0].width, 4);
|
|
assert_eq!(ranges.ranges[0].lo_slice(), &[0x80, 0x00, 0x00, 0x00]);
|
|
assert_eq!(ranges.ranges[0].hi_slice(), &[0xFF, 0xFF, 0xFF, 0xFF]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_comments_ignored() {
|
|
// Comments should be ignored
|
|
let input = b"% This is a comment\n1 begincodespacerange\n% Another comment\n<00> <7F>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 1);
|
|
assert_eq!(ranges.ranges[0].width, 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_whitespace_variations() {
|
|
// Various whitespace forms
|
|
let input = b"1 begincodespacerace <00> <7F> endcodespacerace";
|
|
// Note: typo in keyword would cause this to fail - let's fix it
|
|
let input = b"1 begincodespacerange\t<00>\t<7F>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_recovery_after_invalid_range() {
|
|
// First range is invalid, second is valid
|
|
let input = b"2 begincodespacerange\n<00> <FFFF>\n<00> <7F>\nendcodespacerange";
|
|
let parser = CodespaceParser::new(input);
|
|
let (ranges, diags) = parser.parse();
|
|
|
|
// Should have diagnostic for first range
|
|
assert!(!diags.is_empty());
|
|
// Should skip first range but continue to parse second
|
|
assert_eq!(ranges.len(), 1);
|
|
assert_eq!(ranges.ranges[0].width, 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_display() {
|
|
let ranges = CodespaceRanges {
|
|
ranges: smallvec::smallvec![
|
|
CodespaceRange::new([0x00, 0, 0, 0], [0x7F, 0, 0, 0], 1),
|
|
CodespaceRange::new([0x80, 0x00, 0, 0], [0xFF, 0xFF, 0, 0], 2),
|
|
],
|
|
};
|
|
|
|
let display = format!("{}", ranges);
|
|
assert!(display.contains("CodespaceRanges"));
|
|
assert!(display.contains("2 ranges"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_identity_h_cmap() {
|
|
// Identity-H CMap has specific codespace ranges
|
|
// Most commonly: <00> <FF> for 1-byte and <0100> <FFFF> for 2-byte
|
|
let input = b"2 begincodespacerange\n<00> <FF>\n<0100> <FFFF>\nendcodespacerange";
|
|
let ranges = parse_codespace_ranges(input);
|
|
|
|
assert_eq!(ranges.len(), 2);
|
|
|
|
// 1-byte range covers all single bytes
|
|
assert_eq!(ranges.ranges[0].width, 1);
|
|
assert!(ranges.ranges[0].contains(&[0x00]));
|
|
assert!(ranges.ranges[0].contains(&[0xFF]));
|
|
|
|
// 2-byte range covers 0x0100-0xFFFF
|
|
assert_eq!(ranges.ranges[1].width, 2);
|
|
assert!(ranges.ranges[1].contains(&[0x01, 0x00]));
|
|
assert!(ranges.ranges[1].contains(&[0xFF, 0xFF]));
|
|
}
|
|
}
|