pdftract/tests/proptest/cmap_parser.rs
jedarden 9aa26a449e docs(pdftract-49f8): establish Cargo.lock policy and documentation
This commit implements the Cargo.lock policy for reproducible builds
across all workspace members (pdftract-core, pdftract-cli, pdftract-py).

Changes:
- Add CONTRIBUTING.md with lockfile-update workflow documentation
- Add .renovaterc.json for weekly lockfile-only PRs (human-gated)
- Add crates/pdftract-core/README.md with rationale for checked-in lockfiles
- Add notes/pdftract-49f8.md with verification note

The Argo workflow updates (pdftract-ci.yaml) are committed separately
in the declarative-config repo.

Acceptance criteria:
- PASS: Cargo.lock tracked by git, not in .gitignore
- PASS: Argo workflow templates document --locked/--frozen requirements
- WARN: Enforcement to be completed when placeholder templates are implemented
- WARN: Binary reproducibility verification deferred to pdftract-build-binaries implementation

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 18:13:14 -04:00

286 lines
8.4 KiB
Rust

//! Property-based tests for the PDF CMap parser.
//!
//! These tests verify that CMap parsing foundations (name and string handling)
//! maintain their core invariants across all possible inputs, following INV-8
//! (no panic at public boundary).
//!
//! Note: Full CMap parser is not yet implemented. These tests focus on the
//! lexer's name and string handling which are foundational to CMap parsing.
use pdftract_core::parser::lexer::{Lexer, Token};
/// Property: Name tokens never panic on any input.
///
/// CMap files contain many name tokens (e.g., /CIDInit, /CMapName).
/// The lexer must handle these without panicking.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_name_tokens_never_panic(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
) {
let mut lexer = Lexer::new(&bytes);
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(_) => {
// Any token is fine, we're checking for panics
}
}
}
}
}
/// Property: Hex string parsing never panics.
///
/// CMap uses hex strings extensively for character mappings.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_hex_string_never_panics(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
) {
let mut lexer = Lexer::new(&bytes);
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(Token::HexString(_)) => {
// Hex string parsed successfully
}
Some(_) => {
// Other tokens are fine
}
}
}
}
}
/// Property: Literal string parsing never panics.
///
/// CMap also uses literal strings for certain mappings.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_literal_string_never_panics(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
) {
let mut lexer = Lexer::new(&bytes);
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(Token::String(_)) => {
// String parsed successfully
}
Some(_) => {
// Other tokens are fine
}
}
}
}
}
/// Property: CMap-specific keywords don't cause panics.
///
/// CMap files have specific keywords like /CMapType, /WMode, etc.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_cmap_keywords_no_panic(
prefix in proptest::collection::vec(proptest::num::u8::ANY, 0..100),
keyword in prop_oneof![
Just(b"/CMapName"),
Just(b"/CMapType"),
Just(b"/WMode"),
Just(b"/CIDInit"),
Just(b"/CIDSystemInfo"),
],
suffix in proptest::collection::vec(proptest::num::u8::ANY, 0..100)
) {
let mut input = prefix;
input.extend_from_slice(keyword);
input.extend_from_slice(&suffix);
let mut lexer = Lexer::new(&input);
let _ = lexer.next_token();
}
}
/// Property: Mixed token types in CMap-like input don't panic.
///
/// CMap files mix dictionaries, arrays, integers, and names.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_mixed_cmap_tokens_no_panic(
tokens in proptest::collection::vec(
proptest::prop_oneof![
proptest::collection::vec(proptest::num::u8::ANY, 0..20).prop_map(|b| format!("/{}", String::from_utf8_lossy(&b))),
proptest::collection::vec(proptest::num::u8::ANY, 0..20).prop_map(|b| format!("({})", String::from_utf8_lossy(&b))),
proptest::num::i32::ANY.prop_map(|n| n.to_string()),
Just("<<".to_string()),
Just(">>".to_string()),
Just("[".to_string()),
Just("]".to_string()),
],
0..100
)
) {
let mut input = String::new();
for token in tokens {
input.push_str(&token);
input.push(' ');
}
let mut lexer = Lexer::new(input.as_bytes());
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(_) => {}
}
}
}
}
/// Property: Very long name tokens don't cause panics.
///
/// CMap can have long registry names, but names are limited to 127 bytes.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_long_name_tokens_no_panic(
name_bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..500)
) {
let mut input = vec![b'/'];
input.extend_from_slice(&name_bytes);
let mut lexer = Lexer::new(&input);
let token = lexer.next_token();
// Should either parse a truncated name or emit diagnostics, never panic
match token {
Some(Token::Name(_)) => {
// Name parsed (possibly truncated to 127 bytes)
}
Some(_) => {
// Other token type (diagnostic emitted)
}
None => {
// EOF or error
}
}
}
}
/// Property: Bracket nesting in arrays doesn't cause infinite loops.
///
/// CMap uses arrays for code ranges; ensure we handle nesting correctly.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_array_bracket_nesting_no_infinite_loop(
open_brackets in 0usize..100,
content in proptest::collection::vec(proptest::num::u8::ANY, 0..50)
) {
let mut input = String::new();
for _ in 0..open_brackets {
input.push('[');
}
input.push_str(&String::from_utf8_lossy(&content));
let mut lexer = Lexer::new(input.as_bytes());
let mut iterations = 0;
let max_iterations = 10000;
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(_) => {
iterations += 1;
if iterations > max_iterations {
panic!("Lexer appears to be in an infinite loop");
}
}
}
}
}
}
/// Property: Dictionary nesting in CMap doesn't cause panics.
///
/// CMap has nested dictionaries for CIDSystemInfo, etc.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_dict_nesting_no_panic(
depth in 0usize..50
) {
let mut input = String::new();
for _ in 0..depth {
input.push_str("<< /A ");
}
input.push_str("1");
for _ in 0..depth {
input.push_str(" >>");
}
let mut lexer = Lexer::new(input.as_bytes());
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(_) => {}
}
}
}
}
/// Property: Special CMap characters in names are handled.
///
/// CMap names can contain # escapes for special characters.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_name_hex_escapes_no_panic(
prefix in proptest::collection::vec(proptest::num::u8::ANY, 0..20),
hex_bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..100),
suffix in proptest::collection::vec(proptest::num::u8::ANY, 0..20)
) {
let mut input = vec![b'/'];
input.extend_from_slice(&prefix);
// Add some # hex escapes
for chunk in hex_bytes.chunks(2) {
input.push(b'#');
for &b in chunk.iter().take(2) {
input.push(b);
}
}
input.extend_from_slice(&suffix);
let mut lexer = Lexer::new(&input);
let _ = lexer.next_token();
}
}
/// Property: take_diagnostics is idempotent for CMap-like inputs.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_take_diagnostics_idempotent(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
while lexer.next_token().is_some() {}
let _diags1 = lexer.take_diagnostics();
let diags2 = lexer.take_diagnostics();
prop_assert!(diags2.is_empty(),
"Second take_diagnostics() should return empty, got {} diagnostics",
diags2.len());
}
}