This commit implements the Cargo.lock policy for reproducible builds across all workspace members (pdftract-core, pdftract-cli, pdftract-py). Changes: - Add CONTRIBUTING.md with lockfile-update workflow documentation - Add .renovaterc.json for weekly lockfile-only PRs (human-gated) - Add crates/pdftract-core/README.md with rationale for checked-in lockfiles - Add notes/pdftract-49f8.md with verification note The Argo workflow updates (pdftract-ci.yaml) are committed separately in the declarative-config repo. Acceptance criteria: - PASS: Cargo.lock tracked by git, not in .gitignore - PASS: Argo workflow templates document --locked/--frozen requirements - WARN: Enforcement to be completed when placeholder templates are implemented - WARN: Binary reproducibility verification deferred to pdftract-build-binaries implementation Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
286 lines
8.4 KiB
Rust
286 lines
8.4 KiB
Rust
//! Property-based tests for the PDF CMap parser.
|
|
//!
|
|
//! These tests verify that CMap parsing foundations (name and string handling)
|
|
//! maintain their core invariants across all possible inputs, following INV-8
|
|
//! (no panic at public boundary).
|
|
//!
|
|
//! Note: Full CMap parser is not yet implemented. These tests focus on the
|
|
//! lexer's name and string handling which are foundational to CMap parsing.
|
|
|
|
use pdftract_core::parser::lexer::{Lexer, Token};
|
|
|
|
/// Property: Name tokens never panic on any input.
|
|
///
|
|
/// CMap files contain many name tokens (e.g., /CIDInit, /CMapName).
|
|
/// The lexer must handle these without panicking.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_name_tokens_never_panic(
|
|
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
|
|
) {
|
|
let mut lexer = Lexer::new(&bytes);
|
|
|
|
loop {
|
|
match lexer.next_token() {
|
|
Some(Token::Eof) | None => break,
|
|
Some(_) => {
|
|
// Any token is fine, we're checking for panics
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Property: Hex string parsing never panics.
|
|
///
|
|
/// CMap uses hex strings extensively for character mappings.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_hex_string_never_panics(
|
|
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
|
|
) {
|
|
let mut lexer = Lexer::new(&bytes);
|
|
|
|
loop {
|
|
match lexer.next_token() {
|
|
Some(Token::Eof) | None => break,
|
|
Some(Token::HexString(_)) => {
|
|
// Hex string parsed successfully
|
|
}
|
|
Some(_) => {
|
|
// Other tokens are fine
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Property: Literal string parsing never panics.
|
|
///
|
|
/// CMap also uses literal strings for certain mappings.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_literal_string_never_panics(
|
|
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
|
|
) {
|
|
let mut lexer = Lexer::new(&bytes);
|
|
|
|
loop {
|
|
match lexer.next_token() {
|
|
Some(Token::Eof) | None => break,
|
|
Some(Token::String(_)) => {
|
|
// String parsed successfully
|
|
}
|
|
Some(_) => {
|
|
// Other tokens are fine
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Property: CMap-specific keywords don't cause panics.
|
|
///
|
|
/// CMap files have specific keywords like /CMapType, /WMode, etc.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_cmap_keywords_no_panic(
|
|
prefix in proptest::collection::vec(proptest::num::u8::ANY, 0..100),
|
|
keyword in prop_oneof![
|
|
Just(b"/CMapName"),
|
|
Just(b"/CMapType"),
|
|
Just(b"/WMode"),
|
|
Just(b"/CIDInit"),
|
|
Just(b"/CIDSystemInfo"),
|
|
],
|
|
suffix in proptest::collection::vec(proptest::num::u8::ANY, 0..100)
|
|
) {
|
|
let mut input = prefix;
|
|
input.extend_from_slice(keyword);
|
|
input.extend_from_slice(&suffix);
|
|
|
|
let mut lexer = Lexer::new(&input);
|
|
let _ = lexer.next_token();
|
|
}
|
|
}
|
|
|
|
/// Property: Mixed token types in CMap-like input don't panic.
|
|
///
|
|
/// CMap files mix dictionaries, arrays, integers, and names.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_mixed_cmap_tokens_no_panic(
|
|
tokens in proptest::collection::vec(
|
|
proptest::prop_oneof![
|
|
proptest::collection::vec(proptest::num::u8::ANY, 0..20).prop_map(|b| format!("/{}", String::from_utf8_lossy(&b))),
|
|
proptest::collection::vec(proptest::num::u8::ANY, 0..20).prop_map(|b| format!("({})", String::from_utf8_lossy(&b))),
|
|
proptest::num::i32::ANY.prop_map(|n| n.to_string()),
|
|
Just("<<".to_string()),
|
|
Just(">>".to_string()),
|
|
Just("[".to_string()),
|
|
Just("]".to_string()),
|
|
],
|
|
0..100
|
|
)
|
|
) {
|
|
let mut input = String::new();
|
|
for token in tokens {
|
|
input.push_str(&token);
|
|
input.push(' ');
|
|
}
|
|
|
|
let mut lexer = Lexer::new(input.as_bytes());
|
|
loop {
|
|
match lexer.next_token() {
|
|
Some(Token::Eof) | None => break,
|
|
Some(_) => {}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Property: Very long name tokens don't cause panics.
|
|
///
|
|
/// CMap can have long registry names, but names are limited to 127 bytes.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_long_name_tokens_no_panic(
|
|
name_bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..500)
|
|
) {
|
|
let mut input = vec![b'/'];
|
|
input.extend_from_slice(&name_bytes);
|
|
|
|
let mut lexer = Lexer::new(&input);
|
|
let token = lexer.next_token();
|
|
|
|
// Should either parse a truncated name or emit diagnostics, never panic
|
|
match token {
|
|
Some(Token::Name(_)) => {
|
|
// Name parsed (possibly truncated to 127 bytes)
|
|
}
|
|
Some(_) => {
|
|
// Other token type (diagnostic emitted)
|
|
}
|
|
None => {
|
|
// EOF or error
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Property: Bracket nesting in arrays doesn't cause infinite loops.
|
|
///
|
|
/// CMap uses arrays for code ranges; ensure we handle nesting correctly.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_array_bracket_nesting_no_infinite_loop(
|
|
open_brackets in 0usize..100,
|
|
content in proptest::collection::vec(proptest::num::u8::ANY, 0..50)
|
|
) {
|
|
let mut input = String::new();
|
|
for _ in 0..open_brackets {
|
|
input.push('[');
|
|
}
|
|
input.push_str(&String::from_utf8_lossy(&content));
|
|
|
|
let mut lexer = Lexer::new(input.as_bytes());
|
|
let mut iterations = 0;
|
|
let max_iterations = 10000;
|
|
|
|
loop {
|
|
match lexer.next_token() {
|
|
Some(Token::Eof) | None => break,
|
|
Some(_) => {
|
|
iterations += 1;
|
|
if iterations > max_iterations {
|
|
panic!("Lexer appears to be in an infinite loop");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Property: Dictionary nesting in CMap doesn't cause panics.
|
|
///
|
|
/// CMap has nested dictionaries for CIDSystemInfo, etc.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_dict_nesting_no_panic(
|
|
depth in 0usize..50
|
|
) {
|
|
let mut input = String::new();
|
|
for _ in 0..depth {
|
|
input.push_str("<< /A ");
|
|
}
|
|
input.push_str("1");
|
|
for _ in 0..depth {
|
|
input.push_str(" >>");
|
|
}
|
|
|
|
let mut lexer = Lexer::new(input.as_bytes());
|
|
loop {
|
|
match lexer.next_token() {
|
|
Some(Token::Eof) | None => break,
|
|
Some(_) => {}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Property: Special CMap characters in names are handled.
|
|
///
|
|
/// CMap names can contain # escapes for special characters.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_name_hex_escapes_no_panic(
|
|
prefix in proptest::collection::vec(proptest::num::u8::ANY, 0..20),
|
|
hex_bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..100),
|
|
suffix in proptest::collection::vec(proptest::num::u8::ANY, 0..20)
|
|
) {
|
|
let mut input = vec![b'/'];
|
|
input.extend_from_slice(&prefix);
|
|
|
|
// Add some # hex escapes
|
|
for chunk in hex_bytes.chunks(2) {
|
|
input.push(b'#');
|
|
for &b in chunk.iter().take(2) {
|
|
input.push(b);
|
|
}
|
|
}
|
|
|
|
input.extend_from_slice(&suffix);
|
|
|
|
let mut lexer = Lexer::new(&input);
|
|
let _ = lexer.next_token();
|
|
}
|
|
}
|
|
|
|
/// Property: take_diagnostics is idempotent for CMap-like inputs.
|
|
#[cfg(feature = "proptest")]
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn prop_take_diagnostics_idempotent(
|
|
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
|
|
) {
|
|
let mut lexer = Lexer::new(&bytes);
|
|
|
|
while lexer.next_token().is_some() {}
|
|
|
|
let _diags1 = lexer.take_diagnostics();
|
|
let diags2 = lexer.take_diagnostics();
|
|
|
|
prop_assert!(diags2.is_empty(),
|
|
"Second take_diagnostics() should return empty, got {} diagnostics",
|
|
diags2.len());
|
|
}
|
|
}
|