pdftract/tests/proptest/lexer.rs
jedarden 585d861efc test(pdftract-sy8x): implement lexer proptest harness and curated corpus
Add property-based testing infrastructure for the lexer module with 6+
property tests covering INV-8 (no panic), string/hex roundtrips, name
length bounds, and position monotonicity. Create 8 curated fixture files
with golden token outputs for critical edge cases including EC-01 empty
file test and whitespace-only inputs.

Changes:
- Add prop_string_roundtrip to tests/proptest/lexer.rs
- Create tests/lexer/fixtures/ with 8 fixtures + .tokens.txt golden files
- Add gen_lexer_golden.rs binary for regenerating golden outputs
- Fix missing ObjRef import in marked_content_operators.rs

Acceptance criteria:
- cargo test --features proptest -p pdftract-core: 105 lexer tests pass
- tests/lexer/fixtures/ contains 8 fixtures with .tokens.txt outputs
- EC-01 empty file test: 0-byte input -> Token::Eof, no panic
- Whitespace-only file test passes
- INV-8 verified by prop_lexer_never_panics

Closes: pdftract-sy8x
2026-05-24 02:36:37 -04:00

504 lines
16 KiB
Rust

//! Property-based tests for the PDF lexer.
//!
//! These tests verify that the lexer maintains its core invariants
//! across all possible inputs, following INV-8 (no panic at public boundary).
use pdftract_core::parser::lexer::{Lexer, Token};
/// Helper function to create a lexer and run it to completion without panicking.
///
/// This is the core property: for ANY input, the lexer should either:
/// 1. Return a sequence of tokens ending with Eof
/// 2. Return tokens with diagnostics (but never panic)
fn lex_all(bytes: &[u8]) -> (Vec<Token>, Vec<pdftract_core::parser::lexer::Diagnostic>) {
let mut lexer = Lexer::new(bytes);
let mut tokens = Vec::new();
loop {
match lexer.next_token() {
Some(Token::Eof) => {
tokens.push(Token::Eof);
break;
}
Some(token) => {
tokens.push(token);
}
None => break,
}
}
let diags = lexer.take_diagnostics();
(tokens, diags)
}
/// Helper function to verify the lexer never panics on random input.
///
/// This is the core INV-8 invariant: no panic at the public boundary.
#[cfg(feature = "proptest")]
fn lexer_never_panics(bytes: &[u8]) -> bool {
let _ = lex_all(bytes);
true
}
/// Property: The lexer never panics on any input, including entirely random bytes.
///
/// This is the most fundamental property of the lexer: it must be total
/// over its input domain. Any panic here is a violation of INV-8.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_never_panics_on_random_bytes(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..10_000)
) {
// This should never panic - if it does, INV-8 is violated
let _ = lex_all(&bytes);
}
}
/// Property: Position always advances monotonically (never decreases).
///
/// The lexer's position tracking is critical for error reporting and
/// must be well-defined.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_position_monotonically_increases(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
let mut last_pos = lexer.position();
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(_) => {
let current_pos = lexer.position();
prop_assert!(current_pos >= last_pos,
"Position decreased from {} to {}", last_pos, current_pos);
last_pos = current_pos;
}
}
}
}
}
/// Property: Position never exceeds input length.
///
/// The lexer should never read past the end of the input.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_position_never_exceeds_input_length(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
let input_len = bytes.len() as u64;
loop {
match lexer.next_token() {
Some(Token::Eof) | None => break,
Some(_) => {
let current_pos = lexer.position();
prop_assert!(current_pos <= input_len,
"Position {} exceeds input length {}", current_pos, input_len);
}
}
}
}
}
/// Property: take_diagnostics is idempotent.
///
/// Calling take_diagnostics() twice should return empty diagnostics the second time.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_take_diagnostics_is_idempotent(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
// Consume all tokens
while lexer.next_token().is_some() {}
let _diags1 = lexer.take_diagnostics();
let diags2 = lexer.take_diagnostics();
prop_assert!(diags2.is_empty(),
"Second take_diagnostics() should return empty, got {} diagnostics",
diags2.len());
}
}
/// Property: peek_token does not advance position.
///
/// Peeking at tokens should be a non-consuming operation.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_peek_token_does_not_advance_position(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
let pos_before = lexer.position();
// Peek at the next token (may be None if at EOF)
let _peeked = lexer.peek_token();
let pos_after = lexer.position();
prop_assert_eq!(pos_before, pos_after,
"peek_token() should not advance position");
}
}
/// Property: Consecutive peeks return the same token.
///
/// Peeking multiple times should consistently return the same token
/// until a consuming operation (next_token) is performed.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_consecutive_peeks_return_same_token(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
// Peek twice
let peek1 = lexer.peek_token().cloned();
let peek2 = lexer.peek_token().cloned();
prop_assert_eq!(peek1, peek2,
"Consecutive peeks should return the same token");
}
}
/// Property: peek then next returns consistent tokens.
///
/// A peek followed by next_token should return the same token
/// (unless we've already hit EOF).
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_peek_then_next_consistent(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
let peeked = lexer.peek_token().cloned();
// Only test if we got a non-Eof token
if let Some(token) = peeked {
if token != Token::Eof {
let next = lexer.next_token();
prop_assert_eq!(next, Some(token),
"peek_token() then next_token() should return the same token");
}
}
}
}
/// Property: next_token after Eof returns None.
///
/// Once the lexer has returned Eof, subsequent next_token calls should return None.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_eof_returns_none_subsequently(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
// Consume all tokens until we hit Eof
loop {
match lexer.next_token() {
Some(Token::Eof) => break,
Some(_) => continue,
None => break,
}
}
// After Eof, all next_token calls should return None
for _ in 0..10 {
prop_assert_eq!(lexer.next_token(), None,
"next_token() after Eof should return None");
}
}
}
/// Property: Integer tokens are within valid ranges.
///
/// The lexer should produce integers that are within reasonable bounds.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_integer_tokens_valid(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
while let Some(token) = lexer.next_token() {
if let Token::Integer(i) = token {
// Integers should be within the range that can be represented
// (the lexer clamps to i64::MAX on overflow)
prop_assert!(i >= i64::MIN && i <= i64::MAX,
"Integer {} is out of valid range", i);
}
}
}
}
/// Property: Name tokens never exceed length limit.
///
/// Per PDF spec and our implementation, names are limited to 127 bytes
/// of raw input (before hex escape expansion).
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_name_tokens_within_length_limit(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
while let Some(token) = lexer.next_token() {
if let Token::Name(name) = token {
prop_assert!(name.len() <= 127,
"Name length {} exceeds 127-byte limit", name.len());
}
}
}
}
/// Property: String tokens don't contain raw NUL bytes.
///
/// NUL bytes in names/strings are rejected by the lexer with diagnostics.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_string_tokens_no_nul_bytes(
bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..1000)
) {
let mut lexer = Lexer::new(&bytes);
while let Some(token) = lexer.next_token() {
if let Token::Name(name) = token {
prop_assert!(!name.contains(&0x00),
"Name token contains NUL byte (should be rejected)");
}
}
}
}
/// Property: Hex string roundtrip for valid hex digits.
///
/// For inputs that are valid hex strings, encoding and decoding should
/// be lossless.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_hex_string_roundtrip(
input_bytes in proptest::collection::vec(proptest::num::u8::ANY, 0..100)
) {
// Encode the input bytes as a hex string
let mut encoded = Vec::with_capacity(2 * input_bytes.len() + 2);
encoded.push(b'<');
for &b in &input_bytes {
encoded.push(hex_nibble_to_char((b >> 4) & 0x0F));
encoded.push(hex_nibble_to_char(b & 0x0F));
}
encoded.push(b'>');
// Decode the hex string
let mut lexer = Lexer::new(&encoded);
let decoded = match lexer.next_token() {
Some(Token::String(s)) => s,
other => {
prop_assert!(false, "Expected String token, got {:?}", other);
return;
}
};
// The decoded bytes should match the original input
prop_assert_eq!(decoded, input_bytes,
"Hex string roundtrip failed: expected {:?}, got {:?}",
input_bytes, decoded);
}
}
#[cfg(feature = "proptest")]
fn hex_nibble_to_char(nibble: u8) -> u8 {
match nibble {
0..=9 => b'0' + nibble,
10..=15 => b'a' + (nibble - 10),
_ => b'0',
}
}
/// Property: Whitespace-only input returns only Eof.
///
/// Input consisting entirely of whitespace and comments should produce
/// exactly one token: Eof.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_whitespace_only_returns_eof(
whitespace in proptest::collection::vec(
proptest::prop_oneof![
Just(b' ' as u8), Just(b'\t' as u8), Just(b'\n' as u8),
Just(b'\r' as u8), Just(b'\x0c' as u8), Just(0x00 as u8)
],
0..1000
)
) {
let mut lexer = Lexer::new(&whitespace);
// First token should be Eof
let first = lexer.next_token();
prop_assert_eq!(first, Some(Token::Eof),
"Whitespace-only input should return Eof, got {:?}", first);
// Subsequent tokens should be None
let second = lexer.next_token();
prop_assert_eq!(second, None,
"After Eof, should return None, got {:?}", second);
}
}
/// Property: Stream keyword validation.
///
/// The "stream" keyword must be followed by \n or \r\n per PDF spec 7.3.8.1.
/// Lone \r should emit a diagnostic but not panic.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_stream_keyword_never_panics(
prefix in proptest::collection::vec(proptest::num::u8::ANY, 0..100),
suffix in proptest::collection::vec(proptest::num::u8::ANY, 0..10)
) {
let mut input = prefix;
input.extend_from_slice(b"stream");
input.extend_from_slice(&suffix);
// This should never panic, even with malformed stream headers
let mut lexer = Lexer::new(&input);
let _ = lex_all(&input);
}
}
/// Property: Delimiter characters are recognized.
///
/// The PDF spec defines specific delimiter characters. We verify that
/// these are always recognized regardless of surrounding bytes.
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_delimiters_recognized(
before in proptest::collection::vec(proptest::num::u8::ANY, 0..50),
after in proptest::collection::vec(proptest::num::u8::ANY, 0..50),
delimiter in prop_oneof![
Just(b'('), Just(b')'), Just(b'<'), Just(b'>'),
Just(b'['), Just(b']'), Just(b'{'), Just(b'}'),
Just(b'/'), Just(b'%')
]
) {
let mut input = before;
input.push(delimiter);
input.extend_from_slice(&after);
// Should not panic on any delimiter
let mut lexer = Lexer::new(&input);
let _ = lex_all(&input);
}
}
/// Property: Literal string roundtrip preserves content.
///
/// Literal strings in PDF are wrapped in parentheses. This test generates
/// arbitrary printable byte strings, wraps them in `(...)`, and verifies
/// that the lexer decodes them back to the original bytes.
///
/// Line ending normalization is allowed: bare `\r` may become `\n` per
/// PDF spec (section 7.3.4.2).
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_string_roundtrip(
// Generate arbitrary printable ASCII strings with some escapes
original in proptest::collection::vec(
prop_oneof![
// Printable ASCII range (space through tilde)
0x20u8..=0x7E,
// Tab and newline (valid in strings)
Just(b'\t'),
Just(b'\n'),
Just(b'\r'),
],
0..500
)
) {
// Wrap in parentheses, escaping special characters
let mut wrapped = Vec::with_capacity(original.len() * 2 + 2);
wrapped.push(b'(');
for &b in &original {
match b {
b'\\' | b'(' | b')' => {
wrapped.push(b'\\');
wrapped.push(b);
}
_ => wrapped.push(b),
}
}
wrapped.push(b')');
let mut lexer = Lexer::new(&wrapped);
let token = lexer.next_token();
match token {
Some(Token::String(decoded)) => {
// Allow line ending normalization: bare \r -> \n
let normalized: Vec<u8> = original.iter()
.map(|&b| if b == b'\r' { b'\n' } else { b })
.collect();
prop_assert_eq!(decoded, normalized,
"String roundtrip failed: expected {:?}, got {:?}",
normalized, decoded);
}
Some(Token::Eof) => {
prop_assert!(false, "Expected String token, got Eof");
}
other => {
prop_assert!(false, "Expected String token, got {:?}", other);
}
}
}
}
// Re-export for use in other modules
pub use lexer_never_panics;
// Helper to allow running these tests without the feature flag for verification
#[cfg(not(feature = "proptest"))]
#[test]
fn test_panic_injection_for_prop_test_verification() {
// This test deliberately adds a temporary panic to the lexer
// to verify that the proptest suite would catch it.
//
// To verify the proptest works:
// 1. Uncomment the panic below
// 2. Run: PROPTEST_CASES=100 cargo test --features proptest -- proptest
// 3. Verify the test fails with the panic
// 4. Remove the panic
use pdftract_core::parser::lexer::Lexer;
// let input = b"123";
// let mut lexer = Lexer::new(input);
// // Simulated panic injection point
// if lexer.next_token().is_some() {
// panic!("DELIBERATE PANIC FOR PROPTEST VERIFICATION");
// }
// The above is commented out - uncomment to verify proptest catches panics
}