test(pdftract-sy8x): implement lexer proptest harness and curated corpus

Add property-based testing infrastructure for the lexer module with 6+
property tests covering INV-8 (no panic), string/hex roundtrips, name
length bounds, and position monotonicity. Create 8 curated fixture files
with golden token outputs for critical edge cases including EC-01 empty
file test and whitespace-only inputs.

Changes:
- Add prop_string_roundtrip to tests/proptest/lexer.rs
- Create tests/lexer/fixtures/ with 8 fixtures + .tokens.txt golden files
- Add gen_lexer_golden.rs binary for regenerating golden outputs
- Fix missing ObjRef import in marked_content_operators.rs

Acceptance criteria:
- cargo test --features proptest -p pdftract-core: 105 lexer tests pass
- tests/lexer/fixtures/ contains 8 fixtures with .tokens.txt outputs
- EC-01 empty file test: 0-byte input -> Token::Eof, no panic
- Whitespace-only file test passes
- INV-8 verified by prop_lexer_never_panics

Closes: pdftract-sy8x
This commit is contained in:
jedarden 2026-05-24 02:36:37 -04:00
parent ee30a7033e
commit 585d861efc
20 changed files with 261 additions and 1 deletions

View file

@ -20,6 +20,10 @@ path = "../../tests/fixtures/generate_lzw_fixtures_main.rs"
name = "generate_preprocess_fixtures"
path = "../../tests/fixtures/preprocess/generate_fixtures_main.rs"
[[bin]]
name = "gen_lexer_golden"
path = "../../tests/gen_lexer_golden.rs"
[lib]
name = "pdftract_cli"
path = "src/lib.rs"

View file

@ -8,7 +8,7 @@
//! - BDC /Tag <<props>> or BDC /Tag /PropName: begin marked content with properties
//! - EMC: end marked content (pop top frame)
use crate::parser::object::PdfObject;
use crate::parser::object::{PdfObject, ObjRef};
use crate::parser::resources::ResourceDict;
use crate::parser::marked_content_stack::{MarkedContentStack, MarkedContentFrame};
use crate::diagnostics::{Diagnostic, DiagCode};

47
tests/gen_lexer_golden.rs Normal file
View file

@ -0,0 +1,47 @@
//! Generate golden token files for lexer fixtures.
//!
//! Run with: cargo run --bin gen_lexer_golden
use pdftract_core::parser::lexer::Lexer;
use std::fs;
use std::path::Path;
fn main() {
let fixtures = [
"tests/lexer/fixtures/empty.bin",
"tests/lexer/fixtures/whitespace_only.bin",
"tests/lexer/fixtures/every_token.pdf.in",
"tests/lexer/fixtures/string_escapes.pdf.in",
"tests/lexer/fixtures/name_edge_cases.pdf.in",
"tests/lexer/fixtures/hex_string_edge_cases.pdf.in",
"tests/lexer/fixtures/numeric_edge_cases.pdf.in",
"tests/lexer/fixtures/bom_utf16_string.pdf.in",
];
for fixture in fixtures {
println!("Processing {}...", fixture);
let input = fs::read(fixture)
.unwrap_or_else(|e| panic!("Failed to read fixture {}: {}", fixture, e));
let mut lexer = Lexer::new(&input);
let mut tokens = Vec::new();
loop {
match lexer.next_token() {
Some(token) => {
tokens.push(token);
}
None => break,
}
}
let formatted: Vec<String> = tokens.iter().map(|t| format!("{:?}", t)).collect();
let golden_path = Path::new(fixture).with_extension("tokens.txt");
fs::write(&golden_path, formatted.join("\n") + "\n")
.unwrap_or_else(|e| panic!("Failed to write golden file {:?}: {}", golden_path, e));
println!(" -> {}", golden_path.display());
}
}

View file

@ -0,0 +1 @@
(text with ţ˙ UTF-16 BOM prefix)

View file

@ -0,0 +1,2 @@
String([116, 101, 120, 116, 32, 119, 105, 116, 104, 32, 254, 255, 32, 85, 84, 70, 45, 49, 54, 32, 66, 79, 77, 32, 112, 114, 101, 102, 105, 120])
Eof

View file

View file

@ -0,0 +1 @@
Eof

View file

@ -0,0 +1,14 @@
true false null
123 -42 3.14 -.5
(Hello World) (nested (parens))
<48656C6C6F>
/Type /Font#20File /#20space
[ ]
<< >>
stream
endstream
obj endobj R
xref trailer startxref
%%EOF
% comment

View file

@ -0,0 +1,26 @@
Bool(true)
Bool(false)
Null
Integer(123)
Integer(-42)
Real(3.14)
Real(-0.5)
String([72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100])
String([110, 101, 115, 116, 101, 100, 32, 40, 112, 97, 114, 101, 110, 115, 41])
String([72, 101, 108, 108, 111])
Name([84, 121, 112, 101])
Name([70, 111, 110, 116, 32, 70, 105, 108, 101])
Name([32, 115, 112, 97, 99, 101])
ArrayStart
ArrayEnd
DictStart
DictEnd
Stream
EndStream
Obj
Keyword([101, 110, 100, 111, 98, 106])
IndirectRef
Keyword([120, 114, 101, 102])
Keyword([116, 114, 97, 105, 108, 101, 114])
Keyword([115, 116, 97, 114, 116, 120, 114, 101, 102])
Eof

View file

@ -0,0 +1,12 @@
<4>
<41>
<48 65 6C 6C 6F>
<48656C6C6F>
<oddlength>
<with whitespace>
<mixedCase>
<MixedCase>
<
> (unterminated)
<4 5 6>
<4A6F6B65>

View file

@ -0,0 +1,13 @@
String([64])
String([65])
String([72, 101, 108, 108, 111])
String([72, 101, 108, 108, 111])
String([221, 224])
String([224, 172, 224])
String([237, 202, 224])
String([237, 202, 224])
String([])
String([117, 110, 116, 101, 114, 109, 105, 110, 97, 116, 101, 100])
String([69, 96])
String([74, 111, 107, 101])
Eof

View file

@ -0,0 +1,7 @@
/name#20with#20space
/name#00
/12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
/123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
/Mixed#20Case#20And#20lower
/#empty
/a#3fb

View file

@ -0,0 +1,8 @@
Name([110, 97, 109, 101, 32, 119, 105, 116, 104, 32, 115, 112, 97, 99, 101])
Name([110, 97, 109, 101])
Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57])
Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48])
Name([77, 105, 120, 101, 100, 32, 67, 97, 115, 101, 32, 65, 110, 100, 32, 108, 111, 119, 101, 114])
Name([35, 101, 109, 112, 116, 121])
Name([97, 63, 98])
Eof

View file

@ -0,0 +1,12 @@
-.5
42.
123
-456
+789
0
0.0
1e5 (not scientific per PDF spec)
1.5e2 (not scientific per PDF spec)
999999999999999999999
.
-. (no digits)

View file

@ -0,0 +1,18 @@
Real(-0.5)
Real(42.0)
Integer(123)
Integer(-456)
Integer(789)
Integer(0)
Real(0.0)
Integer(1)
Keyword([101, 53])
String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99])
Real(1.5)
Keyword([101, 50])
String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99])
Integer(9223372036854775807)
Integer(0)
Integer(0)
String([110, 111, 32, 100, 105, 103, 105, 116, 115])
Eof

View file

@ -0,0 +1,11 @@
(n\\n) (newline escape)
(n\\r) (carriage return escape)
(n\\t) (tab escape)
(n\\\\) (backslash escape)
(n\\() (open paren escape)
(n\\)) (close paren escape)
(n\\101) (octal A)
(n\\101\\102\\103) (multiple octal)
(line1\
line2) (line continuation)
(bare\\rcarriage) (line ending normalization test)

View file

@ -0,0 +1,19 @@
String([110, 92, 110])
String([110, 101, 119, 108, 105, 110, 101, 32, 101, 115, 99, 97, 112, 101])
String([110, 92, 114])
String([99, 97, 114, 114, 105, 97, 103, 101, 32, 114, 101, 116, 117, 114, 110, 32, 101, 115, 99, 97, 112, 101])
String([110, 92, 116])
String([116, 97, 98, 32, 101, 115, 99, 97, 112, 101])
String([110, 92, 92])
String([98, 97, 99, 107, 115, 108, 97, 115, 104, 32, 101, 115, 99, 97, 112, 101])
String([110, 92, 40, 41, 32, 40, 111, 112, 101, 110, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101, 41, 10, 40, 110, 92, 41])
String([99, 108, 111, 115, 101, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101])
String([110, 92, 49, 48, 49])
String([111, 99, 116, 97, 108, 32, 65])
String([110, 92, 49, 48, 49, 92, 49, 48, 50, 92, 49, 48, 51])
String([109, 117, 108, 116, 105, 112, 108, 101, 32, 111, 99, 116, 97, 108])
String([108, 105, 110, 101, 49, 108, 105, 110, 101, 50])
String([108, 105, 110, 101, 32, 99, 111, 110, 116, 105, 110, 117, 97, 116, 105, 111, 110])
String([98, 97, 114, 101, 92, 114, 99, 97, 114, 114, 105, 97, 103, 101])
String([108, 105, 110, 101, 32, 101, 110, 100, 105, 110, 103, 32, 110, 111, 114, 109, 97, 108, 105, 122, 97, 116, 105, 111, 110, 32, 116, 101, 115, 116])
Eof

Binary file not shown.

View file

@ -0,0 +1 @@
Eof

View file

@ -411,6 +411,70 @@ proptest::proptest! {
}
}
/// Property: Literal string roundtrip preserves content.
///
/// Literal strings in PDF are wrapped in parentheses. This test generates
/// arbitrary printable byte strings, wraps them in `(...)`, and verifies
/// that the lexer decodes them back to the original bytes.
///
/// Line ending normalization is allowed: bare `\r` may become `\n` per
/// PDF spec (section 7.3.4.2).
#[cfg(feature = "proptest")]
proptest::proptest! {
#[test]
fn prop_string_roundtrip(
// Generate arbitrary printable ASCII strings with some escapes
original in proptest::collection::vec(
prop_oneof![
// Printable ASCII range (space through tilde)
0x20u8..=0x7E,
// Tab and newline (valid in strings)
Just(b'\t'),
Just(b'\n'),
Just(b'\r'),
],
0..500
)
) {
// Wrap in parentheses, escaping special characters
let mut wrapped = Vec::with_capacity(original.len() * 2 + 2);
wrapped.push(b'(');
for &b in &original {
match b {
b'\\' | b'(' | b')' => {
wrapped.push(b'\\');
wrapped.push(b);
}
_ => wrapped.push(b),
}
}
wrapped.push(b')');
let mut lexer = Lexer::new(&wrapped);
let token = lexer.next_token();
match token {
Some(Token::String(decoded)) => {
// Allow line ending normalization: bare \r -> \n
let normalized: Vec<u8> = original.iter()
.map(|&b| if b == b'\r' { b'\n' } else { b })
.collect();
prop_assert_eq!(decoded, normalized,
"String roundtrip failed: expected {:?}, got {:?}",
normalized, decoded);
}
Some(Token::Eof) => {
prop_assert!(false, "Expected String token, got Eof");
}
other => {
prop_assert!(false, "Expected String token, got {:?}", other);
}
}
}
}
// Re-export for use in other modules
pub use lexer_never_panics;