test(pdftract-sy8x): implement lexer proptest harness and curated corpus
Add property-based testing infrastructure for the lexer module with 6+ property tests covering INV-8 (no panic), string/hex roundtrips, name length bounds, and position monotonicity. Create 8 curated fixture files with golden token outputs for critical edge cases including EC-01 empty file test and whitespace-only inputs. Changes: - Add prop_string_roundtrip to tests/proptest/lexer.rs - Create tests/lexer/fixtures/ with 8 fixtures + .tokens.txt golden files - Add gen_lexer_golden.rs binary for regenerating golden outputs - Fix missing ObjRef import in marked_content_operators.rs Acceptance criteria: - cargo test --features proptest -p pdftract-core: 105 lexer tests pass - tests/lexer/fixtures/ contains 8 fixtures with .tokens.txt outputs - EC-01 empty file test: 0-byte input -> Token::Eof, no panic - Whitespace-only file test passes - INV-8 verified by prop_lexer_never_panics Closes: pdftract-sy8x
This commit is contained in:
parent
ee30a7033e
commit
585d861efc
20 changed files with 261 additions and 1 deletions
|
|
@ -20,6 +20,10 @@ path = "../../tests/fixtures/generate_lzw_fixtures_main.rs"
|
|||
name = "generate_preprocess_fixtures"
|
||||
path = "../../tests/fixtures/preprocess/generate_fixtures_main.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "gen_lexer_golden"
|
||||
path = "../../tests/gen_lexer_golden.rs"
|
||||
|
||||
[lib]
|
||||
name = "pdftract_cli"
|
||||
path = "src/lib.rs"
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
//! - BDC /Tag <<props>> or BDC /Tag /PropName: begin marked content with properties
|
||||
//! - EMC: end marked content (pop top frame)
|
||||
|
||||
use crate::parser::object::PdfObject;
|
||||
use crate::parser::object::{PdfObject, ObjRef};
|
||||
use crate::parser::resources::ResourceDict;
|
||||
use crate::parser::marked_content_stack::{MarkedContentStack, MarkedContentFrame};
|
||||
use crate::diagnostics::{Diagnostic, DiagCode};
|
||||
|
|
|
|||
47
tests/gen_lexer_golden.rs
Normal file
47
tests/gen_lexer_golden.rs
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
//! Generate golden token files for lexer fixtures.
|
||||
//!
|
||||
//! Run with: cargo run --bin gen_lexer_golden
|
||||
|
||||
use pdftract_core::parser::lexer::Lexer;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
fn main() {
|
||||
let fixtures = [
|
||||
"tests/lexer/fixtures/empty.bin",
|
||||
"tests/lexer/fixtures/whitespace_only.bin",
|
||||
"tests/lexer/fixtures/every_token.pdf.in",
|
||||
"tests/lexer/fixtures/string_escapes.pdf.in",
|
||||
"tests/lexer/fixtures/name_edge_cases.pdf.in",
|
||||
"tests/lexer/fixtures/hex_string_edge_cases.pdf.in",
|
||||
"tests/lexer/fixtures/numeric_edge_cases.pdf.in",
|
||||
"tests/lexer/fixtures/bom_utf16_string.pdf.in",
|
||||
];
|
||||
|
||||
for fixture in fixtures {
|
||||
println!("Processing {}...", fixture);
|
||||
|
||||
let input = fs::read(fixture)
|
||||
.unwrap_or_else(|e| panic!("Failed to read fixture {}: {}", fixture, e));
|
||||
|
||||
let mut lexer = Lexer::new(&input);
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
loop {
|
||||
match lexer.next_token() {
|
||||
Some(token) => {
|
||||
tokens.push(token);
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
let formatted: Vec<String> = tokens.iter().map(|t| format!("{:?}", t)).collect();
|
||||
let golden_path = Path::new(fixture).with_extension("tokens.txt");
|
||||
|
||||
fs::write(&golden_path, formatted.join("\n") + "\n")
|
||||
.unwrap_or_else(|e| panic!("Failed to write golden file {:?}: {}", golden_path, e));
|
||||
|
||||
println!(" -> {}", golden_path.display());
|
||||
}
|
||||
}
|
||||
1
tests/lexer/fixtures/bom_utf16_string.pdf.in
Normal file
1
tests/lexer/fixtures/bom_utf16_string.pdf.in
Normal file
|
|
@ -0,0 +1 @@
|
|||
(text with ţ˙ UTF-16 BOM prefix)
|
||||
2
tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt
Normal file
2
tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
String([116, 101, 120, 116, 32, 119, 105, 116, 104, 32, 254, 255, 32, 85, 84, 70, 45, 49, 54, 32, 66, 79, 77, 32, 112, 114, 101, 102, 105, 120])
|
||||
Eof
|
||||
0
tests/lexer/fixtures/empty.bin
Normal file
0
tests/lexer/fixtures/empty.bin
Normal file
1
tests/lexer/fixtures/empty.tokens.txt
Normal file
1
tests/lexer/fixtures/empty.tokens.txt
Normal file
|
|
@ -0,0 +1 @@
|
|||
Eof
|
||||
14
tests/lexer/fixtures/every_token.pdf.in
Normal file
14
tests/lexer/fixtures/every_token.pdf.in
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
true false null
|
||||
123 -42 3.14 -.5
|
||||
(Hello World) (nested (parens))
|
||||
<48656C6C6F>
|
||||
/Type /Font#20File /#20space
|
||||
[ ]
|
||||
<< >>
|
||||
stream
|
||||
|
||||
endstream
|
||||
obj endobj R
|
||||
xref trailer startxref
|
||||
%%EOF
|
||||
% comment
|
||||
26
tests/lexer/fixtures/every_token.pdf.tokens.txt
Normal file
26
tests/lexer/fixtures/every_token.pdf.tokens.txt
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
Bool(true)
|
||||
Bool(false)
|
||||
Null
|
||||
Integer(123)
|
||||
Integer(-42)
|
||||
Real(3.14)
|
||||
Real(-0.5)
|
||||
String([72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100])
|
||||
String([110, 101, 115, 116, 101, 100, 32, 40, 112, 97, 114, 101, 110, 115, 41])
|
||||
String([72, 101, 108, 108, 111])
|
||||
Name([84, 121, 112, 101])
|
||||
Name([70, 111, 110, 116, 32, 70, 105, 108, 101])
|
||||
Name([32, 115, 112, 97, 99, 101])
|
||||
ArrayStart
|
||||
ArrayEnd
|
||||
DictStart
|
||||
DictEnd
|
||||
Stream
|
||||
EndStream
|
||||
Obj
|
||||
Keyword([101, 110, 100, 111, 98, 106])
|
||||
IndirectRef
|
||||
Keyword([120, 114, 101, 102])
|
||||
Keyword([116, 114, 97, 105, 108, 101, 114])
|
||||
Keyword([115, 116, 97, 114, 116, 120, 114, 101, 102])
|
||||
Eof
|
||||
12
tests/lexer/fixtures/hex_string_edge_cases.pdf.in
Normal file
12
tests/lexer/fixtures/hex_string_edge_cases.pdf.in
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
<4>
|
||||
<41>
|
||||
<48 65 6C 6C 6F>
|
||||
<48656C6C6F>
|
||||
<oddlength>
|
||||
<with whitespace>
|
||||
<mixedCase>
|
||||
<MixedCase>
|
||||
<
|
||||
> (unterminated)
|
||||
<4 5 6>
|
||||
<4A6F6B65>
|
||||
13
tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt
Normal file
13
tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
String([64])
|
||||
String([65])
|
||||
String([72, 101, 108, 108, 111])
|
||||
String([72, 101, 108, 108, 111])
|
||||
String([221, 224])
|
||||
String([224, 172, 224])
|
||||
String([237, 202, 224])
|
||||
String([237, 202, 224])
|
||||
String([])
|
||||
String([117, 110, 116, 101, 114, 109, 105, 110, 97, 116, 101, 100])
|
||||
String([69, 96])
|
||||
String([74, 111, 107, 101])
|
||||
Eof
|
||||
7
tests/lexer/fixtures/name_edge_cases.pdf.in
Normal file
7
tests/lexer/fixtures/name_edge_cases.pdf.in
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
/name#20with#20space
|
||||
/name#00
|
||||
/12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
|
||||
/123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
|
||||
/Mixed#20Case#20And#20lower
|
||||
/#empty
|
||||
/a#3fb
|
||||
8
tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt
Normal file
8
tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
Name([110, 97, 109, 101, 32, 119, 105, 116, 104, 32, 115, 112, 97, 99, 101])
|
||||
Name([110, 97, 109, 101])
|
||||
Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57])
|
||||
Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48])
|
||||
Name([77, 105, 120, 101, 100, 32, 67, 97, 115, 101, 32, 65, 110, 100, 32, 108, 111, 119, 101, 114])
|
||||
Name([35, 101, 109, 112, 116, 121])
|
||||
Name([97, 63, 98])
|
||||
Eof
|
||||
12
tests/lexer/fixtures/numeric_edge_cases.pdf.in
Normal file
12
tests/lexer/fixtures/numeric_edge_cases.pdf.in
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
-.5
|
||||
42.
|
||||
123
|
||||
-456
|
||||
+789
|
||||
0
|
||||
0.0
|
||||
1e5 (not scientific per PDF spec)
|
||||
1.5e2 (not scientific per PDF spec)
|
||||
999999999999999999999
|
||||
.
|
||||
-. (no digits)
|
||||
18
tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt
Normal file
18
tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
Real(-0.5)
|
||||
Real(42.0)
|
||||
Integer(123)
|
||||
Integer(-456)
|
||||
Integer(789)
|
||||
Integer(0)
|
||||
Real(0.0)
|
||||
Integer(1)
|
||||
Keyword([101, 53])
|
||||
String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99])
|
||||
Real(1.5)
|
||||
Keyword([101, 50])
|
||||
String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99])
|
||||
Integer(9223372036854775807)
|
||||
Integer(0)
|
||||
Integer(0)
|
||||
String([110, 111, 32, 100, 105, 103, 105, 116, 115])
|
||||
Eof
|
||||
11
tests/lexer/fixtures/string_escapes.pdf.in
Normal file
11
tests/lexer/fixtures/string_escapes.pdf.in
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
(n\\n) (newline escape)
|
||||
(n\\r) (carriage return escape)
|
||||
(n\\t) (tab escape)
|
||||
(n\\\\) (backslash escape)
|
||||
(n\\() (open paren escape)
|
||||
(n\\)) (close paren escape)
|
||||
(n\\101) (octal A)
|
||||
(n\\101\\102\\103) (multiple octal)
|
||||
(line1\
|
||||
line2) (line continuation)
|
||||
(bare\\rcarriage) (line ending normalization test)
|
||||
19
tests/lexer/fixtures/string_escapes.pdf.tokens.txt
Normal file
19
tests/lexer/fixtures/string_escapes.pdf.tokens.txt
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
String([110, 92, 110])
|
||||
String([110, 101, 119, 108, 105, 110, 101, 32, 101, 115, 99, 97, 112, 101])
|
||||
String([110, 92, 114])
|
||||
String([99, 97, 114, 114, 105, 97, 103, 101, 32, 114, 101, 116, 117, 114, 110, 32, 101, 115, 99, 97, 112, 101])
|
||||
String([110, 92, 116])
|
||||
String([116, 97, 98, 32, 101, 115, 99, 97, 112, 101])
|
||||
String([110, 92, 92])
|
||||
String([98, 97, 99, 107, 115, 108, 97, 115, 104, 32, 101, 115, 99, 97, 112, 101])
|
||||
String([110, 92, 40, 41, 32, 40, 111, 112, 101, 110, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101, 41, 10, 40, 110, 92, 41])
|
||||
String([99, 108, 111, 115, 101, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101])
|
||||
String([110, 92, 49, 48, 49])
|
||||
String([111, 99, 116, 97, 108, 32, 65])
|
||||
String([110, 92, 49, 48, 49, 92, 49, 48, 50, 92, 49, 48, 51])
|
||||
String([109, 117, 108, 116, 105, 112, 108, 101, 32, 111, 99, 116, 97, 108])
|
||||
String([108, 105, 110, 101, 49, 108, 105, 110, 101, 50])
|
||||
String([108, 105, 110, 101, 32, 99, 111, 110, 116, 105, 110, 117, 97, 116, 105, 111, 110])
|
||||
String([98, 97, 114, 101, 92, 114, 99, 97, 114, 114, 105, 97, 103, 101])
|
||||
String([108, 105, 110, 101, 32, 101, 110, 100, 105, 110, 103, 32, 110, 111, 114, 109, 97, 108, 105, 122, 97, 116, 105, 111, 110, 32, 116, 101, 115, 116])
|
||||
Eof
|
||||
BIN
tests/lexer/fixtures/whitespace_only.bin
Normal file
BIN
tests/lexer/fixtures/whitespace_only.bin
Normal file
Binary file not shown.
1
tests/lexer/fixtures/whitespace_only.tokens.txt
Normal file
1
tests/lexer/fixtures/whitespace_only.tokens.txt
Normal file
|
|
@ -0,0 +1 @@
|
|||
Eof
|
||||
|
|
@ -411,6 +411,70 @@ proptest::proptest! {
|
|||
}
|
||||
}
|
||||
|
||||
/// Property: Literal string roundtrip preserves content.
|
||||
///
|
||||
/// Literal strings in PDF are wrapped in parentheses. This test generates
|
||||
/// arbitrary printable byte strings, wraps them in `(...)`, and verifies
|
||||
/// that the lexer decodes them back to the original bytes.
|
||||
///
|
||||
/// Line ending normalization is allowed: bare `\r` may become `\n` per
|
||||
/// PDF spec (section 7.3.4.2).
|
||||
#[cfg(feature = "proptest")]
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn prop_string_roundtrip(
|
||||
// Generate arbitrary printable ASCII strings with some escapes
|
||||
original in proptest::collection::vec(
|
||||
prop_oneof![
|
||||
// Printable ASCII range (space through tilde)
|
||||
0x20u8..=0x7E,
|
||||
// Tab and newline (valid in strings)
|
||||
Just(b'\t'),
|
||||
Just(b'\n'),
|
||||
Just(b'\r'),
|
||||
],
|
||||
0..500
|
||||
)
|
||||
) {
|
||||
// Wrap in parentheses, escaping special characters
|
||||
let mut wrapped = Vec::with_capacity(original.len() * 2 + 2);
|
||||
wrapped.push(b'(');
|
||||
|
||||
for &b in &original {
|
||||
match b {
|
||||
b'\\' | b'(' | b')' => {
|
||||
wrapped.push(b'\\');
|
||||
wrapped.push(b);
|
||||
}
|
||||
_ => wrapped.push(b),
|
||||
}
|
||||
}
|
||||
|
||||
wrapped.push(b')');
|
||||
|
||||
let mut lexer = Lexer::new(&wrapped);
|
||||
let token = lexer.next_token();
|
||||
|
||||
match token {
|
||||
Some(Token::String(decoded)) => {
|
||||
// Allow line ending normalization: bare \r -> \n
|
||||
let normalized: Vec<u8> = original.iter()
|
||||
.map(|&b| if b == b'\r' { b'\n' } else { b })
|
||||
.collect();
|
||||
prop_assert_eq!(decoded, normalized,
|
||||
"String roundtrip failed: expected {:?}, got {:?}",
|
||||
normalized, decoded);
|
||||
}
|
||||
Some(Token::Eof) => {
|
||||
prop_assert!(false, "Expected String token, got Eof");
|
||||
}
|
||||
other => {
|
||||
prop_assert!(false, "Expected String token, got {:?}", other);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Re-export for use in other modules
|
||||
pub use lexer_never_panics;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue