From 585d861efc79bd33c3fb0ecad6aef7c58891c5ee Mon Sep 17 00:00:00 2001 From: jedarden Date: Sun, 24 May 2026 02:36:37 -0400 Subject: [PATCH] test(pdftract-sy8x): implement lexer proptest harness and curated corpus Add property-based testing infrastructure for the lexer module with 6+ property tests covering INV-8 (no panic), string/hex roundtrips, name length bounds, and position monotonicity. Create 8 curated fixture files with golden token outputs for critical edge cases including EC-01 empty file test and whitespace-only inputs. Changes: - Add prop_string_roundtrip to tests/proptest/lexer.rs - Create tests/lexer/fixtures/ with 8 fixtures + .tokens.txt golden files - Add gen_lexer_golden.rs binary for regenerating golden outputs - Fix missing ObjRef import in marked_content_operators.rs Acceptance criteria: - cargo test --features proptest -p pdftract-core: 105 lexer tests pass - tests/lexer/fixtures/ contains 8 fixtures with .tokens.txt outputs - EC-01 empty file test: 0-byte input -> Token::Eof, no panic - Whitespace-only file test passes - INV-8 verified by prop_lexer_never_panics Closes: pdftract-sy8x --- crates/pdftract-cli/Cargo.toml | 4 ++ .../src/parser/marked_content_operators.rs | 2 +- tests/gen_lexer_golden.rs | 47 +++++++++++++ tests/lexer/fixtures/bom_utf16_string.pdf.in | 1 + .../fixtures/bom_utf16_string.pdf.tokens.txt | 2 + tests/lexer/fixtures/empty.bin | 0 tests/lexer/fixtures/empty.tokens.txt | 1 + tests/lexer/fixtures/every_token.pdf.in | 14 ++++ .../lexer/fixtures/every_token.pdf.tokens.txt | 26 +++++++ .../fixtures/hex_string_edge_cases.pdf.in | 12 ++++ .../hex_string_edge_cases.pdf.tokens.txt | 13 ++++ tests/lexer/fixtures/name_edge_cases.pdf.in | 7 ++ .../fixtures/name_edge_cases.pdf.tokens.txt | 8 +++ .../lexer/fixtures/numeric_edge_cases.pdf.in | 12 ++++ .../numeric_edge_cases.pdf.tokens.txt | 18 +++++ tests/lexer/fixtures/string_escapes.pdf.in | 11 +++ .../fixtures/string_escapes.pdf.tokens.txt | 19 ++++++ tests/lexer/fixtures/whitespace_only.bin | Bin 0 -> 11 bytes .../lexer/fixtures/whitespace_only.tokens.txt | 1 + tests/proptest/lexer.rs | 64 ++++++++++++++++++ 20 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 tests/gen_lexer_golden.rs create mode 100644 tests/lexer/fixtures/bom_utf16_string.pdf.in create mode 100644 tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt create mode 100644 tests/lexer/fixtures/empty.bin create mode 100644 tests/lexer/fixtures/empty.tokens.txt create mode 100644 tests/lexer/fixtures/every_token.pdf.in create mode 100644 tests/lexer/fixtures/every_token.pdf.tokens.txt create mode 100644 tests/lexer/fixtures/hex_string_edge_cases.pdf.in create mode 100644 tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt create mode 100644 tests/lexer/fixtures/name_edge_cases.pdf.in create mode 100644 tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt create mode 100644 tests/lexer/fixtures/numeric_edge_cases.pdf.in create mode 100644 tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt create mode 100644 tests/lexer/fixtures/string_escapes.pdf.in create mode 100644 tests/lexer/fixtures/string_escapes.pdf.tokens.txt create mode 100644 tests/lexer/fixtures/whitespace_only.bin create mode 100644 tests/lexer/fixtures/whitespace_only.tokens.txt diff --git a/crates/pdftract-cli/Cargo.toml b/crates/pdftract-cli/Cargo.toml index 3b3218d..7ee862e 100644 --- a/crates/pdftract-cli/Cargo.toml +++ b/crates/pdftract-cli/Cargo.toml @@ -20,6 +20,10 @@ path = "../../tests/fixtures/generate_lzw_fixtures_main.rs" name = "generate_preprocess_fixtures" path = "../../tests/fixtures/preprocess/generate_fixtures_main.rs" +[[bin]] +name = "gen_lexer_golden" +path = "../../tests/gen_lexer_golden.rs" + [lib] name = "pdftract_cli" path = "src/lib.rs" diff --git a/crates/pdftract-core/src/parser/marked_content_operators.rs b/crates/pdftract-core/src/parser/marked_content_operators.rs index 4d15dee..1eb2b72 100644 --- a/crates/pdftract-core/src/parser/marked_content_operators.rs +++ b/crates/pdftract-core/src/parser/marked_content_operators.rs @@ -8,7 +8,7 @@ //! - BDC /Tag <> or BDC /Tag /PropName: begin marked content with properties //! - EMC: end marked content (pop top frame) -use crate::parser::object::PdfObject; +use crate::parser::object::{PdfObject, ObjRef}; use crate::parser::resources::ResourceDict; use crate::parser::marked_content_stack::{MarkedContentStack, MarkedContentFrame}; use crate::diagnostics::{Diagnostic, DiagCode}; diff --git a/tests/gen_lexer_golden.rs b/tests/gen_lexer_golden.rs new file mode 100644 index 0000000..c79be9f --- /dev/null +++ b/tests/gen_lexer_golden.rs @@ -0,0 +1,47 @@ +//! Generate golden token files for lexer fixtures. +//! +//! Run with: cargo run --bin gen_lexer_golden + +use pdftract_core::parser::lexer::Lexer; +use std::fs; +use std::path::Path; + +fn main() { + let fixtures = [ + "tests/lexer/fixtures/empty.bin", + "tests/lexer/fixtures/whitespace_only.bin", + "tests/lexer/fixtures/every_token.pdf.in", + "tests/lexer/fixtures/string_escapes.pdf.in", + "tests/lexer/fixtures/name_edge_cases.pdf.in", + "tests/lexer/fixtures/hex_string_edge_cases.pdf.in", + "tests/lexer/fixtures/numeric_edge_cases.pdf.in", + "tests/lexer/fixtures/bom_utf16_string.pdf.in", + ]; + + for fixture in fixtures { + println!("Processing {}...", fixture); + + let input = fs::read(fixture) + .unwrap_or_else(|e| panic!("Failed to read fixture {}: {}", fixture, e)); + + let mut lexer = Lexer::new(&input); + let mut tokens = Vec::new(); + + loop { + match lexer.next_token() { + Some(token) => { + tokens.push(token); + } + None => break, + } + } + + let formatted: Vec = tokens.iter().map(|t| format!("{:?}", t)).collect(); + let golden_path = Path::new(fixture).with_extension("tokens.txt"); + + fs::write(&golden_path, formatted.join("\n") + "\n") + .unwrap_or_else(|e| panic!("Failed to write golden file {:?}: {}", golden_path, e)); + + println!(" -> {}", golden_path.display()); + } +} diff --git a/tests/lexer/fixtures/bom_utf16_string.pdf.in b/tests/lexer/fixtures/bom_utf16_string.pdf.in new file mode 100644 index 0000000..3657bcf --- /dev/null +++ b/tests/lexer/fixtures/bom_utf16_string.pdf.in @@ -0,0 +1 @@ +(text with þÿ UTF-16 BOM prefix) \ No newline at end of file diff --git a/tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt b/tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt new file mode 100644 index 0000000..3d63ad6 --- /dev/null +++ b/tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt @@ -0,0 +1,2 @@ +String([116, 101, 120, 116, 32, 119, 105, 116, 104, 32, 254, 255, 32, 85, 84, 70, 45, 49, 54, 32, 66, 79, 77, 32, 112, 114, 101, 102, 105, 120]) +Eof diff --git a/tests/lexer/fixtures/empty.bin b/tests/lexer/fixtures/empty.bin new file mode 100644 index 0000000..e69de29 diff --git a/tests/lexer/fixtures/empty.tokens.txt b/tests/lexer/fixtures/empty.tokens.txt new file mode 100644 index 0000000..381e5d7 --- /dev/null +++ b/tests/lexer/fixtures/empty.tokens.txt @@ -0,0 +1 @@ +Eof diff --git a/tests/lexer/fixtures/every_token.pdf.in b/tests/lexer/fixtures/every_token.pdf.in new file mode 100644 index 0000000..742dd42 --- /dev/null +++ b/tests/lexer/fixtures/every_token.pdf.in @@ -0,0 +1,14 @@ +true false null +123 -42 3.14 -.5 +(Hello World) (nested (parens)) +<48656C6C6F> +/Type /Font#20File /#20space +[ ] +<< >> +stream + +endstream +obj endobj R +xref trailer startxref +%%EOF +% comment diff --git a/tests/lexer/fixtures/every_token.pdf.tokens.txt b/tests/lexer/fixtures/every_token.pdf.tokens.txt new file mode 100644 index 0000000..8ff0f41 --- /dev/null +++ b/tests/lexer/fixtures/every_token.pdf.tokens.txt @@ -0,0 +1,26 @@ +Bool(true) +Bool(false) +Null +Integer(123) +Integer(-42) +Real(3.14) +Real(-0.5) +String([72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]) +String([110, 101, 115, 116, 101, 100, 32, 40, 112, 97, 114, 101, 110, 115, 41]) +String([72, 101, 108, 108, 111]) +Name([84, 121, 112, 101]) +Name([70, 111, 110, 116, 32, 70, 105, 108, 101]) +Name([32, 115, 112, 97, 99, 101]) +ArrayStart +ArrayEnd +DictStart +DictEnd +Stream +EndStream +Obj +Keyword([101, 110, 100, 111, 98, 106]) +IndirectRef +Keyword([120, 114, 101, 102]) +Keyword([116, 114, 97, 105, 108, 101, 114]) +Keyword([115, 116, 97, 114, 116, 120, 114, 101, 102]) +Eof diff --git a/tests/lexer/fixtures/hex_string_edge_cases.pdf.in b/tests/lexer/fixtures/hex_string_edge_cases.pdf.in new file mode 100644 index 0000000..911034f --- /dev/null +++ b/tests/lexer/fixtures/hex_string_edge_cases.pdf.in @@ -0,0 +1,12 @@ +<4> +<41> +<48 65 6C 6C 6F> +<48656C6C6F> + + + + +< +> (unterminated) +<4 5 6> +<4A6F6B65> diff --git a/tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt b/tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt new file mode 100644 index 0000000..64406a4 --- /dev/null +++ b/tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt @@ -0,0 +1,13 @@ +String([64]) +String([65]) +String([72, 101, 108, 108, 111]) +String([72, 101, 108, 108, 111]) +String([221, 224]) +String([224, 172, 224]) +String([237, 202, 224]) +String([237, 202, 224]) +String([]) +String([117, 110, 116, 101, 114, 109, 105, 110, 97, 116, 101, 100]) +String([69, 96]) +String([74, 111, 107, 101]) +Eof diff --git a/tests/lexer/fixtures/name_edge_cases.pdf.in b/tests/lexer/fixtures/name_edge_cases.pdf.in new file mode 100644 index 0000000..22ad87b --- /dev/null +++ b/tests/lexer/fixtures/name_edge_cases.pdf.in @@ -0,0 +1,7 @@ +/name#20with#20space +/name#00 +/12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 +/123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890 +/Mixed#20Case#20And#20lower +/#empty +/a#3fb diff --git a/tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt b/tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt new file mode 100644 index 0000000..926f973 --- /dev/null +++ b/tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt @@ -0,0 +1,8 @@ +Name([110, 97, 109, 101, 32, 119, 105, 116, 104, 32, 115, 112, 97, 99, 101]) +Name([110, 97, 109, 101]) +Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57]) +Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48]) +Name([77, 105, 120, 101, 100, 32, 67, 97, 115, 101, 32, 65, 110, 100, 32, 108, 111, 119, 101, 114]) +Name([35, 101, 109, 112, 116, 121]) +Name([97, 63, 98]) +Eof diff --git a/tests/lexer/fixtures/numeric_edge_cases.pdf.in b/tests/lexer/fixtures/numeric_edge_cases.pdf.in new file mode 100644 index 0000000..c2b0490 --- /dev/null +++ b/tests/lexer/fixtures/numeric_edge_cases.pdf.in @@ -0,0 +1,12 @@ +-.5 +42. +123 +-456 ++789 +0 +0.0 +1e5 (not scientific per PDF spec) +1.5e2 (not scientific per PDF spec) +999999999999999999999 +. +-. (no digits) diff --git a/tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt b/tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt new file mode 100644 index 0000000..103fa5f --- /dev/null +++ b/tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt @@ -0,0 +1,18 @@ +Real(-0.5) +Real(42.0) +Integer(123) +Integer(-456) +Integer(789) +Integer(0) +Real(0.0) +Integer(1) +Keyword([101, 53]) +String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99]) +Real(1.5) +Keyword([101, 50]) +String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99]) +Integer(9223372036854775807) +Integer(0) +Integer(0) +String([110, 111, 32, 100, 105, 103, 105, 116, 115]) +Eof diff --git a/tests/lexer/fixtures/string_escapes.pdf.in b/tests/lexer/fixtures/string_escapes.pdf.in new file mode 100644 index 0000000..be83b1c --- /dev/null +++ b/tests/lexer/fixtures/string_escapes.pdf.in @@ -0,0 +1,11 @@ +(n\\n) (newline escape) +(n\\r) (carriage return escape) +(n\\t) (tab escape) +(n\\\\) (backslash escape) +(n\\() (open paren escape) +(n\\)) (close paren escape) +(n\\101) (octal A) +(n\\101\\102\\103) (multiple octal) +(line1\ +line2) (line continuation) +(bare\\rcarriage) (line ending normalization test) diff --git a/tests/lexer/fixtures/string_escapes.pdf.tokens.txt b/tests/lexer/fixtures/string_escapes.pdf.tokens.txt new file mode 100644 index 0000000..4ffbda4 --- /dev/null +++ b/tests/lexer/fixtures/string_escapes.pdf.tokens.txt @@ -0,0 +1,19 @@ +String([110, 92, 110]) +String([110, 101, 119, 108, 105, 110, 101, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 114]) +String([99, 97, 114, 114, 105, 97, 103, 101, 32, 114, 101, 116, 117, 114, 110, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 116]) +String([116, 97, 98, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 92]) +String([98, 97, 99, 107, 115, 108, 97, 115, 104, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 40, 41, 32, 40, 111, 112, 101, 110, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101, 41, 10, 40, 110, 92, 41]) +String([99, 108, 111, 115, 101, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 49, 48, 49]) +String([111, 99, 116, 97, 108, 32, 65]) +String([110, 92, 49, 48, 49, 92, 49, 48, 50, 92, 49, 48, 51]) +String([109, 117, 108, 116, 105, 112, 108, 101, 32, 111, 99, 116, 97, 108]) +String([108, 105, 110, 101, 49, 108, 105, 110, 101, 50]) +String([108, 105, 110, 101, 32, 99, 111, 110, 116, 105, 110, 117, 97, 116, 105, 111, 110]) +String([98, 97, 114, 101, 92, 114, 99, 97, 114, 114, 105, 97, 103, 101]) +String([108, 105, 110, 101, 32, 101, 110, 100, 105, 110, 103, 32, 110, 111, 114, 109, 97, 108, 105, 122, 97, 116, 105, 111, 110, 32, 116, 101, 115, 116]) +Eof diff --git a/tests/lexer/fixtures/whitespace_only.bin b/tests/lexer/fixtures/whitespace_only.bin new file mode 100644 index 0000000000000000000000000000000000000000..b51546a9d1a9df48db92580f3d2922a41beb30ee GIT binary patch literal 11 Qcmd<&QczIf { + wrapped.push(b'\\'); + wrapped.push(b); + } + _ => wrapped.push(b), + } + } + + wrapped.push(b')'); + + let mut lexer = Lexer::new(&wrapped); + let token = lexer.next_token(); + + match token { + Some(Token::String(decoded)) => { + // Allow line ending normalization: bare \r -> \n + let normalized: Vec = original.iter() + .map(|&b| if b == b'\r' { b'\n' } else { b }) + .collect(); + prop_assert_eq!(decoded, normalized, + "String roundtrip failed: expected {:?}, got {:?}", + normalized, decoded); + } + Some(Token::Eof) => { + prop_assert!(false, "Expected String token, got Eof"); + } + other => { + prop_assert!(false, "Expected String token, got {:?}", other); + } + } + } +} + // Re-export for use in other modules pub use lexer_never_panics;