diff --git a/crates/pdftract-cli/Cargo.toml b/crates/pdftract-cli/Cargo.toml index 3b3218d..7ee862e 100644 --- a/crates/pdftract-cli/Cargo.toml +++ b/crates/pdftract-cli/Cargo.toml @@ -20,6 +20,10 @@ path = "../../tests/fixtures/generate_lzw_fixtures_main.rs" name = "generate_preprocess_fixtures" path = "../../tests/fixtures/preprocess/generate_fixtures_main.rs" +[[bin]] +name = "gen_lexer_golden" +path = "../../tests/gen_lexer_golden.rs" + [lib] name = "pdftract_cli" path = "src/lib.rs" diff --git a/crates/pdftract-core/src/parser/marked_content_operators.rs b/crates/pdftract-core/src/parser/marked_content_operators.rs index 4d15dee..1eb2b72 100644 --- a/crates/pdftract-core/src/parser/marked_content_operators.rs +++ b/crates/pdftract-core/src/parser/marked_content_operators.rs @@ -8,7 +8,7 @@ //! - BDC /Tag <> or BDC /Tag /PropName: begin marked content with properties //! - EMC: end marked content (pop top frame) -use crate::parser::object::PdfObject; +use crate::parser::object::{PdfObject, ObjRef}; use crate::parser::resources::ResourceDict; use crate::parser::marked_content_stack::{MarkedContentStack, MarkedContentFrame}; use crate::diagnostics::{Diagnostic, DiagCode}; diff --git a/tests/gen_lexer_golden.rs b/tests/gen_lexer_golden.rs new file mode 100644 index 0000000..c79be9f --- /dev/null +++ b/tests/gen_lexer_golden.rs @@ -0,0 +1,47 @@ +//! Generate golden token files for lexer fixtures. +//! +//! Run with: cargo run --bin gen_lexer_golden + +use pdftract_core::parser::lexer::Lexer; +use std::fs; +use std::path::Path; + +fn main() { + let fixtures = [ + "tests/lexer/fixtures/empty.bin", + "tests/lexer/fixtures/whitespace_only.bin", + "tests/lexer/fixtures/every_token.pdf.in", + "tests/lexer/fixtures/string_escapes.pdf.in", + "tests/lexer/fixtures/name_edge_cases.pdf.in", + "tests/lexer/fixtures/hex_string_edge_cases.pdf.in", + "tests/lexer/fixtures/numeric_edge_cases.pdf.in", + "tests/lexer/fixtures/bom_utf16_string.pdf.in", + ]; + + for fixture in fixtures { + println!("Processing {}...", fixture); + + let input = fs::read(fixture) + .unwrap_or_else(|e| panic!("Failed to read fixture {}: {}", fixture, e)); + + let mut lexer = Lexer::new(&input); + let mut tokens = Vec::new(); + + loop { + match lexer.next_token() { + Some(token) => { + tokens.push(token); + } + None => break, + } + } + + let formatted: Vec = tokens.iter().map(|t| format!("{:?}", t)).collect(); + let golden_path = Path::new(fixture).with_extension("tokens.txt"); + + fs::write(&golden_path, formatted.join("\n") + "\n") + .unwrap_or_else(|e| panic!("Failed to write golden file {:?}: {}", golden_path, e)); + + println!(" -> {}", golden_path.display()); + } +} diff --git a/tests/lexer/fixtures/bom_utf16_string.pdf.in b/tests/lexer/fixtures/bom_utf16_string.pdf.in new file mode 100644 index 0000000..3657bcf --- /dev/null +++ b/tests/lexer/fixtures/bom_utf16_string.pdf.in @@ -0,0 +1 @@ +(text with þÿ UTF-16 BOM prefix) \ No newline at end of file diff --git a/tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt b/tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt new file mode 100644 index 0000000..3d63ad6 --- /dev/null +++ b/tests/lexer/fixtures/bom_utf16_string.pdf.tokens.txt @@ -0,0 +1,2 @@ +String([116, 101, 120, 116, 32, 119, 105, 116, 104, 32, 254, 255, 32, 85, 84, 70, 45, 49, 54, 32, 66, 79, 77, 32, 112, 114, 101, 102, 105, 120]) +Eof diff --git a/tests/lexer/fixtures/empty.bin b/tests/lexer/fixtures/empty.bin new file mode 100644 index 0000000..e69de29 diff --git a/tests/lexer/fixtures/empty.tokens.txt b/tests/lexer/fixtures/empty.tokens.txt new file mode 100644 index 0000000..381e5d7 --- /dev/null +++ b/tests/lexer/fixtures/empty.tokens.txt @@ -0,0 +1 @@ +Eof diff --git a/tests/lexer/fixtures/every_token.pdf.in b/tests/lexer/fixtures/every_token.pdf.in new file mode 100644 index 0000000..742dd42 --- /dev/null +++ b/tests/lexer/fixtures/every_token.pdf.in @@ -0,0 +1,14 @@ +true false null +123 -42 3.14 -.5 +(Hello World) (nested (parens)) +<48656C6C6F> +/Type /Font#20File /#20space +[ ] +<< >> +stream + +endstream +obj endobj R +xref trailer startxref +%%EOF +% comment diff --git a/tests/lexer/fixtures/every_token.pdf.tokens.txt b/tests/lexer/fixtures/every_token.pdf.tokens.txt new file mode 100644 index 0000000..8ff0f41 --- /dev/null +++ b/tests/lexer/fixtures/every_token.pdf.tokens.txt @@ -0,0 +1,26 @@ +Bool(true) +Bool(false) +Null +Integer(123) +Integer(-42) +Real(3.14) +Real(-0.5) +String([72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100]) +String([110, 101, 115, 116, 101, 100, 32, 40, 112, 97, 114, 101, 110, 115, 41]) +String([72, 101, 108, 108, 111]) +Name([84, 121, 112, 101]) +Name([70, 111, 110, 116, 32, 70, 105, 108, 101]) +Name([32, 115, 112, 97, 99, 101]) +ArrayStart +ArrayEnd +DictStart +DictEnd +Stream +EndStream +Obj +Keyword([101, 110, 100, 111, 98, 106]) +IndirectRef +Keyword([120, 114, 101, 102]) +Keyword([116, 114, 97, 105, 108, 101, 114]) +Keyword([115, 116, 97, 114, 116, 120, 114, 101, 102]) +Eof diff --git a/tests/lexer/fixtures/hex_string_edge_cases.pdf.in b/tests/lexer/fixtures/hex_string_edge_cases.pdf.in new file mode 100644 index 0000000..911034f --- /dev/null +++ b/tests/lexer/fixtures/hex_string_edge_cases.pdf.in @@ -0,0 +1,12 @@ +<4> +<41> +<48 65 6C 6C 6F> +<48656C6C6F> + + + + +< +> (unterminated) +<4 5 6> +<4A6F6B65> diff --git a/tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt b/tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt new file mode 100644 index 0000000..64406a4 --- /dev/null +++ b/tests/lexer/fixtures/hex_string_edge_cases.pdf.tokens.txt @@ -0,0 +1,13 @@ +String([64]) +String([65]) +String([72, 101, 108, 108, 111]) +String([72, 101, 108, 108, 111]) +String([221, 224]) +String([224, 172, 224]) +String([237, 202, 224]) +String([237, 202, 224]) +String([]) +String([117, 110, 116, 101, 114, 109, 105, 110, 97, 116, 101, 100]) +String([69, 96]) +String([74, 111, 107, 101]) +Eof diff --git a/tests/lexer/fixtures/name_edge_cases.pdf.in b/tests/lexer/fixtures/name_edge_cases.pdf.in new file mode 100644 index 0000000..22ad87b --- /dev/null +++ b/tests/lexer/fixtures/name_edge_cases.pdf.in @@ -0,0 +1,7 @@ +/name#20with#20space +/name#00 +/12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789 +/123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890 +/Mixed#20Case#20And#20lower +/#empty +/a#3fb diff --git a/tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt b/tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt new file mode 100644 index 0000000..926f973 --- /dev/null +++ b/tests/lexer/fixtures/name_edge_cases.pdf.tokens.txt @@ -0,0 +1,8 @@ +Name([110, 97, 109, 101, 32, 119, 105, 116, 104, 32, 115, 112, 97, 99, 101]) +Name([110, 97, 109, 101]) +Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57]) +Name([49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 48]) +Name([77, 105, 120, 101, 100, 32, 67, 97, 115, 101, 32, 65, 110, 100, 32, 108, 111, 119, 101, 114]) +Name([35, 101, 109, 112, 116, 121]) +Name([97, 63, 98]) +Eof diff --git a/tests/lexer/fixtures/numeric_edge_cases.pdf.in b/tests/lexer/fixtures/numeric_edge_cases.pdf.in new file mode 100644 index 0000000..c2b0490 --- /dev/null +++ b/tests/lexer/fixtures/numeric_edge_cases.pdf.in @@ -0,0 +1,12 @@ +-.5 +42. +123 +-456 ++789 +0 +0.0 +1e5 (not scientific per PDF spec) +1.5e2 (not scientific per PDF spec) +999999999999999999999 +. +-. (no digits) diff --git a/tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt b/tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt new file mode 100644 index 0000000..103fa5f --- /dev/null +++ b/tests/lexer/fixtures/numeric_edge_cases.pdf.tokens.txt @@ -0,0 +1,18 @@ +Real(-0.5) +Real(42.0) +Integer(123) +Integer(-456) +Integer(789) +Integer(0) +Real(0.0) +Integer(1) +Keyword([101, 53]) +String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99]) +Real(1.5) +Keyword([101, 50]) +String([110, 111, 116, 32, 115, 99, 105, 101, 110, 116, 105, 102, 105, 99, 32, 112, 101, 114, 32, 80, 68, 70, 32, 115, 112, 101, 99]) +Integer(9223372036854775807) +Integer(0) +Integer(0) +String([110, 111, 32, 100, 105, 103, 105, 116, 115]) +Eof diff --git a/tests/lexer/fixtures/string_escapes.pdf.in b/tests/lexer/fixtures/string_escapes.pdf.in new file mode 100644 index 0000000..be83b1c --- /dev/null +++ b/tests/lexer/fixtures/string_escapes.pdf.in @@ -0,0 +1,11 @@ +(n\\n) (newline escape) +(n\\r) (carriage return escape) +(n\\t) (tab escape) +(n\\\\) (backslash escape) +(n\\() (open paren escape) +(n\\)) (close paren escape) +(n\\101) (octal A) +(n\\101\\102\\103) (multiple octal) +(line1\ +line2) (line continuation) +(bare\\rcarriage) (line ending normalization test) diff --git a/tests/lexer/fixtures/string_escapes.pdf.tokens.txt b/tests/lexer/fixtures/string_escapes.pdf.tokens.txt new file mode 100644 index 0000000..4ffbda4 --- /dev/null +++ b/tests/lexer/fixtures/string_escapes.pdf.tokens.txt @@ -0,0 +1,19 @@ +String([110, 92, 110]) +String([110, 101, 119, 108, 105, 110, 101, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 114]) +String([99, 97, 114, 114, 105, 97, 103, 101, 32, 114, 101, 116, 117, 114, 110, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 116]) +String([116, 97, 98, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 92]) +String([98, 97, 99, 107, 115, 108, 97, 115, 104, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 40, 41, 32, 40, 111, 112, 101, 110, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101, 41, 10, 40, 110, 92, 41]) +String([99, 108, 111, 115, 101, 32, 112, 97, 114, 101, 110, 32, 101, 115, 99, 97, 112, 101]) +String([110, 92, 49, 48, 49]) +String([111, 99, 116, 97, 108, 32, 65]) +String([110, 92, 49, 48, 49, 92, 49, 48, 50, 92, 49, 48, 51]) +String([109, 117, 108, 116, 105, 112, 108, 101, 32, 111, 99, 116, 97, 108]) +String([108, 105, 110, 101, 49, 108, 105, 110, 101, 50]) +String([108, 105, 110, 101, 32, 99, 111, 110, 116, 105, 110, 117, 97, 116, 105, 111, 110]) +String([98, 97, 114, 101, 92, 114, 99, 97, 114, 114, 105, 97, 103, 101]) +String([108, 105, 110, 101, 32, 101, 110, 100, 105, 110, 103, 32, 110, 111, 114, 109, 97, 108, 105, 122, 97, 116, 105, 111, 110, 32, 116, 101, 115, 116]) +Eof diff --git a/tests/lexer/fixtures/whitespace_only.bin b/tests/lexer/fixtures/whitespace_only.bin new file mode 100644 index 0000000..b51546a Binary files /dev/null and b/tests/lexer/fixtures/whitespace_only.bin differ diff --git a/tests/lexer/fixtures/whitespace_only.tokens.txt b/tests/lexer/fixtures/whitespace_only.tokens.txt new file mode 100644 index 0000000..381e5d7 --- /dev/null +++ b/tests/lexer/fixtures/whitespace_only.tokens.txt @@ -0,0 +1 @@ +Eof diff --git a/tests/proptest/lexer.rs b/tests/proptest/lexer.rs index bc8a518..2fd0e01 100644 --- a/tests/proptest/lexer.rs +++ b/tests/proptest/lexer.rs @@ -411,6 +411,70 @@ proptest::proptest! { } } +/// Property: Literal string roundtrip preserves content. +/// +/// Literal strings in PDF are wrapped in parentheses. This test generates +/// arbitrary printable byte strings, wraps them in `(...)`, and verifies +/// that the lexer decodes them back to the original bytes. +/// +/// Line ending normalization is allowed: bare `\r` may become `\n` per +/// PDF spec (section 7.3.4.2). +#[cfg(feature = "proptest")] +proptest::proptest! { + #[test] + fn prop_string_roundtrip( + // Generate arbitrary printable ASCII strings with some escapes + original in proptest::collection::vec( + prop_oneof![ + // Printable ASCII range (space through tilde) + 0x20u8..=0x7E, + // Tab and newline (valid in strings) + Just(b'\t'), + Just(b'\n'), + Just(b'\r'), + ], + 0..500 + ) + ) { + // Wrap in parentheses, escaping special characters + let mut wrapped = Vec::with_capacity(original.len() * 2 + 2); + wrapped.push(b'('); + + for &b in &original { + match b { + b'\\' | b'(' | b')' => { + wrapped.push(b'\\'); + wrapped.push(b); + } + _ => wrapped.push(b), + } + } + + wrapped.push(b')'); + + let mut lexer = Lexer::new(&wrapped); + let token = lexer.next_token(); + + match token { + Some(Token::String(decoded)) => { + // Allow line ending normalization: bare \r -> \n + let normalized: Vec = original.iter() + .map(|&b| if b == b'\r' { b'\n' } else { b }) + .collect(); + prop_assert_eq!(decoded, normalized, + "String roundtrip failed: expected {:?}, got {:?}", + normalized, decoded); + } + Some(Token::Eof) => { + prop_assert!(false, "Expected String token, got Eof"); + } + other => { + prop_assert!(false, "Expected String token, got {:?}", other); + } + } + } +} + // Re-export for use in other modules pub use lexer_never_panics;