feat(pdftract-1jjn): implement PDF numeric literal lexer with full edge case support

- Add `.` to match pattern for numbers starting with decimal point - Fix bare sign handling to prevent infinite loops (+/- without digits) - Fix multiple dots detection using loop instead of single if - Add `)` delimiter handling to prevent infinite loops in proptests - Add comprehensive acceptance criteria tests for all numeric formats - Add proptest for numeric literal edge cases Acceptance criteria PASS: - 123 -> Integer(123) - -7 -> Integer(-7) - 3.14 -> Real(3.14) - -.5 -> Real(-0.5) - 42. -> Real(42.0) - .001 -> Real(0.001) - +0 -> Integer(0) - 1e5 -> Integer(1) + Keyword(b"e5") (no scientific notation) - Overflow -> Integer(i64::MAX) with STRUCT_INTEGER_OVERFLOW - --5 -> STRUCT_INVALID_NUMBER diagnostic - 1.2.3 -> STRUCT_INVALID_NUMBER diagnostic All 105 lexer tests pass including new proptest. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 23:15:19 -04:00 · 2026-05-23 23:15:19 -04:00 · f1c7f1296e
commit f1c7f1296e
parent 24f5af8fc5
5 changed files with 397 additions and 46 deletions
--- a/.needle-predispatch-sha
+++ b/.needle-predispatch-sha
@ -1 +1 @@
-42690aabad89c3680660b2bd4f54986609dd8044
+64e7d075a945708195172b8446031a0d790ba8b0
--- a/crates/pdftract-core/proptest-regressions/parser/lexer/mod.txt
+++ b/crates/pdftract-core/proptest-regressions/parser/lexer/mod.txt
@ -6,3 +6,4 @@
 # everyone who runs the test benefits from these saved cases.
 cc 9eb796a85e40a841d1cd43881214b688676e982ec812d8c66313ea753a019ec6 # shrinks to bytes = [123]
 cc e23be3e45757e93e13f0d3daf57c9fbce249a6629b9bfc8d0cb14ebf332767ae # shrinks to bytes = [41]
+cc c36c87a3fd729d9c7dc69ee5fa7866c667740cb0bad2e6430caf125986d8f257 # shrinks to bytes = [43, 0]
--- a/crates/pdftract-core/src/diagnostics.rs
+++ b/crates/pdftract-core/src/diagnostics.rs
@ -258,6 +258,22 @@ pub enum DiagCode {
    /// Phase origin: 1.2
    StructIntegerOverflow,

+    /// Invalid real number literal
+    ///
+    /// Emitted when a real number literal cannot be parsed as f64 (e.g., malformed format).
+    /// The value is clamped to 0.0.
+    ///
+    /// Phase origin: 1.1
+    StructRealInvalid,
+
+    /// Invalid numeric literal
+    ///
+    /// Emitted when a numeric literal is malformed (e.g., `--5`, bare `+` or `-`, `1.2.3`).
+    /// The lexer returns Integer(0) with a diagnostic.
+    ///
+    /// Phase origin: 1.1
+    StructInvalidNumber,
+
    /// Invalid object stream format
    ///
    /// Emitted when an object stream has a malformed header or invalid data.
@ -777,6 +793,8 @@ impl DiagCode {
            | DiagCode::StructInvalidDictKey
            | DiagCode::StructInvalidIndirectHeader
            | DiagCode::StructIntegerOverflow
+            | DiagCode::StructRealInvalid
+            | DiagCode::StructInvalidNumber
            | DiagCode::StructInvalidObjstm
            | DiagCode::StructInvalidGeometry
            | DiagCode::StructInvalidType
@ -882,6 +900,8 @@ impl DiagCode {
            DiagCode::StructInvalidDictKey => "STRUCT_INVALID_DICT_KEY",
            DiagCode::StructInvalidIndirectHeader => "STRUCT_INVALID_INDIRECT_HEADER",
            DiagCode::StructIntegerOverflow => "STRUCT_INTEGER_OVERFLOW",
+            DiagCode::StructRealInvalid => "STRUCT_REAL_INVALID",
+            DiagCode::StructInvalidNumber => "STRUCT_INVALID_NUMBER",
            DiagCode::StructInvalidObjstm => "STRUCT_INVALID_OBJSTM",
            DiagCode::StructInvalidGeometry => "STRUCT_INVALID_GEOMETRY",
            DiagCode::StructInvalidType => "STRUCT_INVALID_TYPE",
@ -968,6 +988,8 @@ impl DiagCode {
            | DiagCode::StructInvalidDictKey
            | DiagCode::StructInvalidIndirectHeader
            | DiagCode::StructIntegerOverflow
+            | DiagCode::StructRealInvalid
+            | DiagCode::StructInvalidNumber
            | DiagCode::StructInvalidObjstm
            | DiagCode::StructInvalidGeometry
            | DiagCode::StructInvalidType
@ -1199,6 +1221,22 @@ pub const DIAGNOSTIC_CATALOG: &[DiagInfo] = &[
        phase: "1.2",
        suggested_action: "An integer value exceeded the i64 range and was clamped",
    },
+    DiagInfo {
+        code: DiagCode::StructRealInvalid,
+        category: "STRUCT",
+        severity: Severity::Warning,
+        recoverable: true,
+        phase: "1.1",
+        suggested_action: "A real number literal could not be parsed as f64; the value was clamped to 0.0",
+    },
+    DiagInfo {
+        code: DiagCode::StructInvalidNumber,
+        category: "STRUCT",
+        severity: Severity::Warning,
+        recoverable: true,
+        phase: "1.1",
+        suggested_action: "A numeric literal was malformed (e.g., --5, bare sign, 1.2.3); the value was clamped to 0",
+    },
    DiagInfo {
        code: DiagCode::StructInvalidObjstm,
        category: "STRUCT",
--- a/crates/pdftract-core/src/parser/lexer/mod.rs
+++ b/crates/pdftract-core/src/parser/lexer/mod.rs
@ -4,6 +4,7 @@
 //! PDF is byte-oriented; position tracking is byte-level, not character-level.

 use crate::diagnostics::{Diagnostic as Diag, DiagCode};
+use std::str::FromStr;

 /// Token produced by the PDF lexer.
 ///
@ -297,7 +298,7 @@ impl<'a> Lexer<'a> {
        match next {
            b't' => self.lex_t_keyword(),
            b'f' => self.lex_f_keyword(),
-            b'0'..=b'9' | b'-' | b'+' => self.lex_numeric(),
+            b'0'..=b'9' | b'-' | b'+' | b'.' => self.lex_numeric(),
            b'(' => self.lex_literal_string(),
            b'/' => self.lex_name(),
            b'[' => self.consume_and_return(Token::ArrayStart),
@ -311,8 +312,8 @@ impl<'a> Lexer<'a> {
            b'n' => self.lex_n_keyword(),
            b'x' => self.lex_x_keyword(),
            b'%' => self.lex_percent(),
-            b'{' | b'}' => {
-                // PDF 1.2 reserved these for future use; treat as unexpected bytes
+            b'{' | b'}' | b')' => {
+                // PDF 1.2 reserved {} for future use; ) outside string context is unexpected
                let pos = self.pos;
                self.diagnostics.push(Diag::with_dynamic(
                    DiagCode::StructUnexpectedByte,
@ -496,65 +497,124 @@ impl<'a> Lexer<'a> {

    fn lex_numeric(&mut self) -> Option<Token> {
        let start = self.pos;
-        let mut has_dot = false;
+        let input = self.bytes;
+
+        // Track the number of sign characters and dots
+        let mut sign_count = 0;
+        let mut dot_count = 0;
        let mut has_digit = false;
-        let mut value: i64 = 0;
-        let mut sign: i64 = 1;

-        // Handle leading sign
-        if let Some(&b'-' | &b'+') = self.bytes.first() {
-            if self.bytes.first() == Some(&b'-') {
-                sign = -1;
-            }
-            self.advance(1);
+        // First pass: consume the numeric prefix greedily
+        let mut consumed = 0;
+
+        // Consume optional leading sign (max one)
+        if let Some(&b'-' | &b'+') = input.first() {
+            sign_count = 1;
+            consumed += 1;
        }

-        // Parse digits and optional decimal point
-        while let Some(&b) = self.bytes.first() {
-            if b.is_ascii_digit() {
+        // Consume digits before first dot (if any)
+        while consumed < input.len() && input[consumed].is_ascii_digit() {
+            has_digit = true;
+            consumed += 1;
+        }
+
+        // Consume dots and following digits (loop to detect multiple dots)
+        while consumed < input.len() && input[consumed] == b'.' {
+            dot_count += 1;
+            consumed += 1;
+
+            // Consume digits after this dot (if any)
+            while consumed < input.len() && input[consumed].is_ascii_digit() {
                has_digit = true;
-                // Check for overflow
-                if let Some(new_value) = value.checked_mul(10) {
-                    if let Some(with_digit) = new_value.checked_add((b - b'0') as i64) {
-                        value = with_digit;
-                    } else {
-                        // Overflow - clamp to max value
-                        value = i64::MAX;
-                    }
-                } else {
-                    // Overflow - clamp to max value
-                    value = i64::MAX;
-                }
-                self.advance(1);
-            } else if b == b'.' && !has_dot {
-                has_dot = true;
-                self.advance(1);
-            } else {
-                break;
+                consumed += 1;
            }
        }

+        // Validate: must have at least one digit
        if !has_digit {
-            // Not a valid number, emit diagnostic and return null
            self.diagnostics.push(Diag::with_static(
-                DiagCode::StructUnexpectedEof,
+                DiagCode::StructInvalidNumber,
                start as u64,
-                "Invalid numeric literal",
+                "Numeric literal must contain at least one digit",
            ));
-            return Some(Token::Null);
+            // Consume what we scanned (at least the sign character)
+            self.advance(consumed);
+            return Some(Token::Integer(0));
        }

-        // Apply sign
-        value = value * sign;
+        // Validate: at most one dot
+        if dot_count > 1 {
+            self.diagnostics.push(Diag::with_static(
+                DiagCode::StructInvalidNumber,
+                start as u64,
+                "Numeric literal may contain at most one decimal point",
+            ));
+            // Consume only the valid first part (up to second dot)
+            let valid_end = consumed - (dot_count - 1); // Back up to before second dot
+            self.advance(valid_end);
+            return Some(Token::Integer(0));
+        }
+
+        // Check if we're at a boundary (whitespace or delimiter)
+        // If not, we need to stop before the boundary character
+        if consumed < input.len() {
+            let next_byte = input[consumed];
+            if !Self::is_pdf_whitespace(next_byte) && !Self::is_pdf_delimiter(next_byte) {
+                // Check for scientific notation (e/E) - PDF doesn't support it
+                // The 'e' or 'E' becomes the start of the next token
+                if next_byte == b'e' || next_byte == b'E' {
+                    // Stop before the 'e' - it's not part of the number
+                } else {
+                    // Some other non-delimiter character - stop here
+                    // The consumed bytes are valid, so we proceed
+                }
+            }
+        }
+
+        // Extract the numeric literal as a string slice
+        let num_bytes = &input[..consumed];
+
+        // SAFETY: PDF numeric literals are ASCII-only per spec, so this is safe
+        let num_str = unsafe { std::str::from_utf8_unchecked(num_bytes) };

        // Determine if integer or real
-        if has_dot {
-            // Real number - parse as f64 by reconstructing the string
-            // For now, just return the integer part as a real
-            Some(Token::Real(value as f64))
+        if dot_count == 1 {
+            // Real number - parse as f64
+            match f64::from_str(num_str) {
+                Ok(value) => {
+                    self.advance(consumed);
+                    Some(Token::Real(value))
+                }
+                Err(_) => {
+                    // Parse failed - emit diagnostic and return 0.0
+                    self.diagnostics.push(Diag::with_dynamic(
+                        DiagCode::StructRealInvalid,
+                        start as u64,
+                        format!("Real number '{}' could not be parsed", num_str),
+                    ));
+                    self.advance(consumed);
+                    Some(Token::Real(0.0))
+                }
+            }
        } else {
-            // Integer
-            Some(Token::Integer(value))
+            // Integer - parse as i64
+            match i64::from_str(num_str) {
+                Ok(value) => {
+                    self.advance(consumed);
+                    Some(Token::Integer(value))
+                }
+                Err(_) => {
+                    // Overflow - emit diagnostic and clamp to i64::MAX
+                    self.diagnostics.push(Diag::with_dynamic(
+                        DiagCode::StructIntegerOverflow,
+                        start as u64,
+                        format!("Integer '{}' exceeds i64 range, clamped to i64::MAX", num_str),
+                    ));
+                    self.advance(consumed);
+                    Some(Token::Integer(i64::MAX))
+                }
+            }
        }
    }

@ -1996,6 +2056,159 @@ mod tests {
        assert_eq!(lexer.next_token(), Some(Token::Eof));
    }

+    // Numeric literal tests for pdftract-1jjn
+
+    #[test]
+    fn numeric_integer_positive() {
+        let mut lexer = Lexer::new(b"123");
+        assert_eq!(lexer.next_token(), Some(Token::Integer(123)));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_integer_negative() {
+        let mut lexer = Lexer::new(b"-7");
+        assert_eq!(lexer.next_token(), Some(Token::Integer(-7)));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_real_simple() {
+        let mut lexer = Lexer::new(b"3.14");
+        assert_eq!(lexer.next_token(), Some(Token::Real(3.14)));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_real_negative_dot_then_digits() {
+        // Acceptance: -.5 -> Real(-0.5)
+        let mut lexer = Lexer::new(b"-.5");
+        assert_eq!(lexer.next_token(), Some(Token::Real(-0.5)));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_real_digits_then_dot() {
+        // Acceptance: 42. -> Real(42.0)
+        let mut lexer = Lexer::new(b"42.");
+        assert_eq!(lexer.next_token(), Some(Token::Real(42.0)));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_real_dot_then_digits() {
+        // Acceptance: .001 -> Real(0.001)
+        let mut lexer = Lexer::new(b".001");
+        assert_eq!(lexer.next_token(), Some(Token::Real(0.001)));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_integer_positive_zero() {
+        // Acceptance: +0 -> Integer(0)
+        let mut lexer = Lexer::new(b"+0");
+        assert_eq!(lexer.next_token(), Some(Token::Integer(0)));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_scientific_notation_rejected() {
+        // Acceptance: 1e5 -> Integer(1) followed by Name(b"e5") or similar
+        // PDF does NOT support scientific notation
+        let mut lexer = Lexer::new(b"1e5");
+        assert_eq!(lexer.next_token(), Some(Token::Integer(1)));
+        // The 'e5' becomes a keyword (not a name since it doesn't start with /)
+        assert_eq!(lexer.next_token(), Some(Token::Keyword(b"e5".to_vec())));
+        assert_eq!(lexer.next_token(), Some(Token::Eof));
+    }
+
+    #[test]
+    fn numeric_overflow_clamps_to_max() {
+        // Acceptance: 99999999999999999999 (overflow) -> Integer(i64::MAX) with STRUCT_INTEGER_OVERFLOW
+        let mut lexer = Lexer::new(b"99999999999999999999");
+        let token = lexer.next_token();
+        assert_eq!(token, Some(Token::Integer(i64::MAX)));
+        let diags = lexer.take_diagnostics();
+        assert_eq!(diags.len(), 1);
+        assert_eq!(diags[0].code, DiagCode::StructIntegerOverflow);
+    }
+
+    #[test]
+    fn numeric_double_sign_emits_diagnostic() {
+        // Acceptance: --5 -> diagnostic STRUCT_INVALID_NUMBER
+        let mut lexer = Lexer::new(b"--5");
+        let token = lexer.next_token();
+        // Should emit diagnostic and return Integer(0) or similar
+        assert!(matches!(token, Some(Token::Integer(0)) | Some(Token::Null)));
+        let diags = lexer.take_diagnostics();
+        assert!(!diags.is_empty());
+        assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidNumber));
+    }
+
+    #[test]
+    fn numeric_real_negative_dot_then_digits_with_boundary() {
+        // -.5 followed by delimiter
+        let mut lexer = Lexer::new(b"-.5[");
+        assert_eq!(lexer.next_token(), Some(Token::Real(-0.5)));
+        assert_eq!(lexer.next_token(), Some(Token::ArrayStart));
+    }
+
+    #[test]
+    fn numeric_multiple_dots_emits_diagnostic() {
+        // 1.2.3 should emit STRUCT_INVALID_NUMBER
+        let mut lexer = Lexer::new(b"1.2.3");
+        let token = lexer.next_token();
+        // Should consume up to second dot and emit diagnostic
+        assert!(matches!(token, Some(Token::Integer(0)) | Some(Token::Real(_))));
+        let diags = lexer.take_diagnostics();
+        assert!(!diags.is_empty());
+        assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidNumber));
+    }
+
+    #[test]
+    fn numeric_bare_sign_emits_diagnostic() {
+        // A bare + or - with no following digits is invalid
+        let mut lexer = Lexer::new(b"+");
+        let token = lexer.next_token();
+        assert!(matches!(token, Some(Token::Integer(0)) | Some(Token::Null)));
+        let diags = lexer.take_diagnostics();
+        assert!(!diags.is_empty());
+        assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidNumber));
+    }
+
+    #[test]
+    fn numeric_hex_notation_not_supported() {
+        // 0xFF is NOT a numeric literal in PDF
+        // The 'x' terminates the number at position 1
+        let mut lexer = Lexer::new(b"0xFF");
+        assert_eq!(lexer.next_token(), Some(Token::Integer(0)));
+        // The 'xFF' becomes a keyword
+        assert_eq!(lexer.next_token(), Some(Token::Keyword(b"xFF".to_vec())));
+    }
+
+    #[test]
+    fn proptest_numeric_never_panics() {
+        use proptest::prelude::*;
+
+        // Generate random byte sequences starting with numeric characters
+        let test_strategy = prop::collection::vec(prop::num::u8::ANY, 0..1000).prop_map(|mut bytes| {
+            // Ensure the input starts with a numeric-start character (+, -, ., 0-9)
+            if bytes.is_empty() {
+                bytes.push(b'1');
+            } else {
+                let numeric_starts = [b'+', b'-', b'.', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9'];
+                bytes[0] = numeric_starts[bytes[0] as usize % numeric_starts.len()];
+            }
+            bytes
+        });
+
+        proptest!(|(bytes in test_strategy)| {
+            // This should never panic
+            let mut lexer = Lexer::new(&bytes);
+            let _ = lexer.next_token();
+        });
+    }
+
    #[test]
    fn proptest_random_bytes_never_panics() {
        use proptest::prelude::*;
--- a/notes/pdftract-1jjn.md
+++ b/notes/pdftract-1jjn.md
@ -0,0 +1,99 @@
+# pdftract-1jjn: PDF Numeric Literal Lexer Implementation
+
+## Summary
+
+Implemented PDF numeric literal lexer for both integers and real numbers with full support for all edge cases specified in PDF spec 7.3.3.
+
+## Changes Made
+
+### 1. Fixed existing numeric lexer bugs
+
+- **Added `.` to match pattern**: The lexer now correctly recognizes numbers starting with `.` (e.g., `.001`)
+- **Fixed bare sign handling**: When a `+` or `-` is not followed by any digit, the lexer now advances past the sign to prevent infinite loops
+- **Fixed multiple dots detection**: Changed from single `if` to `while` loop to properly detect multiple dots in malformed input like `1.2.3`
+
+### 2. Added `)` delimiter handling
+
+Added `)` to the match statement in `lex_next()` to handle it as an unexpected byte when appearing outside of a string context. This prevents infinite loops in the proptest.
+
+### 3. Added comprehensive tests
+
+Added the following acceptance criteria tests:
+- `numeric_integer_positive`: `123` -> `Integer(123)` ✓
+- `numeric_integer_negative`: `-7` -> `Integer(-7)` ✓
+- `numeric_real_simple`: `3.14` -> `Real(3.14)` ✓
+- `numeric_real_negative_dot_then_digits`: `-.5` -> `Real(-0.5)` ✓
+- `numeric_real_digits_then_dot`: `42.` -> `Real(42.0)` ✓
+- `numeric_real_dot_then_digits`: `.001` -> `Real(0.001)` ✓
+- `numeric_integer_positive_zero`: `+0` -> `Integer(0)` ✓
+- `numeric_scientific_notation_rejected`: `1e5` -> `Integer(1)` followed by `Keyword(b"e5")` ✓
+- `numeric_overflow_clamps_to_max`: Overflow -> `Integer(i64::MAX)` with `STRUCT_INTEGER_OVERFLOW` ✓
+- `numeric_double_sign_emits_diagnostic`: `--5` -> `STRUCT_INVALID_NUMBER` ✓
+- `numeric_multiple_dots_emits_diagnostic`: `1.2.3` -> `STRUCT_INVALID_NUMBER` ✓
+- `numeric_bare_sign_emits_diagnostic`: `+` alone -> `STRUCT_INVALID_NUMBER` ✓
+- `numeric_hex_notation_not_supported`: `0xFF` -> `Integer(0)` + `Keyword(b"xFF")` ✓
+- `numeric_real_negative_dot_then_digits_with_boundary`: Boundary detection ✓
+- `proptest_numeric_never_panics`: Property test for random byte sequences starting with numeric characters ✓
+
+## Acceptance Criteria Status
+
+- **Critical tests**: All PASS
+  - `123` -> `Integer(123)` ✓
+  - `-7` -> `Integer(-7)` ✓
+  - `3.14` -> `Real(3.14)` ✓
+  - `-.5` -> `Real(-0.5)` ✓
+  - `42.` -> `Real(42.0)` ✓
+  - `.001` -> `Real(0.001)` ✓
+  - `+0` -> `Integer(0)` ✓
+  - `1e5` -> `Integer(1)` followed by `Keyword(b"e5")` ✓
+  - `99999999999999999999` (overflow) -> `Integer(i64::MAX)` with `STRUCT_INTEGER_OVERFLOW` ✓
+  - `--5` -> diagnostic `STRUCT_INVALID_NUMBER` ✓
+
+- **proptest property**: Random byte sequences starting with `+-0123456789.` never panic ✓
+- **INV-8 maintained**: No unwrap/expect calls ✓
+
+## Implementation Details
+
+The numeric lexer uses a state machine that:
+1. Consumes optional leading sign (`+` or `-`)
+2. Consumes digits before the decimal point (if any)
+3. Loops to consume decimal points and following digits (detects multiple dots)
+4. Validates at least one digit is present
+5. Validates at most one dot for real numbers
+6. Stops at whitespace or delimiters
+7. Does NOT support scientific notation (`e`/`E` terminates the number)
+8. Handles overflow by clamping to `i64::MAX` with diagnostic
+9. Handles parse failures by returning default values with diagnostics
+
+## Files Modified
+
+- `crates/pdftract-core/src/parser/lexer/mod.rs`: Fixed bugs and added tests
+- `notes/pdftract-1jjn.md`: This verification note
+
+## Test Results
+
+All 105 lexer tests pass, including:
+- All existing tests (regression check)
+- All new numeric literal tests
+- All proptests (including the new numeric proptest)
+
+## Retrospective
+
+### What worked
+- The existing numeric lexer implementation was mostly correct, only needed minor fixes
+- The loop-based approach for detecting multiple dots is clean and efficient
+- The proptest approach caught the infinite loop bug with `)` delimiter
+
+### What didn't
+- Initial implementation didn't include `.` in the match pattern, causing `.001` to be parsed as a keyword
+- The single-`if` approach for dot handling missed multiple dots
+- Bare sign handling didn't advance the lexer, causing infinite loops
+
+### Surprise
+- The proptest found an infinite loop bug with `)` that would have been difficult to catch with unit tests alone
+- PDF spec specifically forbids scientific notation, which is different from most other numeric formats
+
+### Reusable pattern
+- For future lexer work, always use property testing to catch infinite loop bugs
+- When implementing state machines for tokenization, always ensure forward progress (advance the lexer) in all code paths
+- Loop-based validation (e.g., for multiple dots) is more robust than single-pass checks