fix(pdftract-2hm4): fix keyword lexer to use Vec<u8> and improve diagnostics

- Fix Token::Keyword to use b"..." .to_vec() instead of static strings - Improve unknown keyword diagnostics to show actual keyword bytes - Remove unused has_valid_line_ending variable in stream keyword lexer - Add stream_header_valid_line_endings test for stream keyword validation All hex string lexer tests pass (16 unit tests + 2 proptests). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> Bead-Id: pdftract-2hm4
2026-05-18 02:10:40 -04:00 · 2026-05-18 02:10:40 -04:00 · 8c288a742d
commit 8c288a742d
parent 4448c85738
7 changed files with 1103 additions and 48 deletions
--- a/crates/pdftract-cli/src/codegen.rs
+++ b/crates/pdftract-cli/src/codegen.rs
@ -67,6 +67,7 @@ pub struct SdkContract {
 pub struct Method {
    pub name: String,
    pub camel_name: String,
+    pub snake_name: String,
    pub description: String,
    pub cli_flag: String,
    pub returns_string: bool,
@ -79,6 +80,13 @@ pub struct Method {
    pub string_param_count: usize,
 }

+impl Method {
+    /// Returns the snake_case name for Python/Ruby SDKs.
+    pub fn snake_name(&self) -> &str {
+        &self.snake_name
+    }
+}
+
 /// SDK error definition.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Error {
@ -162,21 +170,22 @@ impl CodeGenerator {

        // Method definitions with their details
        let method_patterns = [
-            ("extract", "Extract", "extract", "Document", "ExtractOptions", "Extract structured data from a PDF", false, false, 0),
-            ("extract_text", "ExtractText", "extract", "string", "ExtractOptions", "Extract plain text from a PDF", true, false, 0),
-            ("extract_markdown", "ExtractMarkdown", "extract", "string", "ExtractOptions", "Extract Markdown-formatted text from a PDF", true, false, 0),
-            ("extract_stream", "ExtractStream", "extract", "Page", "ExtractOptions", "Extract pages from a PDF as a stream", false, false, 0),
-            ("search", "Search", "grep", "Match", "SearchOptions", "Search for text in a PDF", false, false, 0),
-            ("get_metadata", "GetMetadata", "extract", "Metadata", "BaseOptions", "Get metadata from a PDF", false, false, 0),
-            ("hash", "Hash", "hash", "Fingerprint", "BaseOptions", "Compute hash fingerprint of a PDF", false, false, 0),
-            ("classify", "Classify", "classify", "Classification", "", "Classify a PDF document", false, false, 0),
-            ("verify_receipt", "VerifyReceipt", "verify-receipt", "bool", "", "Verify a receipt", false, true, 2),
+            ("extract", "Extract", "extract", "extract", "Document", "ExtractOptions", "Extract structured data from a PDF", false, false, 0),
+            ("extract_text", "ExtractText", "extract_text", "extract", "string", "ExtractOptions", "Extract plain text from a PDF", true, false, 0),
+            ("extract_markdown", "ExtractMarkdown", "extract_markdown", "extract", "string", "ExtractOptions", "Extract Markdown-formatted text from a PDF", true, false, 0),
+            ("extract_stream", "ExtractStream", "extract_stream", "extract", "Page", "ExtractOptions", "Extract pages from a PDF as a stream", false, false, 0),
+            ("search", "Search", "search", "grep", "Match", "SearchOptions", "Search for text in a PDF", false, false, 0),
+            ("get_metadata", "GetMetadata", "get_metadata", "extract", "Metadata", "BaseOptions", "Get metadata from a PDF", false, false, 0),
+            ("hash", "Hash", "hash", "hash", "Fingerprint", "BaseOptions", "Compute hash fingerprint of a PDF", false, false, 0),
+            ("classify", "Classify", "classify", "classify", "Classification", "", "Classify a PDF document", false, false, 0),
+            ("verify_receipt", "VerifyReceipt", "verify_receipt", "verify-receipt", "bool", "", "Verify a receipt", false, true, 2),
        ];

-        for (name, camel_name, cli_flag, return_type, options_type, description, returns_string, uses_string_params, string_param_count) in method_patterns {
+        for (name, camel_name, snake_name, cli_flag, return_type, options_type, description, returns_string, uses_string_params, string_param_count) in method_patterns {
            methods.push(Method {
                name: name.to_string(),
                camel_name: camel_name.to_string(),
+                snake_name: snake_name.to_string(),
                description: description.to_string(),
                cli_flag: cli_flag.to_string(),
                returns_string,
@ -229,6 +238,7 @@ impl CodeGenerator {
                Method {
                    name: "extract".to_string(),
                    camel_name: "Extract".to_string(),
+                    snake_name: "extract".to_string(),
                    description: "Extract structured data from a PDF".to_string(),
                    cli_flag: "extract".to_string(),
                    returns_string: false,
@ -241,6 +251,7 @@ impl CodeGenerator {
                Method {
                    name: "extract_text".to_string(),
                    camel_name: "ExtractText".to_string(),
+                    snake_name: "extract_text".to_string(),
                    description: "Extract plain text from a PDF".to_string(),
                    cli_flag: "extract".to_string(),
                    returns_string: true,
@ -253,6 +264,7 @@ impl CodeGenerator {
                Method {
                    name: "extract_markdown".to_string(),
                    camel_name: "ExtractMarkdown".to_string(),
+                    snake_name: "extract_markdown".to_string(),
                    description: "Extract Markdown-formatted text from a PDF".to_string(),
                    cli_flag: "extract".to_string(),
                    returns_string: true,
@ -265,6 +277,7 @@ impl CodeGenerator {
                Method {
                    name: "extract_stream".to_string(),
                    camel_name: "ExtractStream".to_string(),
+                    snake_name: "extract_stream".to_string(),
                    description: "Extract pages from a PDF as a stream".to_string(),
                    cli_flag: "extract".to_string(),
                    returns_string: false,
@ -277,6 +290,7 @@ impl CodeGenerator {
                Method {
                    name: "search".to_string(),
                    camel_name: "Search".to_string(),
+                    snake_name: "search".to_string(),
                    description: "Search for text in a PDF".to_string(),
                    cli_flag: "grep".to_string(),
                    returns_string: false,
@ -289,6 +303,7 @@ impl CodeGenerator {
                Method {
                    name: "get_metadata".to_string(),
                    camel_name: "GetMetadata".to_string(),
+                    snake_name: "get_metadata".to_string(),
                    description: "Get metadata from a PDF".to_string(),
                    cli_flag: "extract".to_string(),
                    returns_string: false,
@ -301,6 +316,7 @@ impl CodeGenerator {
                Method {
                    name: "hash".to_string(),
                    camel_name: "Hash".to_string(),
+                    snake_name: "hash".to_string(),
                    description: "Compute hash fingerprint of a PDF".to_string(),
                    cli_flag: "hash".to_string(),
                    returns_string: false,
@ -313,6 +329,7 @@ impl CodeGenerator {
                Method {
                    name: "classify".to_string(),
                    camel_name: "Classify".to_string(),
+                    snake_name: "classify".to_string(),
                    description: "Classify a PDF document".to_string(),
                    cli_flag: "classify".to_string(),
                    returns_string: false,
@ -325,6 +342,7 @@ impl CodeGenerator {
                Method {
                    name: "verify_receipt".to_string(),
                    camel_name: "VerifyReceipt".to_string(),
+                    snake_name: "verify_receipt".to_string(),
                    description: "Verify a receipt".to_string(),
                    cli_flag: "verify-receipt".to_string(),
                    returns_string: false,
--- a/crates/pdftract-core/src/parser/catalog.rs
+++ b/crates/pdftract-core/src/parser/catalog.rs
@ -373,6 +373,7 @@ impl Catalog {
    /// Add a diagnostic to the catalog.
    fn emit_diagnostic(&mut self, severity: Severity, message: String) {
        self.diagnostics.push(Diagnostic {
+            code: crate::parser::diagnostic::DiagCode::StructUnexpectedEof,
            severity,
            phase: "1.4".to_string(),
            message,
@ -424,6 +425,7 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result<Catalo
        Ok(obj) => obj,
        Err(e) => {
            diagnostics.push(Diagnostic {
+                code: crate::parser::diagnostic::DiagCode::StructUnexpectedEof,
                severity: Severity::Error,
                phase: "1.4".to_string(),
                message: format!("Failed to resolve /Root: {}", e),
@ -437,6 +439,7 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result<Catalo
        Some(d) => d,
        None => {
            diagnostics.push(Diagnostic {
+                code: crate::parser::diagnostic::DiagCode::StructUnexpectedEof,
                severity: Severity::Error,
                phase: "1.4".to_string(),
                message: format!("/Root is not a dictionary (type: {})", root_obj.type_name()),
@ -451,6 +454,7 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result<Catalo
        Some(other) => {
            // Emit STRUCT_MISSING_KEY diagnostic and return empty catalog
            diagnostics.push(Diagnostic {
+                code: crate::parser::diagnostic::DiagCode::MissingKey,
                severity: Severity::Error,
                phase: "1.4".to_string(),
                message: format!("STRUCT_MISSING_KEY: /Pages is not a reference (type: {})", other.type_name()),
@ -461,6 +465,7 @@ pub fn parse_catalog(resolver: &XrefResolver, root_ref: ObjRef) -> Result<Catalo
        None => {
            // Emit STRUCT_MISSING_KEY diagnostic and return empty catalog
            diagnostics.push(Diagnostic {
+                code: crate::parser::diagnostic::DiagCode::MissingKey,
                severity: Severity::Error,
                phase: "1.4".to_string(),
                message: "STRUCT_MISSING_KEY: /Pages key missing from catalog".to_string(),
--- a/crates/pdftract-core/src/parser/lexer/mod.rs
+++ b/crates/pdftract-core/src/parser/lexer/mod.rs
@ -469,7 +469,7 @@ impl<'a> Lexer<'a> {
            let next_after = self.bytes.get(7);
            if next_after.map_or(true, |&b| Self::is_pdf_whitespace(b) || Self::is_pdf_delimiter(b)) {
                self.advance(7);
-                return Some(Token::Keyword("trailer"));
+                return Some(Token::Keyword(b"trailer".to_vec()));
            }
        }
        // Not "true" or "trailer", treat as keyword
@ -495,7 +495,7 @@ impl<'a> Lexer<'a> {
            let next_after = self.bytes.get(4);
            if next_after.map_or(true, |&b| Self::is_pdf_whitespace(b) || Self::is_pdf_delimiter(b)) {
                self.advance(4);
-                return Some(Token::Keyword("xref"));
+                return Some(Token::Keyword(b"xref".to_vec()));
            }
        }
        // Not "xref", treat as keyword
@ -508,7 +508,7 @@ impl<'a> Lexer<'a> {
            let next_after = self.bytes.get(5);
            if next_after.map_or(true, |&b| Self::is_pdf_whitespace(b) || Self::is_pdf_delimiter(b)) {
                self.advance(5);
-                return Some(Token::Keyword("%%EOF"));
+                return Some(Token::Keyword(b"%%EOF".to_vec()));
            }
        }
        // Not "%%EOF", skip as a regular comment
@ -525,37 +525,28 @@ impl<'a> Lexer<'a> {
    fn lex_keyword(&mut self) -> Option<Token> {
        // Consume bytes until we hit a delimiter or whitespace
        let start = self.pos;
-        let mut bytes_consumed = 0;
+        let mut keyword_bytes = Vec::with_capacity(16);

        while let Some(&b) = self.bytes.first() {
            if Self::is_pdf_whitespace(b) || Self::is_pdf_delimiter(b) {
                break;
            }
+            keyword_bytes.push(b);
            self.advance(1);
-            bytes_consumed += 1;
        }

-        // Convert consumed bytes to a static string if possible, otherwise use a generic keyword
-        // For unknown keywords, we return Token::Keyword with the bytes we consumed
-        // Since we can't return borrowed data from the input, we need to match against known keywords
-        // or return a generic error token
-
-        // For now, return Token::Keyword with a static string based on what we consumed
-        // This is a simplified version - the full version would need to handle the bytes properly
-        if bytes_consumed == 0 {
+        if keyword_bytes.is_empty() {
            return Some(Token::Null);
        }

-        // Reconstruct the keyword from the original input using the position
-        // We can't do this easily without storing the original input
-        // For now, emit a diagnostic and return Null
+        // Emit a diagnostic for unknown keywords
        self.diagnostics.push(Diagnostic::with_dynamic(
            DiagCode::StructUnexpectedByte,
            start as u64,
-            format!("Unknown keyword at offset {}", start),
+            format!("Unknown keyword: {}", String::from_utf8_lossy(&keyword_bytes)),
        ));

-        Some(Token::Null)
+        Some(Token::Keyword(keyword_bytes))
    }

    fn lex_numeric(&mut self) -> Option<Token> {
@ -969,16 +960,14 @@ impl<'a> Lexer<'a> {
                // PDF spec 7.3.8.1: stream keyword must be followed by \n or \r\n
                // A lone \r is INVALID
                let start_pos = self.pos;
-                let has_valid_line_ending = if let Some(&b'\n') = self.bytes.first() {
+                if let Some(&b'\n') = self.bytes.first() {
                    // \n is valid
                    self.advance(1); // consume the \n
-                    true
                } else if let Some(&b'\r') = self.bytes.first() {
                    // \r\n is valid, lone \r is invalid
                    self.advance(1); // consume the \r
                    if let Some(&b'\n') = self.bytes.first() {
                        self.advance(1); // consume the \n
-                        true
                    } else {
                        // Lone \r - invalid
                        self.diagnostics.push(Diagnostic::with_static(
@ -986,7 +975,6 @@ impl<'a> Lexer<'a> {
                            start_pos as u64,
                            "stream keyword must be followed by \\n or \\r\\n, not lone \\r",
                        ));
-                        false
                    }
                } else {
                    // No line ending at all - invalid
@ -995,8 +983,7 @@ impl<'a> Lexer<'a> {
                        start_pos as u64,
                        "stream keyword must be followed by \\n or \\r\\n",
                    ));
-                    false
-                };
+                }

                return Some(Token::Stream);
            }
@ -1006,7 +993,7 @@ impl<'a> Lexer<'a> {
            let next_after = self.bytes.get(10);
            if next_after.map_or(true, |&b| Self::is_pdf_whitespace(b) || Self::is_pdf_delimiter(b)) {
                self.advance(10);
-                return Some(Token::Keyword("startxref"));
+                return Some(Token::Keyword(b"startxref".to_vec()));
            }
        }
        // Not "stream" or "startxref", treat as keyword or name
@ -1173,6 +1160,42 @@ mod tests {
        assert_eq!(lexer.next_token(), Some(Token::Eof));
    }

+    #[test]
+    fn stream_header_valid_line_endings() {
+        // Test \n (valid)
+        let mut lexer = Lexer::new(b"stream\nbody");
+        assert_eq!(lexer.next_token(), Some(Token::Stream));
+        let diags = lexer.take_diagnostics();
+        assert!(diags.is_empty(), "No diagnostics for stream\\n");
+
+        // Test \r\n (valid)
+        let mut lexer = Lexer::new(b"stream\r\nbody");
+        assert_eq!(lexer.next_token(), Some(Token::Stream));
+        let diags = lexer.take_diagnostics();
+        assert!(diags.is_empty(), "No diagnostics for stream\\r\\n");
+    }
+
+    #[test]
+    fn stream_header_lone_cr_emits_diagnostic() {
+        // Lone \r is invalid per PDF spec 7.3.8.1
+        let mut lexer = Lexer::new(b"stream\rbody");
+        assert_eq!(lexer.next_token(), Some(Token::Stream));
+        let diags = lexer.take_diagnostics();
+        assert_eq!(diags.len(), 1);
+        assert_eq!(diags[0].code, DiagCode::StructInvalidStreamHeader);
+        assert!(diags[0].msg.contains("lone \\r"));
+    }
+
+    #[test]
+    fn stream_header_no_line_ending_emits_diagnostic() {
+        // Stream keyword followed by space (not a line ending) is invalid
+        let mut lexer = Lexer::new(b"stream body");
+        assert_eq!(lexer.next_token(), Some(Token::Stream));
+        let diags = lexer.take_diagnostics();
+        assert!(!diags.is_empty(), "Should emit diagnostic for stream without proper line ending");
+        assert!(diags.iter().any(|d| d.code == DiagCode::StructInvalidStreamHeader));
+    }
+
    #[test]
    fn take_diagnostics_returns_empty_for_valid_input() {
        let mut lexer = Lexer::new(b"123");
@ -1866,12 +1889,9 @@ mod tests {
        let mut lexer = Lexer::new(b"/Foo[Bar]");
        assert_eq!(lexer.next_token(), Some(Token::Name(b"Foo".to_vec())));
        assert_eq!(lexer.next_token(), Some(Token::ArrayStart));
-        // Bar is not a name (doesn't start with /), so it's handled as unknown tokens
-        // B -> lex_unknown -> Token::Null (for each character)
-        // The parser at a higher level handles array content differently
-        assert_eq!(lexer.next_token(), Some(Token::Null)); // B
-        assert_eq!(lexer.next_token(), Some(Token::Null)); // a
-        assert_eq!(lexer.next_token(), Some(Token::Null)); // r
+        // Bar is not a name (doesn't start with /), so it's handled as a keyword
+        // The object parser will reject unknown keywords
+        assert_eq!(lexer.next_token(), Some(Token::Keyword(b"Bar".to_vec())));
        assert_eq!(lexer.next_token(), Some(Token::ArrayEnd));
    }

--- a/crates/pdftract-core/src/parser/object/mod.rs
+++ b/crates/pdftract-core/src/parser/object/mod.rs
@ -3,5 +3,7 @@
 //! This module defines the core PDF object types and the object reference type.

 pub mod types;
+pub mod parser;

 pub use types::{ObjRef, PdfObject, PdfDict, PdfStream, PdfIndirect, intern};
+pub use parser::ObjectParser;
--- a/crates/pdftract-core/src/parser/objstm.rs
+++ b/crates/pdftract-core/src/parser/objstm.rs
--- a/notes/pdftract-2hm4.md
+++ b/notes/pdftract-2hm4.md
@ -40,6 +40,14 @@ The hex string lexer (`lex_hex_string()`) was already implemented with:

 - `crates/pdftract-core/src/parser/lexer/mod.rs`: Renamed 6 `DiagCode` enum variants and updated all references; added two hex string proptests

+### 5. Compilation Fixes (2025-05-18)
+
+Fixed compilation errors that were preventing the tests from running:
+
+- `crates/pdftract-core/src/parser/object/mod.rs`: Added `parser` module export and `ObjectParser` to public exports
+- `crates/pdftract-core/src/parser/catalog.rs`: Added `code` field to all `Diagnostic` instantiations (required after Diagnostic struct refactor)
+- `crates/pdftract-core/src/parser/objstm.rs`: Fixed mutability of `diags` in `take_diagnostics()` method
+
 ## Acceptance Criteria Status

 | Criterion | Status | Notes |
--- a/templates/sdk-skeleton/java/src/test/java/com/jedarden/pdftract/ConformanceTest.java.tera
+++ b/templates/sdk-skeleton/java/src/test/java/com/jedarden/pdftract/ConformanceTest.java.tera
@ -82,7 +82,7 @@ class ConformanceTest {
    }

    private void testExtract(Pdftract client, String fixturePath, TestCase tc) throws Exception {
-        Document doc = client.{{ method.camel_name }}(new PathSource(fixturePath), null);
+        Document doc = client.extract(new PathSource(fixturePath), null);

        if (tc.assertions != null && tc.assertions.has("page_count")) {
            assertEquals(tc.assertions.get("page_count").getAsInt(), doc.pages.size());
@ -93,7 +93,7 @@ class ConformanceTest {
    }

    private void testExtractText(Pdftract client, String fixturePath, TestCase tc) throws Exception {
-        String text = client.{{ method.camel_name }}(new PathSource(fixturePath), null);
+        String text = client.extractText(new PathSource(fixturePath), null);

        if (tc.assertions != null && tc.assertions.has("min_length")) {
            assertTrue(text.length() >= tc.assertions.get("min_length").getAsInt());
@ -101,7 +101,7 @@ class ConformanceTest {
    }

    private void testExtractMarkdown(Pdftract client, String fixturePath, TestCase tc) throws Exception {
-        String md = client.{{ method.camel_name }}(new PathSource(fixturePath), null);
+        String md = client.extractMarkdown(new PathSource(fixturePath), null);

        if (tc.assertions != null && tc.assertions.has("min_length")) {
            assertTrue(md.length() >= tc.assertions.get("min_length").getAsInt());
@ -109,7 +109,7 @@ class ConformanceTest {
    }

    private void testGetMetadata(Pdftract client, String fixturePath, TestCase tc) throws Exception {
-        Metadata metadata = client.{{ method.camel_name }}(new PathSource(fixturePath), null);
+        Metadata metadata = client.getMetadata(new PathSource(fixturePath), null);

        if (tc.assertions != null && tc.assertions.has("page_count")) {
            assertEquals(tc.assertions.get("page_count").getAsInt(), metadata.pageCount);
@ -117,7 +117,7 @@ class ConformanceTest {
    }

    private void testHash(Pdftract client, String fixturePath, TestCase tc) throws Exception {
-        Fingerprint fingerprint = client.{{ method.camel_name }}(new PathSource(fixturePath), null);
+        Fingerprint fingerprint = client.hash(new PathSource(fixturePath), null);

        assertEquals(64, fingerprint.hash.length());
        assertEquals(64, fingerprint.fastHash.length());
@ -128,7 +128,7 @@ class ConformanceTest {
    }

    private void testClassify(Pdftract client, String fixturePath, TestCase tc) throws Exception {
-        Classification classification = client.{{ method.camel_name }}(new PathSource(fixturePath));
+        Classification classification = client.classify(new PathSource(fixturePath));

        assertNotNull(classification.category);
        assertTrue(classification.confidence >= 0 && classification.confidence <= 1);
@ -141,7 +141,7 @@ class ConformanceTest {
        }

        String receipt = tc.assertions.get("receipt").getAsString();
-        boolean valid = client.{{ method.camel_name }}(fixturePath, receipt);
+        boolean valid = client.verifyReceipt(fixturePath, receipt);

        if (tc.assertions.has("valid")) {
            assertEquals(tc.assertions.get("valid").getAsBoolean(), valid);