diff --git a/.github/workflows/schema-gen.yml b/.github/workflows/schema-gen.yml new file mode 100644 index 0000000..625d7aa --- /dev/null +++ b/.github/workflows/schema-gen.yml @@ -0,0 +1,110 @@ +name: Schema Generation Validation + +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +jobs: + validate-schema: + runs-on: ubuntu-latest + name: Validate JSON Schema + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + + - name: Cache Cargo registry + uses: actions/cache@v4 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache Cargo index + uses: actions/cache@v4 + with: + path: ~/.cargo/git + key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache Cargo build + uses: actions/cache@v4 + with: + path: target + key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} + + - name: Generate JSON Schema + run: cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema + + - name: Check for schema changes + id: check-diff + run: | + if git diff --quiet docs/schema/v1.0/pdftract.schema.json; then + echo "Schema is up to date" + echo "has_changes=false" >> $GITHUB_OUTPUT + else + echo "Schema has uncommitted changes" + echo "has_changes=true" >> $GITHUB_OUTPUT + echo "### Schema changes detected :warning:" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "The generated JSON schema differs from the committed file." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`diff" >> $GITHUB_STEP_SUMMARY + git diff docs/schema/v1.0/pdftract.schema.json >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "To fix this issue:" >> $GITHUB_STEP_SUMMARY + echo "1. Run \`cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema\`" >> $GITHUB_STEP_SUMMARY + echo "2. Commit the updated schema file" >> $GITHUB_STEP_SUMMARY + exit 1 + fi + + validate-json-syntax: + runs-on: ubuntu-latest + name: Validate JSON Syntax + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Validate JSON Schema + run: | + python3 -c "import json; json.load(open('docs/schema/v1.0/pdftract.schema.json')); print('Schema is valid JSON')" + + - name: Validate schema structure + run: | + python3 << 'EOF' + import json + with open('docs/schema/v1.0/pdftract.schema.json') as f: + schema = json.load(f) + + # Verify required fields + assert schema['$schema'] == 'https://json-schema.org/draft/2020-12/schema', "Missing or incorrect $schema" + assert schema['$id'] == 'https://pdftract.com/schema/v1.0/pdftract.schema.json', "Missing or incorrect $id" + assert schema['title'] == 'pdftract Output v1.0', "Missing or incorrect title" + + # Verify $defs exist + assert '$defs' in schema, "Missing $defs" + assert 'PageJson' in schema['$defs'], "Missing PageJson definition" + assert 'SpanJson' in schema['$defs'], "Missing SpanJson definition" + assert 'DiagnosticJson' in schema['$defs'], "Missing DiagnosticJson definition" + + # Verify enum constraints + page_type = schema['$defs']['PageJson']['properties']['type'] + assert 'enum' in page_type, "Missing enum constraint on PageJson.type" + assert set(page_type['enum']) == {'text', 'scanned', 'mixed', 'broken_vector', 'blank', 'figure_only'}, "Incorrect page_type enum values" + + severity = schema['$defs']['DiagnosticJson']['properties']['severity'] + assert 'enum' in severity, "Missing enum constraint on DiagnosticJson.severity" + assert set(severity['enum']) == {'info', 'warning', 'error', 'fatal'}, "Incorrect severity enum values" + + conf_source = schema['$defs']['SpanJson']['properties']['confidence_source'] + assert 'enum' in conf_source, "Missing enum constraint on SpanJson.confidence_source" + assert set(conf_source['enum']) == {'native', 'heuristic', 'ocr'}, "Incorrect confidence_source enum values" + + print("All schema structure validations passed!") + EOF diff --git a/docs/schema/v1.0/pdftract.schema.json b/docs/schema/v1.0/pdftract.schema.json index fa23849..b019510 100644 --- a/docs/schema/v1.0/pdftract.schema.json +++ b/docs/schema/v1.0/pdftract.schema.json @@ -740,75 +740,289 @@ ], "type": "object" }, - "ExtractionMetadata": { - "description": "Metadata about the extraction process.", + "DestinationJson": { + "description": "JSON representation of a destination anchor.\n\nDescribes a specific location within a PDF page.", "properties": { - "block_count": { - "description": "Number of blocks extracted.", - "format": "uint", - "minimum": 0, - "type": "integer" + "bottom": { + "description": "Bottom coordinate (user-space points), present only for \"fitr\".", + "format": "double", + "type": [ + "number", + "null" + ] }, - "cache_age_seconds": { - "description": "Cache entry age in seconds (only present when cache_status == \"hit\")", - "format": "uint64", + "left": { + "description": "Left coordinate (user-space points), present for \"xyz\", \"fitv\", \"fitr\", \"fitbv\".", + "format": "double", + "type": [ + "number", + "null" + ] + }, + "right": { + "description": "Right coordinate (user-space points), present only for \"fitr\".", + "format": "double", + "type": [ + "number", + "null" + ] + }, + "top": { + "description": "Top coordinate (user-space points), present for \"xyz\", \"fith\", \"fitr\", \"fitbh\".", + "format": "double", + "type": [ + "number", + "null" + ] + }, + "type": { + "description": "Destination type: \"xyz\", \"fit\", \"fith\", \"fitv\", \"fitr\", \"fitb\", \"fitbh\", \"fitbv\".", + "type": "string" + }, + "zoom": { + "description": "Zoom factor, present only for \"xyz\".", + "format": "double", + "type": [ + "number", + "null" + ] + } + }, + "required": [ + "type" + ], + "type": "object" + }, + "DiagnosticJson": { + "description": "JSON representation of a diagnostic error.\n\nThis struct wraps the internal Diagnostic type for JSON serialization,\nproviding stable error codes and human-readable messages for consumers.", + "properties": { + "code": { + "description": "Stable string identifier for this diagnostic (e.g., \"FONT_GLYPH_UNMAPPED\").", + "type": "string" + }, + "hint": { + "description": "Optional hint for resolving the diagnostic (e.g., \"Install Tesseract for OCR recovery\").", + "type": [ + "string", + "null" + ] + }, + "location": { + "anyOf": [ + { + "$ref": "#/$defs/ObjectLocationJson" + }, + { + "type": "null" + } + ], + "description": "PDF object reference where the issue originated, if applicable." + }, + "message": { + "description": "Human-readable description of the diagnostic.", + "type": "string" + }, + "page_index": { + "description": "Page index where this diagnostic occurred, or `null` for document-level events.", + "format": "uint", "minimum": 0, "type": [ "integer", "null" ] }, - "cache_status": { - "description": "Cache status: \"hit\", \"miss\", or \"skipped\"", + "severity": { + "description": "Severity level: \"info\", \"warning\", \"error\", or \"fatal\".", + "enum": [ + "info", + "warning", + "error", + "fatal" + ], + "type": "string" + } + }, + "required": [ + "code", + "message", + "severity" + ], + "type": "object" + }, + "DocumentMetadata": { + "description": "JSON representation of document metadata.\n\nContains all standard PDF document information dictionary fields along\nwith derived signals from the document catalog.", + "properties": { + "author": { + "description": "PDF /Author - name of the person who created the document.", "type": [ "string", "null" ] }, - "diagnostics": { - "description": "Diagnostics emitted during extraction (coverage warnings, etc.)", + "conformance": { + "default": "none", + "description": "PDF/A or PDF/UA conformance level.\n\nOne of: \"none\", \"PDF-A-1a\", \"PDF-A-1b\", \"PDF-A-2a\", \"PDF-A-2b\", \"PDF-A-2u\",\n\"PDF-A-3a\", \"PDF-A-3b\", \"PDF-A-3u\", \"PDF-UA-1\", \"PDF-UA-2\", \"PDF-X-1a\".", + "type": "string" + }, + "contains_javascript": { + "description": "True if JavaScript actions are present in the document.", + "type": "boolean" + }, + "contains_xfa": { + "description": "True if XFA forms are present.", + "type": "boolean" + }, + "creation_date": { + "description": "PDF /CreationDate - ISO-8601 string from /CreationDate.", + "type": [ + "string", + "null" + ] + }, + "creator": { + "description": "PDF /Creator - the authoring application (e.g., \"Microsoft Word 2019\").", + "type": [ + "string", + "null" + ] + }, + "generator": { + "description": "Heuristic string identifying the producing application.", + "type": [ + "string", + "null" + ] + }, + "is_encrypted": { + "description": "True if document is encrypted.", + "type": "boolean" + }, + "is_tagged": { + "description": "True if /MarkInfo /Marked: true is present.", + "type": "boolean" + }, + "javascript_actions": { + "default": [], + "description": "JavaScript actions found in the document.\n\nPer TH-04, this array contains all discovered JavaScript actions\nwith their location and code excerpt. Empty when no JS is present.", "items": { - "type": "string" + "$ref": "#/$defs/JavascriptActionJson" }, "type": "array" }, - "error_count": { - "description": "Number of pages that failed to extract.", - "format": "uint", - "minimum": 0, - "type": "integer" - }, - "page_count": { - "description": "Total number of pages in the document.", - "format": "uint", - "minimum": 0, - "type": "integer" - }, - "reading_order_algorithm": { - "description": "Reading order algorithm used for this extraction.", + "keywords": { + "description": "PDF /Keywords - space- or comma-delimited keyword list.", "type": [ "string", "null" ] }, - "receipts_mode": { - "$ref": "#/$defs/ReceiptsMode", - "description": "Receipts mode used for this extraction." + "modification_date": { + "description": "PDF /ModDate - ISO-8601 string from /ModDate.", + "type": [ + "string", + "null" + ] }, - "span_count": { - "description": "Number of spans extracted.", - "format": "uint", + "ocg_present": { + "description": "True if optional content groups (layers) are present.", + "type": "boolean" + }, + "page_count": { + "description": "Total number of pages in the document.", + "format": "uint32", "minimum": 0, "type": "integer" + }, + "pdf_version": { + "description": "PDF version (e.g., \"1.7\", \"2.0\").", + "type": [ + "string", + "null" + ] + }, + "producer": { + "description": "PDF /Producer - the PDF-writing library (e.g., \"Acrobat Distiller 23.0\").", + "type": [ + "string", + "null" + ] + }, + "subject": { + "description": "PDF /Subject - subject matter summary.", + "type": [ + "string", + "null" + ] + }, + "title": { + "description": "PDF /Title - document title.", + "type": [ + "string", + "null" + ] } }, "required": [ "page_count", - "receipts_mode", - "span_count", - "block_count", - "error_count", - "diagnostics" + "is_tagged", + "is_encrypted", + "contains_javascript", + "contains_xfa", + "ocg_present" + ], + "type": "object" + }, + "ExtractionQuality": { + "description": "Extraction quality metrics for the document.\n\nThis structure appears in the document footer (NDJSON mode) or\nin the root metadata (full JSON mode). It provides aggregate\nquality signals across all pages.", + "properties": { + "avg_confidence": { + "description": "Average confidence score across all spans [0.0, 1.0].", + "format": "float", + "type": [ + "number", + "null" + ] + }, + "dpi_used": { + "description": "DPI used for OCR rendering (Phase 5.2).\n\nThis field records the DPI selected by the automatic DPI selection\nalgorithm (or the user-specified override). It is present when OCR\nwas performed on any page.\n\nValues: 200 (JBIG2), 300 (standard), 400 (fine print), or custom", + "format": "uint32", + "minimum": 0, + "type": [ + "integer", + "null" + ] + }, + "min_confidence": { + "description": "Minimum confidence score across all spans [0.0, 1.0].\n\nThis represents the weakest link in the extraction chain.", + "format": "float", + "type": [ + "number", + "null" + ] + }, + "ocr_fraction": { + "description": "Fraction of pages that required OCR fallback [0.0, 1.0].\n\nThis is the count of pages classified as \"scanned\" or \"mixed\"\ndivided by the total page count.", + "format": "float", + "type": [ + "number", + "null" + ] + }, + "overall_quality": { + "description": "Overall quality assessment: \"high\", \"medium\", \"low\", or \"none\".\n\n- \"high\": All pages extracted successfully with high confidence\n- \"medium\": Most pages extracted, some with lower confidence\n- \"low\": Significant extraction issues (many low-confidence pages)\n- \"none\": No extractable content found (all blank pages)", + "type": "string" + }, + "readability": { + "description": "Per-page readability score (char-weighted median of span scores) [0.0, 1.0].\n\nThis is the median of per-span readability scores, weighted by character count.\nA score below 0.5 may indicate mojibake, encoding issues, or broken text layers.", + "format": "float", + "type": [ + "number", + "null" + ] + } + }, + "required": [ + "overall_quality" ], "type": "object" }, @@ -1067,41 +1281,104 @@ ], "type": "object" }, - "PageResult": { - "description": "Result for a single page.", + "ObjectLocationJson": { + "description": "JSON representation of a PDF object reference.\n\nIdentifies a specific PDF indirect object by its object and generation numbers.", + "properties": { + "generation_number": { + "description": "Generation number (incremented on each save).", + "format": "uint16", + "maximum": 65535, + "minimum": 0, + "type": "integer" + }, + "object_number": { + "description": "Object number (zero-based index in the xref table).", + "format": "uint32", + "minimum": 0, + "type": "integer" + } + }, + "required": [ + "object_number", + "generation_number" + ], + "type": "object" + }, + "OutlineNode": { + "description": "JSON representation of an outline node (bookmark).\n\nRepresents a single node in the document's outline hierarchy, with support\nfor nested children via the `children` field.", + "properties": { + "children": { + "default": [], + "description": "Nested child outlines (empty array for leaf nodes).", + "items": { + "$ref": "#/$defs/OutlineNode" + }, + "type": "array" + }, + "destination": { + "anyOf": [ + { + "$ref": "#/$defs/DestinationJson" + }, + { + "type": "null" + } + ], + "description": "Destination type and coordinates within the page." + }, + "level": { + "description": "Hierarchical level in the outline tree (0-based, root is 0).", + "format": "uint8", + "maximum": 255, + "minimum": 0, + "type": "integer" + }, + "page_index": { + "description": "Zero-based page index this outline points to, if resolved.", + "format": "uint32", + "minimum": 0, + "type": [ + "integer", + "null" + ] + }, + "title": { + "description": "The outline title text (decoded to UTF-8).", + "type": "string" + } + }, + "required": [ + "title", + "level" + ], + "type": "object" + }, + "PageJson": { + "description": "JSON representation of a single page.\n\nContains all page-level fields including geometry, classification,\nand content arrays (spans, blocks, tables, annotations).", "properties": { "annotations": { "default": [], - "description": "Page-level annotations (highlights, stamps, notes, etc.).\n\nThis array contains all non-link annotations on this page.\nAnnotations are sorted by (rect.y0 desc, rect.x0) for deterministic output.\nEmpty when the page has no annotations.", + "description": "Page-level annotations (highlights, stamps, notes, links).\n\nEmpty until Phase 7.2; always present as an array.", "items": { "$ref": "#/$defs/AnnotationJson" }, "type": "array" }, "blocks": { - "description": "Extracted blocks (semantic units like paragraphs, headings).", + "default": [], + "description": "Semantic blocks (paragraphs, headings, lists, tables, etc.).", "items": { "$ref": "#/$defs/BlockJson" }, "type": "array" }, - "error": { - "description": "Error message if extraction failed for this page.", - "type": [ - "string", - "null" - ] - }, "height": { "description": "Page height in points (1/72 inch).", "format": "float", - "type": [ - "number", - "null" - ] + "type": "number" }, - "index": { - "description": "0-based page index.", + "page_index": { + "description": "Zero-based page index, canonical for programmatic use.\n\nThis is the stable identifier used in all internal references.", "format": "uint", "minimum": 0, "type": "integer" @@ -1114,7 +1391,7 @@ ] }, "page_number": { - "description": "1-based page number (= index + 1).\n\nEmitted as a convenience for human-facing display. For programmatic\naccess, use index instead.", + "description": "One-based page number (= page_index + 1).\n\nEmitted as a convenience for human-facing display. For programmatic\naccess, use page_index instead.", "format": "uint32", "minimum": 0, "type": "integer" @@ -1124,20 +1401,19 @@ "format": "uint16", "maximum": 65535, "minimum": 0, - "type": [ - "integer", - "null" - ] + "type": "integer" }, "spans": { - "description": "Extracted spans (text fragments with consistent styling).", + "default": [], + "description": "Text spans (atomic units with consistent font and styling).", "items": { "$ref": "#/$defs/SpanJson" }, "type": "array" }, "tables": { - "description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.", + "default": [], + "description": "Parallel table structure objects.", "items": { "$ref": "#/$defs/TableJson" }, @@ -1145,26 +1421,29 @@ }, "type": { "description": "Page classification from the page classifier.\n\nOne of: \"text\", \"scanned\", \"mixed\", \"broken_vector\", \"blank\", \"figure_only\".", - "type": [ - "string", - "null" - ] + "enum": [ + "text", + "scanned", + "mixed", + "broken_vector", + "blank", + "figure_only" + ], + "type": "string" }, "width": { "description": "Page width in points (1/72 inch).", "format": "float", - "type": [ - "number", - "null" - ] + "type": "number" } }, "required": [ - "index", + "page_index", "page_number", - "spans", - "blocks", - "tables" + "width", + "height", + "rotation", + "type" ], "type": "object" }, @@ -1216,26 +1495,6 @@ ], "type": "object" }, - "ReceiptsMode": { - "description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.", - "oneOf": [ - { - "const": "off", - "description": "No receipts generated (default).", - "type": "string" - }, - { - "const": "lite", - "description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.", - "type": "string" - }, - { - "const": "svg", - "description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.", - "type": "string" - } - ] - }, "RowJson": { "description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.", "properties": { @@ -1378,6 +1637,11 @@ }, "confidence_source": { "description": "Source of the confidence/text extraction.\n\nOne of: \"vector\" (native font decoding), \"ocr\" (pure OCR),\n\"ocr-assisted\" (OCR + vector correction), \"ocr-fallback\" (region-level fallback),\n\"repaired\" (text was repaired via heuristics).", + "enum": [ + "native", + "heuristic", + "ocr" + ], "type": [ "string", "null" @@ -1550,58 +1814,75 @@ "description": "JSON Schema for pdftract PDF extraction output v1.0. This schema defines the structure of extraction results including pages, spans, blocks, tables, form fields, signatures, and metadata.", "properties": { "attachments": { - "description": "Embedded file attachments extracted from the document.\n\nThis array contains all embedded files from the PDF's `/EmbeddedFiles`\nname tree or `/AF` (Associated Files) array. Attachments exceeding\n50 MB are truncated (metadata only, `data: null`, `truncated: true`).\nEmpty when the PDF has no embedded files.", + "default": [], + "description": "Embedded file attachments.\n\nEmpty until Phase 7.5; always present as an array.", "items": { "$ref": "#/$defs/AttachmentJson" }, "type": "array" }, - "fingerprint": { - "description": "The PDF fingerprint (for receipt generation).", - "type": "string" + "errors": { + "default": [], + "description": "All diagnostics emitted during extraction.", + "items": { + "$ref": "#/$defs/DiagnosticJson" + }, + "type": "array" + }, + "extraction_quality": { + "$ref": "#/$defs/ExtractionQuality", + "description": "Aggregate extraction quality metrics." }, "form_fields": { - "description": "Interactive form fields extracted from the document.\n\nThis array contains all form fields from the AcroForm and/or XFA data.\nFields are sorted alphabetically by name. When both AcroForm and XFA\nare present, XFA values take precedence on collision.\nEmpty when the PDF has no form fields.", + "default": [], + "description": "AcroForm/XFA form fields.\n\nEmpty until Phase 7.4; always present as an array.", "items": { "$ref": "#/$defs/FormFieldJson" }, "type": "array" }, - "javascript_actions": { - "default": [], - "description": "JavaScript actions detected in the document.\n\nPer TH-04, this array contains all discovered JavaScript actions\nwith their location and code excerpt. pdftract NEVER executes\nembedded JavaScript; this is for downstream security review.\nEmpty when no JavaScript is present.", - "items": { - "$ref": "#/$defs/JavascriptActionJson" - }, - "type": "array" - }, "links": { - "description": "Document-scoped hyperlinks extracted from the document.\n\nThis array contains all link annotations (URI and internal destination links)\nextracted from all pages. Links are sorted by (page_index, rect.y0 desc, rect.x0).\nEmpty when the PDF has no link annotations.", + "default": [], + "description": "Document-scoped hyperlinks.\n\nEmpty until Phase 7.6; always present as an array.", "items": { "$ref": "#/$defs/LinkJson" }, "type": "array" }, "metadata": { - "$ref": "#/$defs/ExtractionMetadata", - "description": "Metadata about the extraction." + "$ref": "#/$defs/DocumentMetadata", + "description": "Document-level metadata." }, - "pages": { - "description": "Extracted pages, each containing spans and blocks.", + "outline": { + "default": [], + "description": "Document outline (bookmark tree).\n\nEmpty array if no bookmarks are present.", "items": { - "$ref": "#/$defs/PageResult" + "$ref": "#/$defs/OutlineNode" }, "type": "array" }, + "pages": { + "description": "Page objects array.", + "items": { + "$ref": "#/$defs/PageJson" + }, + "type": "array" + }, + "schema_version": { + "description": "Schema version identifier (e.g., \"1.0\").", + "type": "string" + }, "signatures": { - "description": "Digital signatures extracted from the document.\n\nThis array contains all signature fields discovered in the AcroForm,\nincluding both signed and unsigned (blank) signature fields.\nEmpty when the PDF has no signature fields.", + "default": [], + "description": "Digital signature metadata.\n\nEmpty until Phase 7.3; always present as an array.", "items": { "$ref": "#/$defs/SignatureJson" }, "type": "array" }, "threads": { - "description": "Article thread chains extracted from the document.\n\nThis array contains all article threads from the PDF's `/Threads` array.\nEach thread includes metadata from the thread info dict (/I) and the\ncomplete bead chain walked from the first bead. Empty when the PDF has\nno article threads.", + "default": [], + "description": "Article thread chains.\n\nEmpty until Phase 7.1; always present as an array.", "items": { "$ref": "#/$defs/ThreadJson" }, @@ -1609,14 +1890,10 @@ } }, "required": [ - "fingerprint", - "pages", + "schema_version", "metadata", - "signatures", - "form_fields", - "links", - "attachments", - "threads" + "pages", + "extraction_quality" ], "title": "pdftract Output v1.0", "type": "object" diff --git a/xtask/src/bin/gen_schema.rs b/xtask/src/bin/gen_schema.rs index 8053ff4..0587361 100644 --- a/xtask/src/bin/gen_schema.rs +++ b/xtask/src/bin/gen_schema.rs @@ -62,12 +62,66 @@ fn find_workspace_root() -> PathBuf { std::env::current_dir().unwrap() } +/// Add explicit enum constraints to schema fields. +/// +/// This function post-processes the generated JSON schema to add explicit +/// enum constraints to fields that should have restricted value sets. +fn add_enum_constraints(value: &mut Value) { + if let Some(obj) = value.as_object_mut() { + // Add enum constraints to $defs for specific fields + if let Some(defs) = obj.get_mut("$defs").and_then(|v| v.as_object_mut()) { + // Add enum to DiagnosticJson.severity + if let Some(diag) = defs.get_mut("DiagnosticJson").and_then(|v| v.as_object_mut()) { + if let Some(props) = diag.get_mut("properties").and_then(|v| v.as_object_mut()) { + if let Some(severity) = props.get_mut("severity").and_then(|v| v.as_object_mut()) { + severity.insert("enum".to_string(), Value::Array(vec![ + Value::String("info".to_string()), + Value::String("warning".to_string()), + Value::String("error".to_string()), + Value::String("fatal".to_string()), + ])); + } + } + } + + // Add enum to PageJson.page_type (type field) + if let Some(page) = defs.get_mut("PageJson").and_then(|v| v.as_object_mut()) { + if let Some(props) = page.get_mut("properties").and_then(|v| v.as_object_mut()) { + if let Some(page_type) = props.get_mut("type").and_then(|v| v.as_object_mut()) { + page_type.insert("enum".to_string(), Value::Array(vec![ + Value::String("text".to_string()), + Value::String("scanned".to_string()), + Value::String("mixed".to_string()), + Value::String("broken_vector".to_string()), + Value::String("blank".to_string()), + Value::String("figure_only".to_string()), + ])); + } + } + } + + // Add enum to SpanJson.confidence_source + if let Some(span) = defs.get_mut("SpanJson").and_then(|v| v.as_object_mut()) { + if let Some(props) = span.get_mut("properties").and_then(|v| v.as_object_mut()) { + if let Some(conf_src) = props.get_mut("confidence_source").and_then(|v| v.as_object_mut()) { + conf_src.insert("enum".to_string(), Value::Array(vec![ + Value::String("native".to_string()), + Value::String("heuristic".to_string()), + Value::String("ocr".to_string()), + ])); + } + } + } + } + } +} + /// Generate the JSON Schema for pdftract extraction output. fn generate_schema() -> String { - use pdftract_core::extract::ExtractionResult; + use pdftract_core::schema::Output; use schemars::schema_for; - let schema = schema_for!(ExtractionResult); + let schema = schema_for!(Output); // Convert to JSON value let mut value = serde_json::to_value(&schema).expect("Failed to serialize schema"); @@ -93,6 +147,9 @@ fn generate_schema() -> String { )); } + // Add explicit enum constraints + add_enum_constraints(&mut value); + // Sort keys recursively for stable ordering let sorted = sort_keys_recursive(value);