feat(pdftract-zy2jx): generate JSON Schema from Rust output types

- Add schemars dependency to pdftract-core (v1.2) - Add JsonSchema derives to output types (ExtractionResult, PageResult, ExtractionMetadata, SpanJson, BlockJson, CellJson, RowJson, TableJson, ExtractionQuality, Receipt, ReceiptsMode) - Create xtask/src/bin/gen_schema.rs for schema generation - Add gen-schema command to xtask main.rs - Generate docs/schema/v1.0/pdftract.schema.json using Draft 2020-12 Schema includes: - $schema: "https://json-schema.org/draft/2020-12/schema" - $defs with all output type definitions - Proper type annotations for all fields Closes: pdftract-zy2jx
2026-05-24 01:29:14 -04:00 · 2026-05-24 01:29:14 -04:00 · 92e90af0b0
commit 92e90af0b0
parent d723427da7
10 changed files with 610 additions and 292 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2302,7 +2302,7 @@ dependencies = [
 "pdftract-core",
 "regex",
 "reqwest",
- "schemars",
+ "schemars 0.8.22",
 "secrecy",
 "semver",
 "serde",
@ -2349,6 +2349,7 @@ dependencies = [
 "quick-xml",
 "rayon",
 "regex",
+ "schemars 1.2.1",
 "secrecy",
 "serde",
 "serde_json",
@ -2967,6 +2968,26 @@ dependencies = [
 "thiserror 1.0.69",
 ]

+[[package]]
+name = "ref-cast"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
+dependencies = [
+ "ref-cast-impl",
+]
+
+[[package]]
+name = "ref-cast-impl"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "regex"
 version = "1.12.3"
@ -3170,7 +3191,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
 dependencies = [
 "dyn-clone",
- "schemars_derive",
+ "schemars_derive 0.8.22",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "schemars_derive 1.2.1",
 "serde",
 "serde_json",
 ]
@ -3187,6 +3221,18 @@ dependencies = [
 "syn 2.0.117",
 ]

+[[package]]
+name = "schemars_derive"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
--- a/crates/pdftract-core/Cargo.toml
+++ b/crates/pdftract-core/Cargo.toml
@ -21,6 +21,7 @@ regex = "1.10"
 secrecy = { workspace = true }
 serde = { version = "1.0", features = ["derive"], optional = true }
 serde_json = { version = "1.0", optional = true }
+schemars = { version = "1.2", features = ["derive"], optional = true }
 sha2 = "0.10"
 thiserror = { workspace = true }
 memchr = { workspace = true }
@ -38,7 +39,8 @@ quick-xml = { version = "0.36", optional = true }

 [features]
 default = ["serde"]
-serde = ["dep:serde", "dep:serde_json"]
+serde = ["dep:serde", "dep:serde_json", "dep:schemars"]
+schemars = ["dep:schemars", "serde"]
 receipts = []  # Enable visual citation receipts (SVG clip generation)
 ocr = ["dep:image", "dep:leptonica-plumbing", "dep:quick-xml"]  # Enable OCR path (image compositing + preprocessing + HOCR parsing)
 full-render = ["dep:pdfium-render", "ocr"]  # Enable PDFium-based rendering (requires ocr)
--- a/crates/pdftract-core/src/extract.rs
+++ b/crates/pdftract-core/src/extract.rs
@ -28,6 +28,8 @@ use anyhow::{Context, Result};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;
 use std::sync::Arc;
 use crate::parser::stream::FileSource;

@ -102,6 +104,7 @@ fn decode_page_content_streams(
 ///
 /// Contains the extracted pages, spans, blocks, and metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct ExtractionResult {
    /// The PDF fingerprint (for receipt generation).
    pub fingerprint: String,
@ -113,6 +116,7 @@ pub struct ExtractionResult {

 /// Result for a single page.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct PageResult {
    /// 0-based page index.
    pub index: usize,
@ -177,6 +181,7 @@ impl From<PageResultInternal> for PageResult {

 /// Metadata about the extraction process.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct ExtractionMetadata {
    /// Total number of pages in the document.
    pub page_count: usize,
--- a/crates/pdftract-core/src/options.rs
+++ b/crates/pdftract-core/src/options.rs
@ -4,11 +4,14 @@
 //! including the receipts mode for cryptographic provenance tracking.

 use serde::{Deserialize, Serialize};
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;

 /// Receipt generation mode.
 ///
 /// Controls whether visual citation receipts are generated during extraction.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 #[serde(rename_all = "lowercase")]
 pub enum ReceiptsMode {
    /// No receipts generated (default).
--- a/crates/pdftract-core/src/receipts/mod.rs
+++ b/crates/pdftract-core/src/receipts/mod.rs
@ -26,6 +26,8 @@ pub mod svg;
 pub mod verifier;

 use serde::{Deserialize, Serialize};
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;

 /// A visual citation receipt for extracted text.
 ///
@ -59,6 +61,7 @@ use serde::{Deserialize, Serialize};
 /// }
 /// ```
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct Receipt {
    /// Phase 1.7 fingerprint of the source PDF.
    ///
--- a/crates/pdftract-core/src/schema/mod.rs
+++ b/crates/pdftract-core/src/schema/mod.rs
@ -18,6 +18,8 @@

 use serde::{Deserialize, Serialize};
 use serde_json::json;
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;

 use crate::receipts::Receipt;

@ -26,6 +28,7 @@ use crate::receipts::Receipt;
 /// A span is the smallest unit of extracted text, representing a
 /// contiguous run of text with consistent font and styling.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct SpanJson {
    /// The extracted text content.
    pub text: String,
@ -64,6 +67,7 @@ pub struct SpanJson {
 /// spans. Examples include paragraphs, headings, list items, and
 /// table cells.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct BlockJson {
    /// The block kind/type.
    ///
@ -112,6 +116,7 @@ pub type SpanRef = usize;
 /// A cell represents a single unit within a table row, containing
 /// its text content, bounding box, and position information.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct CellJson {
    /// Bounding box in PDF user-space points.
    ///
@ -163,6 +168,7 @@ fn default_one() -> u32 {
 /// A row contains a sequence of cells that form a horizontal strip
 /// in the table.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct RowJson {
    /// Bounding box in PDF user-space points.
    ///
@ -185,6 +191,7 @@ pub struct RowJson {
 /// provides the concatenated text and position, while the TableJson
 /// provides full cell-level structure.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct TableJson {
    /// Unique identifier for this table (e.g., "table_0").
    pub id: String,
@ -231,6 +238,7 @@ pub struct TableJson {
 /// in the root metadata (full JSON mode). It provides aggregate
 /// quality signals across all pages.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct ExtractionQuality {
    /// Overall quality assessment: "high", "medium", "low", or "none".
    ///
--- a/docs/schema/v1.0/pdftract.schema.json
+++ b/docs/schema/v1.0/pdftract.schema.json
@ -1,345 +1,489 @@
 {
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json",
-  "title": "PDFtract Extraction Output Schema v1.0",
-  "description": "JSON output schema for PDF text and structure extraction",
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "ExtractionResult",
+  "description": "Result of a PDF extraction operation.\n\nContains the extracted pages, spans, blocks, and metadata.",
  "type": "object",
-  "required": ["fingerprint", "schema_version", "pages", "metadata"],
  "properties": {
    "fingerprint": {
-      "type": "string",
-      "description": "PDF fingerprint for verification (format: pdftract-v1:<hex>)"
-    },
-    "schema_version": {
-      "type": "string",
-      "description": "Schema version (e.g., '1.0')",
-      "enum": ["1.0"]
-    },
-    "pages": {
-      "type": "array",
-      "description": "Extracted pages",
-      "items": {
-        "$ref": "#/definitions/page"
-      }
+      "description": "The PDF fingerprint (for receipt generation).",
+      "type": "string"
    },
    "metadata": {
-      "$ref": "#/definitions/metadata"
+      "description": "Metadata about the extraction.",
+      "$ref": "#/$defs/ExtractionMetadata"
+    },
+    "pages": {
+      "description": "Extracted pages, each containing spans and blocks.",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/PageResult"
+      }
    }
  },
-  "definitions": {
-    "page": {
+  "required": [
+    "fingerprint",
+    "pages",
+    "metadata"
+  ],
+  "$defs": {
+    "BlockJson": {
+      "description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.",
      "type": "object",
-      "required": ["index", "spans", "blocks", "tables"],
      "properties": {
-        "index": {
-          "type": "integer",
-          "description": "0-based page index"
-        },
-        "spans": {
-          "type": "array",
-          "description": "Extracted text spans",
-          "items": {
-            "$ref": "#/definitions/span"
-          }
-        },
-        "blocks": {
-          "type": "array",
-          "description": "Extracted structural blocks",
-          "items": {
-            "$ref": "#/definitions/block"
-          }
-        },
-        "tables": {
-          "type": "array",
-          "description": "Extracted tables (cell-level structure)",
-          "items": {
-            "$ref": "#/definitions/table"
-          }
-        },
-        "error": {
-          "type": "string",
-          "description": "Error message if extraction failed for this page"
-        }
-      }
-    },
-    "span": {
-      "type": "object",
-      "required": ["text", "bbox", "font", "size"],
-      "properties": {
-        "text": {
-          "type": "string",
-          "description": "The extracted text content"
-        },
        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
          "items": {
-            "type": "number"
+            "type": "number",
+            "format": "double"
          },
-          "minItems": 4,
-          "maxItems": 4
+          "maxItems": 4,
+          "minItems": 4
        },
-        "font": {
-          "type": "string",
-          "description": "Font name or identifier"
-        },
-        "size": {
-          "type": "number",
-          "description": "Font size in points"
-        },
-        "confidence": {
-          "type": "number",
-          "description": "Confidence score (0.0 to 1.0) for OCR text",
-          "minimum": 0.0,
-          "maximum": 1.0
-        },
-        "receipt": {
-          "$ref": "#/definitions/receipt"
-        }
-      }
-    },
-    "block": {
-      "type": "object",
-      "required": ["kind", "text", "bbox"],
-      "properties": {
        "kind": {
-          "type": "string",
-          "description": "Block kind/type",
-          "enum": ["paragraph", "heading", "list", "table", "figure"]
-        },
-        "text": {
-          "type": "string",
-          "description": "The concatenated text content of all spans in the block"
-        },
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
+          "description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".",
+          "type": "string"
        },
        "level": {
-          "type": "integer",
-          "description": "Heading level (1-6) for 'heading' kind blocks",
-          "minimum": 1,
-          "maximum": 6
-        },
-        "table_index": {
-          "type": "integer",
-          "description": "Table index for 'table' kind blocks (points to tables array)",
+          "description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.",
+          "type": [
+            "integer",
+            "null"
+          ],
+          "format": "uint8",
+          "maximum": 255,
          "minimum": 0
        },
        "receipt": {
-          "$ref": "#/definitions/receipt"
-        }
-      }
-    },
-    "table": {
-      "type": "object",
-      "required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"],
-      "properties": {
-        "id": {
-          "type": "string",
-          "description": "Unique identifier for this table (e.g., 'table_0')"
+          "description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
+          "anyOf": [
+            {
+              "$ref": "#/$defs/Receipt"
+            },
+            {
+              "type": "null"
+            }
+          ]
        },
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
-        },
-        "rows": {
-          "type": "array",
-          "description": "Rows in this table, ordered top-to-bottom",
-          "items": {
-            "$ref": "#/definitions/row"
-          }
-        },
-        "header_rows": {
-          "type": "integer",
-          "description": "Number of contiguous header rows at the top of the table",
+        "table_index": {
+          "description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.",
+          "type": [
+            "integer",
+            "null"
+          ],
+          "format": "uint",
          "minimum": 0
        },
-        "detection_method": {
-          "type": "string",
-          "description": "Detection method used to identify this table",
-          "enum": ["line_based", "borderless"]
-        },
-        "continued": {
-          "type": "boolean",
-          "description": "Whether this table continues on the next page"
-        },
-        "continued_from_prev": {
-          "type": "boolean",
-          "description": "Whether this table is a continuation from the previous page"
-        },
-        "page_index": {
-          "type": "integer",
-          "description": "Zero-based page index where this table appears",
-          "minimum": 0
-        }
-      }
-    },
-    "row": {
-      "type": "object",
-      "required": ["bbox", "cells", "is_header"],
-      "properties": {
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
-        },
-        "cells": {
-          "type": "array",
-          "description": "Cells in this row, ordered left-to-right",
-          "items": {
-            "$ref": "#/definitions/cell"
-          }
-        },
-        "is_header": {
-          "type": "boolean",
-          "description": "Whether this row is a header row"
-        }
-      }
-    },
-    "cell": {
-      "type": "object",
-      "required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"],
-      "properties": {
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
-        },
        "text": {
-          "type": "string",
-          "description": "The concatenated text content of all spans in the cell"
-        },
-        "spans": {
+          "description": "The concatenated text content of all spans in the block.",
+          "type": "string"
+        }
+      },
+      "required": [
+        "kind",
+        "text",
+        "bbox"
+      ]
+    },
+    "CellJson": {
+      "description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
          "type": "array",
-          "description": "References to spans in the page's spans array",
          "items": {
-            "type": "integer"
-          }
-        },
-        "row": {
-          "type": "integer",
-          "description": "Zero-based row index within the table",
-          "minimum": 0
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
        },
        "col": {
+          "description": "Zero-based column index within the table.",
          "type": "integer",
-          "description": "Zero-based column index within the table",
+          "format": "uint",
+          "minimum": 0
+        },
+        "colspan": {
+          "description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.",
+          "type": "integer",
+          "format": "uint32",
+          "default": 1,
+          "minimum": 0
+        },
+        "is_header_row": {
+          "description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.",
+          "type": "boolean"
+        },
+        "row": {
+          "description": "Zero-based row index within the table.",
+          "type": "integer",
+          "format": "uint",
          "minimum": 0
        },
        "rowspan": {
+          "description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.",
          "type": "integer",
-          "description": "Number of rows this cell spans (default 1)",
-          "minimum": 1
+          "format": "uint32",
+          "default": 1,
+          "minimum": 0
        },
-        "colspan": {
-          "type": "integer",
-          "description": "Number of columns this cell spans (default 1)",
-          "minimum": 1
-        },
-        "is_header_row": {
-          "type": "boolean",
-          "description": "Whether this cell is in a header row"
-        }
-      }
-    },
-    "receipt": {
-      "type": "object",
-      "required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"],
-      "properties": {
-        "pdf_fingerprint": {
-          "type": "string",
-          "description": "The PDF fingerprint"
-        },
-        "page_index": {
-          "type": "integer",
-          "description": "The page index"
-        },
-        "bbox": {
+        "spans": {
+          "description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.",
          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
+            "type": "integer",
+            "format": "uint",
+            "minimum": 0
+          }
        },
-        "content_hash": {
-          "type": "string",
-          "description": "SHA-256 hash of the content"
-        },
-        "extraction_version": {
-          "type": "string",
-          "description": "Version string of the extractor"
-        },
-        "svg_clip": {
-          "type": "string",
-          "description": "SVG clip path for verification (present only in SvgClip mode)"
+        "text": {
+          "description": "The concatenated text content of all spans in the cell.",
+          "type": "string"
        }
-      }
+      },
+      "required": [
+        "bbox",
+        "text",
+        "spans",
+        "row",
+        "col",
+        "is_header_row"
+      ]
    },
-    "metadata": {
+    "ExtractionMetadata": {
+      "description": "Metadata about the extraction process.",
      "type": "object",
-      "required": ["page_count", "span_count", "block_count"],
      "properties": {
-        "page_count": {
-          "type": "integer",
-          "description": "Total number of pages in the document"
-        },
-        "span_count": {
-          "type": "integer",
-          "description": "Number of spans extracted"
-        },
        "block_count": {
+          "description": "Number of blocks extracted.",
          "type": "integer",
-          "description": "Number of blocks extracted"
-        },
-        "cache_status": {
-          "type": "string",
-          "description": "Cache status: 'hit', 'miss', or 'skipped'",
-          "enum": ["hit", "miss", "skipped"]
+          "format": "uint",
+          "minimum": 0
        },
        "cache_age_seconds": {
-          "type": "integer",
-          "description": "Cache entry age in seconds (only present when cache_status == 'hit')",
+          "description": "Cache entry age in seconds (only present when cache_status == \"hit\")",
+          "type": [
+            "integer",
+            "null"
+          ],
+          "format": "uint64",
          "minimum": 0
        },
-        "error_count": {
-          "type": "integer",
-          "description": "Number of pages that failed to extract",
-          "minimum": 0
-        },
-        "reading_order_algorithm": {
-          "type": "string",
-          "description": "Reading order algorithm used for this extraction",
-          "enum": ["struct_tree", "xy_cut"]
+        "cache_status": {
+          "description": "Cache status: \"hit\", \"miss\", or \"skipped\"",
+          "type": [
+            "string",
+            "null"
+          ]
        },
        "diagnostics": {
+          "description": "Diagnostics emitted during extraction (coverage warnings, etc.)",
          "type": "array",
-          "description": "Diagnostics emitted during extraction",
          "items": {
            "type": "string"
          }
+        },
+        "error_count": {
+          "description": "Number of pages that failed to extract.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "page_count": {
+          "description": "Total number of pages in the document.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "reading_order_algorithm": {
+          "description": "Reading order algorithm used for this extraction.",
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "receipts_mode": {
+          "description": "Receipts mode used for this extraction.",
+          "$ref": "#/$defs/ReceiptsMode"
+        },
+        "span_count": {
+          "description": "Number of spans extracted.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
        }
-      }
+      },
+      "required": [
+        "page_count",
+        "receipts_mode",
+        "span_count",
+        "block_count",
+        "error_count",
+        "diagnostics"
+      ]
+    },
+    "PageResult": {
+      "description": "Result for a single page.",
+      "type": "object",
+      "properties": {
+        "blocks": {
+          "description": "Extracted blocks (semantic units like paragraphs, headings).",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/BlockJson"
+          }
+        },
+        "error": {
+          "description": "Error message if extraction failed for this page.",
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "index": {
+          "description": "0-based page index.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "spans": {
+          "description": "Extracted spans (text fragments with consistent styling).",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/SpanJson"
+          }
+        },
+        "tables": {
+          "description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/TableJson"
+          }
+        }
+      },
+      "required": [
+        "index",
+        "spans",
+        "blocks",
+        "tables"
+      ]
+    },
+    "Receipt": {
+      "description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n  \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n  \"page_index\": 14,\n  \"bbox\": [220.0, 412.0, 412.0, 432.0],\n  \"content_hash\": \"sha256:9b21...\",\n  \"extraction_version\": \"1.0.0\"\n}\n```",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "content_hash": {
+          "description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).",
+          "type": "string"
+        },
+        "extraction_version": {
+          "description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.",
+          "type": "string"
+        },
+        "page_index": {
+          "description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "pdf_fingerprint": {
+          "description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).",
+          "type": "string"
+        },
+        "svg_clip": {
+          "description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.",
+          "type": [
+            "string",
+            "null"
+          ]
+        }
+      },
+      "required": [
+        "pdf_fingerprint",
+        "page_index",
+        "bbox",
+        "content_hash",
+        "extraction_version"
+      ]
+    },
+    "ReceiptsMode": {
+      "description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.",
+      "oneOf": [
+        {
+          "description": "No receipts generated (default).",
+          "type": "string",
+          "const": "off"
+        },
+        {
+          "description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.",
+          "type": "string",
+          "const": "lite"
+        },
+        {
+          "description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.",
+          "type": "string",
+          "const": "svg"
+        }
+      ]
+    },
+    "RowJson": {
+      "description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "cells": {
+          "description": "Cells in this row, ordered left-to-right.",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/CellJson"
+          }
+        },
+        "is_header": {
+          "description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.",
+          "type": "boolean"
+        }
+      },
+      "required": [
+        "bbox",
+        "cells",
+        "is_header"
+      ]
+    },
+    "SpanJson": {
+      "description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "confidence": {
+          "description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.",
+          "type": [
+            "number",
+            "null"
+          ],
+          "format": "double"
+        },
+        "font": {
+          "description": "Font name or identifier.",
+          "type": "string"
+        },
+        "receipt": {
+          "description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
+          "anyOf": [
+            {
+              "$ref": "#/$defs/Receipt"
+            },
+            {
+              "type": "null"
+            }
+          ]
+        },
+        "size": {
+          "description": "Font size in points.",
+          "type": "number",
+          "format": "double"
+        },
+        "text": {
+          "description": "The extracted text content.",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text",
+        "bbox",
+        "font",
+        "size"
+      ]
+    },
+    "TableJson": {
+      "description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "continued": {
+          "description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.",
+          "type": "boolean"
+        },
+        "continued_from_prev": {
+          "description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.",
+          "type": "boolean"
+        },
+        "detection_method": {
+          "description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics",
+          "type": "string"
+        },
+        "header_rows": {
+          "description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.",
+          "type": "integer",
+          "format": "uint32",
+          "minimum": 0
+        },
+        "id": {
+          "description": "Unique identifier for this table (e.g., \"table_0\").",
+          "type": "string"
+        },
+        "page_index": {
+          "description": "Zero-based page index where this table appears.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "rows": {
+          "description": "Rows in this table, ordered top-to-bottom.",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/RowJson"
+          }
+        }
+      },
+      "required": [
+        "id",
+        "bbox",
+        "rows",
+        "header_rows",
+        "detection_method",
+        "continued",
+        "continued_from_prev",
+        "page_index"
+      ]
    }
  }
 }
--- a/xtask/Cargo.toml
+++ b/xtask/Cargo.toml
@ -11,6 +11,10 @@ publish = false
 name = "xtask"
 path = "src/main.rs"

+[[bin]]
+name = "gen_schema"
+path = "src/bin/gen_schema.rs"
+
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
@ -18,3 +22,5 @@ serde_yaml = "0.9"
 glob = "0.3"
 humantime = "2.1"
 lopdf = "0.34"
+schemars = "1.2"
+pdftract-core = { path = "../crates/pdftract-core", features = ["schemars"] }
--- a/xtask/src/bin/gen_schema.rs
+++ b/xtask/src/bin/gen_schema.rs
@ -0,0 +1,74 @@
+//! Generate JSON Schema from Rust output types.
+//!
+//! This binary generates the canonical JSON Schema for pdftract's
+//! extraction output, which is checked into the repository at
+//! docs/schema/v1.0/pdftract.schema.json.
+//!
+//! Usage: cargo run --bin gen_schema
+
+use std::fs;
+use std::path::PathBuf;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Find the workspace root
+    let workspace_root = find_workspace_root();
+
+    // Generate the schema
+    let schema_json = generate_schema();
+
+    // Write to docs/schema/v1.0/pdftract.schema.json
+    let schema_path = workspace_root.join("docs/schema/v1.0/pdftract.schema.json");
+
+    // Create the directory if it doesn't exist
+    if let Some(parent) = schema_path.parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    fs::write(&schema_path, schema_json)?;
+
+    println!("Generated schema at: {}", schema_path.display());
+
+    Ok(())
+}
+
+/// Find the workspace root by searching for Cargo.toml
+fn find_workspace_root() -> PathBuf {
+    let mut current = std::env::current_dir().unwrap();
+
+    // If we're in the xtask directory, go to parent
+    if current.ends_with("xtask") {
+        current = current.parent().unwrap().to_path_buf();
+    }
+
+    // Search upward for Cargo.toml with workspace members
+    loop {
+        let cargo_toml = current.join("Cargo.toml");
+        if cargo_toml.exists() {
+            let content = fs::read_to_string(&cargo_toml).unwrap_or_default();
+            if content.contains("[workspace]") {
+                return current;
+            }
+        }
+
+        match current.parent() {
+            Some(parent) => current = parent.to_path_buf(),
+            None => break,
+        }
+    }
+
+    // Fallback: use current directory if not found
+    std::env::current_dir().unwrap()
+}
+
+/// Generate the JSON Schema for pdftract extraction output.
+fn generate_schema() -> String {
+    use pdftract_core::extract::ExtractionResult;
+    use schemars::schema_for;
+
+    let schema = schema_for!(ExtractionResult);
+
+    // Convert to JSON string
+    // The schema_for! macro already includes the $schema field
+    serde_json::to_string_pretty(&schema)
+        .expect("Failed to serialize schema")
+}
--- a/xtask/src/main.rs
+++ b/xtask/src/main.rs
@ -104,17 +104,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        eprintln!("  doc-profiles                     Generate README skeletons for all profiles");
        eprintln!("  generate-stress-pdfs            Generate stress-test PDFs for memory ceiling testing");
        eprintln!("  generate-page-class-fixtures    Generate page classification test fixtures");
+        eprintln!("  gen-schema                      Generate JSON Schema from Rust output types");
        eprintln!("  memory-ceiling                  Run memory ceiling tests against perf/malformed corpora");
        std::process::exit(1);
    }

-    match args[1].as_str() {
+    let result = match args[1].as_str() {
        "doc-profile" => {
            if args.len() < 3 {
                eprintln!("Usage: xtask doc-profile <profile-name>");
                std::process::exit(1);
            }
            generate_profile_readme(&args[2])?;
+            Ok(())
        }
        "doc-profiles" => {
            let profiles_dir = find_workspace_root().join("profiles/builtin");
@ -127,20 +129,45 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                    }
                }
            }
+            Ok(())
        }
        "generate-stress-pdfs" => {
            generate_stress_pdfs()?;
+            Ok(())
        }
        "generate-page-class-fixtures" => {
            generate_page_class_fixtures()?;
+            Ok(())
+        }
+        "gen-schema" => {
+            gen_schema()?;
+            Ok(())
        }
        "memory-ceiling" => {
            run_memory_ceiling_tests()?;
+            Ok(())
        }
        _ => {
            eprintln!("Unknown command: {}", args[1]);
            std::process::exit(1);
        }
+    };
+
+    result
+}
+
+/// Generate JSON Schema from Rust output types.
+///
+/// Delegates to the gen_schema binary.
+fn gen_schema() -> Result<(), Box<dyn std::error::Error>> {
+    // Invoke the gen_schema binary
+    let status = std::process::Command::new("cargo")
+        .args(["run", "--bin", "gen_schema"])
+        .current_dir(find_workspace_root())
+        .status()?;
+
+    if !status.success() {
+        return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into());
    }

    Ok(())