From 92e90af0b070f08b82d2f8ab8bc2087d61fd868c Mon Sep 17 00:00:00 2001
From: jedarden <github@jedarden.com>
Date: Sun, 24 May 2026 01:29:14 -0400
Subject: [PATCH] feat(pdftract-zy2jx): generate JSON Schema from Rust output
 types

- Add schemars dependency to pdftract-core (v1.2)
- Add JsonSchema derives to output types (ExtractionResult, PageResult, ExtractionMetadata, SpanJson, BlockJson, CellJson, RowJson, TableJson, ExtractionQuality, Receipt, ReceiptsMode)
- Create xtask/src/bin/gen_schema.rs for schema generation
- Add gen-schema command to xtask main.rs
- Generate docs/schema/v1.0/pdftract.schema.json using Draft 2020-12

Schema includes:
- $schema: "https://json-schema.org/draft/2020-12/schema"
- $defs with all output type definitions
- Proper type annotations for all fields

Closes: pdftract-zy2jx
---
 Cargo.lock                               |  50 +-
 crates/pdftract-core/Cargo.toml          |   4 +-
 crates/pdftract-core/src/extract.rs      |   5 +
 crates/pdftract-core/src/options.rs      |   3 +
 crates/pdftract-core/src/receipts/mod.rs |   3 +
 crates/pdftract-core/src/schema/mod.rs   |   8 +
 docs/schema/v1.0/pdftract.schema.json    | 720 ++++++++++++++---------
 xtask/Cargo.toml                         |   6 +
 xtask/src/bin/gen_schema.rs              |  74 +++
 xtask/src/main.rs                        |  29 +-
 10 files changed, 610 insertions(+), 292 deletions(-)
 create mode 100644 xtask/src/bin/gen_schema.rs

diff --git a/Cargo.lock b/Cargo.lock
index a5f762f..28f3e5e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2302,7 +2302,7 @@ dependencies = [
  "pdftract-core",
  "regex",
  "reqwest",
- "schemars",
+ "schemars 0.8.22",
  "secrecy",
  "semver",
  "serde",
@@ -2349,6 +2349,7 @@ dependencies = [
  "quick-xml",
  "rayon",
  "regex",
+ "schemars 1.2.1",
  "secrecy",
  "serde",
  "serde_json",
@@ -2967,6 +2968,26 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "ref-cast"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
+dependencies = [
+ "ref-cast-impl",
+]
+
+[[package]]
+name = "ref-cast-impl"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "regex"
 version = "1.12.3"
@@ -3170,7 +3191,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
 dependencies = [
  "dyn-clone",
- "schemars_derive",
+ "schemars_derive 0.8.22",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "schemars_derive 1.2.1",
  "serde",
  "serde_json",
 ]
@@ -3187,6 +3221,18 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "schemars_derive"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml
index 8fc188d..fb63901 100644
--- a/crates/pdftract-core/Cargo.toml
+++ b/crates/pdftract-core/Cargo.toml
@@ -21,6 +21,7 @@ regex = "1.10"
 secrecy = { workspace = true }
 serde = { version = "1.0", features = ["derive"], optional = true }
 serde_json = { version = "1.0", optional = true }
+schemars = { version = "1.2", features = ["derive"], optional = true }
 sha2 = "0.10"
 thiserror = { workspace = true }
 memchr = { workspace = true }
@@ -38,7 +39,8 @@ quick-xml = { version = "0.36", optional = true }
 
 [features]
 default = ["serde"]
-serde = ["dep:serde", "dep:serde_json"]
+serde = ["dep:serde", "dep:serde_json", "dep:schemars"]
+schemars = ["dep:schemars", "serde"]
 receipts = []  # Enable visual citation receipts (SVG clip generation)
 ocr = ["dep:image", "dep:leptonica-plumbing", "dep:quick-xml"]  # Enable OCR path (image compositing + preprocessing + HOCR parsing)
 full-render = ["dep:pdfium-render", "ocr"]  # Enable PDFium-based rendering (requires ocr)
diff --git a/crates/pdftract-core/src/extract.rs b/crates/pdftract-core/src/extract.rs
index 3cb0719..a5f264a 100644
--- a/crates/pdftract-core/src/extract.rs
+++ b/crates/pdftract-core/src/extract.rs
@@ -28,6 +28,8 @@ use anyhow::{Context, Result};
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;
 use std::sync::Arc;
 use crate::parser::stream::FileSource;
 
@@ -102,6 +104,7 @@ fn decode_page_content_streams(
 ///
 /// Contains the extracted pages, spans, blocks, and metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct ExtractionResult {
     /// The PDF fingerprint (for receipt generation).
     pub fingerprint: String,
@@ -113,6 +116,7 @@ pub struct ExtractionResult {
 
 /// Result for a single page.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct PageResult {
     /// 0-based page index.
     pub index: usize,
@@ -177,6 +181,7 @@ impl From<PageResultInternal> for PageResult {
 
 /// Metadata about the extraction process.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct ExtractionMetadata {
     /// Total number of pages in the document.
     pub page_count: usize,
diff --git a/crates/pdftract-core/src/options.rs b/crates/pdftract-core/src/options.rs
index adf6855..893d906 100644
--- a/crates/pdftract-core/src/options.rs
+++ b/crates/pdftract-core/src/options.rs
@@ -4,11 +4,14 @@
 //! including the receipts mode for cryptographic provenance tracking.
 
 use serde::{Deserialize, Serialize};
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;
 
 /// Receipt generation mode.
 ///
 /// Controls whether visual citation receipts are generated during extraction.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 #[serde(rename_all = "lowercase")]
 pub enum ReceiptsMode {
     /// No receipts generated (default).
diff --git a/crates/pdftract-core/src/receipts/mod.rs b/crates/pdftract-core/src/receipts/mod.rs
index 475051a..b1c2162 100644
--- a/crates/pdftract-core/src/receipts/mod.rs
+++ b/crates/pdftract-core/src/receipts/mod.rs
@@ -26,6 +26,8 @@ pub mod svg;
 pub mod verifier;
 
 use serde::{Deserialize, Serialize};
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;
 
 /// A visual citation receipt for extracted text.
 ///
@@ -59,6 +61,7 @@ use serde::{Deserialize, Serialize};
 /// }
 /// ```
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct Receipt {
     /// Phase 1.7 fingerprint of the source PDF.
     ///
diff --git a/crates/pdftract-core/src/schema/mod.rs b/crates/pdftract-core/src/schema/mod.rs
index 88e37bc..cbd0997 100644
--- a/crates/pdftract-core/src/schema/mod.rs
+++ b/crates/pdftract-core/src/schema/mod.rs
@@ -18,6 +18,8 @@
 
 use serde::{Deserialize, Serialize};
 use serde_json::json;
+#[cfg(feature = "schemars")]
+use schemars::JsonSchema;
 
 use crate::receipts::Receipt;
 
@@ -26,6 +28,7 @@ use crate::receipts::Receipt;
 /// A span is the smallest unit of extracted text, representing a
 /// contiguous run of text with consistent font and styling.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct SpanJson {
     /// The extracted text content.
     pub text: String,
@@ -64,6 +67,7 @@ pub struct SpanJson {
 /// spans. Examples include paragraphs, headings, list items, and
 /// table cells.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct BlockJson {
     /// The block kind/type.
     ///
@@ -112,6 +116,7 @@ pub type SpanRef = usize;
 /// A cell represents a single unit within a table row, containing
 /// its text content, bounding box, and position information.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct CellJson {
     /// Bounding box in PDF user-space points.
     ///
@@ -163,6 +168,7 @@ fn default_one() -> u32 {
 /// A row contains a sequence of cells that form a horizontal strip
 /// in the table.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct RowJson {
     /// Bounding box in PDF user-space points.
     ///
@@ -185,6 +191,7 @@ pub struct RowJson {
 /// provides the concatenated text and position, while the TableJson
 /// provides full cell-level structure.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct TableJson {
     /// Unique identifier for this table (e.g., "table_0").
     pub id: String,
@@ -231,6 +238,7 @@ pub struct TableJson {
 /// in the root metadata (full JSON mode). It provides aggregate
 /// quality signals across all pages.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
 pub struct ExtractionQuality {
     /// Overall quality assessment: "high", "medium", "low", or "none".
     ///
diff --git a/docs/schema/v1.0/pdftract.schema.json b/docs/schema/v1.0/pdftract.schema.json
index 36d4bd3..a1911aa 100644
--- a/docs/schema/v1.0/pdftract.schema.json
+++ b/docs/schema/v1.0/pdftract.schema.json
@@ -1,345 +1,489 @@
 {
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json",
-  "title": "PDFtract Extraction Output Schema v1.0",
-  "description": "JSON output schema for PDF text and structure extraction",
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "ExtractionResult",
+  "description": "Result of a PDF extraction operation.\n\nContains the extracted pages, spans, blocks, and metadata.",
   "type": "object",
-  "required": ["fingerprint", "schema_version", "pages", "metadata"],
   "properties": {
     "fingerprint": {
-      "type": "string",
-      "description": "PDF fingerprint for verification (format: pdftract-v1:<hex>)"
-    },
-    "schema_version": {
-      "type": "string",
-      "description": "Schema version (e.g., '1.0')",
-      "enum": ["1.0"]
-    },
-    "pages": {
-      "type": "array",
-      "description": "Extracted pages",
-      "items": {
-        "$ref": "#/definitions/page"
-      }
+      "description": "The PDF fingerprint (for receipt generation).",
+      "type": "string"
     },
     "metadata": {
-      "$ref": "#/definitions/metadata"
+      "description": "Metadata about the extraction.",
+      "$ref": "#/$defs/ExtractionMetadata"
+    },
+    "pages": {
+      "description": "Extracted pages, each containing spans and blocks.",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/PageResult"
+      }
     }
   },
-  "definitions": {
-    "page": {
+  "required": [
+    "fingerprint",
+    "pages",
+    "metadata"
+  ],
+  "$defs": {
+    "BlockJson": {
+      "description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.",
       "type": "object",
-      "required": ["index", "spans", "blocks", "tables"],
       "properties": {
-        "index": {
-          "type": "integer",
-          "description": "0-based page index"
-        },
-        "spans": {
-          "type": "array",
-          "description": "Extracted text spans",
-          "items": {
-            "$ref": "#/definitions/span"
-          }
-        },
-        "blocks": {
-          "type": "array",
-          "description": "Extracted structural blocks",
-          "items": {
-            "$ref": "#/definitions/block"
-          }
-        },
-        "tables": {
-          "type": "array",
-          "description": "Extracted tables (cell-level structure)",
-          "items": {
-            "$ref": "#/definitions/table"
-          }
-        },
-        "error": {
-          "type": "string",
-          "description": "Error message if extraction failed for this page"
-        }
-      }
-    },
-    "span": {
-      "type": "object",
-      "required": ["text", "bbox", "font", "size"],
-      "properties": {
-        "text": {
-          "type": "string",
-          "description": "The extracted text content"
-        },
         "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
           "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
           "items": {
-            "type": "number"
+            "type": "number",
+            "format": "double"
           },
-          "minItems": 4,
-          "maxItems": 4
+          "maxItems": 4,
+          "minItems": 4
         },
-        "font": {
-          "type": "string",
-          "description": "Font name or identifier"
-        },
-        "size": {
-          "type": "number",
-          "description": "Font size in points"
-        },
-        "confidence": {
-          "type": "number",
-          "description": "Confidence score (0.0 to 1.0) for OCR text",
-          "minimum": 0.0,
-          "maximum": 1.0
-        },
-        "receipt": {
-          "$ref": "#/definitions/receipt"
-        }
-      }
-    },
-    "block": {
-      "type": "object",
-      "required": ["kind", "text", "bbox"],
-      "properties": {
         "kind": {
-          "type": "string",
-          "description": "Block kind/type",
-          "enum": ["paragraph", "heading", "list", "table", "figure"]
-        },
-        "text": {
-          "type": "string",
-          "description": "The concatenated text content of all spans in the block"
-        },
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
+          "description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".",
+          "type": "string"
         },
         "level": {
-          "type": "integer",
-          "description": "Heading level (1-6) for 'heading' kind blocks",
-          "minimum": 1,
-          "maximum": 6
-        },
-        "table_index": {
-          "type": "integer",
-          "description": "Table index for 'table' kind blocks (points to tables array)",
+          "description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.",
+          "type": [
+            "integer",
+            "null"
+          ],
+          "format": "uint8",
+          "maximum": 255,
           "minimum": 0
         },
         "receipt": {
-          "$ref": "#/definitions/receipt"
-        }
-      }
-    },
-    "table": {
-      "type": "object",
-      "required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"],
-      "properties": {
-        "id": {
-          "type": "string",
-          "description": "Unique identifier for this table (e.g., 'table_0')"
+          "description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
+          "anyOf": [
+            {
+              "$ref": "#/$defs/Receipt"
+            },
+            {
+              "type": "null"
+            }
+          ]
         },
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
-        },
-        "rows": {
-          "type": "array",
-          "description": "Rows in this table, ordered top-to-bottom",
-          "items": {
-            "$ref": "#/definitions/row"
-          }
-        },
-        "header_rows": {
-          "type": "integer",
-          "description": "Number of contiguous header rows at the top of the table",
+        "table_index": {
+          "description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.",
+          "type": [
+            "integer",
+            "null"
+          ],
+          "format": "uint",
           "minimum": 0
         },
-        "detection_method": {
-          "type": "string",
-          "description": "Detection method used to identify this table",
-          "enum": ["line_based", "borderless"]
-        },
-        "continued": {
-          "type": "boolean",
-          "description": "Whether this table continues on the next page"
-        },
-        "continued_from_prev": {
-          "type": "boolean",
-          "description": "Whether this table is a continuation from the previous page"
-        },
-        "page_index": {
-          "type": "integer",
-          "description": "Zero-based page index where this table appears",
-          "minimum": 0
-        }
-      }
-    },
-    "row": {
-      "type": "object",
-      "required": ["bbox", "cells", "is_header"],
-      "properties": {
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
-        },
-        "cells": {
-          "type": "array",
-          "description": "Cells in this row, ordered left-to-right",
-          "items": {
-            "$ref": "#/definitions/cell"
-          }
-        },
-        "is_header": {
-          "type": "boolean",
-          "description": "Whether this row is a header row"
-        }
-      }
-    },
-    "cell": {
-      "type": "object",
-      "required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"],
-      "properties": {
-        "bbox": {
-          "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
-          "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
-        },
         "text": {
-          "type": "string",
-          "description": "The concatenated text content of all spans in the cell"
-        },
-        "spans": {
+          "description": "The concatenated text content of all spans in the block.",
+          "type": "string"
+        }
+      },
+      "required": [
+        "kind",
+        "text",
+        "bbox"
+      ]
+    },
+    "CellJson": {
+      "description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
           "type": "array",
-          "description": "References to spans in the page's spans array",
           "items": {
-            "type": "integer"
-          }
-        },
-        "row": {
-          "type": "integer",
-          "description": "Zero-based row index within the table",
-          "minimum": 0
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
         },
         "col": {
+          "description": "Zero-based column index within the table.",
           "type": "integer",
-          "description": "Zero-based column index within the table",
+          "format": "uint",
+          "minimum": 0
+        },
+        "colspan": {
+          "description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.",
+          "type": "integer",
+          "format": "uint32",
+          "default": 1,
+          "minimum": 0
+        },
+        "is_header_row": {
+          "description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.",
+          "type": "boolean"
+        },
+        "row": {
+          "description": "Zero-based row index within the table.",
+          "type": "integer",
+          "format": "uint",
           "minimum": 0
         },
         "rowspan": {
+          "description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.",
           "type": "integer",
-          "description": "Number of rows this cell spans (default 1)",
-          "minimum": 1
+          "format": "uint32",
+          "default": 1,
+          "minimum": 0
         },
-        "colspan": {
-          "type": "integer",
-          "description": "Number of columns this cell spans (default 1)",
-          "minimum": 1
-        },
-        "is_header_row": {
-          "type": "boolean",
-          "description": "Whether this cell is in a header row"
-        }
-      }
-    },
-    "receipt": {
-      "type": "object",
-      "required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"],
-      "properties": {
-        "pdf_fingerprint": {
-          "type": "string",
-          "description": "The PDF fingerprint"
-        },
-        "page_index": {
-          "type": "integer",
-          "description": "The page index"
-        },
-        "bbox": {
+        "spans": {
+          "description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.",
           "type": "array",
-          "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
           "items": {
-            "type": "number"
-          },
-          "minItems": 4,
-          "maxItems": 4
+            "type": "integer",
+            "format": "uint",
+            "minimum": 0
+          }
         },
-        "content_hash": {
-          "type": "string",
-          "description": "SHA-256 hash of the content"
-        },
-        "extraction_version": {
-          "type": "string",
-          "description": "Version string of the extractor"
-        },
-        "svg_clip": {
-          "type": "string",
-          "description": "SVG clip path for verification (present only in SvgClip mode)"
+        "text": {
+          "description": "The concatenated text content of all spans in the cell.",
+          "type": "string"
         }
-      }
+      },
+      "required": [
+        "bbox",
+        "text",
+        "spans",
+        "row",
+        "col",
+        "is_header_row"
+      ]
     },
-    "metadata": {
+    "ExtractionMetadata": {
+      "description": "Metadata about the extraction process.",
       "type": "object",
-      "required": ["page_count", "span_count", "block_count"],
       "properties": {
-        "page_count": {
-          "type": "integer",
-          "description": "Total number of pages in the document"
-        },
-        "span_count": {
-          "type": "integer",
-          "description": "Number of spans extracted"
-        },
         "block_count": {
+          "description": "Number of blocks extracted.",
           "type": "integer",
-          "description": "Number of blocks extracted"
-        },
-        "cache_status": {
-          "type": "string",
-          "description": "Cache status: 'hit', 'miss', or 'skipped'",
-          "enum": ["hit", "miss", "skipped"]
+          "format": "uint",
+          "minimum": 0
         },
         "cache_age_seconds": {
-          "type": "integer",
-          "description": "Cache entry age in seconds (only present when cache_status == 'hit')",
+          "description": "Cache entry age in seconds (only present when cache_status == \"hit\")",
+          "type": [
+            "integer",
+            "null"
+          ],
+          "format": "uint64",
           "minimum": 0
         },
-        "error_count": {
-          "type": "integer",
-          "description": "Number of pages that failed to extract",
-          "minimum": 0
-        },
-        "reading_order_algorithm": {
-          "type": "string",
-          "description": "Reading order algorithm used for this extraction",
-          "enum": ["struct_tree", "xy_cut"]
+        "cache_status": {
+          "description": "Cache status: \"hit\", \"miss\", or \"skipped\"",
+          "type": [
+            "string",
+            "null"
+          ]
         },
         "diagnostics": {
+          "description": "Diagnostics emitted during extraction (coverage warnings, etc.)",
           "type": "array",
-          "description": "Diagnostics emitted during extraction",
           "items": {
             "type": "string"
           }
+        },
+        "error_count": {
+          "description": "Number of pages that failed to extract.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "page_count": {
+          "description": "Total number of pages in the document.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "reading_order_algorithm": {
+          "description": "Reading order algorithm used for this extraction.",
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "receipts_mode": {
+          "description": "Receipts mode used for this extraction.",
+          "$ref": "#/$defs/ReceiptsMode"
+        },
+        "span_count": {
+          "description": "Number of spans extracted.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
         }
-      }
+      },
+      "required": [
+        "page_count",
+        "receipts_mode",
+        "span_count",
+        "block_count",
+        "error_count",
+        "diagnostics"
+      ]
+    },
+    "PageResult": {
+      "description": "Result for a single page.",
+      "type": "object",
+      "properties": {
+        "blocks": {
+          "description": "Extracted blocks (semantic units like paragraphs, headings).",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/BlockJson"
+          }
+        },
+        "error": {
+          "description": "Error message if extraction failed for this page.",
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "index": {
+          "description": "0-based page index.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "spans": {
+          "description": "Extracted spans (text fragments with consistent styling).",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/SpanJson"
+          }
+        },
+        "tables": {
+          "description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/TableJson"
+          }
+        }
+      },
+      "required": [
+        "index",
+        "spans",
+        "blocks",
+        "tables"
+      ]
+    },
+    "Receipt": {
+      "description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n  \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n  \"page_index\": 14,\n  \"bbox\": [220.0, 412.0, 412.0, 432.0],\n  \"content_hash\": \"sha256:9b21...\",\n  \"extraction_version\": \"1.0.0\"\n}\n```",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "content_hash": {
+          "description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).",
+          "type": "string"
+        },
+        "extraction_version": {
+          "description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.",
+          "type": "string"
+        },
+        "page_index": {
+          "description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "pdf_fingerprint": {
+          "description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).",
+          "type": "string"
+        },
+        "svg_clip": {
+          "description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.",
+          "type": [
+            "string",
+            "null"
+          ]
+        }
+      },
+      "required": [
+        "pdf_fingerprint",
+        "page_index",
+        "bbox",
+        "content_hash",
+        "extraction_version"
+      ]
+    },
+    "ReceiptsMode": {
+      "description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.",
+      "oneOf": [
+        {
+          "description": "No receipts generated (default).",
+          "type": "string",
+          "const": "off"
+        },
+        {
+          "description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.",
+          "type": "string",
+          "const": "lite"
+        },
+        {
+          "description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.",
+          "type": "string",
+          "const": "svg"
+        }
+      ]
+    },
+    "RowJson": {
+      "description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "cells": {
+          "description": "Cells in this row, ordered left-to-right.",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/CellJson"
+          }
+        },
+        "is_header": {
+          "description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.",
+          "type": "boolean"
+        }
+      },
+      "required": [
+        "bbox",
+        "cells",
+        "is_header"
+      ]
+    },
+    "SpanJson": {
+      "description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "confidence": {
+          "description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.",
+          "type": [
+            "number",
+            "null"
+          ],
+          "format": "double"
+        },
+        "font": {
+          "description": "Font name or identifier.",
+          "type": "string"
+        },
+        "receipt": {
+          "description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
+          "anyOf": [
+            {
+              "$ref": "#/$defs/Receipt"
+            },
+            {
+              "type": "null"
+            }
+          ]
+        },
+        "size": {
+          "description": "Font size in points.",
+          "type": "number",
+          "format": "double"
+        },
+        "text": {
+          "description": "The extracted text content.",
+          "type": "string"
+        }
+      },
+      "required": [
+        "text",
+        "bbox",
+        "font",
+        "size"
+      ]
+    },
+    "TableJson": {
+      "description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.",
+      "type": "object",
+      "properties": {
+        "bbox": {
+          "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
+          "type": "array",
+          "items": {
+            "type": "number",
+            "format": "double"
+          },
+          "maxItems": 4,
+          "minItems": 4
+        },
+        "continued": {
+          "description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.",
+          "type": "boolean"
+        },
+        "continued_from_prev": {
+          "description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.",
+          "type": "boolean"
+        },
+        "detection_method": {
+          "description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics",
+          "type": "string"
+        },
+        "header_rows": {
+          "description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.",
+          "type": "integer",
+          "format": "uint32",
+          "minimum": 0
+        },
+        "id": {
+          "description": "Unique identifier for this table (e.g., \"table_0\").",
+          "type": "string"
+        },
+        "page_index": {
+          "description": "Zero-based page index where this table appears.",
+          "type": "integer",
+          "format": "uint",
+          "minimum": 0
+        },
+        "rows": {
+          "description": "Rows in this table, ordered top-to-bottom.",
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/RowJson"
+          }
+        }
+      },
+      "required": [
+        "id",
+        "bbox",
+        "rows",
+        "header_rows",
+        "detection_method",
+        "continued",
+        "continued_from_prev",
+        "page_index"
+      ]
     }
   }
-}
+}
\ No newline at end of file
diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml
index 1f3f0cd..a5ac24d 100644
--- a/xtask/Cargo.toml
+++ b/xtask/Cargo.toml
@@ -11,6 +11,10 @@ publish = false
 name = "xtask"
 path = "src/main.rs"
 
+[[bin]]
+name = "gen_schema"
+path = "src/bin/gen_schema.rs"
+
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
@@ -18,3 +22,5 @@ serde_yaml = "0.9"
 glob = "0.3"
 humantime = "2.1"
 lopdf = "0.34"
+schemars = "1.2"
+pdftract-core = { path = "../crates/pdftract-core", features = ["schemars"] }
diff --git a/xtask/src/bin/gen_schema.rs b/xtask/src/bin/gen_schema.rs
new file mode 100644
index 0000000..9c9ff11
--- /dev/null
+++ b/xtask/src/bin/gen_schema.rs
@@ -0,0 +1,74 @@
+//! Generate JSON Schema from Rust output types.
+//!
+//! This binary generates the canonical JSON Schema for pdftract's
+//! extraction output, which is checked into the repository at
+//! docs/schema/v1.0/pdftract.schema.json.
+//!
+//! Usage: cargo run --bin gen_schema
+
+use std::fs;
+use std::path::PathBuf;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Find the workspace root
+    let workspace_root = find_workspace_root();
+
+    // Generate the schema
+    let schema_json = generate_schema();
+
+    // Write to docs/schema/v1.0/pdftract.schema.json
+    let schema_path = workspace_root.join("docs/schema/v1.0/pdftract.schema.json");
+
+    // Create the directory if it doesn't exist
+    if let Some(parent) = schema_path.parent() {
+        fs::create_dir_all(parent)?;
+    }
+
+    fs::write(&schema_path, schema_json)?;
+
+    println!("Generated schema at: {}", schema_path.display());
+
+    Ok(())
+}
+
+/// Find the workspace root by searching for Cargo.toml
+fn find_workspace_root() -> PathBuf {
+    let mut current = std::env::current_dir().unwrap();
+
+    // If we're in the xtask directory, go to parent
+    if current.ends_with("xtask") {
+        current = current.parent().unwrap().to_path_buf();
+    }
+
+    // Search upward for Cargo.toml with workspace members
+    loop {
+        let cargo_toml = current.join("Cargo.toml");
+        if cargo_toml.exists() {
+            let content = fs::read_to_string(&cargo_toml).unwrap_or_default();
+            if content.contains("[workspace]") {
+                return current;
+            }
+        }
+
+        match current.parent() {
+            Some(parent) => current = parent.to_path_buf(),
+            None => break,
+        }
+    }
+
+    // Fallback: use current directory if not found
+    std::env::current_dir().unwrap()
+}
+
+/// Generate the JSON Schema for pdftract extraction output.
+fn generate_schema() -> String {
+    use pdftract_core::extract::ExtractionResult;
+    use schemars::schema_for;
+
+    let schema = schema_for!(ExtractionResult);
+
+    // Convert to JSON string
+    // The schema_for! macro already includes the $schema field
+    serde_json::to_string_pretty(&schema)
+        .expect("Failed to serialize schema")
+}
diff --git a/xtask/src/main.rs b/xtask/src/main.rs
index 68b55d3..b13d682 100644
--- a/xtask/src/main.rs
+++ b/xtask/src/main.rs
@@ -104,17 +104,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         eprintln!("  doc-profiles                     Generate README skeletons for all profiles");
         eprintln!("  generate-stress-pdfs            Generate stress-test PDFs for memory ceiling testing");
         eprintln!("  generate-page-class-fixtures    Generate page classification test fixtures");
+        eprintln!("  gen-schema                      Generate JSON Schema from Rust output types");
         eprintln!("  memory-ceiling                  Run memory ceiling tests against perf/malformed corpora");
         std::process::exit(1);
     }
 
-    match args[1].as_str() {
+    let result = match args[1].as_str() {
         "doc-profile" => {
             if args.len() < 3 {
                 eprintln!("Usage: xtask doc-profile <profile-name>");
                 std::process::exit(1);
             }
             generate_profile_readme(&args[2])?;
+            Ok(())
         }
         "doc-profiles" => {
             let profiles_dir = find_workspace_root().join("profiles/builtin");
@@ -127,20 +129,45 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                     }
                 }
             }
+            Ok(())
         }
         "generate-stress-pdfs" => {
             generate_stress_pdfs()?;
+            Ok(())
         }
         "generate-page-class-fixtures" => {
             generate_page_class_fixtures()?;
+            Ok(())
+        }
+        "gen-schema" => {
+            gen_schema()?;
+            Ok(())
         }
         "memory-ceiling" => {
             run_memory_ceiling_tests()?;
+            Ok(())
         }
         _ => {
             eprintln!("Unknown command: {}", args[1]);
             std::process::exit(1);
         }
+    };
+
+    result
+}
+
+/// Generate JSON Schema from Rust output types.
+///
+/// Delegates to the gen_schema binary.
+fn gen_schema() -> Result<(), Box<dyn std::error::Error>> {
+    // Invoke the gen_schema binary
+    let status = std::process::Command::new("cargo")
+        .args(["run", "--bin", "gen_schema"])
+        .current_dir(find_workspace_root())
+        .status()?;
+
+    if !status.success() {
+        return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into());
     }
 
     Ok(())