feat(pdftract-zy2jx): generate JSON Schema from Rust output types

- Add schemars dependency to pdftract-core (v1.2)
- Add JsonSchema derives to output types (ExtractionResult, PageResult, ExtractionMetadata, SpanJson, BlockJson, CellJson, RowJson, TableJson, ExtractionQuality, Receipt, ReceiptsMode)
- Create xtask/src/bin/gen_schema.rs for schema generation
- Add gen-schema command to xtask main.rs
- Generate docs/schema/v1.0/pdftract.schema.json using Draft 2020-12

Schema includes:
- $schema: "https://json-schema.org/draft/2020-12/schema"
- $defs with all output type definitions
- Proper type annotations for all fields

Closes: pdftract-zy2jx
This commit is contained in:
jedarden 2026-05-24 01:29:14 -04:00
parent d723427da7
commit 92e90af0b0
10 changed files with 610 additions and 292 deletions

50
Cargo.lock generated
View file

@ -2302,7 +2302,7 @@ dependencies = [
"pdftract-core",
"regex",
"reqwest",
"schemars",
"schemars 0.8.22",
"secrecy",
"semver",
"serde",
@ -2349,6 +2349,7 @@ dependencies = [
"quick-xml",
"rayon",
"regex",
"schemars 1.2.1",
"secrecy",
"serde",
"serde_json",
@ -2967,6 +2968,26 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "ref-cast"
version = "1.0.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
dependencies = [
"ref-cast-impl",
]
[[package]]
name = "ref-cast-impl"
version = "1.0.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "regex"
version = "1.12.3"
@ -3170,7 +3191,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
dependencies = [
"dyn-clone",
"schemars_derive",
"schemars_derive 0.8.22",
"serde",
"serde_json",
]
[[package]]
name = "schemars"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
dependencies = [
"dyn-clone",
"ref-cast",
"schemars_derive 1.2.1",
"serde",
"serde_json",
]
@ -3187,6 +3221,18 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "schemars_derive"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f"
dependencies = [
"proc-macro2",
"quote",
"serde_derive_internals",
"syn 2.0.117",
]
[[package]]
name = "scopeguard"
version = "1.2.0"

View file

@ -21,6 +21,7 @@ regex = "1.10"
secrecy = { workspace = true }
serde = { version = "1.0", features = ["derive"], optional = true }
serde_json = { version = "1.0", optional = true }
schemars = { version = "1.2", features = ["derive"], optional = true }
sha2 = "0.10"
thiserror = { workspace = true }
memchr = { workspace = true }
@ -38,7 +39,8 @@ quick-xml = { version = "0.36", optional = true }
[features]
default = ["serde"]
serde = ["dep:serde", "dep:serde_json"]
serde = ["dep:serde", "dep:serde_json", "dep:schemars"]
schemars = ["dep:schemars", "serde"]
receipts = [] # Enable visual citation receipts (SVG clip generation)
ocr = ["dep:image", "dep:leptonica-plumbing", "dep:quick-xml"] # Enable OCR path (image compositing + preprocessing + HOCR parsing)
full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr)

View file

@ -28,6 +28,8 @@ use anyhow::{Context, Result};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use serde_json::json;
#[cfg(feature = "schemars")]
use schemars::JsonSchema;
use std::sync::Arc;
use crate::parser::stream::FileSource;
@ -102,6 +104,7 @@ fn decode_page_content_streams(
///
/// Contains the extracted pages, spans, blocks, and metadata.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct ExtractionResult {
/// The PDF fingerprint (for receipt generation).
pub fingerprint: String,
@ -113,6 +116,7 @@ pub struct ExtractionResult {
/// Result for a single page.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct PageResult {
/// 0-based page index.
pub index: usize,
@ -177,6 +181,7 @@ impl From<PageResultInternal> for PageResult {
/// Metadata about the extraction process.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct ExtractionMetadata {
/// Total number of pages in the document.
pub page_count: usize,

View file

@ -4,11 +4,14 @@
//! including the receipts mode for cryptographic provenance tracking.
use serde::{Deserialize, Serialize};
#[cfg(feature = "schemars")]
use schemars::JsonSchema;
/// Receipt generation mode.
///
/// Controls whether visual citation receipts are generated during extraction.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
#[serde(rename_all = "lowercase")]
pub enum ReceiptsMode {
/// No receipts generated (default).

View file

@ -26,6 +26,8 @@ pub mod svg;
pub mod verifier;
use serde::{Deserialize, Serialize};
#[cfg(feature = "schemars")]
use schemars::JsonSchema;
/// A visual citation receipt for extracted text.
///
@ -59,6 +61,7 @@ use serde::{Deserialize, Serialize};
/// }
/// ```
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct Receipt {
/// Phase 1.7 fingerprint of the source PDF.
///

View file

@ -18,6 +18,8 @@
use serde::{Deserialize, Serialize};
use serde_json::json;
#[cfg(feature = "schemars")]
use schemars::JsonSchema;
use crate::receipts::Receipt;
@ -26,6 +28,7 @@ use crate::receipts::Receipt;
/// A span is the smallest unit of extracted text, representing a
/// contiguous run of text with consistent font and styling.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct SpanJson {
/// The extracted text content.
pub text: String,
@ -64,6 +67,7 @@ pub struct SpanJson {
/// spans. Examples include paragraphs, headings, list items, and
/// table cells.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct BlockJson {
/// The block kind/type.
///
@ -112,6 +116,7 @@ pub type SpanRef = usize;
/// A cell represents a single unit within a table row, containing
/// its text content, bounding box, and position information.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct CellJson {
/// Bounding box in PDF user-space points.
///
@ -163,6 +168,7 @@ fn default_one() -> u32 {
/// A row contains a sequence of cells that form a horizontal strip
/// in the table.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct RowJson {
/// Bounding box in PDF user-space points.
///
@ -185,6 +191,7 @@ pub struct RowJson {
/// provides the concatenated text and position, while the TableJson
/// provides full cell-level structure.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct TableJson {
/// Unique identifier for this table (e.g., "table_0").
pub id: String,
@ -231,6 +238,7 @@ pub struct TableJson {
/// in the root metadata (full JSON mode). It provides aggregate
/// quality signals across all pages.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
pub struct ExtractionQuality {
/// Overall quality assessment: "high", "medium", "low", or "none".
///

View file

@ -1,345 +1,489 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json",
"title": "PDFtract Extraction Output Schema v1.0",
"description": "JSON output schema for PDF text and structure extraction",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "ExtractionResult",
"description": "Result of a PDF extraction operation.\n\nContains the extracted pages, spans, blocks, and metadata.",
"type": "object",
"required": ["fingerprint", "schema_version", "pages", "metadata"],
"properties": {
"fingerprint": {
"type": "string",
"description": "PDF fingerprint for verification (format: pdftract-v1:<hex>)"
},
"schema_version": {
"type": "string",
"description": "Schema version (e.g., '1.0')",
"enum": ["1.0"]
},
"pages": {
"type": "array",
"description": "Extracted pages",
"items": {
"$ref": "#/definitions/page"
}
"description": "The PDF fingerprint (for receipt generation).",
"type": "string"
},
"metadata": {
"$ref": "#/definitions/metadata"
"description": "Metadata about the extraction.",
"$ref": "#/$defs/ExtractionMetadata"
},
"pages": {
"description": "Extracted pages, each containing spans and blocks.",
"type": "array",
"items": {
"$ref": "#/$defs/PageResult"
}
}
},
"definitions": {
"page": {
"required": [
"fingerprint",
"pages",
"metadata"
],
"$defs": {
"BlockJson": {
"description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.",
"type": "object",
"required": ["index", "spans", "blocks", "tables"],
"properties": {
"index": {
"type": "integer",
"description": "0-based page index"
},
"spans": {
"type": "array",
"description": "Extracted text spans",
"items": {
"$ref": "#/definitions/span"
}
},
"blocks": {
"type": "array",
"description": "Extracted structural blocks",
"items": {
"$ref": "#/definitions/block"
}
},
"tables": {
"type": "array",
"description": "Extracted tables (cell-level structure)",
"items": {
"$ref": "#/definitions/table"
}
},
"error": {
"type": "string",
"description": "Error message if extraction failed for this page"
}
}
},
"span": {
"type": "object",
"required": ["text", "bbox", "font", "size"],
"properties": {
"text": {
"type": "string",
"description": "The extracted text content"
},
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"font": {
"type": "string",
"description": "Font name or identifier"
},
"size": {
"type": "number",
"description": "Font size in points"
"format": "double"
},
"confidence": {
"type": "number",
"description": "Confidence score (0.0 to 1.0) for OCR text",
"minimum": 0.0,
"maximum": 1.0
"maxItems": 4,
"minItems": 4
},
"receipt": {
"$ref": "#/definitions/receipt"
}
}
},
"block": {
"type": "object",
"required": ["kind", "text", "bbox"],
"properties": {
"kind": {
"type": "string",
"description": "Block kind/type",
"enum": ["paragraph", "heading", "list", "table", "figure"]
},
"text": {
"type": "string",
"description": "The concatenated text content of all spans in the block"
},
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
"description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".",
"type": "string"
},
"level": {
"type": "integer",
"description": "Heading level (1-6) for 'heading' kind blocks",
"minimum": 1,
"maximum": 6
},
"table_index": {
"type": "integer",
"description": "Table index for 'table' kind blocks (points to tables array)",
"description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.",
"type": [
"integer",
"null"
],
"format": "uint8",
"maximum": 255,
"minimum": 0
},
"receipt": {
"$ref": "#/definitions/receipt"
}
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
"anyOf": [
{
"$ref": "#/$defs/Receipt"
},
{
"type": "null"
}
]
},
"table": {
"type": "object",
"required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"],
"properties": {
"id": {
"type": "string",
"description": "Unique identifier for this table (e.g., 'table_0')"
},
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"rows": {
"type": "array",
"description": "Rows in this table, ordered top-to-bottom",
"items": {
"$ref": "#/definitions/row"
}
},
"header_rows": {
"type": "integer",
"description": "Number of contiguous header rows at the top of the table",
"table_index": {
"description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.",
"type": [
"integer",
"null"
],
"format": "uint",
"minimum": 0
},
"detection_method": {
"type": "string",
"description": "Detection method used to identify this table",
"enum": ["line_based", "borderless"]
},
"continued": {
"type": "boolean",
"description": "Whether this table continues on the next page"
},
"continued_from_prev": {
"type": "boolean",
"description": "Whether this table is a continuation from the previous page"
},
"page_index": {
"type": "integer",
"description": "Zero-based page index where this table appears",
"minimum": 0
}
}
},
"row": {
"type": "object",
"required": ["bbox", "cells", "is_header"],
"properties": {
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"cells": {
"type": "array",
"description": "Cells in this row, ordered left-to-right",
"items": {
"$ref": "#/definitions/cell"
}
},
"is_header": {
"type": "boolean",
"description": "Whether this row is a header row"
}
}
},
"cell": {
"type": "object",
"required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"],
"properties": {
"bbox": {
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"text": {
"type": "string",
"description": "The concatenated text content of all spans in the cell"
},
"spans": {
"type": "array",
"description": "References to spans in the page's spans array",
"items": {
"type": "integer"
"description": "The concatenated text content of all spans in the block.",
"type": "string"
}
},
"row": {
"type": "integer",
"description": "Zero-based row index within the table",
"minimum": 0
"required": [
"kind",
"text",
"bbox"
]
},
"CellJson": {
"description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"col": {
"description": "Zero-based column index within the table.",
"type": "integer",
"description": "Zero-based column index within the table",
"format": "uint",
"minimum": 0
},
"colspan": {
"description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.",
"type": "integer",
"format": "uint32",
"default": 1,
"minimum": 0
},
"is_header_row": {
"description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.",
"type": "boolean"
},
"row": {
"description": "Zero-based row index within the table.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"rowspan": {
"description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.",
"type": "integer",
"description": "Number of rows this cell spans (default 1)",
"minimum": 1
"format": "uint32",
"default": 1,
"minimum": 0
},
"colspan": {
"type": "integer",
"description": "Number of columns this cell spans (default 1)",
"minimum": 1
},
"is_header_row": {
"type": "boolean",
"description": "Whether this cell is in a header row"
}
}
},
"receipt": {
"type": "object",
"required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"],
"properties": {
"pdf_fingerprint": {
"type": "string",
"description": "The PDF fingerprint"
},
"page_index": {
"type": "integer",
"description": "The page index"
},
"bbox": {
"spans": {
"description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.",
"type": "array",
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4
},
"content_hash": {
"type": "string",
"description": "SHA-256 hash of the content"
},
"extraction_version": {
"type": "string",
"description": "Version string of the extractor"
},
"svg_clip": {
"type": "string",
"description": "SVG clip path for verification (present only in SvgClip mode)"
}
"type": "integer",
"format": "uint",
"minimum": 0
}
},
"metadata": {
"text": {
"description": "The concatenated text content of all spans in the cell.",
"type": "string"
}
},
"required": [
"bbox",
"text",
"spans",
"row",
"col",
"is_header_row"
]
},
"ExtractionMetadata": {
"description": "Metadata about the extraction process.",
"type": "object",
"required": ["page_count", "span_count", "block_count"],
"properties": {
"page_count": {
"type": "integer",
"description": "Total number of pages in the document"
},
"span_count": {
"type": "integer",
"description": "Number of spans extracted"
},
"block_count": {
"description": "Number of blocks extracted.",
"type": "integer",
"description": "Number of blocks extracted"
},
"cache_status": {
"type": "string",
"description": "Cache status: 'hit', 'miss', or 'skipped'",
"enum": ["hit", "miss", "skipped"]
"format": "uint",
"minimum": 0
},
"cache_age_seconds": {
"type": "integer",
"description": "Cache entry age in seconds (only present when cache_status == 'hit')",
"description": "Cache entry age in seconds (only present when cache_status == \"hit\")",
"type": [
"integer",
"null"
],
"format": "uint64",
"minimum": 0
},
"error_count": {
"type": "integer",
"description": "Number of pages that failed to extract",
"minimum": 0
},
"reading_order_algorithm": {
"type": "string",
"description": "Reading order algorithm used for this extraction",
"enum": ["struct_tree", "xy_cut"]
"cache_status": {
"description": "Cache status: \"hit\", \"miss\", or \"skipped\"",
"type": [
"string",
"null"
]
},
"diagnostics": {
"description": "Diagnostics emitted during extraction (coverage warnings, etc.)",
"type": "array",
"description": "Diagnostics emitted during extraction",
"items": {
"type": "string"
}
}
}
},
"error_count": {
"description": "Number of pages that failed to extract.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"page_count": {
"description": "Total number of pages in the document.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"reading_order_algorithm": {
"description": "Reading order algorithm used for this extraction.",
"type": [
"string",
"null"
]
},
"receipts_mode": {
"description": "Receipts mode used for this extraction.",
"$ref": "#/$defs/ReceiptsMode"
},
"span_count": {
"description": "Number of spans extracted.",
"type": "integer",
"format": "uint",
"minimum": 0
}
},
"required": [
"page_count",
"receipts_mode",
"span_count",
"block_count",
"error_count",
"diagnostics"
]
},
"PageResult": {
"description": "Result for a single page.",
"type": "object",
"properties": {
"blocks": {
"description": "Extracted blocks (semantic units like paragraphs, headings).",
"type": "array",
"items": {
"$ref": "#/$defs/BlockJson"
}
},
"error": {
"description": "Error message if extraction failed for this page.",
"type": [
"string",
"null"
]
},
"index": {
"description": "0-based page index.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"spans": {
"description": "Extracted spans (text fragments with consistent styling).",
"type": "array",
"items": {
"$ref": "#/$defs/SpanJson"
}
},
"tables": {
"description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.",
"type": "array",
"items": {
"$ref": "#/$defs/TableJson"
}
}
},
"required": [
"index",
"spans",
"blocks",
"tables"
]
},
"Receipt": {
"description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n \"page_index\": 14,\n \"bbox\": [220.0, 412.0, 412.0, 432.0],\n \"content_hash\": \"sha256:9b21...\",\n \"extraction_version\": \"1.0.0\"\n}\n```",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"content_hash": {
"description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).",
"type": "string"
},
"extraction_version": {
"description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.",
"type": "string"
},
"page_index": {
"description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"pdf_fingerprint": {
"description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).",
"type": "string"
},
"svg_clip": {
"description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.",
"type": [
"string",
"null"
]
}
},
"required": [
"pdf_fingerprint",
"page_index",
"bbox",
"content_hash",
"extraction_version"
]
},
"ReceiptsMode": {
"description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.",
"oneOf": [
{
"description": "No receipts generated (default).",
"type": "string",
"const": "off"
},
{
"description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.",
"type": "string",
"const": "lite"
},
{
"description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.",
"type": "string",
"const": "svg"
}
]
},
"RowJson": {
"description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"cells": {
"description": "Cells in this row, ordered left-to-right.",
"type": "array",
"items": {
"$ref": "#/$defs/CellJson"
}
},
"is_header": {
"description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.",
"type": "boolean"
}
},
"required": [
"bbox",
"cells",
"is_header"
]
},
"SpanJson": {
"description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"confidence": {
"description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.",
"type": [
"number",
"null"
],
"format": "double"
},
"font": {
"description": "Font name or identifier.",
"type": "string"
},
"receipt": {
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
"anyOf": [
{
"$ref": "#/$defs/Receipt"
},
{
"type": "null"
}
]
},
"size": {
"description": "Font size in points.",
"type": "number",
"format": "double"
},
"text": {
"description": "The extracted text content.",
"type": "string"
}
},
"required": [
"text",
"bbox",
"font",
"size"
]
},
"TableJson": {
"description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"continued": {
"description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.",
"type": "boolean"
},
"continued_from_prev": {
"description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.",
"type": "boolean"
},
"detection_method": {
"description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics",
"type": "string"
},
"header_rows": {
"description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.",
"type": "integer",
"format": "uint32",
"minimum": 0
},
"id": {
"description": "Unique identifier for this table (e.g., \"table_0\").",
"type": "string"
},
"page_index": {
"description": "Zero-based page index where this table appears.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"rows": {
"description": "Rows in this table, ordered top-to-bottom.",
"type": "array",
"items": {
"$ref": "#/$defs/RowJson"
}
}
},
"required": [
"id",
"bbox",
"rows",
"header_rows",
"detection_method",
"continued",
"continued_from_prev",
"page_index"
]
}
}
}

View file

@ -11,6 +11,10 @@ publish = false
name = "xtask"
path = "src/main.rs"
[[bin]]
name = "gen_schema"
path = "src/bin/gen_schema.rs"
[dependencies]
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
@ -18,3 +22,5 @@ serde_yaml = "0.9"
glob = "0.3"
humantime = "2.1"
lopdf = "0.34"
schemars = "1.2"
pdftract-core = { path = "../crates/pdftract-core", features = ["schemars"] }

View file

@ -0,0 +1,74 @@
//! Generate JSON Schema from Rust output types.
//!
//! This binary generates the canonical JSON Schema for pdftract's
//! extraction output, which is checked into the repository at
//! docs/schema/v1.0/pdftract.schema.json.
//!
//! Usage: cargo run --bin gen_schema
use std::fs;
use std::path::PathBuf;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Find the workspace root
let workspace_root = find_workspace_root();
// Generate the schema
let schema_json = generate_schema();
// Write to docs/schema/v1.0/pdftract.schema.json
let schema_path = workspace_root.join("docs/schema/v1.0/pdftract.schema.json");
// Create the directory if it doesn't exist
if let Some(parent) = schema_path.parent() {
fs::create_dir_all(parent)?;
}
fs::write(&schema_path, schema_json)?;
println!("Generated schema at: {}", schema_path.display());
Ok(())
}
/// Find the workspace root by searching for Cargo.toml
fn find_workspace_root() -> PathBuf {
let mut current = std::env::current_dir().unwrap();
// If we're in the xtask directory, go to parent
if current.ends_with("xtask") {
current = current.parent().unwrap().to_path_buf();
}
// Search upward for Cargo.toml with workspace members
loop {
let cargo_toml = current.join("Cargo.toml");
if cargo_toml.exists() {
let content = fs::read_to_string(&cargo_toml).unwrap_or_default();
if content.contains("[workspace]") {
return current;
}
}
match current.parent() {
Some(parent) => current = parent.to_path_buf(),
None => break,
}
}
// Fallback: use current directory if not found
std::env::current_dir().unwrap()
}
/// Generate the JSON Schema for pdftract extraction output.
fn generate_schema() -> String {
use pdftract_core::extract::ExtractionResult;
use schemars::schema_for;
let schema = schema_for!(ExtractionResult);
// Convert to JSON string
// The schema_for! macro already includes the $schema field
serde_json::to_string_pretty(&schema)
.expect("Failed to serialize schema")
}

View file

@ -104,17 +104,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
eprintln!(" doc-profiles Generate README skeletons for all profiles");
eprintln!(" generate-stress-pdfs Generate stress-test PDFs for memory ceiling testing");
eprintln!(" generate-page-class-fixtures Generate page classification test fixtures");
eprintln!(" gen-schema Generate JSON Schema from Rust output types");
eprintln!(" memory-ceiling Run memory ceiling tests against perf/malformed corpora");
std::process::exit(1);
}
match args[1].as_str() {
let result = match args[1].as_str() {
"doc-profile" => {
if args.len() < 3 {
eprintln!("Usage: xtask doc-profile <profile-name>");
std::process::exit(1);
}
generate_profile_readme(&args[2])?;
Ok(())
}
"doc-profiles" => {
let profiles_dir = find_workspace_root().join("profiles/builtin");
@ -127,20 +129,45 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
}
}
}
Ok(())
}
"generate-stress-pdfs" => {
generate_stress_pdfs()?;
Ok(())
}
"generate-page-class-fixtures" => {
generate_page_class_fixtures()?;
Ok(())
}
"gen-schema" => {
gen_schema()?;
Ok(())
}
"memory-ceiling" => {
run_memory_ceiling_tests()?;
Ok(())
}
_ => {
eprintln!("Unknown command: {}", args[1]);
std::process::exit(1);
}
};
result
}
/// Generate JSON Schema from Rust output types.
///
/// Delegates to the gen_schema binary.
fn gen_schema() -> Result<(), Box<dyn std::error::Error>> {
// Invoke the gen_schema binary
let status = std::process::Command::new("cargo")
.args(["run", "--bin", "gen_schema"])
.current_dir(find_workspace_root())
.status()?;
if !status.success() {
return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into());
}
Ok(())