pdftract/docs/schema/v1.0/pdftract.schema.json
jedarden 92e90af0b0 feat(pdftract-zy2jx): generate JSON Schema from Rust output types
- Add schemars dependency to pdftract-core (v1.2)
- Add JsonSchema derives to output types (ExtractionResult, PageResult, ExtractionMetadata, SpanJson, BlockJson, CellJson, RowJson, TableJson, ExtractionQuality, Receipt, ReceiptsMode)
- Create xtask/src/bin/gen_schema.rs for schema generation
- Add gen-schema command to xtask main.rs
- Generate docs/schema/v1.0/pdftract.schema.json using Draft 2020-12

Schema includes:
- $schema: "https://json-schema.org/draft/2020-12/schema"
- $defs with all output type definitions
- Proper type annotations for all fields

Closes: pdftract-zy2jx
2026-05-24 01:29:14 -04:00

489 lines
No EOL
18 KiB
JSON

{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "ExtractionResult",
"description": "Result of a PDF extraction operation.\n\nContains the extracted pages, spans, blocks, and metadata.",
"type": "object",
"properties": {
"fingerprint": {
"description": "The PDF fingerprint (for receipt generation).",
"type": "string"
},
"metadata": {
"description": "Metadata about the extraction.",
"$ref": "#/$defs/ExtractionMetadata"
},
"pages": {
"description": "Extracted pages, each containing spans and blocks.",
"type": "array",
"items": {
"$ref": "#/$defs/PageResult"
}
}
},
"required": [
"fingerprint",
"pages",
"metadata"
],
"$defs": {
"BlockJson": {
"description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"kind": {
"description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".",
"type": "string"
},
"level": {
"description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.",
"type": [
"integer",
"null"
],
"format": "uint8",
"maximum": 255,
"minimum": 0
},
"receipt": {
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
"anyOf": [
{
"$ref": "#/$defs/Receipt"
},
{
"type": "null"
}
]
},
"table_index": {
"description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.",
"type": [
"integer",
"null"
],
"format": "uint",
"minimum": 0
},
"text": {
"description": "The concatenated text content of all spans in the block.",
"type": "string"
}
},
"required": [
"kind",
"text",
"bbox"
]
},
"CellJson": {
"description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"col": {
"description": "Zero-based column index within the table.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"colspan": {
"description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.",
"type": "integer",
"format": "uint32",
"default": 1,
"minimum": 0
},
"is_header_row": {
"description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.",
"type": "boolean"
},
"row": {
"description": "Zero-based row index within the table.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"rowspan": {
"description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.",
"type": "integer",
"format": "uint32",
"default": 1,
"minimum": 0
},
"spans": {
"description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.",
"type": "array",
"items": {
"type": "integer",
"format": "uint",
"minimum": 0
}
},
"text": {
"description": "The concatenated text content of all spans in the cell.",
"type": "string"
}
},
"required": [
"bbox",
"text",
"spans",
"row",
"col",
"is_header_row"
]
},
"ExtractionMetadata": {
"description": "Metadata about the extraction process.",
"type": "object",
"properties": {
"block_count": {
"description": "Number of blocks extracted.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"cache_age_seconds": {
"description": "Cache entry age in seconds (only present when cache_status == \"hit\")",
"type": [
"integer",
"null"
],
"format": "uint64",
"minimum": 0
},
"cache_status": {
"description": "Cache status: \"hit\", \"miss\", or \"skipped\"",
"type": [
"string",
"null"
]
},
"diagnostics": {
"description": "Diagnostics emitted during extraction (coverage warnings, etc.)",
"type": "array",
"items": {
"type": "string"
}
},
"error_count": {
"description": "Number of pages that failed to extract.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"page_count": {
"description": "Total number of pages in the document.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"reading_order_algorithm": {
"description": "Reading order algorithm used for this extraction.",
"type": [
"string",
"null"
]
},
"receipts_mode": {
"description": "Receipts mode used for this extraction.",
"$ref": "#/$defs/ReceiptsMode"
},
"span_count": {
"description": "Number of spans extracted.",
"type": "integer",
"format": "uint",
"minimum": 0
}
},
"required": [
"page_count",
"receipts_mode",
"span_count",
"block_count",
"error_count",
"diagnostics"
]
},
"PageResult": {
"description": "Result for a single page.",
"type": "object",
"properties": {
"blocks": {
"description": "Extracted blocks (semantic units like paragraphs, headings).",
"type": "array",
"items": {
"$ref": "#/$defs/BlockJson"
}
},
"error": {
"description": "Error message if extraction failed for this page.",
"type": [
"string",
"null"
]
},
"index": {
"description": "0-based page index.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"spans": {
"description": "Extracted spans (text fragments with consistent styling).",
"type": "array",
"items": {
"$ref": "#/$defs/SpanJson"
}
},
"tables": {
"description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.",
"type": "array",
"items": {
"$ref": "#/$defs/TableJson"
}
}
},
"required": [
"index",
"spans",
"blocks",
"tables"
]
},
"Receipt": {
"description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n \"page_index\": 14,\n \"bbox\": [220.0, 412.0, 412.0, 432.0],\n \"content_hash\": \"sha256:9b21...\",\n \"extraction_version\": \"1.0.0\"\n}\n```",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"content_hash": {
"description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).",
"type": "string"
},
"extraction_version": {
"description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.",
"type": "string"
},
"page_index": {
"description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"pdf_fingerprint": {
"description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).",
"type": "string"
},
"svg_clip": {
"description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.",
"type": [
"string",
"null"
]
}
},
"required": [
"pdf_fingerprint",
"page_index",
"bbox",
"content_hash",
"extraction_version"
]
},
"ReceiptsMode": {
"description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.",
"oneOf": [
{
"description": "No receipts generated (default).",
"type": "string",
"const": "off"
},
{
"description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.",
"type": "string",
"const": "lite"
},
{
"description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.",
"type": "string",
"const": "svg"
}
]
},
"RowJson": {
"description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"cells": {
"description": "Cells in this row, ordered left-to-right.",
"type": "array",
"items": {
"$ref": "#/$defs/CellJson"
}
},
"is_header": {
"description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.",
"type": "boolean"
}
},
"required": [
"bbox",
"cells",
"is_header"
]
},
"SpanJson": {
"description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"confidence": {
"description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.",
"type": [
"number",
"null"
],
"format": "double"
},
"font": {
"description": "Font name or identifier.",
"type": "string"
},
"receipt": {
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
"anyOf": [
{
"$ref": "#/$defs/Receipt"
},
{
"type": "null"
}
]
},
"size": {
"description": "Font size in points.",
"type": "number",
"format": "double"
},
"text": {
"description": "The extracted text content.",
"type": "string"
}
},
"required": [
"text",
"bbox",
"font",
"size"
]
},
"TableJson": {
"description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.",
"type": "object",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"type": "array",
"items": {
"type": "number",
"format": "double"
},
"maxItems": 4,
"minItems": 4
},
"continued": {
"description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.",
"type": "boolean"
},
"continued_from_prev": {
"description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.",
"type": "boolean"
},
"detection_method": {
"description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics",
"type": "string"
},
"header_rows": {
"description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.",
"type": "integer",
"format": "uint32",
"minimum": 0
},
"id": {
"description": "Unique identifier for this table (e.g., \"table_0\").",
"type": "string"
},
"page_index": {
"description": "Zero-based page index where this table appears.",
"type": "integer",
"format": "uint",
"minimum": 0
},
"rows": {
"description": "Rows in this table, ordered top-to-bottom.",
"type": "array",
"items": {
"$ref": "#/$defs/RowJson"
}
}
},
"required": [
"id",
"bbox",
"rows",
"header_rows",
"detection_method",
"continued",
"continued_from_prev",
"page_index"
]
}
}
}