From 92e90af0b070f08b82d2f8ab8bc2087d61fd868c Mon Sep 17 00:00:00 2001 From: jedarden Date: Sun, 24 May 2026 01:29:14 -0400 Subject: [PATCH] feat(pdftract-zy2jx): generate JSON Schema from Rust output types - Add schemars dependency to pdftract-core (v1.2) - Add JsonSchema derives to output types (ExtractionResult, PageResult, ExtractionMetadata, SpanJson, BlockJson, CellJson, RowJson, TableJson, ExtractionQuality, Receipt, ReceiptsMode) - Create xtask/src/bin/gen_schema.rs for schema generation - Add gen-schema command to xtask main.rs - Generate docs/schema/v1.0/pdftract.schema.json using Draft 2020-12 Schema includes: - $schema: "https://json-schema.org/draft/2020-12/schema" - $defs with all output type definitions - Proper type annotations for all fields Closes: pdftract-zy2jx --- Cargo.lock | 50 +- crates/pdftract-core/Cargo.toml | 4 +- crates/pdftract-core/src/extract.rs | 5 + crates/pdftract-core/src/options.rs | 3 + crates/pdftract-core/src/receipts/mod.rs | 3 + crates/pdftract-core/src/schema/mod.rs | 8 + docs/schema/v1.0/pdftract.schema.json | 720 ++++++++++++++--------- xtask/Cargo.toml | 6 + xtask/src/bin/gen_schema.rs | 74 +++ xtask/src/main.rs | 29 +- 10 files changed, 610 insertions(+), 292 deletions(-) create mode 100644 xtask/src/bin/gen_schema.rs diff --git a/Cargo.lock b/Cargo.lock index a5f762f..28f3e5e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2302,7 +2302,7 @@ dependencies = [ "pdftract-core", "regex", "reqwest", - "schemars", + "schemars 0.8.22", "secrecy", "semver", "serde", @@ -2349,6 +2349,7 @@ dependencies = [ "quick-xml", "rayon", "regex", + "schemars 1.2.1", "secrecy", "serde", "serde_json", @@ -2967,6 +2968,26 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "regex" version = "1.12.3" @@ -3170,7 +3191,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" dependencies = [ "dyn-clone", - "schemars_derive", + "schemars_derive 0.8.22", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "schemars_derive 1.2.1", "serde", "serde_json", ] @@ -3187,6 +3221,18 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "schemars_derive" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn 2.0.117", +] + [[package]] name = "scopeguard" version = "1.2.0" diff --git a/crates/pdftract-core/Cargo.toml b/crates/pdftract-core/Cargo.toml index 8fc188d..fb63901 100644 --- a/crates/pdftract-core/Cargo.toml +++ b/crates/pdftract-core/Cargo.toml @@ -21,6 +21,7 @@ regex = "1.10" secrecy = { workspace = true } serde = { version = "1.0", features = ["derive"], optional = true } serde_json = { version = "1.0", optional = true } +schemars = { version = "1.2", features = ["derive"], optional = true } sha2 = "0.10" thiserror = { workspace = true } memchr = { workspace = true } @@ -38,7 +39,8 @@ quick-xml = { version = "0.36", optional = true } [features] default = ["serde"] -serde = ["dep:serde", "dep:serde_json"] +serde = ["dep:serde", "dep:serde_json", "dep:schemars"] +schemars = ["dep:schemars", "serde"] receipts = [] # Enable visual citation receipts (SVG clip generation) ocr = ["dep:image", "dep:leptonica-plumbing", "dep:quick-xml"] # Enable OCR path (image compositing + preprocessing + HOCR parsing) full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr) diff --git a/crates/pdftract-core/src/extract.rs b/crates/pdftract-core/src/extract.rs index 3cb0719..a5f264a 100644 --- a/crates/pdftract-core/src/extract.rs +++ b/crates/pdftract-core/src/extract.rs @@ -28,6 +28,8 @@ use anyhow::{Context, Result}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use serde_json::json; +#[cfg(feature = "schemars")] +use schemars::JsonSchema; use std::sync::Arc; use crate::parser::stream::FileSource; @@ -102,6 +104,7 @@ fn decode_page_content_streams( /// /// Contains the extracted pages, spans, blocks, and metadata. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct ExtractionResult { /// The PDF fingerprint (for receipt generation). pub fingerprint: String, @@ -113,6 +116,7 @@ pub struct ExtractionResult { /// Result for a single page. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct PageResult { /// 0-based page index. pub index: usize, @@ -177,6 +181,7 @@ impl From for PageResult { /// Metadata about the extraction process. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct ExtractionMetadata { /// Total number of pages in the document. pub page_count: usize, diff --git a/crates/pdftract-core/src/options.rs b/crates/pdftract-core/src/options.rs index adf6855..893d906 100644 --- a/crates/pdftract-core/src/options.rs +++ b/crates/pdftract-core/src/options.rs @@ -4,11 +4,14 @@ //! including the receipts mode for cryptographic provenance tracking. use serde::{Deserialize, Serialize}; +#[cfg(feature = "schemars")] +use schemars::JsonSchema; /// Receipt generation mode. /// /// Controls whether visual citation receipts are generated during extraction. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] #[serde(rename_all = "lowercase")] pub enum ReceiptsMode { /// No receipts generated (default). diff --git a/crates/pdftract-core/src/receipts/mod.rs b/crates/pdftract-core/src/receipts/mod.rs index 475051a..b1c2162 100644 --- a/crates/pdftract-core/src/receipts/mod.rs +++ b/crates/pdftract-core/src/receipts/mod.rs @@ -26,6 +26,8 @@ pub mod svg; pub mod verifier; use serde::{Deserialize, Serialize}; +#[cfg(feature = "schemars")] +use schemars::JsonSchema; /// A visual citation receipt for extracted text. /// @@ -59,6 +61,7 @@ use serde::{Deserialize, Serialize}; /// } /// ``` #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct Receipt { /// Phase 1.7 fingerprint of the source PDF. /// diff --git a/crates/pdftract-core/src/schema/mod.rs b/crates/pdftract-core/src/schema/mod.rs index 88e37bc..cbd0997 100644 --- a/crates/pdftract-core/src/schema/mod.rs +++ b/crates/pdftract-core/src/schema/mod.rs @@ -18,6 +18,8 @@ use serde::{Deserialize, Serialize}; use serde_json::json; +#[cfg(feature = "schemars")] +use schemars::JsonSchema; use crate::receipts::Receipt; @@ -26,6 +28,7 @@ use crate::receipts::Receipt; /// A span is the smallest unit of extracted text, representing a /// contiguous run of text with consistent font and styling. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct SpanJson { /// The extracted text content. pub text: String, @@ -64,6 +67,7 @@ pub struct SpanJson { /// spans. Examples include paragraphs, headings, list items, and /// table cells. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct BlockJson { /// The block kind/type. /// @@ -112,6 +116,7 @@ pub type SpanRef = usize; /// A cell represents a single unit within a table row, containing /// its text content, bounding box, and position information. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct CellJson { /// Bounding box in PDF user-space points. /// @@ -163,6 +168,7 @@ fn default_one() -> u32 { /// A row contains a sequence of cells that form a horizontal strip /// in the table. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct RowJson { /// Bounding box in PDF user-space points. /// @@ -185,6 +191,7 @@ pub struct RowJson { /// provides the concatenated text and position, while the TableJson /// provides full cell-level structure. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct TableJson { /// Unique identifier for this table (e.g., "table_0"). pub id: String, @@ -231,6 +238,7 @@ pub struct TableJson { /// in the root metadata (full JSON mode). It provides aggregate /// quality signals across all pages. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] pub struct ExtractionQuality { /// Overall quality assessment: "high", "medium", "low", or "none". /// diff --git a/docs/schema/v1.0/pdftract.schema.json b/docs/schema/v1.0/pdftract.schema.json index 36d4bd3..a1911aa 100644 --- a/docs/schema/v1.0/pdftract.schema.json +++ b/docs/schema/v1.0/pdftract.schema.json @@ -1,345 +1,489 @@ { - "$schema": "http://json-schema.org/draft-07/schema#", - "$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json", - "title": "PDFtract Extraction Output Schema v1.0", - "description": "JSON output schema for PDF text and structure extraction", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "ExtractionResult", + "description": "Result of a PDF extraction operation.\n\nContains the extracted pages, spans, blocks, and metadata.", "type": "object", - "required": ["fingerprint", "schema_version", "pages", "metadata"], "properties": { "fingerprint": { - "type": "string", - "description": "PDF fingerprint for verification (format: pdftract-v1:)" - }, - "schema_version": { - "type": "string", - "description": "Schema version (e.g., '1.0')", - "enum": ["1.0"] - }, - "pages": { - "type": "array", - "description": "Extracted pages", - "items": { - "$ref": "#/definitions/page" - } + "description": "The PDF fingerprint (for receipt generation).", + "type": "string" }, "metadata": { - "$ref": "#/definitions/metadata" + "description": "Metadata about the extraction.", + "$ref": "#/$defs/ExtractionMetadata" + }, + "pages": { + "description": "Extracted pages, each containing spans and blocks.", + "type": "array", + "items": { + "$ref": "#/$defs/PageResult" + } } }, - "definitions": { - "page": { + "required": [ + "fingerprint", + "pages", + "metadata" + ], + "$defs": { + "BlockJson": { + "description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.", "type": "object", - "required": ["index", "spans", "blocks", "tables"], "properties": { - "index": { - "type": "integer", - "description": "0-based page index" - }, - "spans": { - "type": "array", - "description": "Extracted text spans", - "items": { - "$ref": "#/definitions/span" - } - }, - "blocks": { - "type": "array", - "description": "Extracted structural blocks", - "items": { - "$ref": "#/definitions/block" - } - }, - "tables": { - "type": "array", - "description": "Extracted tables (cell-level structure)", - "items": { - "$ref": "#/definitions/table" - } - }, - "error": { - "type": "string", - "description": "Error message if extraction failed for this page" - } - } - }, - "span": { - "type": "object", - "required": ["text", "bbox", "font", "size"], - "properties": { - "text": { - "type": "string", - "description": "The extracted text content" - }, "bbox": { + "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.", "type": "array", - "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]", "items": { - "type": "number" + "type": "number", + "format": "double" }, - "minItems": 4, - "maxItems": 4 + "maxItems": 4, + "minItems": 4 }, - "font": { - "type": "string", - "description": "Font name or identifier" - }, - "size": { - "type": "number", - "description": "Font size in points" - }, - "confidence": { - "type": "number", - "description": "Confidence score (0.0 to 1.0) for OCR text", - "minimum": 0.0, - "maximum": 1.0 - }, - "receipt": { - "$ref": "#/definitions/receipt" - } - } - }, - "block": { - "type": "object", - "required": ["kind", "text", "bbox"], - "properties": { "kind": { - "type": "string", - "description": "Block kind/type", - "enum": ["paragraph", "heading", "list", "table", "figure"] - }, - "text": { - "type": "string", - "description": "The concatenated text content of all spans in the block" - }, - "bbox": { - "type": "array", - "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]", - "items": { - "type": "number" - }, - "minItems": 4, - "maxItems": 4 + "description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".", + "type": "string" }, "level": { - "type": "integer", - "description": "Heading level (1-6) for 'heading' kind blocks", - "minimum": 1, - "maximum": 6 - }, - "table_index": { - "type": "integer", - "description": "Table index for 'table' kind blocks (points to tables array)", + "description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.", + "type": [ + "integer", + "null" + ], + "format": "uint8", + "maximum": 255, "minimum": 0 }, "receipt": { - "$ref": "#/definitions/receipt" - } - } - }, - "table": { - "type": "object", - "required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"], - "properties": { - "id": { - "type": "string", - "description": "Unique identifier for this table (e.g., 'table_0')" + "description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.", + "anyOf": [ + { + "$ref": "#/$defs/Receipt" + }, + { + "type": "null" + } + ] }, - "bbox": { - "type": "array", - "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]", - "items": { - "type": "number" - }, - "minItems": 4, - "maxItems": 4 - }, - "rows": { - "type": "array", - "description": "Rows in this table, ordered top-to-bottom", - "items": { - "$ref": "#/definitions/row" - } - }, - "header_rows": { - "type": "integer", - "description": "Number of contiguous header rows at the top of the table", + "table_index": { + "description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.", + "type": [ + "integer", + "null" + ], + "format": "uint", "minimum": 0 }, - "detection_method": { - "type": "string", - "description": "Detection method used to identify this table", - "enum": ["line_based", "borderless"] - }, - "continued": { - "type": "boolean", - "description": "Whether this table continues on the next page" - }, - "continued_from_prev": { - "type": "boolean", - "description": "Whether this table is a continuation from the previous page" - }, - "page_index": { - "type": "integer", - "description": "Zero-based page index where this table appears", - "minimum": 0 - } - } - }, - "row": { - "type": "object", - "required": ["bbox", "cells", "is_header"], - "properties": { - "bbox": { - "type": "array", - "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]", - "items": { - "type": "number" - }, - "minItems": 4, - "maxItems": 4 - }, - "cells": { - "type": "array", - "description": "Cells in this row, ordered left-to-right", - "items": { - "$ref": "#/definitions/cell" - } - }, - "is_header": { - "type": "boolean", - "description": "Whether this row is a header row" - } - } - }, - "cell": { - "type": "object", - "required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"], - "properties": { - "bbox": { - "type": "array", - "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]", - "items": { - "type": "number" - }, - "minItems": 4, - "maxItems": 4 - }, "text": { - "type": "string", - "description": "The concatenated text content of all spans in the cell" - }, - "spans": { + "description": "The concatenated text content of all spans in the block.", + "type": "string" + } + }, + "required": [ + "kind", + "text", + "bbox" + ] + }, + "CellJson": { + "description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.", + "type": "object", + "properties": { + "bbox": { + "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.", "type": "array", - "description": "References to spans in the page's spans array", "items": { - "type": "integer" - } - }, - "row": { - "type": "integer", - "description": "Zero-based row index within the table", - "minimum": 0 + "type": "number", + "format": "double" + }, + "maxItems": 4, + "minItems": 4 }, "col": { + "description": "Zero-based column index within the table.", "type": "integer", - "description": "Zero-based column index within the table", + "format": "uint", + "minimum": 0 + }, + "colspan": { + "description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.", + "type": "integer", + "format": "uint32", + "default": 1, + "minimum": 0 + }, + "is_header_row": { + "description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.", + "type": "boolean" + }, + "row": { + "description": "Zero-based row index within the table.", + "type": "integer", + "format": "uint", "minimum": 0 }, "rowspan": { + "description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.", "type": "integer", - "description": "Number of rows this cell spans (default 1)", - "minimum": 1 + "format": "uint32", + "default": 1, + "minimum": 0 }, - "colspan": { - "type": "integer", - "description": "Number of columns this cell spans (default 1)", - "minimum": 1 - }, - "is_header_row": { - "type": "boolean", - "description": "Whether this cell is in a header row" - } - } - }, - "receipt": { - "type": "object", - "required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"], - "properties": { - "pdf_fingerprint": { - "type": "string", - "description": "The PDF fingerprint" - }, - "page_index": { - "type": "integer", - "description": "The page index" - }, - "bbox": { + "spans": { + "description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.", "type": "array", - "description": "Bounding box in PDF user-space points [x0, y0, x1, y1]", "items": { - "type": "number" - }, - "minItems": 4, - "maxItems": 4 + "type": "integer", + "format": "uint", + "minimum": 0 + } }, - "content_hash": { - "type": "string", - "description": "SHA-256 hash of the content" - }, - "extraction_version": { - "type": "string", - "description": "Version string of the extractor" - }, - "svg_clip": { - "type": "string", - "description": "SVG clip path for verification (present only in SvgClip mode)" + "text": { + "description": "The concatenated text content of all spans in the cell.", + "type": "string" } - } + }, + "required": [ + "bbox", + "text", + "spans", + "row", + "col", + "is_header_row" + ] }, - "metadata": { + "ExtractionMetadata": { + "description": "Metadata about the extraction process.", "type": "object", - "required": ["page_count", "span_count", "block_count"], "properties": { - "page_count": { - "type": "integer", - "description": "Total number of pages in the document" - }, - "span_count": { - "type": "integer", - "description": "Number of spans extracted" - }, "block_count": { + "description": "Number of blocks extracted.", "type": "integer", - "description": "Number of blocks extracted" - }, - "cache_status": { - "type": "string", - "description": "Cache status: 'hit', 'miss', or 'skipped'", - "enum": ["hit", "miss", "skipped"] + "format": "uint", + "minimum": 0 }, "cache_age_seconds": { - "type": "integer", - "description": "Cache entry age in seconds (only present when cache_status == 'hit')", + "description": "Cache entry age in seconds (only present when cache_status == \"hit\")", + "type": [ + "integer", + "null" + ], + "format": "uint64", "minimum": 0 }, - "error_count": { - "type": "integer", - "description": "Number of pages that failed to extract", - "minimum": 0 - }, - "reading_order_algorithm": { - "type": "string", - "description": "Reading order algorithm used for this extraction", - "enum": ["struct_tree", "xy_cut"] + "cache_status": { + "description": "Cache status: \"hit\", \"miss\", or \"skipped\"", + "type": [ + "string", + "null" + ] }, "diagnostics": { + "description": "Diagnostics emitted during extraction (coverage warnings, etc.)", "type": "array", - "description": "Diagnostics emitted during extraction", "items": { "type": "string" } + }, + "error_count": { + "description": "Number of pages that failed to extract.", + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "page_count": { + "description": "Total number of pages in the document.", + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "reading_order_algorithm": { + "description": "Reading order algorithm used for this extraction.", + "type": [ + "string", + "null" + ] + }, + "receipts_mode": { + "description": "Receipts mode used for this extraction.", + "$ref": "#/$defs/ReceiptsMode" + }, + "span_count": { + "description": "Number of spans extracted.", + "type": "integer", + "format": "uint", + "minimum": 0 } - } + }, + "required": [ + "page_count", + "receipts_mode", + "span_count", + "block_count", + "error_count", + "diagnostics" + ] + }, + "PageResult": { + "description": "Result for a single page.", + "type": "object", + "properties": { + "blocks": { + "description": "Extracted blocks (semantic units like paragraphs, headings).", + "type": "array", + "items": { + "$ref": "#/$defs/BlockJson" + } + }, + "error": { + "description": "Error message if extraction failed for this page.", + "type": [ + "string", + "null" + ] + }, + "index": { + "description": "0-based page index.", + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "spans": { + "description": "Extracted spans (text fragments with consistent styling).", + "type": "array", + "items": { + "$ref": "#/$defs/SpanJson" + } + }, + "tables": { + "description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.", + "type": "array", + "items": { + "$ref": "#/$defs/TableJson" + } + } + }, + "required": [ + "index", + "spans", + "blocks", + "tables" + ] + }, + "Receipt": { + "description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n \"page_index\": 14,\n \"bbox\": [220.0, 412.0, 412.0, 432.0],\n \"content_hash\": \"sha256:9b21...\",\n \"extraction_version\": \"1.0.0\"\n}\n```", + "type": "object", + "properties": { + "bbox": { + "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.", + "type": "array", + "items": { + "type": "number", + "format": "double" + }, + "maxItems": 4, + "minItems": 4 + }, + "content_hash": { + "description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).", + "type": "string" + }, + "extraction_version": { + "description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.", + "type": "string" + }, + "page_index": { + "description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.", + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "pdf_fingerprint": { + "description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).", + "type": "string" + }, + "svg_clip": { + "description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.", + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "pdf_fingerprint", + "page_index", + "bbox", + "content_hash", + "extraction_version" + ] + }, + "ReceiptsMode": { + "description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.", + "oneOf": [ + { + "description": "No receipts generated (default).", + "type": "string", + "const": "off" + }, + { + "description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.", + "type": "string", + "const": "lite" + }, + { + "description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.", + "type": "string", + "const": "svg" + } + ] + }, + "RowJson": { + "description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.", + "type": "object", + "properties": { + "bbox": { + "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.", + "type": "array", + "items": { + "type": "number", + "format": "double" + }, + "maxItems": 4, + "minItems": 4 + }, + "cells": { + "description": "Cells in this row, ordered left-to-right.", + "type": "array", + "items": { + "$ref": "#/$defs/CellJson" + } + }, + "is_header": { + "description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.", + "type": "boolean" + } + }, + "required": [ + "bbox", + "cells", + "is_header" + ] + }, + "SpanJson": { + "description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.", + "type": "object", + "properties": { + "bbox": { + "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.", + "type": "array", + "items": { + "type": "number", + "format": "double" + }, + "maxItems": 4, + "minItems": 4 + }, + "confidence": { + "description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.", + "type": [ + "number", + "null" + ], + "format": "double" + }, + "font": { + "description": "Font name or identifier.", + "type": "string" + }, + "receipt": { + "description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.", + "anyOf": [ + { + "$ref": "#/$defs/Receipt" + }, + { + "type": "null" + } + ] + }, + "size": { + "description": "Font size in points.", + "type": "number", + "format": "double" + }, + "text": { + "description": "The extracted text content.", + "type": "string" + } + }, + "required": [ + "text", + "bbox", + "font", + "size" + ] + }, + "TableJson": { + "description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.", + "type": "object", + "properties": { + "bbox": { + "description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.", + "type": "array", + "items": { + "type": "number", + "format": "double" + }, + "maxItems": 4, + "minItems": 4 + }, + "continued": { + "description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.", + "type": "boolean" + }, + "continued_from_prev": { + "description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.", + "type": "boolean" + }, + "detection_method": { + "description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics", + "type": "string" + }, + "header_rows": { + "description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.", + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "id": { + "description": "Unique identifier for this table (e.g., \"table_0\").", + "type": "string" + }, + "page_index": { + "description": "Zero-based page index where this table appears.", + "type": "integer", + "format": "uint", + "minimum": 0 + }, + "rows": { + "description": "Rows in this table, ordered top-to-bottom.", + "type": "array", + "items": { + "$ref": "#/$defs/RowJson" + } + } + }, + "required": [ + "id", + "bbox", + "rows", + "header_rows", + "detection_method", + "continued", + "continued_from_prev", + "page_index" + ] } } -} +} \ No newline at end of file diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 1f3f0cd..a5ac24d 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -11,6 +11,10 @@ publish = false name = "xtask" path = "src/main.rs" +[[bin]] +name = "gen_schema" +path = "src/bin/gen_schema.rs" + [dependencies] serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" @@ -18,3 +22,5 @@ serde_yaml = "0.9" glob = "0.3" humantime = "2.1" lopdf = "0.34" +schemars = "1.2" +pdftract-core = { path = "../crates/pdftract-core", features = ["schemars"] } diff --git a/xtask/src/bin/gen_schema.rs b/xtask/src/bin/gen_schema.rs new file mode 100644 index 0000000..9c9ff11 --- /dev/null +++ b/xtask/src/bin/gen_schema.rs @@ -0,0 +1,74 @@ +//! Generate JSON Schema from Rust output types. +//! +//! This binary generates the canonical JSON Schema for pdftract's +//! extraction output, which is checked into the repository at +//! docs/schema/v1.0/pdftract.schema.json. +//! +//! Usage: cargo run --bin gen_schema + +use std::fs; +use std::path::PathBuf; + +fn main() -> Result<(), Box> { + // Find the workspace root + let workspace_root = find_workspace_root(); + + // Generate the schema + let schema_json = generate_schema(); + + // Write to docs/schema/v1.0/pdftract.schema.json + let schema_path = workspace_root.join("docs/schema/v1.0/pdftract.schema.json"); + + // Create the directory if it doesn't exist + if let Some(parent) = schema_path.parent() { + fs::create_dir_all(parent)?; + } + + fs::write(&schema_path, schema_json)?; + + println!("Generated schema at: {}", schema_path.display()); + + Ok(()) +} + +/// Find the workspace root by searching for Cargo.toml +fn find_workspace_root() -> PathBuf { + let mut current = std::env::current_dir().unwrap(); + + // If we're in the xtask directory, go to parent + if current.ends_with("xtask") { + current = current.parent().unwrap().to_path_buf(); + } + + // Search upward for Cargo.toml with workspace members + loop { + let cargo_toml = current.join("Cargo.toml"); + if cargo_toml.exists() { + let content = fs::read_to_string(&cargo_toml).unwrap_or_default(); + if content.contains("[workspace]") { + return current; + } + } + + match current.parent() { + Some(parent) => current = parent.to_path_buf(), + None => break, + } + } + + // Fallback: use current directory if not found + std::env::current_dir().unwrap() +} + +/// Generate the JSON Schema for pdftract extraction output. +fn generate_schema() -> String { + use pdftract_core::extract::ExtractionResult; + use schemars::schema_for; + + let schema = schema_for!(ExtractionResult); + + // Convert to JSON string + // The schema_for! macro already includes the $schema field + serde_json::to_string_pretty(&schema) + .expect("Failed to serialize schema") +} diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 68b55d3..b13d682 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -104,17 +104,19 @@ fn main() -> Result<(), Box> { eprintln!(" doc-profiles Generate README skeletons for all profiles"); eprintln!(" generate-stress-pdfs Generate stress-test PDFs for memory ceiling testing"); eprintln!(" generate-page-class-fixtures Generate page classification test fixtures"); + eprintln!(" gen-schema Generate JSON Schema from Rust output types"); eprintln!(" memory-ceiling Run memory ceiling tests against perf/malformed corpora"); std::process::exit(1); } - match args[1].as_str() { + let result = match args[1].as_str() { "doc-profile" => { if args.len() < 3 { eprintln!("Usage: xtask doc-profile "); std::process::exit(1); } generate_profile_readme(&args[2])?; + Ok(()) } "doc-profiles" => { let profiles_dir = find_workspace_root().join("profiles/builtin"); @@ -127,20 +129,45 @@ fn main() -> Result<(), Box> { } } } + Ok(()) } "generate-stress-pdfs" => { generate_stress_pdfs()?; + Ok(()) } "generate-page-class-fixtures" => { generate_page_class_fixtures()?; + Ok(()) + } + "gen-schema" => { + gen_schema()?; + Ok(()) } "memory-ceiling" => { run_memory_ceiling_tests()?; + Ok(()) } _ => { eprintln!("Unknown command: {}", args[1]); std::process::exit(1); } + }; + + result +} + +/// Generate JSON Schema from Rust output types. +/// +/// Delegates to the gen_schema binary. +fn gen_schema() -> Result<(), Box> { + // Invoke the gen_schema binary + let status = std::process::Command::new("cargo") + .args(["run", "--bin", "gen_schema"]) + .current_dir(find_workspace_root()) + .status()?; + + if !status.success() { + return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into()); } Ok(())