feat(pdftract-zy2jx): generate JSON Schema from Rust output types
- Add schemars dependency to pdftract-core (v1.2) - Add JsonSchema derives to output types (ExtractionResult, PageResult, ExtractionMetadata, SpanJson, BlockJson, CellJson, RowJson, TableJson, ExtractionQuality, Receipt, ReceiptsMode) - Create xtask/src/bin/gen_schema.rs for schema generation - Add gen-schema command to xtask main.rs - Generate docs/schema/v1.0/pdftract.schema.json using Draft 2020-12 Schema includes: - $schema: "https://json-schema.org/draft/2020-12/schema" - $defs with all output type definitions - Proper type annotations for all fields Closes: pdftract-zy2jx
This commit is contained in:
parent
d723427da7
commit
92e90af0b0
10 changed files with 610 additions and 292 deletions
50
Cargo.lock
generated
50
Cargo.lock
generated
|
|
@ -2302,7 +2302,7 @@ dependencies = [
|
|||
"pdftract-core",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"schemars",
|
||||
"schemars 0.8.22",
|
||||
"secrecy",
|
||||
"semver",
|
||||
"serde",
|
||||
|
|
@ -2349,6 +2349,7 @@ dependencies = [
|
|||
"quick-xml",
|
||||
"rayon",
|
||||
"regex",
|
||||
"schemars 1.2.1",
|
||||
"secrecy",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
|
@ -2967,6 +2968,26 @@ dependencies = [
|
|||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ref-cast"
|
||||
version = "1.0.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
|
||||
dependencies = [
|
||||
"ref-cast-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ref-cast-impl"
|
||||
version = "1.0.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.12.3"
|
||||
|
|
@ -3170,7 +3191,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615"
|
||||
dependencies = [
|
||||
"dyn-clone",
|
||||
"schemars_derive",
|
||||
"schemars_derive 0.8.22",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schemars"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
|
||||
dependencies = [
|
||||
"dyn-clone",
|
||||
"ref-cast",
|
||||
"schemars_derive 1.2.1",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
|
@ -3187,6 +3221,18 @@ dependencies = [
|
|||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schemars_derive"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde_derive_internals",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ regex = "1.10"
|
|||
secrecy = { workspace = true }
|
||||
serde = { version = "1.0", features = ["derive"], optional = true }
|
||||
serde_json = { version = "1.0", optional = true }
|
||||
schemars = { version = "1.2", features = ["derive"], optional = true }
|
||||
sha2 = "0.10"
|
||||
thiserror = { workspace = true }
|
||||
memchr = { workspace = true }
|
||||
|
|
@ -38,7 +39,8 @@ quick-xml = { version = "0.36", optional = true }
|
|||
|
||||
[features]
|
||||
default = ["serde"]
|
||||
serde = ["dep:serde", "dep:serde_json"]
|
||||
serde = ["dep:serde", "dep:serde_json", "dep:schemars"]
|
||||
schemars = ["dep:schemars", "serde"]
|
||||
receipts = [] # Enable visual citation receipts (SVG clip generation)
|
||||
ocr = ["dep:image", "dep:leptonica-plumbing", "dep:quick-xml"] # Enable OCR path (image compositing + preprocessing + HOCR parsing)
|
||||
full-render = ["dep:pdfium-render", "ocr"] # Enable PDFium-based rendering (requires ocr)
|
||||
|
|
|
|||
|
|
@ -28,6 +28,8 @@ use anyhow::{Context, Result};
|
|||
use rayon::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
#[cfg(feature = "schemars")]
|
||||
use schemars::JsonSchema;
|
||||
use std::sync::Arc;
|
||||
use crate::parser::stream::FileSource;
|
||||
|
||||
|
|
@ -102,6 +104,7 @@ fn decode_page_content_streams(
|
|||
///
|
||||
/// Contains the extracted pages, spans, blocks, and metadata.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct ExtractionResult {
|
||||
/// The PDF fingerprint (for receipt generation).
|
||||
pub fingerprint: String,
|
||||
|
|
@ -113,6 +116,7 @@ pub struct ExtractionResult {
|
|||
|
||||
/// Result for a single page.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct PageResult {
|
||||
/// 0-based page index.
|
||||
pub index: usize,
|
||||
|
|
@ -177,6 +181,7 @@ impl From<PageResultInternal> for PageResult {
|
|||
|
||||
/// Metadata about the extraction process.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct ExtractionMetadata {
|
||||
/// Total number of pages in the document.
|
||||
pub page_count: usize,
|
||||
|
|
|
|||
|
|
@ -4,11 +4,14 @@
|
|||
//! including the receipts mode for cryptographic provenance tracking.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
#[cfg(feature = "schemars")]
|
||||
use schemars::JsonSchema;
|
||||
|
||||
/// Receipt generation mode.
|
||||
///
|
||||
/// Controls whether visual citation receipts are generated during extraction.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ReceiptsMode {
|
||||
/// No receipts generated (default).
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@ pub mod svg;
|
|||
pub mod verifier;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
#[cfg(feature = "schemars")]
|
||||
use schemars::JsonSchema;
|
||||
|
||||
/// A visual citation receipt for extracted text.
|
||||
///
|
||||
|
|
@ -59,6 +61,7 @@ use serde::{Deserialize, Serialize};
|
|||
/// }
|
||||
/// ```
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct Receipt {
|
||||
/// Phase 1.7 fingerprint of the source PDF.
|
||||
///
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@
|
|||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
#[cfg(feature = "schemars")]
|
||||
use schemars::JsonSchema;
|
||||
|
||||
use crate::receipts::Receipt;
|
||||
|
||||
|
|
@ -26,6 +28,7 @@ use crate::receipts::Receipt;
|
|||
/// A span is the smallest unit of extracted text, representing a
|
||||
/// contiguous run of text with consistent font and styling.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct SpanJson {
|
||||
/// The extracted text content.
|
||||
pub text: String,
|
||||
|
|
@ -64,6 +67,7 @@ pub struct SpanJson {
|
|||
/// spans. Examples include paragraphs, headings, list items, and
|
||||
/// table cells.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct BlockJson {
|
||||
/// The block kind/type.
|
||||
///
|
||||
|
|
@ -112,6 +116,7 @@ pub type SpanRef = usize;
|
|||
/// A cell represents a single unit within a table row, containing
|
||||
/// its text content, bounding box, and position information.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct CellJson {
|
||||
/// Bounding box in PDF user-space points.
|
||||
///
|
||||
|
|
@ -163,6 +168,7 @@ fn default_one() -> u32 {
|
|||
/// A row contains a sequence of cells that form a horizontal strip
|
||||
/// in the table.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct RowJson {
|
||||
/// Bounding box in PDF user-space points.
|
||||
///
|
||||
|
|
@ -185,6 +191,7 @@ pub struct RowJson {
|
|||
/// provides the concatenated text and position, while the TableJson
|
||||
/// provides full cell-level structure.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct TableJson {
|
||||
/// Unique identifier for this table (e.g., "table_0").
|
||||
pub id: String,
|
||||
|
|
@ -231,6 +238,7 @@ pub struct TableJson {
|
|||
/// in the root metadata (full JSON mode). It provides aggregate
|
||||
/// quality signals across all pages.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))]
|
||||
pub struct ExtractionQuality {
|
||||
/// Overall quality assessment: "high", "medium", "low", or "none".
|
||||
///
|
||||
|
|
|
|||
|
|
@ -1,345 +1,489 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json",
|
||||
"title": "PDFtract Extraction Output Schema v1.0",
|
||||
"description": "JSON output schema for PDF text and structure extraction",
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "ExtractionResult",
|
||||
"description": "Result of a PDF extraction operation.\n\nContains the extracted pages, spans, blocks, and metadata.",
|
||||
"type": "object",
|
||||
"required": ["fingerprint", "schema_version", "pages", "metadata"],
|
||||
"properties": {
|
||||
"fingerprint": {
|
||||
"type": "string",
|
||||
"description": "PDF fingerprint for verification (format: pdftract-v1:<hex>)"
|
||||
},
|
||||
"schema_version": {
|
||||
"type": "string",
|
||||
"description": "Schema version (e.g., '1.0')",
|
||||
"enum": ["1.0"]
|
||||
},
|
||||
"pages": {
|
||||
"type": "array",
|
||||
"description": "Extracted pages",
|
||||
"items": {
|
||||
"$ref": "#/definitions/page"
|
||||
}
|
||||
"description": "The PDF fingerprint (for receipt generation).",
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"$ref": "#/definitions/metadata"
|
||||
"description": "Metadata about the extraction.",
|
||||
"$ref": "#/$defs/ExtractionMetadata"
|
||||
},
|
||||
"pages": {
|
||||
"description": "Extracted pages, each containing spans and blocks.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/PageResult"
|
||||
}
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"page": {
|
||||
"required": [
|
||||
"fingerprint",
|
||||
"pages",
|
||||
"metadata"
|
||||
],
|
||||
"$defs": {
|
||||
"BlockJson": {
|
||||
"description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.",
|
||||
"type": "object",
|
||||
"required": ["index", "spans", "blocks", "tables"],
|
||||
"properties": {
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "0-based page index"
|
||||
},
|
||||
"spans": {
|
||||
"type": "array",
|
||||
"description": "Extracted text spans",
|
||||
"items": {
|
||||
"$ref": "#/definitions/span"
|
||||
}
|
||||
},
|
||||
"blocks": {
|
||||
"type": "array",
|
||||
"description": "Extracted structural blocks",
|
||||
"items": {
|
||||
"$ref": "#/definitions/block"
|
||||
}
|
||||
},
|
||||
"tables": {
|
||||
"type": "array",
|
||||
"description": "Extracted tables (cell-level structure)",
|
||||
"items": {
|
||||
"$ref": "#/definitions/table"
|
||||
}
|
||||
},
|
||||
"error": {
|
||||
"type": "string",
|
||||
"description": "Error message if extraction failed for this page"
|
||||
}
|
||||
}
|
||||
},
|
||||
"span": {
|
||||
"type": "object",
|
||||
"required": ["text", "bbox", "font", "size"],
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The extracted text content"
|
||||
},
|
||||
"bbox": {
|
||||
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
"type": "number",
|
||||
"format": "double"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
"maxItems": 4,
|
||||
"minItems": 4
|
||||
},
|
||||
"font": {
|
||||
"type": "string",
|
||||
"description": "Font name or identifier"
|
||||
},
|
||||
"size": {
|
||||
"type": "number",
|
||||
"description": "Font size in points"
|
||||
},
|
||||
"confidence": {
|
||||
"type": "number",
|
||||
"description": "Confidence score (0.0 to 1.0) for OCR text",
|
||||
"minimum": 0.0,
|
||||
"maximum": 1.0
|
||||
},
|
||||
"receipt": {
|
||||
"$ref": "#/definitions/receipt"
|
||||
}
|
||||
}
|
||||
},
|
||||
"block": {
|
||||
"type": "object",
|
||||
"required": ["kind", "text", "bbox"],
|
||||
"properties": {
|
||||
"kind": {
|
||||
"type": "string",
|
||||
"description": "Block kind/type",
|
||||
"enum": ["paragraph", "heading", "list", "table", "figure"]
|
||||
},
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The concatenated text content of all spans in the block"
|
||||
},
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
"description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".",
|
||||
"type": "string"
|
||||
},
|
||||
"level": {
|
||||
"type": "integer",
|
||||
"description": "Heading level (1-6) for 'heading' kind blocks",
|
||||
"minimum": 1,
|
||||
"maximum": 6
|
||||
},
|
||||
"table_index": {
|
||||
"type": "integer",
|
||||
"description": "Table index for 'table' kind blocks (points to tables array)",
|
||||
"description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.",
|
||||
"type": [
|
||||
"integer",
|
||||
"null"
|
||||
],
|
||||
"format": "uint8",
|
||||
"maximum": 255,
|
||||
"minimum": 0
|
||||
},
|
||||
"receipt": {
|
||||
"$ref": "#/definitions/receipt"
|
||||
}
|
||||
}
|
||||
},
|
||||
"table": {
|
||||
"type": "object",
|
||||
"required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"],
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"description": "Unique identifier for this table (e.g., 'table_0')"
|
||||
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/$defs/Receipt"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"rows": {
|
||||
"type": "array",
|
||||
"description": "Rows in this table, ordered top-to-bottom",
|
||||
"items": {
|
||||
"$ref": "#/definitions/row"
|
||||
}
|
||||
},
|
||||
"header_rows": {
|
||||
"type": "integer",
|
||||
"description": "Number of contiguous header rows at the top of the table",
|
||||
"table_index": {
|
||||
"description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.",
|
||||
"type": [
|
||||
"integer",
|
||||
"null"
|
||||
],
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"detection_method": {
|
||||
"type": "string",
|
||||
"description": "Detection method used to identify this table",
|
||||
"enum": ["line_based", "borderless"]
|
||||
},
|
||||
"continued": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this table continues on the next page"
|
||||
},
|
||||
"continued_from_prev": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this table is a continuation from the previous page"
|
||||
},
|
||||
"page_index": {
|
||||
"type": "integer",
|
||||
"description": "Zero-based page index where this table appears",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"row": {
|
||||
"type": "object",
|
||||
"required": ["bbox", "cells", "is_header"],
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"cells": {
|
||||
"type": "array",
|
||||
"description": "Cells in this row, ordered left-to-right",
|
||||
"items": {
|
||||
"$ref": "#/definitions/cell"
|
||||
}
|
||||
},
|
||||
"is_header": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this row is a header row"
|
||||
}
|
||||
}
|
||||
},
|
||||
"cell": {
|
||||
"type": "object",
|
||||
"required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"],
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
},
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The concatenated text content of all spans in the cell"
|
||||
},
|
||||
"spans": {
|
||||
"description": "The concatenated text content of all spans in the block.",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"kind",
|
||||
"text",
|
||||
"bbox"
|
||||
]
|
||||
},
|
||||
"CellJson": {
|
||||
"description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
||||
"type": "array",
|
||||
"description": "References to spans in the page's spans array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
}
|
||||
},
|
||||
"row": {
|
||||
"type": "integer",
|
||||
"description": "Zero-based row index within the table",
|
||||
"minimum": 0
|
||||
"type": "number",
|
||||
"format": "double"
|
||||
},
|
||||
"maxItems": 4,
|
||||
"minItems": 4
|
||||
},
|
||||
"col": {
|
||||
"description": "Zero-based column index within the table.",
|
||||
"type": "integer",
|
||||
"description": "Zero-based column index within the table",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"colspan": {
|
||||
"description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.",
|
||||
"type": "integer",
|
||||
"format": "uint32",
|
||||
"default": 1,
|
||||
"minimum": 0
|
||||
},
|
||||
"is_header_row": {
|
||||
"description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"row": {
|
||||
"description": "Zero-based row index within the table.",
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"rowspan": {
|
||||
"description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.",
|
||||
"type": "integer",
|
||||
"description": "Number of rows this cell spans (default 1)",
|
||||
"minimum": 1
|
||||
"format": "uint32",
|
||||
"default": 1,
|
||||
"minimum": 0
|
||||
},
|
||||
"colspan": {
|
||||
"type": "integer",
|
||||
"description": "Number of columns this cell spans (default 1)",
|
||||
"minimum": 1
|
||||
},
|
||||
"is_header_row": {
|
||||
"type": "boolean",
|
||||
"description": "Whether this cell is in a header row"
|
||||
}
|
||||
}
|
||||
},
|
||||
"receipt": {
|
||||
"type": "object",
|
||||
"required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"],
|
||||
"properties": {
|
||||
"pdf_fingerprint": {
|
||||
"type": "string",
|
||||
"description": "The PDF fingerprint"
|
||||
},
|
||||
"page_index": {
|
||||
"type": "integer",
|
||||
"description": "The page index"
|
||||
},
|
||||
"bbox": {
|
||||
"spans": {
|
||||
"description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.",
|
||||
"type": "array",
|
||||
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
||||
"items": {
|
||||
"type": "number"
|
||||
},
|
||||
"minItems": 4,
|
||||
"maxItems": 4
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"content_hash": {
|
||||
"type": "string",
|
||||
"description": "SHA-256 hash of the content"
|
||||
},
|
||||
"extraction_version": {
|
||||
"type": "string",
|
||||
"description": "Version string of the extractor"
|
||||
},
|
||||
"svg_clip": {
|
||||
"type": "string",
|
||||
"description": "SVG clip path for verification (present only in SvgClip mode)"
|
||||
"text": {
|
||||
"description": "The concatenated text content of all spans in the cell.",
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"bbox",
|
||||
"text",
|
||||
"spans",
|
||||
"row",
|
||||
"col",
|
||||
"is_header_row"
|
||||
]
|
||||
},
|
||||
"metadata": {
|
||||
"ExtractionMetadata": {
|
||||
"description": "Metadata about the extraction process.",
|
||||
"type": "object",
|
||||
"required": ["page_count", "span_count", "block_count"],
|
||||
"properties": {
|
||||
"page_count": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages in the document"
|
||||
},
|
||||
"span_count": {
|
||||
"type": "integer",
|
||||
"description": "Number of spans extracted"
|
||||
},
|
||||
"block_count": {
|
||||
"description": "Number of blocks extracted.",
|
||||
"type": "integer",
|
||||
"description": "Number of blocks extracted"
|
||||
},
|
||||
"cache_status": {
|
||||
"type": "string",
|
||||
"description": "Cache status: 'hit', 'miss', or 'skipped'",
|
||||
"enum": ["hit", "miss", "skipped"]
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"cache_age_seconds": {
|
||||
"type": "integer",
|
||||
"description": "Cache entry age in seconds (only present when cache_status == 'hit')",
|
||||
"description": "Cache entry age in seconds (only present when cache_status == \"hit\")",
|
||||
"type": [
|
||||
"integer",
|
||||
"null"
|
||||
],
|
||||
"format": "uint64",
|
||||
"minimum": 0
|
||||
},
|
||||
"error_count": {
|
||||
"type": "integer",
|
||||
"description": "Number of pages that failed to extract",
|
||||
"minimum": 0
|
||||
},
|
||||
"reading_order_algorithm": {
|
||||
"type": "string",
|
||||
"description": "Reading order algorithm used for this extraction",
|
||||
"enum": ["struct_tree", "xy_cut"]
|
||||
"cache_status": {
|
||||
"description": "Cache status: \"hit\", \"miss\", or \"skipped\"",
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"diagnostics": {
|
||||
"description": "Diagnostics emitted during extraction (coverage warnings, etc.)",
|
||||
"type": "array",
|
||||
"description": "Diagnostics emitted during extraction",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"error_count": {
|
||||
"description": "Number of pages that failed to extract.",
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"page_count": {
|
||||
"description": "Total number of pages in the document.",
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"reading_order_algorithm": {
|
||||
"description": "Reading order algorithm used for this extraction.",
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"receipts_mode": {
|
||||
"description": "Receipts mode used for this extraction.",
|
||||
"$ref": "#/$defs/ReceiptsMode"
|
||||
},
|
||||
"span_count": {
|
||||
"description": "Number of spans extracted.",
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"page_count",
|
||||
"receipts_mode",
|
||||
"span_count",
|
||||
"block_count",
|
||||
"error_count",
|
||||
"diagnostics"
|
||||
]
|
||||
},
|
||||
"PageResult": {
|
||||
"description": "Result for a single page.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"blocks": {
|
||||
"description": "Extracted blocks (semantic units like paragraphs, headings).",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/BlockJson"
|
||||
}
|
||||
},
|
||||
"error": {
|
||||
"description": "Error message if extraction failed for this page.",
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
},
|
||||
"index": {
|
||||
"description": "0-based page index.",
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"spans": {
|
||||
"description": "Extracted spans (text fragments with consistent styling).",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/SpanJson"
|
||||
}
|
||||
},
|
||||
"tables": {
|
||||
"description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/TableJson"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"index",
|
||||
"spans",
|
||||
"blocks",
|
||||
"tables"
|
||||
]
|
||||
},
|
||||
"Receipt": {
|
||||
"description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n \"page_index\": 14,\n \"bbox\": [220.0, 412.0, 412.0, 432.0],\n \"content_hash\": \"sha256:9b21...\",\n \"extraction_version\": \"1.0.0\"\n}\n```",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"format": "double"
|
||||
},
|
||||
"maxItems": 4,
|
||||
"minItems": 4
|
||||
},
|
||||
"content_hash": {
|
||||
"description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).",
|
||||
"type": "string"
|
||||
},
|
||||
"extraction_version": {
|
||||
"description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.",
|
||||
"type": "string"
|
||||
},
|
||||
"page_index": {
|
||||
"description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.",
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"pdf_fingerprint": {
|
||||
"description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).",
|
||||
"type": "string"
|
||||
},
|
||||
"svg_clip": {
|
||||
"description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.",
|
||||
"type": [
|
||||
"string",
|
||||
"null"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"pdf_fingerprint",
|
||||
"page_index",
|
||||
"bbox",
|
||||
"content_hash",
|
||||
"extraction_version"
|
||||
]
|
||||
},
|
||||
"ReceiptsMode": {
|
||||
"description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.",
|
||||
"oneOf": [
|
||||
{
|
||||
"description": "No receipts generated (default).",
|
||||
"type": "string",
|
||||
"const": "off"
|
||||
},
|
||||
{
|
||||
"description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.",
|
||||
"type": "string",
|
||||
"const": "lite"
|
||||
},
|
||||
{
|
||||
"description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.",
|
||||
"type": "string",
|
||||
"const": "svg"
|
||||
}
|
||||
]
|
||||
},
|
||||
"RowJson": {
|
||||
"description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"format": "double"
|
||||
},
|
||||
"maxItems": 4,
|
||||
"minItems": 4
|
||||
},
|
||||
"cells": {
|
||||
"description": "Cells in this row, ordered left-to-right.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/CellJson"
|
||||
}
|
||||
},
|
||||
"is_header": {
|
||||
"description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.",
|
||||
"type": "boolean"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"bbox",
|
||||
"cells",
|
||||
"is_header"
|
||||
]
|
||||
},
|
||||
"SpanJson": {
|
||||
"description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"format": "double"
|
||||
},
|
||||
"maxItems": 4,
|
||||
"minItems": 4
|
||||
},
|
||||
"confidence": {
|
||||
"description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.",
|
||||
"type": [
|
||||
"number",
|
||||
"null"
|
||||
],
|
||||
"format": "double"
|
||||
},
|
||||
"font": {
|
||||
"description": "Font name or identifier.",
|
||||
"type": "string"
|
||||
},
|
||||
"receipt": {
|
||||
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`.",
|
||||
"anyOf": [
|
||||
{
|
||||
"$ref": "#/$defs/Receipt"
|
||||
},
|
||||
{
|
||||
"type": "null"
|
||||
}
|
||||
]
|
||||
},
|
||||
"size": {
|
||||
"description": "Font size in points.",
|
||||
"type": "number",
|
||||
"format": "double"
|
||||
},
|
||||
"text": {
|
||||
"description": "The extracted text content.",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"text",
|
||||
"bbox",
|
||||
"font",
|
||||
"size"
|
||||
]
|
||||
},
|
||||
"TableJson": {
|
||||
"description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"bbox": {
|
||||
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "number",
|
||||
"format": "double"
|
||||
},
|
||||
"maxItems": 4,
|
||||
"minItems": 4
|
||||
},
|
||||
"continued": {
|
||||
"description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"continued_from_prev": {
|
||||
"description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.",
|
||||
"type": "boolean"
|
||||
},
|
||||
"detection_method": {
|
||||
"description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics",
|
||||
"type": "string"
|
||||
},
|
||||
"header_rows": {
|
||||
"description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.",
|
||||
"type": "integer",
|
||||
"format": "uint32",
|
||||
"minimum": 0
|
||||
},
|
||||
"id": {
|
||||
"description": "Unique identifier for this table (e.g., \"table_0\").",
|
||||
"type": "string"
|
||||
},
|
||||
"page_index": {
|
||||
"description": "Zero-based page index where this table appears.",
|
||||
"type": "integer",
|
||||
"format": "uint",
|
||||
"minimum": 0
|
||||
},
|
||||
"rows": {
|
||||
"description": "Rows in this table, ordered top-to-bottom.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/RowJson"
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"bbox",
|
||||
"rows",
|
||||
"header_rows",
|
||||
"detection_method",
|
||||
"continued",
|
||||
"continued_from_prev",
|
||||
"page_index"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -11,6 +11,10 @@ publish = false
|
|||
name = "xtask"
|
||||
path = "src/main.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "gen_schema"
|
||||
path = "src/bin/gen_schema.rs"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
|
|
@ -18,3 +22,5 @@ serde_yaml = "0.9"
|
|||
glob = "0.3"
|
||||
humantime = "2.1"
|
||||
lopdf = "0.34"
|
||||
schemars = "1.2"
|
||||
pdftract-core = { path = "../crates/pdftract-core", features = ["schemars"] }
|
||||
|
|
|
|||
74
xtask/src/bin/gen_schema.rs
Normal file
74
xtask/src/bin/gen_schema.rs
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
//! Generate JSON Schema from Rust output types.
|
||||
//!
|
||||
//! This binary generates the canonical JSON Schema for pdftract's
|
||||
//! extraction output, which is checked into the repository at
|
||||
//! docs/schema/v1.0/pdftract.schema.json.
|
||||
//!
|
||||
//! Usage: cargo run --bin gen_schema
|
||||
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Find the workspace root
|
||||
let workspace_root = find_workspace_root();
|
||||
|
||||
// Generate the schema
|
||||
let schema_json = generate_schema();
|
||||
|
||||
// Write to docs/schema/v1.0/pdftract.schema.json
|
||||
let schema_path = workspace_root.join("docs/schema/v1.0/pdftract.schema.json");
|
||||
|
||||
// Create the directory if it doesn't exist
|
||||
if let Some(parent) = schema_path.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
fs::write(&schema_path, schema_json)?;
|
||||
|
||||
println!("Generated schema at: {}", schema_path.display());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Find the workspace root by searching for Cargo.toml
|
||||
fn find_workspace_root() -> PathBuf {
|
||||
let mut current = std::env::current_dir().unwrap();
|
||||
|
||||
// If we're in the xtask directory, go to parent
|
||||
if current.ends_with("xtask") {
|
||||
current = current.parent().unwrap().to_path_buf();
|
||||
}
|
||||
|
||||
// Search upward for Cargo.toml with workspace members
|
||||
loop {
|
||||
let cargo_toml = current.join("Cargo.toml");
|
||||
if cargo_toml.exists() {
|
||||
let content = fs::read_to_string(&cargo_toml).unwrap_or_default();
|
||||
if content.contains("[workspace]") {
|
||||
return current;
|
||||
}
|
||||
}
|
||||
|
||||
match current.parent() {
|
||||
Some(parent) => current = parent.to_path_buf(),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: use current directory if not found
|
||||
std::env::current_dir().unwrap()
|
||||
}
|
||||
|
||||
/// Generate the JSON Schema for pdftract extraction output.
|
||||
fn generate_schema() -> String {
|
||||
use pdftract_core::extract::ExtractionResult;
|
||||
use schemars::schema_for;
|
||||
|
||||
let schema = schema_for!(ExtractionResult);
|
||||
|
||||
// Convert to JSON string
|
||||
// The schema_for! macro already includes the $schema field
|
||||
serde_json::to_string_pretty(&schema)
|
||||
.expect("Failed to serialize schema")
|
||||
}
|
||||
|
|
@ -104,17 +104,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
eprintln!(" doc-profiles Generate README skeletons for all profiles");
|
||||
eprintln!(" generate-stress-pdfs Generate stress-test PDFs for memory ceiling testing");
|
||||
eprintln!(" generate-page-class-fixtures Generate page classification test fixtures");
|
||||
eprintln!(" gen-schema Generate JSON Schema from Rust output types");
|
||||
eprintln!(" memory-ceiling Run memory ceiling tests against perf/malformed corpora");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
match args[1].as_str() {
|
||||
let result = match args[1].as_str() {
|
||||
"doc-profile" => {
|
||||
if args.len() < 3 {
|
||||
eprintln!("Usage: xtask doc-profile <profile-name>");
|
||||
std::process::exit(1);
|
||||
}
|
||||
generate_profile_readme(&args[2])?;
|
||||
Ok(())
|
||||
}
|
||||
"doc-profiles" => {
|
||||
let profiles_dir = find_workspace_root().join("profiles/builtin");
|
||||
|
|
@ -127,20 +129,45 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
"generate-stress-pdfs" => {
|
||||
generate_stress_pdfs()?;
|
||||
Ok(())
|
||||
}
|
||||
"generate-page-class-fixtures" => {
|
||||
generate_page_class_fixtures()?;
|
||||
Ok(())
|
||||
}
|
||||
"gen-schema" => {
|
||||
gen_schema()?;
|
||||
Ok(())
|
||||
}
|
||||
"memory-ceiling" => {
|
||||
run_memory_ceiling_tests()?;
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
eprintln!("Unknown command: {}", args[1]);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Generate JSON Schema from Rust output types.
|
||||
///
|
||||
/// Delegates to the gen_schema binary.
|
||||
fn gen_schema() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Invoke the gen_schema binary
|
||||
let status = std::process::Command::new("cargo")
|
||||
.args(["run", "--bin", "gen_schema"])
|
||||
.current_dir(find_workspace_root())
|
||||
.status()?;
|
||||
|
||||
if !status.success() {
|
||||
return Err(format!("gen_schema failed with exit code: {:?}", status.code()).into());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue