pdftract/docs/schema/v1.0/pdftract.schema.json
jedarden 23322f79d1
Some checks are pending
Schema Generation Validation / Validate JSON Schema (push) Waiting to run
Schema Generation Validation / Validate JSON Syntax (push) Waiting to run
feat(pdftract-2qw5j): add explicit enum constraints to JSON Schema
Add explicit enum constraints to page_type, severity, and confidence_source
fields in the generated JSON Schema for better validation.

Changes:
- Modified xtask/src/bin/gen_schema.rs to add explicit enum constraints
  during schema generation via add_enum_constraints() function
- page_type enum: ["text", "scanned", "mixed", "broken_vector", "blank", "figure_only"]
- severity enum: ["info", "warning", "error", "fatal"]
- confidence_source enum: ["native", "heuristic", "ocr"]
- Regenerated docs/schema/v1.0/pdftract.schema.json with enum constraints
- Added .github/workflows/schema-gen.yml CI workflow for schema validation

The CI workflow validates:
1. Generated schema matches committed file (fails on diff)
2. JSON syntax is valid
3. Schema structure is correct ($id, $schema, title, $defs)
4. Enum constraints are present and have correct values

This ensures schema changes are reviewable in PRs and forces
developers to commit the updated schema when type definitions change.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 02:47:54 -04:00

1900 lines
No EOL
66 KiB
JSON

{
"$defs": {
"AnnotationJson": {
"description": "JSON representation of a non-link annotation.\n\nRepresents markup annotations like highlights, text notes, stamps,\nand other non-link annotations.\n\nPer the plan (Phase 7.6.4), annotations are emitted at the page level in the\n`/pages[i]/annotations` array, sorted by (rect.y0 desc, rect.x0) for deterministic output.",
"properties": {
"author": {
"description": "The annotation's author (from /T).\n\nNone if /T is missing or not a string.",
"type": [
"string",
"null"
]
},
"color": {
"description": "The color array (from /C) as RGB/Grayscale components.\n\nNone if /C is missing. Length is 1 (grayscale), 3 (RGB), or 4 (CMYK).",
"items": {
"format": "float",
"type": "number"
},
"type": [
"array",
"null"
]
},
"contents": {
"description": "The annotation's content text (from /Contents).\n\nNone if /Contents is missing or not a string.",
"type": [
"string",
"null"
]
},
"modified": {
"description": "The modification date (from /M) as an ISO 8601 string.\n\nNone if /M is missing, malformed, or fails to parse.",
"type": [
"string",
"null"
]
},
"name_id": {
"description": "The name identifier (from /NM).\n\nNone if /NM is missing.",
"type": [
"string",
"null"
]
},
"opacity": {
"description": "The opacity (from /CA).\n\nNone if not specified (defaults to 1.0).",
"format": "float",
"type": [
"number",
"null"
]
},
"rect": {
"description": "Bounding box in PDF user-space points.\n\nFormat: [x0, y0, x1, y1] where (x0, y0) is the bottom-left corner.\nNone if the /Rect entry is missing or invalid.",
"items": {
"format": "float",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": [
"array",
"null"
]
},
"specific": {
"anyOf": [
{
"$ref": "#/$defs/AnnotationSpecificJson"
},
{
"type": "null"
}
],
"description": "Subtype-specific fields.\n\nThe presence and contents of this field depend on the annotation subtype:\n- TextMarkup (Highlight, Squiggly, StrikeOut, Underline): contains \"quads\" array\n- Stamp: contains \"name\" field\n- FreeText: contains \"da\" (default appearance) field\n- Text (sticky note): contains \"open\", \"state\", \"state_model\" fields\n- Ink: contains \"strokes\" array\n- Line: contains \"endpoints\" array\n- Polygon/PolyLine: contains \"vertices\" array\n- FileAttachment: contains \"fs_ref\" field\n- Other subtypes: null or omitted"
},
"subject": {
"description": "The subject (from /Subj).\n\nNone if /Subj is missing.",
"type": [
"string",
"null"
]
},
"type": {
"description": "Annotation subtype (e.g., \"Text\", \"Highlight\", \"Stamp\", \"FreeText\").\n\nPer INV: stable taxonomy of annotation subtypes.",
"type": "string"
}
},
"required": [
"type"
],
"type": "object"
},
"AnnotationSpecificJson": {
"description": "JSON representation of subtype-specific annotation fields.",
"oneOf": [
{
"description": "Text markup annotations (Highlight, Squiggly, StrikeOut, Underline).\n\nContains quad points for the highlighted regions.",
"properties": {
"kind": {
"const": "text_markup",
"type": "string"
},
"quads": {
"items": {
"items": {
"format": "float",
"type": "number"
},
"maxItems": 8,
"minItems": 8,
"type": "array"
},
"type": "array"
}
},
"required": [
"kind",
"quads"
],
"type": "object"
},
{
"description": "Stamp annotation with icon name.",
"properties": {
"kind": {
"const": "stamp",
"type": "string"
},
"name": {
"type": [
"string",
"null"
]
}
},
"required": [
"kind"
],
"type": "object"
},
{
"description": "FreeText annotation with default appearance string.",
"properties": {
"da": {
"type": [
"string",
"null"
]
},
"kind": {
"const": "free_text",
"type": "string"
}
},
"required": [
"kind"
],
"type": "object"
},
{
"description": "Text (sticky note) annotation.",
"properties": {
"kind": {
"const": "text",
"type": "string"
},
"open": {
"type": [
"boolean",
"null"
]
},
"state": {
"type": [
"string",
"null"
]
},
"state_model": {
"type": [
"string",
"null"
]
}
},
"required": [
"kind"
],
"type": "object"
},
{
"description": "Ink annotation with stroke paths.",
"properties": {
"kind": {
"const": "ink",
"type": "string"
},
"strokes": {
"items": {
"items": {
"items": {
"format": "float",
"type": "number"
},
"maxItems": 2,
"minItems": 2,
"type": "array"
},
"type": "array"
},
"type": "array"
}
},
"required": [
"kind",
"strokes"
],
"type": "object"
},
{
"description": "Line annotation with endpoints.",
"properties": {
"endpoints": {
"items": {
"format": "float",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": [
"array",
"null"
]
},
"kind": {
"const": "line",
"type": "string"
}
},
"required": [
"kind"
],
"type": "object"
},
{
"description": "Polygon or PolyLine annotation with vertices.",
"properties": {
"kind": {
"const": "polygon",
"type": "string"
},
"vertices": {
"items": {
"items": {
"format": "float",
"type": "number"
},
"maxItems": 2,
"minItems": 2,
"type": "array"
},
"type": "array"
}
},
"required": [
"kind",
"vertices"
],
"type": "object"
},
{
"description": "FileAttachment annotation.",
"properties": {
"fs_ref": {
"format": "uint32",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"kind": {
"const": "file_attachment",
"type": "string"
}
},
"required": [
"kind"
],
"type": "object"
},
{
"description": "Other annotation types with no subtype-specific fields.",
"properties": {
"kind": {
"const": "other",
"type": "string"
}
},
"required": [
"kind"
],
"type": "object"
}
]
},
"AttachmentJson": {
"description": "JSON representation of an embedded file attachment.\n\nRepresents a single embedded file extracted from the PDF's\n`/EmbeddedFiles` name tree or `/AF` (Associated Files) array.\n\nPer the plan (Phase 7.5.3), attachments exceeding 50 MB are truncated\n(metadata only, `data: null`, `truncated: true`). The `data` field\ncontains base64-encoded content using RFC 4648 standard alphabet with\npadding and no line breaks.\n\nThe JSON Schema declares `contentEncoding: base64` for the `data` field,\nenabling JSON Schema validators and code generation tools to understand\nthe encoding.",
"properties": {
"checksum_md5": {
"description": "MD5 checksum from /Params /CheckSum as hex string (None if absent).\n\nPer PDF spec, /CheckSum is a 16-byte binary string (MD5), hex-encoded\nas 32 lowercase hex characters.",
"type": [
"string",
"null"
]
},
"created": {
"description": "Creation date from /Params /CreationDate as ISO 8601 string (None if absent).\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"",
"type": [
"string",
"null"
]
},
"data": {
"description": "Base64-encoded attachment content (null if truncated or empty).\n\nPer JSON Schema, this field has `contentEncoding: base64`, indicating\nthe string is base64-encoded binary data. Downstream tools can use this\ninformation to automatically decode the content.\n\n- `Some(base64_string)` when content <= 50 MB\n- `None` when `truncated: true` (content too large)\n\nIn the Python API (PyO3), this field is returned as a `bytes` object\n(PyO3 automatically decodes the base64 string).",
"type": [
"string",
"null"
]
},
"description": {
"description": "Description from /Desc (None if absent, not empty string).",
"type": [
"string",
"null"
]
},
"mime_type": {
"description": "MIME type from stream /Subtype (None if absent, no guessing from extension).",
"type": [
"string",
"null"
]
},
"modified": {
"description": "Modification date from /Params /ModDate as ISO 8601 string (None if absent).\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"",
"type": [
"string",
"null"
]
},
"name": {
"description": "Attachment filename from /UF (Unicode, preferred) or /F (system-independent).",
"type": "string"
},
"size": {
"description": "Original decoded size in bytes (always populated, even when truncated).\n\nThis is the size of the attachment content before base64 encoding.\nWhen `truncated: true`, this represents the full original size that\nwas not included in the output.",
"format": "uint64",
"minimum": 0,
"type": "integer"
},
"truncated": {
"description": "Whether the attachment content was truncated due to the 50 MB size limit.\n\nWhen `true`, the `data` field is `None` and only metadata is included.\nThe `size` field still reflects the original full size.",
"type": "boolean"
}
},
"required": [
"name",
"size",
"truncated"
],
"type": "object"
},
"BeadJson": {
"description": "A single bead in an article thread chain.\n\nRepresents one bead's position on a page, extracted during bead chain walking.\nPer PDF 1.7 Section 12.4.3, each bead contains a reference to its page and\na bounding rectangle defining the article region on that page.\n\n# Fields\n\n* `page_index` - 0-based index of the page containing this bead\n* `rect` - Bounding rectangle of the bead region in PDF user-space coordinates [x0, y0, x1, y1]",
"properties": {
"page_index": {
"description": "0-based page index where this bead is located.",
"format": "uint",
"minimum": 0,
"type": "integer"
},
"rect": {
"description": "Bounding rectangle in PDF user-space coordinates [x0, y0, x1, y1].\n\nPer PDF spec, the origin is at the bottom-left corner of the page.\nThis rect is NOT flipped to image-space coordinates.",
"items": {
"format": "float",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
}
},
"required": [
"page_index",
"rect"
],
"type": "object"
},
"BlockJson": {
"description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"items": {
"format": "double",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
},
"kind": {
"description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".",
"type": "string"
},
"level": {
"description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.",
"format": "uint8",
"maximum": 255,
"minimum": 0,
"type": [
"integer",
"null"
]
},
"receipt": {
"anyOf": [
{
"$ref": "#/$defs/Receipt"
},
{
"type": "null"
}
],
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`."
},
"spans": {
"default": [],
"description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this block's content.",
"items": {
"format": "uint",
"minimum": 0,
"type": "integer"
},
"type": "array"
},
"table_index": {
"description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.",
"format": "uint",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"text": {
"description": "The concatenated text content of all spans in the block.",
"type": "string"
}
},
"required": [
"kind",
"text",
"bbox"
],
"type": "object"
},
"CellJson": {
"description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"items": {
"format": "double",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
},
"col": {
"description": "Zero-based column index within the table.",
"format": "uint",
"minimum": 0,
"type": "integer"
},
"colspan": {
"default": 1,
"description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.",
"format": "uint32",
"minimum": 0,
"type": "integer"
},
"is_header_row": {
"description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.",
"type": "boolean"
},
"row": {
"description": "Zero-based row index within the table.",
"format": "uint",
"minimum": 0,
"type": "integer"
},
"rowspan": {
"default": 1,
"description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.",
"format": "uint32",
"minimum": 0,
"type": "integer"
},
"spans": {
"description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.",
"items": {
"format": "uint",
"minimum": 0,
"type": "integer"
},
"type": "array"
},
"text": {
"description": "The concatenated text content of all spans in the cell.",
"type": "string"
}
},
"required": [
"bbox",
"text",
"spans",
"row",
"col",
"is_header_row"
],
"type": "object"
},
"ChoiceValueJson": {
"anyOf": [
{
"description": "Single selected option.",
"type": "string"
},
{
"description": "Multiple selected options.",
"items": {
"type": "string"
},
"type": "array"
}
],
"description": "Choice field value representation.\n\nChoice fields can have either a single selected value or multiple\nselected values (for multi-select list boxes)."
},
"DestArrayJson": {
"description": "JSON representation of an explicit destination array.\n\nDescribes a specific location within a PDF page.",
"oneOf": [
{
"description": "XYZ destination with optional left, top, zoom.\n\nNull values mean \"retain current view\" for that parameter.",
"properties": {
"fit": {
"const": "xyz",
"type": "string"
},
"left": {
"format": "double",
"type": [
"number",
"null"
]
},
"top": {
"format": "double",
"type": [
"number",
"null"
]
},
"zoom": {
"format": "double",
"type": [
"number",
"null"
]
}
},
"required": [
"fit"
],
"type": "object"
},
{
"description": "Fit page to window.",
"properties": {
"fit": {
"const": "fit",
"type": "string"
}
},
"required": [
"fit"
],
"type": "object"
},
{
"description": "Fit horizontally with optional top coordinate.",
"properties": {
"fit": {
"const": "fith",
"type": "string"
},
"top": {
"format": "double",
"type": [
"number",
"null"
]
}
},
"required": [
"fit"
],
"type": "object"
},
{
"description": "Fit vertically with optional left coordinate.",
"properties": {
"fit": {
"const": "fitv",
"type": "string"
},
"left": {
"format": "double",
"type": [
"number",
"null"
]
}
},
"required": [
"fit"
],
"type": "object"
},
{
"description": "Fit rectangle (left, bottom, right, top).",
"properties": {
"bottom": {
"format": "double",
"type": "number"
},
"fit": {
"const": "fitr",
"type": "string"
},
"left": {
"format": "double",
"type": "number"
},
"right": {
"format": "double",
"type": "number"
},
"top": {
"format": "double",
"type": "number"
}
},
"required": [
"fit",
"left",
"bottom",
"right",
"top"
],
"type": "object"
},
{
"description": "Fit bounding box to window.",
"properties": {
"fit": {
"const": "fitb",
"type": "string"
}
},
"required": [
"fit"
],
"type": "object"
},
{
"description": "Fit bounding box horizontally with optional top coordinate.",
"properties": {
"fit": {
"const": "fitbh",
"type": "string"
},
"top": {
"format": "double",
"type": [
"number",
"null"
]
}
},
"required": [
"fit"
],
"type": "object"
},
{
"description": "Fit bounding box vertically with optional left coordinate.",
"properties": {
"fit": {
"const": "fitbv",
"type": "string"
},
"left": {
"format": "double",
"type": [
"number",
"null"
]
}
},
"required": [
"fit"
],
"type": "object"
}
],
"properties": {
"page_index": {
"description": "Zero-based page index within the document.",
"format": "uint",
"minimum": 0,
"type": "integer"
}
},
"required": [
"page_index"
],
"type": "object"
},
"DestinationJson": {
"description": "JSON representation of a destination anchor.\n\nDescribes a specific location within a PDF page.",
"properties": {
"bottom": {
"description": "Bottom coordinate (user-space points), present only for \"fitr\".",
"format": "double",
"type": [
"number",
"null"
]
},
"left": {
"description": "Left coordinate (user-space points), present for \"xyz\", \"fitv\", \"fitr\", \"fitbv\".",
"format": "double",
"type": [
"number",
"null"
]
},
"right": {
"description": "Right coordinate (user-space points), present only for \"fitr\".",
"format": "double",
"type": [
"number",
"null"
]
},
"top": {
"description": "Top coordinate (user-space points), present for \"xyz\", \"fith\", \"fitr\", \"fitbh\".",
"format": "double",
"type": [
"number",
"null"
]
},
"type": {
"description": "Destination type: \"xyz\", \"fit\", \"fith\", \"fitv\", \"fitr\", \"fitb\", \"fitbh\", \"fitbv\".",
"type": "string"
},
"zoom": {
"description": "Zoom factor, present only for \"xyz\".",
"format": "double",
"type": [
"number",
"null"
]
}
},
"required": [
"type"
],
"type": "object"
},
"DiagnosticJson": {
"description": "JSON representation of a diagnostic error.\n\nThis struct wraps the internal Diagnostic type for JSON serialization,\nproviding stable error codes and human-readable messages for consumers.",
"properties": {
"code": {
"description": "Stable string identifier for this diagnostic (e.g., \"FONT_GLYPH_UNMAPPED\").",
"type": "string"
},
"hint": {
"description": "Optional hint for resolving the diagnostic (e.g., \"Install Tesseract for OCR recovery\").",
"type": [
"string",
"null"
]
},
"location": {
"anyOf": [
{
"$ref": "#/$defs/ObjectLocationJson"
},
{
"type": "null"
}
],
"description": "PDF object reference where the issue originated, if applicable."
},
"message": {
"description": "Human-readable description of the diagnostic.",
"type": "string"
},
"page_index": {
"description": "Page index where this diagnostic occurred, or `null` for document-level events.",
"format": "uint",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"severity": {
"description": "Severity level: \"info\", \"warning\", \"error\", or \"fatal\".",
"enum": [
"info",
"warning",
"error",
"fatal"
],
"type": "string"
}
},
"required": [
"code",
"message",
"severity"
],
"type": "object"
},
"DocumentMetadata": {
"description": "JSON representation of document metadata.\n\nContains all standard PDF document information dictionary fields along\nwith derived signals from the document catalog.",
"properties": {
"author": {
"description": "PDF /Author - name of the person who created the document.",
"type": [
"string",
"null"
]
},
"conformance": {
"default": "none",
"description": "PDF/A or PDF/UA conformance level.\n\nOne of: \"none\", \"PDF-A-1a\", \"PDF-A-1b\", \"PDF-A-2a\", \"PDF-A-2b\", \"PDF-A-2u\",\n\"PDF-A-3a\", \"PDF-A-3b\", \"PDF-A-3u\", \"PDF-UA-1\", \"PDF-UA-2\", \"PDF-X-1a\".",
"type": "string"
},
"contains_javascript": {
"description": "True if JavaScript actions are present in the document.",
"type": "boolean"
},
"contains_xfa": {
"description": "True if XFA forms are present.",
"type": "boolean"
},
"creation_date": {
"description": "PDF /CreationDate - ISO-8601 string from /CreationDate.",
"type": [
"string",
"null"
]
},
"creator": {
"description": "PDF /Creator - the authoring application (e.g., \"Microsoft Word 2019\").",
"type": [
"string",
"null"
]
},
"generator": {
"description": "Heuristic string identifying the producing application.",
"type": [
"string",
"null"
]
},
"is_encrypted": {
"description": "True if document is encrypted.",
"type": "boolean"
},
"is_tagged": {
"description": "True if /MarkInfo /Marked: true is present.",
"type": "boolean"
},
"javascript_actions": {
"default": [],
"description": "JavaScript actions found in the document.\n\nPer TH-04, this array contains all discovered JavaScript actions\nwith their location and code excerpt. Empty when no JS is present.",
"items": {
"$ref": "#/$defs/JavascriptActionJson"
},
"type": "array"
},
"keywords": {
"description": "PDF /Keywords - space- or comma-delimited keyword list.",
"type": [
"string",
"null"
]
},
"modification_date": {
"description": "PDF /ModDate - ISO-8601 string from /ModDate.",
"type": [
"string",
"null"
]
},
"ocg_present": {
"description": "True if optional content groups (layers) are present.",
"type": "boolean"
},
"page_count": {
"description": "Total number of pages in the document.",
"format": "uint32",
"minimum": 0,
"type": "integer"
},
"pdf_version": {
"description": "PDF version (e.g., \"1.7\", \"2.0\").",
"type": [
"string",
"null"
]
},
"producer": {
"description": "PDF /Producer - the PDF-writing library (e.g., \"Acrobat Distiller 23.0\").",
"type": [
"string",
"null"
]
},
"subject": {
"description": "PDF /Subject - subject matter summary.",
"type": [
"string",
"null"
]
},
"title": {
"description": "PDF /Title - document title.",
"type": [
"string",
"null"
]
}
},
"required": [
"page_count",
"is_tagged",
"is_encrypted",
"contains_javascript",
"contains_xfa",
"ocg_present"
],
"type": "object"
},
"ExtractionQuality": {
"description": "Extraction quality metrics for the document.\n\nThis structure appears in the document footer (NDJSON mode) or\nin the root metadata (full JSON mode). It provides aggregate\nquality signals across all pages.",
"properties": {
"avg_confidence": {
"description": "Average confidence score across all spans [0.0, 1.0].",
"format": "float",
"type": [
"number",
"null"
]
},
"dpi_used": {
"description": "DPI used for OCR rendering (Phase 5.2).\n\nThis field records the DPI selected by the automatic DPI selection\nalgorithm (or the user-specified override). It is present when OCR\nwas performed on any page.\n\nValues: 200 (JBIG2), 300 (standard), 400 (fine print), or custom",
"format": "uint32",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"min_confidence": {
"description": "Minimum confidence score across all spans [0.0, 1.0].\n\nThis represents the weakest link in the extraction chain.",
"format": "float",
"type": [
"number",
"null"
]
},
"ocr_fraction": {
"description": "Fraction of pages that required OCR fallback [0.0, 1.0].\n\nThis is the count of pages classified as \"scanned\" or \"mixed\"\ndivided by the total page count.",
"format": "float",
"type": [
"number",
"null"
]
},
"overall_quality": {
"description": "Overall quality assessment: \"high\", \"medium\", \"low\", or \"none\".\n\n- \"high\": All pages extracted successfully with high confidence\n- \"medium\": Most pages extracted, some with lower confidence\n- \"low\": Significant extraction issues (many low-confidence pages)\n- \"none\": No extractable content found (all blank pages)",
"type": "string"
},
"readability": {
"description": "Per-page readability score (char-weighted median of span scores) [0.0, 1.0].\n\nThis is the median of per-span readability scores, weighted by character count.\nA score below 0.5 may indicate mojibake, encoding issues, or broken text layers.",
"format": "float",
"type": [
"number",
"null"
]
}
},
"required": [
"overall_quality"
],
"type": "object"
},
"FormFieldJson": {
"description": "JSON representation of a form field.\n\nThis struct represents a single interactive form field from the PDF's\nAcroForm or XFA data, including its type, value, and metadata.\n\nPer the plan (Phase 7.4), form fields are extracted from both AcroForm\nand XFA sources, with XFA values taking precedence on collision.",
"properties": {
"default": {
"anyOf": [
{
"$ref": "#/$defs/FormFieldValueJson"
},
{
"type": "null"
}
],
"description": "The default value (/DV entry) if present.\n\nMatches the structure of `value` but represents the field's default state."
},
"max_length": {
"description": "Maximum length for text fields (/MaxLen entry).\nOnly present for text fields that have a max length set.",
"format": "uint32",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"multi_select": {
"description": "Whether this choice field supports multiple selections (bit 21 of /Ff).\nOnly present for choice fields.",
"type": [
"boolean",
"null"
]
},
"multiline": {
"description": "Whether this text field supports multiple lines (bit 13 of /Ff).\nOnly present for text fields.",
"type": [
"boolean",
"null"
]
},
"name": {
"description": "The absolute (dot-joined) field name from the AcroForm.\nExample: \"employer_signature\" or \"form.employee_sig\"",
"type": "string"
},
"options": {
"description": "Available options for choice fields.\n\nEach option is a [export_value, display_name] pair.\nOnly present for choice fields.",
"items": {
"items": {
"type": "string"
},
"maxItems": 2,
"minItems": 2,
"type": "array"
},
"type": [
"array",
"null"
]
},
"page_index": {
"description": "Zero-based page index where this field's widget appears.\n\nNone if the field has no visual representation (form-only field).",
"format": "uint",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"pushbutton": {
"description": "Whether this button is a pushbutton (bit 26 of /Ff).\nOnly present for button fields.",
"type": [
"boolean",
"null"
]
},
"radio": {
"description": "Whether this button is a radio button (bit 25 of /Ff).\nOnly present for button fields.",
"type": [
"boolean",
"null"
]
},
"read_only": {
"description": "Whether this field is read-only (bit 1 of /Ff flags).",
"type": "boolean"
},
"rect": {
"description": "Bounding box in PDF user-space points.\n\nFormat: [x0, y0, x1, y1] where (x0, y0) is the bottom-left corner.\nNone if the field has no visual appearance.",
"items": {
"format": "float",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": [
"array",
"null"
]
},
"required": {
"description": "Whether this field is required (bit 2 of /Ff flags).",
"type": "boolean"
},
"selected": {
"description": "Selected state for button fields.\nTrue = checked/selected, False = unchecked.\nOnly present for button fields.",
"type": [
"boolean",
"null"
]
},
"state_name": {
"description": "Appearance state name for button fields.\nE.g., \"Yes\", \"Off\", or custom state names.\nOnly present for button fields.",
"type": [
"string",
"null"
]
},
"type": {
"$ref": "#/$defs/FormFieldTypeJson",
"description": "The field type variant (text, button, choice, or signature)."
},
"value": {
"$ref": "#/$defs/FormFieldValueJson",
"description": "The current value of the form field.\n\nThis field's structure varies by field_type:\n- text: string value\n- button: boolean selected state\n- choice: string or array of strings (for multi-select)\n- signature: signature reference number (or null if unsigned)"
}
},
"required": [
"name",
"type",
"value",
"required",
"read_only"
],
"type": "object"
},
"FormFieldTypeJson": {
"description": "Form field type discriminator.\n\nThis enum uses serde's \"tag\" representation to produce a JSON string\nindicating the field type.",
"oneOf": [
{
"const": "text",
"description": "Text field (/FT /Tx) - single-line or multi-line text input.",
"type": "string"
},
{
"const": "button",
"description": "Button field (/FT /Btn) - pushbutton, checkbox, or radio button.",
"type": "string"
},
{
"const": "choice",
"description": "Choice field (/FT /Ch) - dropdown or list box.",
"type": "string"
},
{
"const": "signature",
"description": "Signature field (/FT /Sig) - digital signature field.",
"type": "string"
}
]
},
"FormFieldValueJson": {
"anyOf": [
{
"description": "Text field value (string or null).",
"type": [
"string",
"null"
]
},
{
"description": "Button field value (boolean selected state).",
"type": "boolean"
},
{
"$ref": "#/$defs/ChoiceValueJson",
"description": "Choice field value (single string or array of strings for multi-select)."
},
{
"description": "Signature field value (signature reference number or null).",
"format": "uint32",
"minimum": 0,
"type": [
"integer",
"null"
]
}
],
"description": "Form field value representation.\n\nThis enum captures the current value of a form field, with the variant\ntype matching the field_type."
},
"JavascriptActionJson": {
"description": "JSON representation of a JavaScript action found in a PDF.\n\nRepresents a single JavaScript action discovered during extraction.\nPer TH-04, pdftract NEVER executes embedded JavaScript; this struct\nsurfaces the JS for downstream security review.",
"properties": {
"code_excerpt": {
"description": "Truncated excerpt of the JavaScript code (first 200 characters).\n\nThe excerpt is JSON-escaped and HTML-escaped if rendered in a web context.\nThis field contains the raw JS text for review, NOT executable code.",
"type": "string"
},
"location": {
"description": "Location of the JavaScript action in the PDF structure.\n\nExamples: \"catalog.openaction\", \"page.0.aa.O\", \"page.1.annot.0.A\".\nThe format is: <scope>.<index>.<path> where scope is \"catalog\" or \"page\",\nindex is the page number (for pages), and path is the dot-joined entry path.",
"type": "string"
}
},
"required": [
"location",
"code_excerpt"
],
"type": "object"
},
"LinkJson": {
"description": "JSON representation of a hyperlink annotation.\n\nRepresents either a URI hyperlink (external link) or an internal destination\nlink (named or explicit destination within the same document).\n\nPer the plan (Phase 7.6.4), links are emitted at the document level in the\n`/links` array, sorted by (page_index, rect.y0 desc, rect.x0) for deterministic output.",
"properties": {
"dest": {
"description": "The internal destination name (from /Dest as a name string).\n\nPresent for named destination links. Null for URI links or explicit destinations.",
"type": [
"string",
"null"
]
},
"dest_array": {
"anyOf": [
{
"$ref": "#/$defs/DestArrayJson"
},
{
"type": "null"
}
],
"description": "Explicit destination array (from /Dest as an array or resolved name tree).\n\nPresent when the link target can be resolved to explicit coordinates.\nNull for URI links or unresolved named destinations."
},
"page_index": {
"description": "Zero-based page index containing this link.",
"format": "uint",
"minimum": 0,
"type": "integer"
},
"rect": {
"description": "Bounding box in PDF user-space points.\n\nFormat: [x0, y0, x1, y1] where (x0, y0) is the bottom-left corner.",
"items": {
"format": "float",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
},
"uri": {
"description": "The URI target for external links (from /A /S /URI /URI).\n\nPresent for URI links and JavaScript actions (prefixed with \"javascript:\").\nNull for internal destination links.",
"type": [
"string",
"null"
]
}
},
"required": [
"page_index",
"rect"
],
"type": "object"
},
"ObjectLocationJson": {
"description": "JSON representation of a PDF object reference.\n\nIdentifies a specific PDF indirect object by its object and generation numbers.",
"properties": {
"generation_number": {
"description": "Generation number (incremented on each save).",
"format": "uint16",
"maximum": 65535,
"minimum": 0,
"type": "integer"
},
"object_number": {
"description": "Object number (zero-based index in the xref table).",
"format": "uint32",
"minimum": 0,
"type": "integer"
}
},
"required": [
"object_number",
"generation_number"
],
"type": "object"
},
"OutlineNode": {
"description": "JSON representation of an outline node (bookmark).\n\nRepresents a single node in the document's outline hierarchy, with support\nfor nested children via the `children` field.",
"properties": {
"children": {
"default": [],
"description": "Nested child outlines (empty array for leaf nodes).",
"items": {
"$ref": "#/$defs/OutlineNode"
},
"type": "array"
},
"destination": {
"anyOf": [
{
"$ref": "#/$defs/DestinationJson"
},
{
"type": "null"
}
],
"description": "Destination type and coordinates within the page."
},
"level": {
"description": "Hierarchical level in the outline tree (0-based, root is 0).",
"format": "uint8",
"maximum": 255,
"minimum": 0,
"type": "integer"
},
"page_index": {
"description": "Zero-based page index this outline points to, if resolved.",
"format": "uint32",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"title": {
"description": "The outline title text (decoded to UTF-8).",
"type": "string"
}
},
"required": [
"title",
"level"
],
"type": "object"
},
"PageJson": {
"description": "JSON representation of a single page.\n\nContains all page-level fields including geometry, classification,\nand content arrays (spans, blocks, tables, annotations).",
"properties": {
"annotations": {
"default": [],
"description": "Page-level annotations (highlights, stamps, notes, links).\n\nEmpty until Phase 7.2; always present as an array.",
"items": {
"$ref": "#/$defs/AnnotationJson"
},
"type": "array"
},
"blocks": {
"default": [],
"description": "Semantic blocks (paragraphs, headings, lists, tables, etc.).",
"items": {
"$ref": "#/$defs/BlockJson"
},
"type": "array"
},
"height": {
"description": "Page height in points (1/72 inch).",
"format": "float",
"type": "number"
},
"page_index": {
"description": "Zero-based page index, canonical for programmatic use.\n\nThis is the stable identifier used in all internal references.",
"format": "uint",
"minimum": 0,
"type": "integer"
},
"page_label": {
"description": "Human-readable label from PDF /PageLabels number tree.\n\nExamples: \"iv\", \"A-3\", \"1\". Null if the PDF defines no page labels.",
"type": [
"string",
"null"
]
},
"page_number": {
"description": "One-based page number (= page_index + 1).\n\nEmitted as a convenience for human-facing display. For programmatic\naccess, use page_index instead.",
"format": "uint32",
"minimum": 0,
"type": "integer"
},
"rotation": {
"description": "Page rotation in degrees clockwise (0, 90, 180, or 270).",
"format": "uint16",
"maximum": 65535,
"minimum": 0,
"type": "integer"
},
"spans": {
"default": [],
"description": "Text spans (atomic units with consistent font and styling).",
"items": {
"$ref": "#/$defs/SpanJson"
},
"type": "array"
},
"tables": {
"default": [],
"description": "Parallel table structure objects.",
"items": {
"$ref": "#/$defs/TableJson"
},
"type": "array"
},
"type": {
"description": "Page classification from the page classifier.\n\nOne of: \"text\", \"scanned\", \"mixed\", \"broken_vector\", \"blank\", \"figure_only\".",
"enum": [
"text",
"scanned",
"mixed",
"broken_vector",
"blank",
"figure_only"
],
"type": "string"
},
"width": {
"description": "Page width in points (1/72 inch).",
"format": "float",
"type": "number"
}
},
"required": [
"page_index",
"page_number",
"width",
"height",
"rotation",
"type"
],
"type": "object"
},
"Receipt": {
"description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n \"page_index\": 14,\n \"bbox\": [220.0, 412.0, 412.0, 432.0],\n \"content_hash\": \"sha256:9b21...\",\n \"extraction_version\": \"1.0.0\"\n}\n```",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.",
"items": {
"format": "double",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
},
"content_hash": {
"description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).",
"type": "string"
},
"extraction_version": {
"description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.",
"type": "string"
},
"page_index": {
"description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.",
"format": "uint",
"minimum": 0,
"type": "integer"
},
"pdf_fingerprint": {
"description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).",
"type": "string"
},
"svg_clip": {
"description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.",
"type": [
"string",
"null"
]
}
},
"required": [
"pdf_fingerprint",
"page_index",
"bbox",
"content_hash",
"extraction_version"
],
"type": "object"
},
"RowJson": {
"description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"items": {
"format": "double",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
},
"cells": {
"description": "Cells in this row, ordered left-to-right.",
"items": {
"$ref": "#/$defs/CellJson"
},
"type": "array"
},
"is_header": {
"description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.",
"type": "boolean"
}
},
"required": [
"bbox",
"cells",
"is_header"
],
"type": "object"
},
"SignatureJson": {
"description": "JSON representation of a digital signature.\n\nThis struct represents a signature extracted from a PDF signature field,\nincluding signer identity, timestamp, and coverage information.\n\nPer the plan (Phase 7.3), pdftract does NOT perform cryptographic validation\nin v1. The `validation_status` field is always \"not_checked\" — future versions\nmay add \"valid\", \"invalid\", or \"indeterminate\" as cryptographic validation\nis implemented.",
"properties": {
"byte_range": {
"description": "The /ByteRange array defining which bytes of the file are signed.\n\nFormat: array of 4 integers [offset, length, offset, length] defining two byte ranges.\nNone if /ByteRange is missing or malformed.",
"items": {
"format": "uint64",
"minimum": 0,
"type": "integer"
},
"type": [
"array",
"null"
]
},
"coverage_fraction": {
"description": "Fraction of the file covered by the signature (0.0 to 1.0).\n\nComputed as `(byte_range[1] + byte_range[3]) / file_size`.\nNone if /ByteRange is missing, malformed, or file_size is unknown.\n\nValues < 1.0 indicate partial signatures (a common red flag for tampered docs).",
"format": "double",
"type": [
"number",
"null"
]
},
"field_name": {
"description": "The absolute (dot-joined) field name from the AcroForm.\nExample: \"employer_signature\" or \"form.employee_sig\"",
"type": "string"
},
"location": {
"description": "The location of signing from the /Location entry.\n\nNone if /Location is absent.",
"type": [
"string",
"null"
]
},
"reason": {
"description": "The reason for signing from the /Reason entry.\n\nNone if /Reason is absent.",
"type": [
"string",
"null"
]
},
"signer_name": {
"description": "The signer's name from the /Name entry in the signature dictionary.\n\nEmpty string if /Name is absent.",
"type": "string"
},
"signing_date": {
"description": "The signing date as an ISO 8601 string (RFC 3339 format).\n\nParsed from the PDF /M date string. None if the date is missing,\nmalformed, or the field is unsigned.\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"",
"type": [
"string",
"null"
]
},
"sub_filter": {
"description": "The signature format / filter from the /SubFilter entry.\n\nIndicates the signature format: \"adbe.pkcs7.detached\", \"adbe.x509.rsa.sha1\", etc.\nNone if /SubFilter is absent.",
"type": [
"string",
"null"
]
},
"validation_status": {
"description": "Validation status — always \"not_checked\" in v1.\n\nFuture versions may add \"valid\", \"invalid\", \"indeterminate\" as cryptographic\nvalidation is implemented. This is a string enum for schema stability.",
"type": "string"
}
},
"required": [
"field_name",
"signer_name",
"validation_status"
],
"type": "object"
},
"SpanJson": {
"description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.\n\nPer INV-7 (confidence_source on every Span), all spans include\nthe confidence_source field to indicate how the text was extracted.",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"items": {
"format": "double",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
},
"color": {
"description": "Fill color as CSS hex string (e.g., \"#1a1a1a\"), or null if not expressible as RGB.\n\nNull for spot colors, patterns, or complex color spaces that cannot be\naccurately represented as RGB hex.",
"type": [
"string",
"null"
]
},
"column": {
"description": "Column index (0-based) assigned by Phase 4.3 column detection.\n\nThis field is `None` for spans outside any detected column\n(e.g., full-width headings, inter-column gaps).",
"format": "uint32",
"minimum": 0,
"type": [
"integer",
"null"
]
},
"confidence": {
"description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.",
"format": "double",
"type": [
"number",
"null"
]
},
"confidence_source": {
"description": "Source of the confidence/text extraction.\n\nOne of: \"vector\" (native font decoding), \"ocr\" (pure OCR),\n\"ocr-assisted\" (OCR + vector correction), \"ocr-fallback\" (region-level fallback),\n\"repaired\" (text was repaired via heuristics).",
"enum": [
"native",
"heuristic",
"ocr"
],
"type": [
"string",
"null"
]
},
"flags": {
"default": [],
"description": "Set of style flags applied to this span.\n\nPossible values: \"bold\", \"italic\", \"smallcaps\", \"subscript\", \"superscript\".",
"items": {
"type": "string"
},
"type": "array"
},
"font": {
"description": "Font name or identifier.",
"type": "string"
},
"lang": {
"description": "BCP-47 language tag if detected, otherwise null.\n\nExamples: \"en\", \"en-US\", \"zh-Hans\". Null when language detection\nis not available or not applicable.",
"type": [
"string",
"null"
]
},
"receipt": {
"anyOf": [
{
"$ref": "#/$defs/Receipt"
},
{
"type": "null"
}
],
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`."
},
"rendering_mode": {
"description": "PDF Tr operator value (0-7) indicating the text rendering mode.\n\n0 = fill, 1 = stroke, 2 = fill then stroke, 3 = invisible,\n4 = fill to clip, 5 = stroke to clip, 6 = fill then stroke to clip,\n7 = clip.",
"format": "uint8",
"maximum": 255,
"minimum": 0,
"type": [
"integer",
"null"
]
},
"size": {
"description": "Font size in points.",
"format": "double",
"type": "number"
},
"text": {
"description": "The extracted text content.",
"type": "string"
}
},
"required": [
"text",
"bbox",
"font",
"size"
],
"type": "object"
},
"TableJson": {
"description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.",
"properties": {
"bbox": {
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
"items": {
"format": "double",
"type": "number"
},
"maxItems": 4,
"minItems": 4,
"type": "array"
},
"continued": {
"description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.",
"type": "boolean"
},
"continued_from_prev": {
"description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.",
"type": "boolean"
},
"detection_method": {
"description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics",
"type": "string"
},
"header_rows": {
"description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.",
"format": "uint32",
"minimum": 0,
"type": "integer"
},
"id": {
"description": "Unique identifier for this table (e.g., \"table_0\").",
"type": "string"
},
"page_index": {
"description": "Zero-based page index where this table appears.",
"format": "uint",
"minimum": 0,
"type": "integer"
},
"rows": {
"description": "Rows in this table, ordered top-to-bottom.",
"items": {
"$ref": "#/$defs/RowJson"
},
"type": "array"
}
},
"required": [
"id",
"bbox",
"rows",
"header_rows",
"detection_method",
"continued",
"continued_from_prev",
"page_index"
],
"type": "object"
},
"ThreadJson": {
"description": "JSON representation of an article thread.\n\nRepresents a single article thread from the PDF's /Threads array,\nincluding metadata from the thread info dict (/I) and the complete\nbead chain walked from the first bead.\n\nPer the plan (Phase 7.7), threads are extracted and emitted at the\ndocument level in the `/threads` array. The bead chain is walked by\nfollowing `/N` (next bead) links from the first bead until termination.",
"properties": {
"author": {
"description": "Thread author from /I/Author.\n\n- `Some(\"\")` if /I/Author is present but empty string\n- `None` if /I is missing or /Author is absent",
"type": [
"string",
"null"
]
},
"beads": {
"default": [],
"description": "Beads in this thread chain, in traversal order.\n\nEach bead represents a region on a page that is part of this article.\nThe beads are ordered by following `/N` (next bead) links from the\nfirst bead through the chain until termination.",
"items": {
"$ref": "#/$defs/BeadJson"
},
"type": "array"
},
"keywords": {
"description": "Thread keywords from /I/Keywords.\n\nPer PDF spec, this is a comma-separated convention (not an array).\n- `Some(\"\")` if /I/Keywords is present but empty string\n- `None` if /I is missing or /Keywords is absent",
"type": [
"string",
"null"
]
},
"subject": {
"description": "Thread subject from /I/Subject.\n\n- `Some(\"\")` if /I/Subject is present but empty string\n- `None` if /I is missing or /Subject is absent",
"type": [
"string",
"null"
]
},
"title": {
"description": "Thread title from /I/Title.\n\n- `Some(\"\")` if /I/Title is present but empty string\n- `None` if /I is missing or /Title is absent",
"type": [
"string",
"null"
]
}
},
"type": "object"
}
},
"$id": "https://pdftract.com/schema/v1.0/pdftract.schema.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "JSON Schema for pdftract PDF extraction output v1.0. This schema defines the structure of extraction results including pages, spans, blocks, tables, form fields, signatures, and metadata.",
"properties": {
"attachments": {
"default": [],
"description": "Embedded file attachments.\n\nEmpty until Phase 7.5; always present as an array.",
"items": {
"$ref": "#/$defs/AttachmentJson"
},
"type": "array"
},
"errors": {
"default": [],
"description": "All diagnostics emitted during extraction.",
"items": {
"$ref": "#/$defs/DiagnosticJson"
},
"type": "array"
},
"extraction_quality": {
"$ref": "#/$defs/ExtractionQuality",
"description": "Aggregate extraction quality metrics."
},
"form_fields": {
"default": [],
"description": "AcroForm/XFA form fields.\n\nEmpty until Phase 7.4; always present as an array.",
"items": {
"$ref": "#/$defs/FormFieldJson"
},
"type": "array"
},
"links": {
"default": [],
"description": "Document-scoped hyperlinks.\n\nEmpty until Phase 7.6; always present as an array.",
"items": {
"$ref": "#/$defs/LinkJson"
},
"type": "array"
},
"metadata": {
"$ref": "#/$defs/DocumentMetadata",
"description": "Document-level metadata."
},
"outline": {
"default": [],
"description": "Document outline (bookmark tree).\n\nEmpty array if no bookmarks are present.",
"items": {
"$ref": "#/$defs/OutlineNode"
},
"type": "array"
},
"pages": {
"description": "Page objects array.",
"items": {
"$ref": "#/$defs/PageJson"
},
"type": "array"
},
"schema_version": {
"description": "Schema version identifier (e.g., \"1.0\").",
"type": "string"
},
"signatures": {
"default": [],
"description": "Digital signature metadata.\n\nEmpty until Phase 7.3; always present as an array.",
"items": {
"$ref": "#/$defs/SignatureJson"
},
"type": "array"
},
"threads": {
"default": [],
"description": "Article thread chains.\n\nEmpty until Phase 7.1; always present as an array.",
"items": {
"$ref": "#/$defs/ThreadJson"
},
"type": "array"
}
},
"required": [
"schema_version",
"metadata",
"pages",
"extraction_quality"
],
"title": "pdftract Output v1.0",
"type": "object"
}