Phase 7.7.3: Add threads field to ExtractionResult with ThreadJson schema integration. Changes: - Added ThreadJson and BeadJson structs to schema/mod.rs - Added thread_to_json() function to threads/mod.rs - Added build_page_ref_to_index() helper to parser/pages.rs - Added threads field to ExtractionResult in extract.rs - Implemented Phase 7.7 extraction logic with discover_threads/walk_beads - Added threads_to_markdown() and collapse_page_ranges() to markdown.rs - Updated JSON schema with ThreadJson and BeadJson definitions - Added thread_to_py() and bead_to_py() conversions in pdftract-py - Exported ThreadJson, BeadJson from lib.rs All 32 threads module tests pass. All 35 markdown tests pass. Verification: notes/pdftract-3h9xo.md Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1414 lines
No EOL
54 KiB
JSON
1414 lines
No EOL
54 KiB
JSON
{
|
|
"$defs": {
|
|
"AnnotationJson": {
|
|
"description": "JSON representation of a PDF annotation.\n\nThis struct represents a non-link annotation from a PDF page, such as\nhighlights, text notes, stamps, free text, ink drawings, lines, polygons,\nand file attachments.\n\nPer the plan (Phase 7.6.3), annotations are extracted after links and\nform fields, with sorting for deterministic output.",
|
|
"properties": {
|
|
"author": {
|
|
"description": "The annotation's author from the /T entry.\n\nNone if /T is missing or not a string.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"color": {
|
|
"description": "The color array from /C as RGB/Grayscale components.\n\nNone if /C is missing. Length is 1 (grayscale), 3 (RGB), or 4 (CMYK).",
|
|
"items": {
|
|
"format": "float",
|
|
"type": "number"
|
|
},
|
|
"type": [
|
|
"array",
|
|
"null"
|
|
]
|
|
},
|
|
"contents": {
|
|
"description": "The annotation's content text from /Contents.\n\nNone if /Contents is missing or not a string.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"modified": {
|
|
"description": "The modification date from /M as an ISO 8601 string.\n\nNone if /M is missing, malformed, or fails to parse.\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"name_id": {
|
|
"description": "The name identifier from /NM.\n\nNone if /NM is missing.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"opacity": {
|
|
"description": "The opacity from /CA.\n\nNone if /CA is missing.",
|
|
"format": "float",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
},
|
|
"rect": {
|
|
"description": "The bounding rectangle [x0, y0, x1, y1] in PDF user-space units.\n\nNone if the /Rect entry is missing or invalid.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": [
|
|
"array",
|
|
"null"
|
|
]
|
|
},
|
|
"specific": {
|
|
"anyOf": [
|
|
{
|
|
"$ref": "#/$defs/AnnotationSpecificJson"
|
|
},
|
|
{
|
|
"type": "null"
|
|
}
|
|
],
|
|
"description": "Subtype-specific fields.\n\nPresent only for annotation types that have additional data beyond\nthe common fields. For unsupported subtypes, this is null."
|
|
},
|
|
"subject": {
|
|
"description": "The subject from /Subj.\n\nNone if /Subj is missing.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"type": {
|
|
"description": "The annotation subtype (e.g., \"Highlight\", \"Text\", \"Stamp\", \"FreeText\", \"Ink\", \"Line\", \"Polygon\", \"FileAttachment\").",
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"type"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"AnnotationSpecificJson": {
|
|
"description": "Subtype-specific annotation fields.\n\nThis enum captures the additional data present in specific annotation subtypes.",
|
|
"oneOf": [
|
|
{
|
|
"description": "Text markup annotations (Highlight, Underline, StrikeOut, Squiggly).\n\nContains the quadpoint arrays defining the marked regions.",
|
|
"properties": {
|
|
"quads": {
|
|
"description": "Array of quadpoint arrays [x0, y0, x1, y1, x2, y2, x3, y3] defining the marked regions.\n\nEach quad defines a quadrilateral region in PDF user-space coordinates.",
|
|
"items": {
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 8,
|
|
"minItems": 8,
|
|
"type": "array"
|
|
},
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"quads"
|
|
],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "Stamp annotations.\n\nContains the stamp name from /Name.",
|
|
"properties": {
|
|
"name": {
|
|
"description": "The stamp name (e.g., \"Approved\", \"Draft\", \"Confidential\").",
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"name"
|
|
],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "Free text annotations.\n\nContains the default appearance string from /DA.",
|
|
"properties": {
|
|
"da": {
|
|
"description": "The default appearance string.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [
|
|
"da"
|
|
],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "Text annotations (sticky notes).\n\nContains the open state and state information.",
|
|
"properties": {
|
|
"open": {
|
|
"description": "Whether the note is initially open.",
|
|
"type": [
|
|
"boolean",
|
|
"null"
|
|
]
|
|
},
|
|
"state": {
|
|
"description": "The annotation state from /State (e.g., \"Reviewed\", \"Accepted\").",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"state_model": {
|
|
"description": "The state model from /StateModel (e.g., \"Marked\", \"Review\").",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "Ink annotations (hand-drawn sketches).\n\nContains the stroke paths.",
|
|
"properties": {
|
|
"strokes": {
|
|
"description": "Array of stroke paths, where each stroke is an array of points.\n\nEach point is [x, y] in PDF user-space coordinates.",
|
|
"items": {
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 2,
|
|
"minItems": 2,
|
|
"type": "array"
|
|
},
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"strokes"
|
|
],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "Line annotations.\n\nContains the line endpoints.",
|
|
"properties": {
|
|
"endpoints": {
|
|
"description": "The line endpoints as [[x0, y0], [x1, y1]].",
|
|
"items": {
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 2,
|
|
"minItems": 2,
|
|
"type": "array"
|
|
},
|
|
"maxItems": 2,
|
|
"minItems": 2,
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"endpoints"
|
|
],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "Polygon annotations.\n\nContains the polygon vertices.",
|
|
"properties": {
|
|
"vertices": {
|
|
"description": "Array of [x, y] vertices defining the polygon.\n\nEach vertex is in PDF user-space coordinates.",
|
|
"items": {
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 2,
|
|
"minItems": 2,
|
|
"type": "array"
|
|
},
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"vertices"
|
|
],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "File attachment annotations.\n\nContains the file specification reference.",
|
|
"properties": {
|
|
"fs_ref": {
|
|
"description": "The file specification reference number.\n\nComputed as (object_number << 16 | generation_number) as u32.",
|
|
"format": "uint32",
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [
|
|
"fs_ref"
|
|
],
|
|
"type": "object"
|
|
}
|
|
]
|
|
},
|
|
"AttachmentJson": {
|
|
"description": "JSON representation of an embedded file attachment.\n\nRepresents a single embedded file extracted from the PDF's `/EmbeddedFiles`\nname tree or `/AF` (Associated Files) array.\n\nPer plan (Phase 7.5.3), attachments exceeding 50 MB are truncated\n(metadata only, data: null, truncated: true). The `data` field contains\nbase64-encoded content using RFC 4648 standard alphabet with padding\nand no line breaks. The JSON Schema declares `contentEncoding: base64`\nfor the `data` field, enabling JSON Schema validators and code generation\ntools to understand the encoding.",
|
|
"properties": {
|
|
"checksum_md5": {
|
|
"description": "MD5 checksum from /Params /CheckSum as hex string (None if absent).\n\nPer PDF spec, /CheckSum is a 16-byte binary string (MD5), hex-encoded\nas 32 lowercase hex characters.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"created": {
|
|
"description": "Creation date from /Params /CreationDate as ISO 8601 string (None if absent).\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"data": {
|
|
"description": "Base64-encoded attachment content (null if truncated or empty).\n\nPer JSON Schema, this field has `contentEncoding: base64`, indicating\nthe string is base64-encoded binary data. Downstream tools can use this\ninformation to automatically decode the content.\n\n- `Some(base64_string)` when content <= 50 MB\n- `None` when `truncated: true` (content too large)\n\nIn the Python API (PyO3), this field is returned as a `bytes` object\n(PyO3 automatically decodes the base64 string).",
|
|
"contentEncoding": "base64",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"description": {
|
|
"description": "Description from /Desc (None if absent, not empty string).",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"mime_type": {
|
|
"description": "MIME type from stream /Subtype (None if absent, no guessing from extension).",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"modified": {
|
|
"description": "Modification date from /Params /ModDate as ISO 8601 string (None if absent).\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"name": {
|
|
"description": "Attachment filename from /UF (Unicode, preferred) or /F (system-independent).",
|
|
"type": "string"
|
|
},
|
|
"size": {
|
|
"description": "Original decoded size in bytes (always populated, even when truncated).\n\nThis is the size of the attachment content before base64 encoding.\nWhen `truncated: true`, this represents the full original size that\nwas not included in the output.",
|
|
"format": "uint64",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"truncated": {
|
|
"description": "Whether the attachment content was truncated due to the 50 MB size limit.\n\nWhen `true`, the `data` field is `None` and only metadata is included.\nThe `size` field still reflects the original full size.",
|
|
"type": "boolean"
|
|
}
|
|
},
|
|
"required": [
|
|
"name",
|
|
"size",
|
|
"truncated"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"BlockJson": {
|
|
"description": "JSON representation of a structural block.\n\nA block is a higher-level semantic unit composed of one or more\nspans. Examples include paragraphs, headings, list items, and\ntable cells.",
|
|
"properties": {
|
|
"bbox": {
|
|
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
},
|
|
"kind": {
|
|
"description": "The block kind/type.\n\nCommon values: \"paragraph\", \"heading\", \"list\", \"table\", \"figure\".",
|
|
"type": "string"
|
|
},
|
|
"level": {
|
|
"description": "Optional heading level (1-6) for \"heading\" kind blocks.\n\nThis field is present only for heading blocks. For paragraphs\nand other block types, it is `null`.",
|
|
"format": "uint8",
|
|
"maximum": 255,
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
},
|
|
"receipt": {
|
|
"anyOf": [
|
|
{
|
|
"$ref": "#/$defs/Receipt"
|
|
},
|
|
{
|
|
"type": "null"
|
|
}
|
|
],
|
|
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`."
|
|
},
|
|
"table_index": {
|
|
"description": "Optional table index for \"table\" kind blocks.\n\nThis field is present only for table blocks and points to the\ncorresponding entry in the page's `tables` array.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
},
|
|
"text": {
|
|
"description": "The concatenated text content of all spans in the block.",
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"kind",
|
|
"text",
|
|
"bbox"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"CellJson": {
|
|
"description": "JSON representation of a table cell.\n\nA cell represents a single unit within a table row, containing\nits text content, bounding box, and position information.",
|
|
"properties": {
|
|
"bbox": {
|
|
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
},
|
|
"col": {
|
|
"description": "Zero-based column index within the table.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"colspan": {
|
|
"default": 1,
|
|
"description": "Number of columns this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple columns horizontally.",
|
|
"format": "uint32",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"is_header_row": {
|
|
"description": "Whether this cell is in a header row.\n\nHeader cells are typically rendered differently (bold, centered)\nand may be reused when tables span multiple pages.",
|
|
"type": "boolean"
|
|
},
|
|
"row": {
|
|
"description": "Zero-based row index within the table.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"rowspan": {
|
|
"default": 1,
|
|
"description": "Number of rows this cell spans (default 1).\n\nValues greater than 1 indicate a merged cell that spans\nmultiple rows vertically.",
|
|
"format": "uint32",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"spans": {
|
|
"description": "References to spans in the page's `spans` array.\n\nThese indices point to the spans that make up this cell's content.",
|
|
"items": {
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"text": {
|
|
"description": "The concatenated text content of all spans in the cell.",
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"bbox",
|
|
"text",
|
|
"spans",
|
|
"row",
|
|
"col",
|
|
"is_header_row"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"ChoiceValueJson": {
|
|
"anyOf": [
|
|
{
|
|
"description": "Single selected option.",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"description": "Multiple selected options.",
|
|
"items": {
|
|
"type": "string"
|
|
},
|
|
"type": "array"
|
|
}
|
|
],
|
|
"description": "Choice field value representation.\n\nChoice fields can have either a single selected value or multiple\nselected values (for multi-select list boxes)."
|
|
},
|
|
"DestArrayJson": {
|
|
"description": "Explicit destination array for internal links.\n\nThis struct represents an explicit destination in a PDF, which specifies\na target page and how that page should be displayed (fit type).",
|
|
"properties": {
|
|
"dest": {
|
|
"$ref": "#/$defs/DestTypeJson",
|
|
"description": "The fit type and associated coordinates for this destination."
|
|
},
|
|
"page_index": {
|
|
"description": "Zero-based page index for this destination.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
}
|
|
},
|
|
"required": [
|
|
"page_index",
|
|
"dest"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"DestTypeJson": {
|
|
"description": "Destination fit type enum.\n\nThis enum defines how a page should be displayed when navigating to\na destination. It corresponds to the PDF destination fit types.",
|
|
"oneOf": [
|
|
{
|
|
"description": "XYZ destination with optional left, top, and zoom.\n\nDisplay the page with the coordinates (left, top) positioned at the\nupper-left corner of the window and the page contents magnified by\nthe factor zoom. A null value for any of left, top, or zoom indicates\nthat the current value of that parameter should be retained unchanged.",
|
|
"properties": {
|
|
"left": {
|
|
"description": "The left coordinate in PDF user-space units.\n\nNull indicates the current left position should be retained.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
},
|
|
"top": {
|
|
"description": "The top coordinate in PDF user-space units.\n\nNull indicates the current top position should be retained.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
},
|
|
"zoom": {
|
|
"description": "The zoom factor.\n\nNull indicates the current zoom level should be retained.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"const": "Fit",
|
|
"description": "Fit destination — display the page with its contents magnified\njust enough to fit the entire page within the window both horizontally\nand vertically.",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"description": "FitH destination with optional top coordinate.\n\nDisplay the page with the top coordinate positioned at the top edge\nof the window and the contents magnified just enough to fit the entire\nwidth of the page within the window.",
|
|
"properties": {
|
|
"top": {
|
|
"description": "The top coordinate in PDF user-space units.\n\nNull indicates the current top position should be retained.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "FitV destination with optional left coordinate.\n\nDisplay the page with the left coordinate positioned at the left edge\nof the window and the contents magnified just enough to fit the entire\nheight of the page within the window.",
|
|
"properties": {
|
|
"left": {
|
|
"description": "The left coordinate in PDF user-space units.\n\nNull indicates the current left position should be retained.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "FitR destination with bounding rectangle.\n\nDisplay the page with the specified rectangle magnified just enough\nto fit the entire rectangle within the window both horizontally and\nvertically.",
|
|
"properties": {
|
|
"bottom": {
|
|
"description": "The bottom coordinate in PDF user-space units.",
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"left": {
|
|
"description": "The left coordinate in PDF user-space units.",
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"right": {
|
|
"description": "The right coordinate in PDF user-space units.",
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"top": {
|
|
"description": "The top coordinate in PDF user-space units.",
|
|
"format": "double",
|
|
"type": "number"
|
|
}
|
|
},
|
|
"required": [
|
|
"left",
|
|
"bottom",
|
|
"right",
|
|
"top"
|
|
],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"const": "FitB",
|
|
"description": "FitB destination — display the page with its contents magnified\njust enough to fit its bounding box entirely within the window both\nhorizontally and vertically.",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"description": "FitBH destination with optional top coordinate.\n\nDisplay the page with the top coordinate positioned at the top edge\nof the window and the contents magnified just enough to fit the entire\nwidth of its bounding box within the window.",
|
|
"properties": {
|
|
"top": {
|
|
"description": "The top coordinate in PDF user-space units.\n\nNull indicates the current top position should be retained.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [],
|
|
"type": "object"
|
|
},
|
|
{
|
|
"description": "FitBV destination with optional left coordinate.\n\nDisplay the page with the left coordinate positioned at the left edge\nof the window and the contents magnified just enough to fit the entire\nheight of its bounding box within the window.",
|
|
"properties": {
|
|
"left": {
|
|
"description": "The left coordinate in PDF user-space units.\n\nNull indicates the current left position should be retained.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [],
|
|
"type": "object"
|
|
}
|
|
]
|
|
},
|
|
"LinkJson": {
|
|
"description": "JSON representation of a PDF link annotation.\n\nThis struct represents a hyperlink from a PDF page, which can point to\na URI, a named destination, or an explicit destination array.\n\nPer the plan (Phase 7.6.2), links are extracted and sorted deterministically\nfor stable output.",
|
|
"properties": {
|
|
"dest": {
|
|
"description": "Named destination string (e.g., \"Chapter1\").\n\nNone if the link is not a named destination link.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"dest_array": {
|
|
"anyOf": [
|
|
{
|
|
"$ref": "#/$defs/DestArrayJson"
|
|
},
|
|
{
|
|
"type": "null"
|
|
}
|
|
],
|
|
"description": "Explicit destination array with page index and fit type.\n\nNone if the link is not an explicit destination link."
|
|
},
|
|
"page_index": {
|
|
"description": "Zero-based page index containing this link.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"rect": {
|
|
"description": "The bounding rectangle [x0, y0, x1, y1] in PDF user-space units.\n\nFormat: [x0, y0, x1, y1] where (x0, y0) is the bottom-left corner and\n(x1, y1) is the top-right corner.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
},
|
|
"uri": {
|
|
"description": "URI string for external links.\n\nNone if the link is not a URI link.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [
|
|
"page_index",
|
|
"rect"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"ExtractionMetadata": {
|
|
"description": "Metadata about the extraction process.",
|
|
"properties": {
|
|
"block_count": {
|
|
"description": "Number of blocks extracted.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"cache_age_seconds": {
|
|
"description": "Cache entry age in seconds (only present when cache_status == \"hit\")",
|
|
"format": "uint64",
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
},
|
|
"cache_status": {
|
|
"description": "Cache status: \"hit\", \"miss\", or \"skipped\"",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"diagnostics": {
|
|
"description": "Diagnostics emitted during extraction (coverage warnings, etc.)",
|
|
"items": {
|
|
"type": "string"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"error_count": {
|
|
"description": "Number of pages that failed to extract.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"page_count": {
|
|
"description": "Total number of pages in the document.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"reading_order_algorithm": {
|
|
"description": "Reading order algorithm used for this extraction.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"receipts_mode": {
|
|
"$ref": "#/$defs/ReceiptsMode",
|
|
"description": "Receipts mode used for this extraction."
|
|
},
|
|
"span_count": {
|
|
"description": "Number of spans extracted.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
}
|
|
},
|
|
"required": [
|
|
"page_count",
|
|
"receipts_mode",
|
|
"span_count",
|
|
"block_count",
|
|
"error_count",
|
|
"diagnostics"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"FormFieldJson": {
|
|
"description": "JSON representation of a form field.\n\nThis struct represents a single interactive form field from the PDF's\nAcroForm or XFA data, including its type, value, and metadata.\n\nPer the plan (Phase 7.4), form fields are extracted from both AcroForm\nand XFA sources, with XFA values taking precedence on collision.",
|
|
"properties": {
|
|
"default": {
|
|
"anyOf": [
|
|
{
|
|
"$ref": "#/$defs/FormFieldValueJson"
|
|
},
|
|
{
|
|
"type": "null"
|
|
}
|
|
],
|
|
"description": "The default value (/DV entry) if present.\n\nMatches the structure of `value` but represents the field's default state."
|
|
},
|
|
"max_length": {
|
|
"description": "Maximum length for text fields (/MaxLen entry).\nOnly present for text fields that have a max length set.",
|
|
"format": "uint32",
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
},
|
|
"multi_select": {
|
|
"description": "Whether this choice field supports multiple selections (bit 21 of /Ff).\nOnly present for choice fields.",
|
|
"type": [
|
|
"boolean",
|
|
"null"
|
|
]
|
|
},
|
|
"multiline": {
|
|
"description": "Whether this text field supports multiple lines (bit 13 of /Ff).\nOnly present for text fields.",
|
|
"type": [
|
|
"boolean",
|
|
"null"
|
|
]
|
|
},
|
|
"name": {
|
|
"description": "The absolute (dot-joined) field name from the AcroForm.\nExample: \"employer_signature\" or \"form.employee_sig\"",
|
|
"type": "string"
|
|
},
|
|
"options": {
|
|
"description": "Available options for choice fields.\n\nEach option is a [export_value, display_name] pair.\nOnly present for choice fields.",
|
|
"items": {
|
|
"items": {
|
|
"type": "string"
|
|
},
|
|
"maxItems": 2,
|
|
"minItems": 2,
|
|
"type": "array"
|
|
},
|
|
"type": [
|
|
"array",
|
|
"null"
|
|
]
|
|
},
|
|
"page_index": {
|
|
"description": "Zero-based page index where this field's widget appears.\n\nNone if the field has no visual representation (form-only field).",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
},
|
|
"pushbutton": {
|
|
"description": "Whether this button is a pushbutton (bit 26 of /Ff).\nOnly present for button fields.",
|
|
"type": [
|
|
"boolean",
|
|
"null"
|
|
]
|
|
},
|
|
"radio": {
|
|
"description": "Whether this button is a radio button (bit 25 of /Ff).\nOnly present for button fields.",
|
|
"type": [
|
|
"boolean",
|
|
"null"
|
|
]
|
|
},
|
|
"read_only": {
|
|
"description": "Whether this field is read-only (bit 1 of /Ff flags).",
|
|
"type": "boolean"
|
|
},
|
|
"rect": {
|
|
"description": "Bounding box in PDF user-space points.\n\nFormat: [x0, y0, x1, y1] where (x0, y0) is the bottom-left corner.\nNone if the field has no visual appearance.",
|
|
"items": {
|
|
"format": "float",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": [
|
|
"array",
|
|
"null"
|
|
]
|
|
},
|
|
"required": {
|
|
"description": "Whether this field is required (bit 2 of /Ff flags).",
|
|
"type": "boolean"
|
|
},
|
|
"selected": {
|
|
"description": "Selected state for button fields.\nTrue = checked/selected, False = unchecked.\nOnly present for button fields.",
|
|
"type": [
|
|
"boolean",
|
|
"null"
|
|
]
|
|
},
|
|
"state_name": {
|
|
"description": "Appearance state name for button fields.\nE.g., \"Yes\", \"Off\", or custom state names.\nOnly present for button fields.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"type": {
|
|
"$ref": "#/$defs/FormFieldTypeJson",
|
|
"description": "The field type variant (text, button, choice, or signature)."
|
|
},
|
|
"value": {
|
|
"$ref": "#/$defs/FormFieldValueJson",
|
|
"description": "The current value of the form field.\n\nThis field's structure varies by field_type:\n- text: string value\n- button: boolean selected state\n- choice: string or array of strings (for multi-select)\n- signature: signature reference number (or null if unsigned)"
|
|
}
|
|
},
|
|
"required": [
|
|
"name",
|
|
"type",
|
|
"value",
|
|
"required",
|
|
"read_only"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"FormFieldTypeJson": {
|
|
"description": "Form field type discriminator.\n\nThis enum uses serde's \"tag\" representation to produce a JSON string\nindicating the field type.",
|
|
"oneOf": [
|
|
{
|
|
"const": "text",
|
|
"description": "Text field (/FT /Tx) - single-line or multi-line text input.",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"const": "button",
|
|
"description": "Button field (/FT /Btn) - pushbutton, checkbox, or radio button.",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"const": "choice",
|
|
"description": "Choice field (/FT /Ch) - dropdown or list box.",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"const": "signature",
|
|
"description": "Signature field (/FT /Sig) - digital signature field.",
|
|
"type": "string"
|
|
}
|
|
]
|
|
},
|
|
"FormFieldValueJson": {
|
|
"anyOf": [
|
|
{
|
|
"description": "Text field value (string or null).",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
{
|
|
"description": "Button field value (boolean selected state).",
|
|
"type": "boolean"
|
|
},
|
|
{
|
|
"$ref": "#/$defs/ChoiceValueJson",
|
|
"description": "Choice field value (single string or array of strings for multi-select)."
|
|
},
|
|
{
|
|
"description": "Signature field value (signature reference number or null).",
|
|
"format": "uint32",
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
}
|
|
],
|
|
"description": "Form field value representation.\n\nThis enum captures the current value of a form field, with the variant\ntype matching the field_type."
|
|
},
|
|
"PageResult": {
|
|
"description": "Result for a single page.",
|
|
"properties": {
|
|
"annotations": {
|
|
"description": "Non-link annotations on this page (highlights, notes, stamps, etc.).\n\nThis array contains all non-link annotations extracted from the page's\n/Annots array. Annotations are sorted deterministically by position\n(y0 descending, then x0 ascending) for stable output.",
|
|
"items": {
|
|
"$ref": "#/$defs/AnnotationJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"blocks": {
|
|
"description": "Extracted blocks (semantic units like paragraphs, headings).",
|
|
"items": {
|
|
"$ref": "#/$defs/BlockJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"error": {
|
|
"description": "Error message if extraction failed for this page.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"index": {
|
|
"description": "0-based page index.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"spans": {
|
|
"description": "Extracted spans (text fragments with consistent styling).",
|
|
"items": {
|
|
"$ref": "#/$defs/SpanJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"tables": {
|
|
"description": "Extracted tables (cell-level structure).\n\nThis array provides detailed table structure with rows and cells.\nTable blocks in the `blocks` array reference entries here via `table_index`.",
|
|
"items": {
|
|
"$ref": "#/$defs/TableJson"
|
|
},
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"index",
|
|
"spans",
|
|
"blocks",
|
|
"tables"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"Receipt": {
|
|
"description": "A visual citation receipt for extracted text.\n\nReceipts provide cryptographic proof that a piece of extracted text\noriginated from a specific region in a specific PDF. They can be\nverified independently by re-running pdftract on the original file.\n\n# Lite mode\n\nIn lite mode, `svg_clip` is `None` and the JSON output does not\ninclude the key at all (via `skip_serializing_if`). This keeps\nreceipts small (~120-180 bytes) for high-volume use cases like\nRAG citation pipelines.\n\n# SVG mode\n\nIn SVG mode, `svg_clip` contains a self-contained SVG element\nthat renders only the glyphs whose bboxes fall within the receipt\nbbox. The SVG is normalized to the bbox coordinate system and\ncan be rendered standalone in any browser.\n\n# Example\n\n```json\n{\n \"pdf_fingerprint\": \"pdftract-v1:a7f3...\",\n \"page_index\": 14,\n \"bbox\": [220.0, 412.0, 412.0, 432.0],\n \"content_hash\": \"sha256:9b21...\",\n \"extraction_version\": \"1.0.0\"\n}\n```",
|
|
"properties": {
|
|
"bbox": {
|
|
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where:\n- x0, y0: bottom-left corner\n- x1, y1: top-right corner\n- Units: PDF points (1/72 inch)\n\nThis is a copy of the parent span's bbox, included so the\nreceipt is self-contained.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
},
|
|
"content_hash": {
|
|
"description": "SHA-256 hash of the NFC-normalized text content.\n\nFormat: `\"sha256:\" + hex(SHA-256)`.\n\nThe text is normalized to NFC form before hashing to ensure\nstability across platforms that may use different Unicode\nnormalization forms (e.g., macOS HFS+/APFS sometimes round-trips\nthrough NFD).",
|
|
"type": "string"
|
|
},
|
|
"extraction_version": {
|
|
"description": "The pdftract version that produced this receipt.\n\nFormat: semver string (e.g., \"1.0.0\", \"1.0.0-rc.1\").\nTaken from `CARGO_PKG_VERSION` at compile time.",
|
|
"type": "string"
|
|
},
|
|
"page_index": {
|
|
"description": "0-based page index in the source PDF.\n\nMatches the page_index in the extraction schema.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"pdf_fingerprint": {
|
|
"description": "Phase 1.7 fingerprint of the source PDF.\n\nFormat: `\"pdftract-v1:\" + hex(SHA-256)`.\nThe verifier compares this string literally (not parsed).",
|
|
"type": "string"
|
|
},
|
|
"svg_clip": {
|
|
"description": "Optional SVG clip rendering the glyphs in this receipt.\n\n- `None` in lite mode (the key is omitted from JSON entirely)\n- `Some(svg)` in SVG mode, where `svg` is a self-contained SVG element\n\nThe SVG coordinate system is normalized to the bbox itself,\nso it renders correctly in isolation.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"required": [
|
|
"pdf_fingerprint",
|
|
"page_index",
|
|
"bbox",
|
|
"content_hash",
|
|
"extraction_version"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"ReceiptsMode": {
|
|
"description": "Receipt generation mode.\n\nControls whether visual citation receipts are generated during extraction.",
|
|
"oneOf": [
|
|
{
|
|
"const": "off",
|
|
"description": "No receipts generated (default).",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"const": "lite",
|
|
"description": "Lite mode: minimal receipts (~120 bytes each) with fingerprint, page index, bbox, and content hash.",
|
|
"type": "string"
|
|
},
|
|
{
|
|
"const": "svg",
|
|
"description": "SVG mode: extended receipts that include an SVG clip rendering the glyphs.",
|
|
"type": "string"
|
|
}
|
|
]
|
|
},
|
|
"RowJson": {
|
|
"description": "JSON representation of a table row.\n\nA row contains a sequence of cells that form a horizontal strip\nin the table.",
|
|
"properties": {
|
|
"bbox": {
|
|
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
},
|
|
"cells": {
|
|
"description": "Cells in this row, ordered left-to-right.",
|
|
"items": {
|
|
"$ref": "#/$defs/CellJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"is_header": {
|
|
"description": "Whether this row is a header row.\n\nHeader rows are typically repeated when tables span multiple pages.",
|
|
"type": "boolean"
|
|
}
|
|
},
|
|
"required": [
|
|
"bbox",
|
|
"cells",
|
|
"is_header"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"SignatureJson": {
|
|
"description": "JSON representation of a digital signature.\n\nThis struct represents a signature extracted from a PDF signature field,\nincluding signer identity, timestamp, and coverage information.\n\nPer the plan (Phase 7.3), pdftract does NOT perform cryptographic validation\nin v1. The `validation_status` field is always \"not_checked\" — future versions\nmay add \"valid\", \"invalid\", or \"indeterminate\" as cryptographic validation\nis implemented.",
|
|
"properties": {
|
|
"byte_range": {
|
|
"description": "The /ByteRange array defining which bytes of the file are signed.\n\nFormat: array of 4 integers [offset, length, offset, length] defining two byte ranges.\nNone if /ByteRange is missing or malformed.",
|
|
"items": {
|
|
"format": "uint64",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"type": [
|
|
"array",
|
|
"null"
|
|
]
|
|
},
|
|
"coverage_fraction": {
|
|
"description": "Fraction of the file covered by the signature (0.0 to 1.0).\n\nComputed as `(byte_range[1] + byte_range[3]) / file_size`.\nNone if /ByteRange is missing, malformed, or file_size is unknown.\n\nValues < 1.0 indicate partial signatures (a common red flag for tampered docs).",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
},
|
|
"field_name": {
|
|
"description": "The absolute (dot-joined) field name from the AcroForm.\nExample: \"employer_signature\" or \"form.employee_sig\"",
|
|
"type": "string"
|
|
},
|
|
"location": {
|
|
"description": "The location of signing from the /Location entry.\n\nNone if /Location is absent.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"reason": {
|
|
"description": "The reason for signing from the /Reason entry.\n\nNone if /Reason is absent.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"signer_name": {
|
|
"description": "The signer's name from the /Name entry in the signature dictionary.\n\nEmpty string if /Name is absent.",
|
|
"type": "string"
|
|
},
|
|
"signing_date": {
|
|
"description": "The signing date as an ISO 8601 string (RFC 3339 format).\n\nParsed from the PDF /M date string. None if the date is missing,\nmalformed, or the field is unsigned.\n\nFormat: \"YYYY-MM-DDTHH:MM:SS+HH:MM\" or \"YYYY-MM-DDTHH:MM:SSZ\"",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"sub_filter": {
|
|
"description": "The signature format / filter from the /SubFilter entry.\n\nIndicates the signature format: \"adbe.pkcs7.detached\", \"adbe.x509.rsa.sha1\", etc.\nNone if /SubFilter is absent.",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"validation_status": {
|
|
"description": "Validation status — always \"not_checked\" in v1.\n\nFuture versions may add \"valid\", \"invalid\", \"indeterminate\" as cryptographic\nvalidation is implemented. This is a string enum for schema stability.",
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"field_name",
|
|
"signer_name",
|
|
"validation_status"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"SpanJson": {
|
|
"description": "JSON representation of a text span.\n\nA span is the smallest unit of extracted text, representing a\ncontiguous run of text with consistent font and styling.\n\n# TODO: Phase 6.1 - Add confidence_source field\n\nWhen the `confidence_source` field is added to the schema (per plan line 363, 1662),\nit should include \"ocr-fallback\" as a valid value for spans emitted via\nPhase 5.5.3 region-level fallback. The internal `SpanSource::OcrFallback` variant\nin `hybrid.rs` maps to this value.",
|
|
"properties": {
|
|
"bbox": {
|
|
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
},
|
|
"column": {
|
|
"description": "Column index (0-based) assigned by Phase 4.3 column detection.\n\nThis field is `None` for spans outside any detected column\n(e.g., full-width headings, inter-column gaps).",
|
|
"format": "uint32",
|
|
"minimum": 0,
|
|
"type": [
|
|
"integer",
|
|
"null"
|
|
]
|
|
},
|
|
"confidence": {
|
|
"description": "Optional confidence score (0.0 to 1.0).\n\nThis field is present when OCR is used or when the extraction\nhas uncertainty about the text. When confidence is not applicable,\nthis field is `null`.",
|
|
"format": "double",
|
|
"type": [
|
|
"number",
|
|
"null"
|
|
]
|
|
},
|
|
"font": {
|
|
"description": "Font name or identifier.",
|
|
"type": "string"
|
|
},
|
|
"receipt": {
|
|
"anyOf": [
|
|
{
|
|
"$ref": "#/$defs/Receipt"
|
|
},
|
|
{
|
|
"type": "null"
|
|
}
|
|
],
|
|
"description": "Optional cryptographic receipt for verification.\n\nThis field is present when `--receipts=lite` or `--receipts=svg`\nis enabled. When receipts are disabled, the field is `null`."
|
|
},
|
|
"size": {
|
|
"description": "Font size in points.",
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"text": {
|
|
"description": "The extracted text content.",
|
|
"type": "string"
|
|
}
|
|
},
|
|
"required": [
|
|
"text",
|
|
"bbox",
|
|
"font",
|
|
"size"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"TableJson": {
|
|
"description": "JSON representation of a table.\n\nTables are emitted in parallel with table blocks - the block\nprovides the concatenated text and position, while the TableJson\nprovides full cell-level structure.",
|
|
"properties": {
|
|
"bbox": {
|
|
"description": "Bounding box in PDF user-space points.\n\nFormat: `[x0, y0, x1, y1]` where (x0, y0) is the bottom-left\ncorner and (x1, y1) is the top-right corner.",
|
|
"items": {
|
|
"format": "double",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
},
|
|
"continued": {
|
|
"description": "Whether this table continues on the next page.\n\nSet to `true` when a table is split across pages and this\npage contains the first part.",
|
|
"type": "boolean"
|
|
},
|
|
"continued_from_prev": {
|
|
"description": "Whether this table is a continuation from the previous page.\n\nSet to `true` when a table is split across pages and this\npage contains a subsequent part.",
|
|
"type": "boolean"
|
|
},
|
|
"detection_method": {
|
|
"description": "Detection method used to identify this table.\n\n- \"line_based\": Table detected via ruling lines (borders)\n- \"borderless\": Table detected via x0 alignment heuristics",
|
|
"type": "string"
|
|
},
|
|
"header_rows": {
|
|
"description": "Number of contiguous header rows at the top of the table.\n\nHeader rows are typically repeated when tables span multiple pages.",
|
|
"format": "uint32",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"id": {
|
|
"description": "Unique identifier for this table (e.g., \"table_0\").",
|
|
"type": "string"
|
|
},
|
|
"page_index": {
|
|
"description": "Zero-based page index where this table appears.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"rows": {
|
|
"description": "Rows in this table, ordered top-to-bottom.",
|
|
"items": {
|
|
"$ref": "#/$defs/RowJson"
|
|
},
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"id",
|
|
"bbox",
|
|
"rows",
|
|
"header_rows",
|
|
"detection_method",
|
|
"continued",
|
|
"continued_from_prev",
|
|
"page_index"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"BeadJson": {
|
|
"description": "A single bead in an article thread chain.\n\nRepresents one bead's position on a page, extracted during bead chain walking.\nPer PDF 1.7 Section 12.4.3, each bead contains a reference to its page and\na bounding rectangle defining the article region on that page.",
|
|
"properties": {
|
|
"page_index": {
|
|
"description": "0-based page index where this bead is located.",
|
|
"format": "uint",
|
|
"minimum": 0,
|
|
"type": "integer"
|
|
},
|
|
"rect": {
|
|
"description": "Bounding rectangle in PDF user-space coordinates [x0, y0, x1, y1].\n\nPer PDF spec, the origin is at the bottom-left corner of the page.\nThis rect is NOT flipped to image-space coordinates.",
|
|
"items": {
|
|
"format": "float",
|
|
"type": "number"
|
|
},
|
|
"maxItems": 4,
|
|
"minItems": 4,
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"page_index",
|
|
"rect"
|
|
],
|
|
"type": "object"
|
|
},
|
|
"ThreadJson": {
|
|
"description": "JSON representation of an article thread.\n\nRepresents a single article thread from the PDF's /Threads array,\nincluding metadata from the thread info dict (/I) and the complete\nbead chain walked from the first bead.\n\nPer the plan (Phase 7.7), threads are extracted and emitted at the\ndocument level in the `/threads` array. The bead chain is walked by\nfollowing `/N` (next bead) links from the first bead until termination.",
|
|
"properties": {
|
|
"author": {
|
|
"description": "Thread author from /I/Author.\n\n- `Some(\"\")` if /I/Author is present but empty string\n- `None` if /I is missing or /Author is absent",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"beads": {
|
|
"description": "Beads in this thread chain, in traversal order.\n\nEach bead represents a region on a page that is part of this article.\nThe beads are ordered by following `/N` (next bead) links from the\nfirst bead through the chain until termination.",
|
|
"items": {
|
|
"$ref": "#/$defs/BeadJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"keywords": {
|
|
"description": "Thread keywords from /I/Keywords.\n\nPer PDF spec, this is a comma-separated convention (not an array).\n- `Some(\"\")` if /I/Keywords is present but empty string\n- `None` if /I is missing or /Keywords is absent",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"subject": {
|
|
"description": "Thread subject from /I/Subject.\n\n- `Some(\"\")` if /I/Subject is present but empty string\n- `None` if /I is missing or /Subject is absent",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
},
|
|
"title": {
|
|
"description": "Thread title from /I/Title.\n\n- `Some(\"\")` if /I/Title is present but empty string\n- `None` if /I is missing or /Title is absent",
|
|
"type": [
|
|
"string",
|
|
"null"
|
|
]
|
|
}
|
|
},
|
|
"type": "object"
|
|
}
|
|
},
|
|
"$id": "https://pdftract.com/schema/v1.0/pdftract.schema.json",
|
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
"description": "JSON Schema for pdftract PDF extraction output v1.0. This schema defines the structure of extraction results including pages, spans, blocks, tables, form fields, signatures, and metadata.",
|
|
"properties": {
|
|
"fingerprint": {
|
|
"description": "The PDF fingerprint (for receipt generation).",
|
|
"type": "string"
|
|
},
|
|
"form_fields": {
|
|
"description": "Interactive form fields extracted from the document.\n\nThis array contains all form fields from the AcroForm and/or XFA data.\nFields are sorted alphabetically by name. When both AcroForm and XFA\nare present, XFA values take precedence on collision.\nEmpty when the PDF has no form fields.",
|
|
"items": {
|
|
"$ref": "#/$defs/FormFieldJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"links": {
|
|
"description": "Hyperlink annotations extracted from the document.\n\nThis array contains all link annotations from all pages, sorted\ndeterministically by page_index and position for stable output.\nEmpty when the PDF has no link annotations.",
|
|
"items": {
|
|
"$ref": "#/$defs/LinkJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"attachments": {
|
|
"description": "Embedded file attachments extracted from the document.\n\nThis array contains all embedded files from the /EmbeddedFiles name tree\nor /AF (Associated Files) array. Attachments exceeding 50 MB are\ntruncated (metadata only, data: null, truncated: true). Empty when the\nPDF has no embedded files.",
|
|
"items": {
|
|
"$ref": "#/$defs/AttachmentJson"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"metadata": {
|
|
"$ref": "#/$defs/ExtractionMetadata",
|
|
"description": "Metadata about the extraction."
|
|
},
|
|
"pages": {
|
|
"description": "Extracted pages, each containing spans and blocks.",
|
|
"items": {
|
|
"$ref": "#/$defs/PageResult"
|
|
},
|
|
"type": "array"
|
|
},
|
|
"signatures": {
|
|
"description": "Digital signatures extracted from the document.\n\nThis array contains all signature fields discovered in the AcroForm,\nincluding both signed and unsigned (blank) signature fields.\nEmpty when the PDF has no signature fields.",
|
|
"items": {
|
|
"$ref": "#/$defs/SignatureJson"
|
|
},
|
|
"type": "array"
|
|
}
|
|
},
|
|
"required": [
|
|
"fingerprint",
|
|
"pages",
|
|
"metadata",
|
|
"signatures",
|
|
"form_fields",
|
|
"links"
|
|
],
|
|
"title": "pdftract Output v1.0",
|
|
"type": "object"
|
|
} |