Add unified detect() method to TableDetector that combines both line-based and borderless table detection pipelines. This completes the coordinator bead for Phase 7.2: Table Detection and Structure Reconstruction. All child beads (7.2.1-7.2.6) are closed: - 7.2.1: Line-based detection (path segment clustering) - 7.2.2: Borderless detection (x0 alignment heuristic) - 7.2.3: Span-to-cell assignment (centroid containment) - 7.2.4: Header row detection (bold + StructTree TH) - 7.2.5: Merged cell detection (missing interior edges) - 7.2.6: Table JSON output schema integration Critical tests pass: - 5x3 bordered table (15 cells extracted) - Merged header cell colspan=3 - Borderless 3-column table detection - Two-page table continuation detection Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
345 lines
9.9 KiB
JSON
345 lines
9.9 KiB
JSON
{
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"$id": "https://pdftract.ardenone.com/schemas/v1.0/pdftract.schema.json",
|
|
"title": "PDFtract Extraction Output Schema v1.0",
|
|
"description": "JSON output schema for PDF text and structure extraction",
|
|
"type": "object",
|
|
"required": ["fingerprint", "schema_version", "pages", "metadata"],
|
|
"properties": {
|
|
"fingerprint": {
|
|
"type": "string",
|
|
"description": "PDF fingerprint for verification (format: pdftract-v1:<hex>)"
|
|
},
|
|
"schema_version": {
|
|
"type": "string",
|
|
"description": "Schema version (e.g., '1.0')",
|
|
"enum": ["1.0"]
|
|
},
|
|
"pages": {
|
|
"type": "array",
|
|
"description": "Extracted pages",
|
|
"items": {
|
|
"$ref": "#/definitions/page"
|
|
}
|
|
},
|
|
"metadata": {
|
|
"$ref": "#/definitions/metadata"
|
|
}
|
|
},
|
|
"definitions": {
|
|
"page": {
|
|
"type": "object",
|
|
"required": ["index", "spans", "blocks", "tables"],
|
|
"properties": {
|
|
"index": {
|
|
"type": "integer",
|
|
"description": "0-based page index"
|
|
},
|
|
"spans": {
|
|
"type": "array",
|
|
"description": "Extracted text spans",
|
|
"items": {
|
|
"$ref": "#/definitions/span"
|
|
}
|
|
},
|
|
"blocks": {
|
|
"type": "array",
|
|
"description": "Extracted structural blocks",
|
|
"items": {
|
|
"$ref": "#/definitions/block"
|
|
}
|
|
},
|
|
"tables": {
|
|
"type": "array",
|
|
"description": "Extracted tables (cell-level structure)",
|
|
"items": {
|
|
"$ref": "#/definitions/table"
|
|
}
|
|
},
|
|
"error": {
|
|
"type": "string",
|
|
"description": "Error message if extraction failed for this page"
|
|
}
|
|
}
|
|
},
|
|
"span": {
|
|
"type": "object",
|
|
"required": ["text", "bbox", "font", "size"],
|
|
"properties": {
|
|
"text": {
|
|
"type": "string",
|
|
"description": "The extracted text content"
|
|
},
|
|
"bbox": {
|
|
"type": "array",
|
|
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
|
"items": {
|
|
"type": "number"
|
|
},
|
|
"minItems": 4,
|
|
"maxItems": 4
|
|
},
|
|
"font": {
|
|
"type": "string",
|
|
"description": "Font name or identifier"
|
|
},
|
|
"size": {
|
|
"type": "number",
|
|
"description": "Font size in points"
|
|
},
|
|
"confidence": {
|
|
"type": "number",
|
|
"description": "Confidence score (0.0 to 1.0) for OCR text",
|
|
"minimum": 0.0,
|
|
"maximum": 1.0
|
|
},
|
|
"receipt": {
|
|
"$ref": "#/definitions/receipt"
|
|
}
|
|
}
|
|
},
|
|
"block": {
|
|
"type": "object",
|
|
"required": ["kind", "text", "bbox"],
|
|
"properties": {
|
|
"kind": {
|
|
"type": "string",
|
|
"description": "Block kind/type",
|
|
"enum": ["paragraph", "heading", "list", "table", "figure"]
|
|
},
|
|
"text": {
|
|
"type": "string",
|
|
"description": "The concatenated text content of all spans in the block"
|
|
},
|
|
"bbox": {
|
|
"type": "array",
|
|
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
|
"items": {
|
|
"type": "number"
|
|
},
|
|
"minItems": 4,
|
|
"maxItems": 4
|
|
},
|
|
"level": {
|
|
"type": "integer",
|
|
"description": "Heading level (1-6) for 'heading' kind blocks",
|
|
"minimum": 1,
|
|
"maximum": 6
|
|
},
|
|
"table_index": {
|
|
"type": "integer",
|
|
"description": "Table index for 'table' kind blocks (points to tables array)",
|
|
"minimum": 0
|
|
},
|
|
"receipt": {
|
|
"$ref": "#/definitions/receipt"
|
|
}
|
|
}
|
|
},
|
|
"table": {
|
|
"type": "object",
|
|
"required": ["id", "bbox", "rows", "header_rows", "detection_method", "continued", "continued_from_prev", "page_index"],
|
|
"properties": {
|
|
"id": {
|
|
"type": "string",
|
|
"description": "Unique identifier for this table (e.g., 'table_0')"
|
|
},
|
|
"bbox": {
|
|
"type": "array",
|
|
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
|
"items": {
|
|
"type": "number"
|
|
},
|
|
"minItems": 4,
|
|
"maxItems": 4
|
|
},
|
|
"rows": {
|
|
"type": "array",
|
|
"description": "Rows in this table, ordered top-to-bottom",
|
|
"items": {
|
|
"$ref": "#/definitions/row"
|
|
}
|
|
},
|
|
"header_rows": {
|
|
"type": "integer",
|
|
"description": "Number of contiguous header rows at the top of the table",
|
|
"minimum": 0
|
|
},
|
|
"detection_method": {
|
|
"type": "string",
|
|
"description": "Detection method used to identify this table",
|
|
"enum": ["line_based", "borderless"]
|
|
},
|
|
"continued": {
|
|
"type": "boolean",
|
|
"description": "Whether this table continues on the next page"
|
|
},
|
|
"continued_from_prev": {
|
|
"type": "boolean",
|
|
"description": "Whether this table is a continuation from the previous page"
|
|
},
|
|
"page_index": {
|
|
"type": "integer",
|
|
"description": "Zero-based page index where this table appears",
|
|
"minimum": 0
|
|
}
|
|
}
|
|
},
|
|
"row": {
|
|
"type": "object",
|
|
"required": ["bbox", "cells", "is_header"],
|
|
"properties": {
|
|
"bbox": {
|
|
"type": "array",
|
|
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
|
"items": {
|
|
"type": "number"
|
|
},
|
|
"minItems": 4,
|
|
"maxItems": 4
|
|
},
|
|
"cells": {
|
|
"type": "array",
|
|
"description": "Cells in this row, ordered left-to-right",
|
|
"items": {
|
|
"$ref": "#/definitions/cell"
|
|
}
|
|
},
|
|
"is_header": {
|
|
"type": "boolean",
|
|
"description": "Whether this row is a header row"
|
|
}
|
|
}
|
|
},
|
|
"cell": {
|
|
"type": "object",
|
|
"required": ["bbox", "text", "spans", "row", "col", "rowspan", "colspan", "is_header_row"],
|
|
"properties": {
|
|
"bbox": {
|
|
"type": "array",
|
|
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
|
"items": {
|
|
"type": "number"
|
|
},
|
|
"minItems": 4,
|
|
"maxItems": 4
|
|
},
|
|
"text": {
|
|
"type": "string",
|
|
"description": "The concatenated text content of all spans in the cell"
|
|
},
|
|
"spans": {
|
|
"type": "array",
|
|
"description": "References to spans in the page's spans array",
|
|
"items": {
|
|
"type": "integer"
|
|
}
|
|
},
|
|
"row": {
|
|
"type": "integer",
|
|
"description": "Zero-based row index within the table",
|
|
"minimum": 0
|
|
},
|
|
"col": {
|
|
"type": "integer",
|
|
"description": "Zero-based column index within the table",
|
|
"minimum": 0
|
|
},
|
|
"rowspan": {
|
|
"type": "integer",
|
|
"description": "Number of rows this cell spans (default 1)",
|
|
"minimum": 1
|
|
},
|
|
"colspan": {
|
|
"type": "integer",
|
|
"description": "Number of columns this cell spans (default 1)",
|
|
"minimum": 1
|
|
},
|
|
"is_header_row": {
|
|
"type": "boolean",
|
|
"description": "Whether this cell is in a header row"
|
|
}
|
|
}
|
|
},
|
|
"receipt": {
|
|
"type": "object",
|
|
"required": ["pdf_fingerprint", "page_index", "bbox", "content_hash", "extraction_version"],
|
|
"properties": {
|
|
"pdf_fingerprint": {
|
|
"type": "string",
|
|
"description": "The PDF fingerprint"
|
|
},
|
|
"page_index": {
|
|
"type": "integer",
|
|
"description": "The page index"
|
|
},
|
|
"bbox": {
|
|
"type": "array",
|
|
"description": "Bounding box in PDF user-space points [x0, y0, x1, y1]",
|
|
"items": {
|
|
"type": "number"
|
|
},
|
|
"minItems": 4,
|
|
"maxItems": 4
|
|
},
|
|
"content_hash": {
|
|
"type": "string",
|
|
"description": "SHA-256 hash of the content"
|
|
},
|
|
"extraction_version": {
|
|
"type": "string",
|
|
"description": "Version string of the extractor"
|
|
},
|
|
"svg_clip": {
|
|
"type": "string",
|
|
"description": "SVG clip path for verification (present only in SvgClip mode)"
|
|
}
|
|
}
|
|
},
|
|
"metadata": {
|
|
"type": "object",
|
|
"required": ["page_count", "span_count", "block_count"],
|
|
"properties": {
|
|
"page_count": {
|
|
"type": "integer",
|
|
"description": "Total number of pages in the document"
|
|
},
|
|
"span_count": {
|
|
"type": "integer",
|
|
"description": "Number of spans extracted"
|
|
},
|
|
"block_count": {
|
|
"type": "integer",
|
|
"description": "Number of blocks extracted"
|
|
},
|
|
"cache_status": {
|
|
"type": "string",
|
|
"description": "Cache status: 'hit', 'miss', or 'skipped'",
|
|
"enum": ["hit", "miss", "skipped"]
|
|
},
|
|
"cache_age_seconds": {
|
|
"type": "integer",
|
|
"description": "Cache entry age in seconds (only present when cache_status == 'hit')",
|
|
"minimum": 0
|
|
},
|
|
"error_count": {
|
|
"type": "integer",
|
|
"description": "Number of pages that failed to extract",
|
|
"minimum": 0
|
|
},
|
|
"reading_order_algorithm": {
|
|
"type": "string",
|
|
"description": "Reading order algorithm used for this extraction",
|
|
"enum": ["struct_tree", "xy_cut"]
|
|
},
|
|
"diagnostics": {
|
|
"type": "array",
|
|
"description": "Diagnostics emitted during extraction",
|
|
"items": {
|
|
"type": "string"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|