pdftract/docs/schema/v1.0/grep-jsonl.schema.json
jedarden 47df769e4b feat(pdftract-5ls35): implement JSON-Lines output sink for grep
Implement the --json output sink for pdftract grep with JSON-Lines
format (one match per line). Includes MatchEvent, FileOnlyEvent,
CountEvent structs and JsonSink line-buffered writer.

Key features:
- MatchEvent with all fields (path, page_index, bbox, match_text,
  span_text, span_confidence, pdf_fingerprint, crosses_spans)
- crosses_spans omitted when false via skip_serializing_if
- NaN/Infinity in span_confidence replaced with null
- page_index is 0-based (machine convention)
- FileOnlyEvent for -l mode, CountEvent for -c mode
- Line-buffered writes with immediate flush
- JSON schema at docs/schema/v1.0/grep-jsonl.schema.json

Closes: pdftract-5ls35
2026-05-25 02:05:17 -04:00

97 lines
3 KiB
JSON

{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://pdftract.jedarden.com/schemas/v1.0/grep-jsonl.schema.json",
"title": "pdftract grep JSON-Lines output",
"description": "One match per line in JSON format. Each line is a separate JSON object with match metadata.",
"type": "object",
"anyOf": [
{
"$ref": "#/definitions/MatchEvent"
},
{
"$ref": "#/definitions/FileOnlyEvent"
},
{
"$ref": "#/definitions/CountEvent"
}
],
"definitions": {
"MatchEvent": {
"description": "A single grep match with full metadata (default output mode)",
"type": "object",
"required": ["path", "page_index", "bbox", "match_text", "span_text", "pdf_fingerprint"],
"properties": {
"path": {
"type": "string",
"description": "Path to the PDF file (relative if input was relative, absolute if input was absolute)"
},
"page_index": {
"type": "integer",
"minimum": 0,
"description": "0-based page index (machine convention; human output flips to 1-based)"
},
"bbox": {
"type": "array",
"items": {
"type": "number"
},
"minItems": 4,
"maxItems": 4,
"description": "Bounding box in PDF user-space coordinates [x0, y0, x1, y1]"
},
"match_text": {
"type": "string",
"description": "The matched text substring"
},
"span_text": {
"type": "string",
"description": "The full span text containing the match"
},
"span_confidence": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0,
"description": "Confidence score (0.0 to 1.0). Omitted (null) if not applicable (e.g., non-OCR text or NaN/Infinity values)."
},
"pdf_fingerprint": {
"type": "string",
"pattern": "^pdftract-v1:[0-9a-f]{64}$",
"description": "PDF structural fingerprint for deduplication across runs"
},
"crosses_spans": {
"type": "boolean",
"description": "Whether the match crosses multiple spans. Omitted when false to keep typical lines short."
}
}
},
"FileOnlyEvent": {
"description": "File-only event emitted with -l (files-with-matches) flag",
"type": "object",
"required": ["path"],
"properties": {
"path": {
"type": "string",
"description": "Path to the PDF file with at least one match"
}
},
"additionalProperties": false
},
"CountEvent": {
"description": "Count event emitted with -c (count) flag",
"type": "object",
"required": ["path", "count"],
"properties": {
"path": {
"type": "string",
"description": "Path to the PDF file"
},
"count": {
"type": "integer",
"minimum": 0,
"description": "Number of matches in this file"
}
},
"additionalProperties": false
}
}
}