pdftract/tests/sdk-conformance/schema.json
jedarden a3178a3960 test(pdftract-1527): add shared SDK conformance suite with 32 test cases
Add tests/sdk-conformance/ containing the shared, language-neutral test
specification for all pdftract SDKs. The suite includes 32 cases covering
all 9 contract methods (extract, extract_text, extract_markdown,
extract_stream, search, get_metadata, hash, classify, verify_receipt)
across vector, scanned, encrypted, fillable-form, mixed, large, broken,
and remote PDFs.

- cases.json: 32 test cases with id, fixture, method, options, expected,
  tolerances, feature tags, and min_schema_version
- schema.json: JSON Schema v7 draft for validating test case structure
- validate_suite.py: Validation script that checks structure and fixture
  existence
- fixtures/: Test PDFs organized by category (symlinks to classifier
  fixtures for shared files)

See notes/pdftract-1527.md for verification details.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-18 01:17:42 -04:00

186 lines
6.3 KiB
JSON

{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://github.com/jedarden/pdftract/schemas/sdk-conformance-v1.json",
"title": "pdftract SDK Conformance Suite Schema",
"description": "Schema for the pdftract SDK conformance test suite. Defines the structure of test cases that all SDK implementations must pass.",
"type": "object",
"required": ["version", "schema_version", "cases"],
"properties": {
"version": {
"type": "string",
"description": "Version of the conformance suite itself. Bumping this triggers coordinated SDK releases.",
"pattern": "^\\d+\\.\\d+\\.\\d+$"
},
"schema_version": {
"type": "string",
"description": "The pdftract output schema version this suite targets.",
"pattern": "^\\d+\\.\\d+$"
},
"cases": {
"type": "array",
"description": "Array of conformance test cases.",
"items": {
"type": "object",
"required": ["id", "fixture", "method", "options", "expected"],
"properties": {
"id": {
"type": "string",
"description": "Unique identifier for this test case. Use kebab-case.",
"pattern": "^[a-z0-9]+(-[a-z0-9]+)*$"
},
"fixture": {
"type": "string",
"description": "Path to the test fixture PDF, relative to the fixtures directory, or a remote URL."
},
"method": {
"type": "string",
"description": "The SDK method being tested.",
"enum": [
"extract",
"extract_text",
"extract_markdown",
"extract_stream",
"search",
"get_metadata",
"hash",
"classify",
"verify_receipt"
]
},
"options": {
"type": "object",
"description": "Options to pass to the method. Varies by method.",
"properties": {
"ocr_language": {
"type": "string",
"description": "ISO 639-3 language code for OCR."
},
"ocr_threshold": {
"type": "number",
"description": "Confidence threshold for OCR (0-1).",
"minimum": 0,
"maximum": 1
},
"preserve_layout": {
"type": "boolean",
"description": "Preserve original reading order and layout."
},
"extract_images": {
"type": "boolean",
"description": "Extract embedded images."
},
"image_format": {
"type": "string",
"description": "Format for extracted images.",
"enum": ["png", "jpg", "webp"]
},
"min_image_size": {
"type": "integer",
"description": "Minimum dimension for image extraction.",
"minimum": 1
},
"password": {
"type": "string",
"description": "Password for encrypted PDFs."
},
"timeout": {
"type": "integer",
"description": "Maximum seconds to wait for the operation.",
"minimum": 1
},
"max_pages": {
"type": "integer",
"description": "Maximum pages to process for streaming.",
"minimum": 1
},
"pattern": {
"type": "string",
"description": "Search pattern."
},
"case_insensitive": {
"type": "boolean",
"description": "Ignore case when matching."
},
"regex": {
"type": "boolean",
"description": "Treat pattern as regular expression."
},
"whole_word": {
"type": "boolean",
"description": "Match only whole words."
},
"max_results": {
"type": ["integer", "null"],
"description": "Maximum matches to return.",
"minimum": 1
},
"receipt": {
"type": "string",
"description": "Path to receipt file for verify_receipt."
}
}
},
"expected": {
"type": "object",
"description": "Expected results. Structure varies by method. Uses JSONPath-like syntax for nested fields.",
"additionalProperties": true
},
"tolerances": {
"type": "object",
"description": "Per-field tolerances for numeric comparisons. Uses JSONPath wildcard syntax.",
"additionalProperties": {
"type": "object",
"properties": {
"abs": {
"type": "number",
"description": "Absolute tolerance."
},
"rel": {
"type": "number",
"description": "Relative tolerance (as a fraction, e.g., 0.01 for 1%)."
}
}
}
},
"feature": {
"type": "string",
"description": "Feature tag for this test. SDKs without this feature may skip the test.",
"enum": [
"vector",
"ocr",
"decrypt",
"forms",
"mixed",
"large",
"unicode",
"vertical",
"math",
"tables",
"code",
"headings",
"stream",
"search",
"metadata",
"xmp",
"hash",
"classify",
"receipt",
"error-handling",
"remote"
]
},
"min_schema_version": {
"type": "string",
"description": "Minimum pdftract schema version required for this test.",
"pattern": "^\\d+\\.\\d+$"
},
"skip_reason": {
"type": "string",
"description": "If present, this test is skipped. Reason should document why."
}
}
},
"minItems": 1
}
}
}