Add tests/sdk-conformance/ containing the shared, language-neutral test specification for all pdftract SDKs. The suite includes 32 cases covering all 9 contract methods (extract, extract_text, extract_markdown, extract_stream, search, get_metadata, hash, classify, verify_receipt) across vector, scanned, encrypted, fillable-form, mixed, large, broken, and remote PDFs. - cases.json: 32 test cases with id, fixture, method, options, expected, tolerances, feature tags, and min_schema_version - schema.json: JSON Schema v7 draft for validating test case structure - validate_suite.py: Validation script that checks structure and fixture existence - fixtures/: Test PDFs organized by category (symlinks to classifier fixtures for shared files) See notes/pdftract-1527.md for verification details. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
186 lines
6.3 KiB
JSON
186 lines
6.3 KiB
JSON
{
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"$id": "https://github.com/jedarden/pdftract/schemas/sdk-conformance-v1.json",
|
|
"title": "pdftract SDK Conformance Suite Schema",
|
|
"description": "Schema for the pdftract SDK conformance test suite. Defines the structure of test cases that all SDK implementations must pass.",
|
|
"type": "object",
|
|
"required": ["version", "schema_version", "cases"],
|
|
"properties": {
|
|
"version": {
|
|
"type": "string",
|
|
"description": "Version of the conformance suite itself. Bumping this triggers coordinated SDK releases.",
|
|
"pattern": "^\\d+\\.\\d+\\.\\d+$"
|
|
},
|
|
"schema_version": {
|
|
"type": "string",
|
|
"description": "The pdftract output schema version this suite targets.",
|
|
"pattern": "^\\d+\\.\\d+$"
|
|
},
|
|
"cases": {
|
|
"type": "array",
|
|
"description": "Array of conformance test cases.",
|
|
"items": {
|
|
"type": "object",
|
|
"required": ["id", "fixture", "method", "options", "expected"],
|
|
"properties": {
|
|
"id": {
|
|
"type": "string",
|
|
"description": "Unique identifier for this test case. Use kebab-case.",
|
|
"pattern": "^[a-z0-9]+(-[a-z0-9]+)*$"
|
|
},
|
|
"fixture": {
|
|
"type": "string",
|
|
"description": "Path to the test fixture PDF, relative to the fixtures directory, or a remote URL."
|
|
},
|
|
"method": {
|
|
"type": "string",
|
|
"description": "The SDK method being tested.",
|
|
"enum": [
|
|
"extract",
|
|
"extract_text",
|
|
"extract_markdown",
|
|
"extract_stream",
|
|
"search",
|
|
"get_metadata",
|
|
"hash",
|
|
"classify",
|
|
"verify_receipt"
|
|
]
|
|
},
|
|
"options": {
|
|
"type": "object",
|
|
"description": "Options to pass to the method. Varies by method.",
|
|
"properties": {
|
|
"ocr_language": {
|
|
"type": "string",
|
|
"description": "ISO 639-3 language code for OCR."
|
|
},
|
|
"ocr_threshold": {
|
|
"type": "number",
|
|
"description": "Confidence threshold for OCR (0-1).",
|
|
"minimum": 0,
|
|
"maximum": 1
|
|
},
|
|
"preserve_layout": {
|
|
"type": "boolean",
|
|
"description": "Preserve original reading order and layout."
|
|
},
|
|
"extract_images": {
|
|
"type": "boolean",
|
|
"description": "Extract embedded images."
|
|
},
|
|
"image_format": {
|
|
"type": "string",
|
|
"description": "Format for extracted images.",
|
|
"enum": ["png", "jpg", "webp"]
|
|
},
|
|
"min_image_size": {
|
|
"type": "integer",
|
|
"description": "Minimum dimension for image extraction.",
|
|
"minimum": 1
|
|
},
|
|
"password": {
|
|
"type": "string",
|
|
"description": "Password for encrypted PDFs."
|
|
},
|
|
"timeout": {
|
|
"type": "integer",
|
|
"description": "Maximum seconds to wait for the operation.",
|
|
"minimum": 1
|
|
},
|
|
"max_pages": {
|
|
"type": "integer",
|
|
"description": "Maximum pages to process for streaming.",
|
|
"minimum": 1
|
|
},
|
|
"pattern": {
|
|
"type": "string",
|
|
"description": "Search pattern."
|
|
},
|
|
"case_insensitive": {
|
|
"type": "boolean",
|
|
"description": "Ignore case when matching."
|
|
},
|
|
"regex": {
|
|
"type": "boolean",
|
|
"description": "Treat pattern as regular expression."
|
|
},
|
|
"whole_word": {
|
|
"type": "boolean",
|
|
"description": "Match only whole words."
|
|
},
|
|
"max_results": {
|
|
"type": ["integer", "null"],
|
|
"description": "Maximum matches to return.",
|
|
"minimum": 1
|
|
},
|
|
"receipt": {
|
|
"type": "string",
|
|
"description": "Path to receipt file for verify_receipt."
|
|
}
|
|
}
|
|
},
|
|
"expected": {
|
|
"type": "object",
|
|
"description": "Expected results. Structure varies by method. Uses JSONPath-like syntax for nested fields.",
|
|
"additionalProperties": true
|
|
},
|
|
"tolerances": {
|
|
"type": "object",
|
|
"description": "Per-field tolerances for numeric comparisons. Uses JSONPath wildcard syntax.",
|
|
"additionalProperties": {
|
|
"type": "object",
|
|
"properties": {
|
|
"abs": {
|
|
"type": "number",
|
|
"description": "Absolute tolerance."
|
|
},
|
|
"rel": {
|
|
"type": "number",
|
|
"description": "Relative tolerance (as a fraction, e.g., 0.01 for 1%)."
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"feature": {
|
|
"type": "string",
|
|
"description": "Feature tag for this test. SDKs without this feature may skip the test.",
|
|
"enum": [
|
|
"vector",
|
|
"ocr",
|
|
"decrypt",
|
|
"forms",
|
|
"mixed",
|
|
"large",
|
|
"unicode",
|
|
"vertical",
|
|
"math",
|
|
"tables",
|
|
"code",
|
|
"headings",
|
|
"stream",
|
|
"search",
|
|
"metadata",
|
|
"xmp",
|
|
"hash",
|
|
"classify",
|
|
"receipt",
|
|
"error-handling",
|
|
"remote"
|
|
]
|
|
},
|
|
"min_schema_version": {
|
|
"type": "string",
|
|
"description": "Minimum pdftract schema version required for this test.",
|
|
"pattern": "^\\d+\\.\\d+$"
|
|
},
|
|
"skip_reason": {
|
|
"type": "string",
|
|
"description": "If present, this test is skipped. Reason should document why."
|
|
}
|
|
}
|
|
},
|
|
"minItems": 1
|
|
}
|
|
}
|
|
}
|