pdftract/tests/sdk-conformance/cases.json
jedarden a3178a3960 test(pdftract-1527): add shared SDK conformance suite with 32 test cases
Add tests/sdk-conformance/ containing the shared, language-neutral test
specification for all pdftract SDKs. The suite includes 32 cases covering
all 9 contract methods (extract, extract_text, extract_markdown,
extract_stream, search, get_metadata, hash, classify, verify_receipt)
across vector, scanned, encrypted, fillable-form, mixed, large, broken,
and remote PDFs.

- cases.json: 32 test cases with id, fixture, method, options, expected,
  tolerances, feature tags, and min_schema_version
- schema.json: JSON Schema v7 draft for validating test case structure
- validate_suite.py: Validation script that checks structure and fixture
  existence
- fixtures/: Test PDFs organized by category (symlinks to classifier
  fixtures for shared files)

See notes/pdftract-1527.md for verification details.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-18 01:17:42 -04:00

610 lines
16 KiB
JSON

{
"version": "1.0.0",
"schema_version": "1.0",
"cases": [
{
"id": "extract-vector-scientific-paper",
"fixture": "scientific_paper/01.pdf",
"method": "extract",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false
},
"expected": {
"schema_version": "1.0",
"metadata.page_count": 1,
"pages.length": 1,
"pages[0].page_index": 0,
"pages[0].width": {"min": 500, "max": 700},
"pages[0].height": {"min": 700, "max": 900},
"pages[0].rotation": 0,
"pages[0].spans.length": {"min": 1},
"pages[0].blocks.length": {"min": 1},
"pages[0].blocks[0].kind": "heading",
"errors.length": 0
},
"tolerances": {
"pages[*].blocks[*].bbox": {"abs": 0.5},
"pages[*].spans[*].bbox": {"abs": 0.5}
},
"feature": "vector",
"min_schema_version": "1.0"
},
{
"id": "extract-scanned-receipt",
"fixture": "misc/01.pdf",
"method": "extract",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false
},
"expected": {
"schema_version": "1.0",
"metadata.page_count": 1,
"pages.length": 1,
"pages[0].page_index": 0,
"pages[0].page_type": "scanned",
"pages[0].spans.length": {"min": 1},
"pages[0].blocks.length": {"min": 1},
"pages[0].blocks[0].kind": "paragraph",
"errors.length": 0
},
"tolerances": {
"pages[*].blocks[*].bbox": {"abs": 1.0},
"pages[*].spans[*].bbox": {"abs": 1.0},
"pages[*].spans[*].confidence": {"abs": 0.2}
},
"feature": "ocr",
"min_schema_version": "1.0"
},
{
"id": "extract-encrypted-pdf",
"fixture": "encrypted/encrypted.pdf",
"method": "extract",
"options": {
"password": "test123",
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false
},
"expected": {
"schema_version": "1.0",
"metadata.is_encrypted": true,
"pages.length": {"min": 1},
"errors.length": 0
},
"tolerances": {},
"feature": "decrypt",
"min_schema_version": "1.0"
},
{
"id": "extract-fillable-form",
"fixture": "fillable-form/form.pdf",
"method": "extract",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false
},
"expected": {
"schema_version": "1.0",
"metadata.page_count": 1,
"form_fields.length": {"min": 1},
"pages.length": 1,
"errors.length": 0
},
"tolerances": {},
"feature": "forms",
"min_schema_version": "1.0"
},
{
"id": "extract-mixed-vector-scanned",
"fixture": "mixed/mixed.pdf",
"method": "extract",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false
},
"expected": {
"schema_version": "1.0",
"metadata.page_count": {"min": 2},
"pages.length": {"min": 2},
"pages[0].page_type": "mixed",
"errors.length": 0
},
"tolerances": {},
"feature": "mixed",
"min_schema_version": "1.0"
},
{
"id": "extract-large-document",
"fixture": "large/100pages.pdf",
"method": "extract",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false,
"timeout": 120
},
"expected": {
"schema_version": "1.0",
"metadata.page_count": 100,
"pages.length": 100,
"errors.length": 0
},
"tolerances": {},
"feature": "large",
"min_schema_version": "1.0"
},
{
"id": "extract-text-unicode-heavy",
"fixture": "scientific_paper/02.pdf",
"method": "extract_text",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false
},
"expected": {
"output_type": "string",
"min_length": 50,
"contains": ["Abstract", "Introduction"]
},
"tolerances": {},
"feature": "unicode",
"min_schema_version": "1.0"
},
{
"id": "extract-text-vertical-writing",
"fixture": "vertical/vertical.pdf",
"method": "extract_text",
"options": {
"ocr_language": "jpn",
"ocr_threshold": 0.7,
"preserve_layout": true
},
"expected": {
"output_type": "string",
"min_length": 10
},
"tolerances": {},
"feature": "vertical",
"min_schema_version": "1.0"
},
{
"id": "extract-text-math-content",
"fixture": "scientific_paper/03.pdf",
"method": "extract_text",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false
},
"expected": {
"output_type": "string",
"min_length": 100,
"contains": ["equation", "formula"]
},
"tolerances": {},
"feature": "math",
"min_schema_version": "1.0"
},
{
"id": "extract-markdown-table-heavy",
"fixture": "contract/01.pdf",
"method": "extract_markdown",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false
},
"expected": {
"output_type": "string",
"min_length": 100,
"contains": ["|", "AGREEMENT"]
},
"tolerances": {},
"feature": "tables",
"min_schema_version": "1.0"
},
{
"id": "extract-markdown-code-block",
"fixture": "code/code.pdf",
"method": "extract_markdown",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false
},
"expected": {
"output_type": "string",
"min_length": 50,
"contains": ["```", "function", "return"]
},
"tolerances": {},
"feature": "code",
"min_schema_version": "1.0"
},
{
"id": "extract-markdown-nested-heading",
"fixture": "scientific_paper/04.pdf",
"method": "extract_markdown",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false
},
"expected": {
"output_type": "string",
"min_length": 100,
"contains": ["#", "##", "###"]
},
"tolerances": {},
"feature": "headings",
"min_schema_version": "1.0"
},
{
"id": "extract-stream-page-at-a-time",
"fixture": "scientific_paper/05.pdf",
"method": "extract_stream",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false
},
"expected": {
"output_type": "iterator",
"frame_count": {"min": 3},
"first_frame_type": "header",
"last_frame_type": "footer",
"page_frames": {"min": 1}
},
"tolerances": {},
"feature": "stream",
"min_schema_version": "1.0"
},
{
"id": "extract-stream-cancellation",
"fixture": "large/50pages.pdf",
"method": "extract_stream",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"max_pages": 5
},
"expected": {
"output_type": "iterator",
"page_frames": {"max": 6}
},
"tolerances": {},
"feature": "stream",
"min_schema_version": "1.0"
},
{
"id": "extract-stream-ndjson-format",
"fixture": "scientific_paper/06.pdf",
"method": "extract_stream",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false
},
"expected": {
"output_type": "iterator",
"frame_count": {"min": 3},
"header_frame_has_schema_version": true,
"header_frame_has_total_pages": true
},
"tolerances": {},
"feature": "stream",
"min_schema_version": "1.0"
},
{
"id": "search-literal-pattern",
"fixture": "scientific_paper/07.pdf",
"method": "search",
"options": {
"pattern": "Abstract",
"case_insensitive": false,
"regex": false,
"whole_word": false,
"max_results": null
},
"expected": {
"output_type": "iterator",
"min_matches": 1,
"first_match_page": 0,
"first_match_text": "Abstract"
},
"tolerances": {},
"feature": "search",
"min_schema_version": "1.0"
},
{
"id": "search-regex-pattern",
"fixture": "scientific_paper/08.pdf",
"method": "search",
"options": {
"pattern": "\\b\\d{4}\\b",
"case_insensitive": false,
"regex": true,
"whole_word": false,
"max_results": null
},
"expected": {
"output_type": "iterator",
"min_matches": 1
},
"tolerances": {},
"feature": "search",
"min_schema_version": "1.0"
},
{
"id": "search-case-insensitive",
"fixture": "invoice/01.pdf",
"method": "search",
"options": {
"pattern": "invoice",
"case_insensitive": true,
"regex": false,
"whole_word": false,
"max_results": null
},
"expected": {
"output_type": "iterator",
"min_matches": 1
},
"tolerances": {},
"feature": "search",
"min_schema_version": "1.0"
},
{
"id": "search-no-match",
"fixture": "scientific_paper/09.pdf",
"method": "search",
"options": {
"pattern": "nonexistent_pattern_xyz123",
"case_insensitive": false,
"regex": false,
"whole_word": false,
"max_results": null
},
"expected": {
"output_type": "iterator",
"match_count": 0
},
"tolerances": {},
"feature": "search",
"min_schema_version": "1.0"
},
{
"id": "get-metadata-complete",
"fixture": "scientific_paper/10.pdf",
"method": "get_metadata",
"options": {
"timeout": 30
},
"expected": {
"metadata.page_count": 1,
"metadata.has_title": true,
"metadata.has_author": true,
"metadata.has_creator": true
},
"tolerances": {},
"feature": "metadata",
"min_schema_version": "1.0"
},
{
"id": "get-metadata-minimal",
"fixture": "misc/02.pdf",
"method": "get_metadata",
"options": {
"timeout": 30
},
"expected": {
"metadata.page_count": 1,
"metadata.title": null,
"metadata.author": null
},
"tolerances": {},
"feature": "metadata",
"min_schema_version": "1.0"
},
{
"id": "get-metadata-xmp-only",
"fixture": "xmp/xmp-metadata.pdf",
"method": "get_metadata",
"options": {
"timeout": 30
},
"expected": {
"metadata.page_count": 1,
"metadata.has_xmp": true
},
"tolerances": {},
"feature": "xmp",
"min_schema_version": "1.0"
},
{
"id": "hash-same-file-same-hash",
"fixture": "scientific_paper/11.pdf",
"method": "hash",
"options": {
"timeout": 30
},
"expected": {
"hash_type": "sha256",
"hash.length": 64,
"page_count": 1,
"fast_hash.length": 64,
"fast_hash_different_from_hash": true
},
"tolerances": {},
"feature": "hash",
"min_schema_version": "1.0"
},
{
"id": "hash-content-stability",
"fixture": "scientific_paper/12.pdf",
"method": "hash",
"options": {
"timeout": 30
},
"expected": {
"hash_type": "sha256",
"hash.length": 64,
"content_hash_stable": true
},
"tolerances": {},
"feature": "hash",
"min_schema_version": "1.0"
},
{
"id": "classify-academic-paper",
"fixture": "scientific_paper/13.pdf",
"method": "classify",
"options": {},
"expected": {
"category": "scientific_paper",
"confidence": {"min": 0.7},
"tags.length": {"min": 1},
"heuristics.has_abstract": true,
"heuristics.has_references": true
},
"tolerances": {
"confidence": {"abs": 0.2}
},
"feature": "classify",
"min_schema_version": "1.0"
},
{
"id": "classify-scientific-paper",
"fixture": "scientific_paper/14.pdf",
"method": "classify",
"options": {},
"expected": {
"category": "scientific_paper",
"confidence": {"min": 0.7},
"tags.length": {"min": 1},
"heuristics.has_methods": true,
"heuristics.has_results": true
},
"tolerances": {
"confidence": {"abs": 0.2}
},
"feature": "classify",
"min_schema_version": "1.0"
},
{
"id": "classify-scanned-receipt",
"fixture": "misc/03.pdf",
"method": "classify",
"options": {},
"expected": {
"category": "receipt",
"confidence": {"min": 0.7},
"tags.length": {"min": 1},
"heuristics.is_scanned": true
},
"tolerances": {
"confidence": {"abs": 0.2}
},
"feature": "classify",
"min_schema_version": "1.0"
},
{
"id": "classify-fillable-form",
"fixture": "fillable-form/form.pdf",
"method": "classify",
"options": {},
"expected": {
"category": "form",
"confidence": {"min": 0.7},
"tags.length": {"min": 1},
"heuristics.has_form_fields": true
},
"tolerances": {
"confidence": {"abs": 0.2}
},
"feature": "classify",
"min_schema_version": "1.0"
},
{
"id": "verify-receipt-valid",
"fixture": "receipts/valid-receipt.pdf",
"method": "verify_receipt",
"options": {
"receipt": "receipts/valid-receipt.receipt.json"
},
"expected": {
"valid": true
},
"tolerances": {},
"feature": "receipt",
"min_schema_version": "1.0"
},
{
"id": "verify-receipt-tampered",
"fixture": "receipts/tampered-receipt.pdf",
"method": "verify_receipt",
"options": {
"receipt": "receipts/tampered-receipt.receipt.json"
},
"expected": {
"valid": false
},
"tolerances": {},
"feature": "receipt",
"min_schema_version": "1.0"
},
{
"id": "extract-broken-pdf",
"fixture": "broken/corrupt.pdf",
"method": "extract",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false
},
"expected": {
"errors.length": {"min": 1},
"errors[0].severity": "error"
},
"tolerances": {},
"feature": "error-handling",
"min_schema_version": "1.0"
},
{
"id": "extract-remote-pdf",
"fixture": "https://arxiv.org/pdf/2201.00001.pdf",
"method": "extract",
"options": {
"ocr_language": "eng",
"ocr_threshold": 0.7,
"preserve_layout": false,
"extract_images": false,
"timeout": 60
},
"expected": {
"schema_version": "1.0",
"metadata.page_count": {"min": 1},
"pages.length": {"min": 1},
"errors.length": 0
},
"tolerances": {},
"feature": "remote",
"min_schema_version": "1.0"
}
]
}