Add tests/sdk-conformance/ containing the shared, language-neutral test specification for all pdftract SDKs. The suite includes 32 cases covering all 9 contract methods (extract, extract_text, extract_markdown, extract_stream, search, get_metadata, hash, classify, verify_receipt) across vector, scanned, encrypted, fillable-form, mixed, large, broken, and remote PDFs. - cases.json: 32 test cases with id, fixture, method, options, expected, tolerances, feature tags, and min_schema_version - schema.json: JSON Schema v7 draft for validating test case structure - validate_suite.py: Validation script that checks structure and fixture existence - fixtures/: Test PDFs organized by category (symlinks to classifier fixtures for shared files) See notes/pdftract-1527.md for verification details. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
610 lines
16 KiB
JSON
610 lines
16 KiB
JSON
{
|
|
"version": "1.0.0",
|
|
"schema_version": "1.0",
|
|
"cases": [
|
|
{
|
|
"id": "extract-vector-scientific-paper",
|
|
"fixture": "scientific_paper/01.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false
|
|
},
|
|
"expected": {
|
|
"schema_version": "1.0",
|
|
"metadata.page_count": 1,
|
|
"pages.length": 1,
|
|
"pages[0].page_index": 0,
|
|
"pages[0].width": {"min": 500, "max": 700},
|
|
"pages[0].height": {"min": 700, "max": 900},
|
|
"pages[0].rotation": 0,
|
|
"pages[0].spans.length": {"min": 1},
|
|
"pages[0].blocks.length": {"min": 1},
|
|
"pages[0].blocks[0].kind": "heading",
|
|
"errors.length": 0
|
|
},
|
|
"tolerances": {
|
|
"pages[*].blocks[*].bbox": {"abs": 0.5},
|
|
"pages[*].spans[*].bbox": {"abs": 0.5}
|
|
},
|
|
"feature": "vector",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-scanned-receipt",
|
|
"fixture": "misc/01.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false
|
|
},
|
|
"expected": {
|
|
"schema_version": "1.0",
|
|
"metadata.page_count": 1,
|
|
"pages.length": 1,
|
|
"pages[0].page_index": 0,
|
|
"pages[0].page_type": "scanned",
|
|
"pages[0].spans.length": {"min": 1},
|
|
"pages[0].blocks.length": {"min": 1},
|
|
"pages[0].blocks[0].kind": "paragraph",
|
|
"errors.length": 0
|
|
},
|
|
"tolerances": {
|
|
"pages[*].blocks[*].bbox": {"abs": 1.0},
|
|
"pages[*].spans[*].bbox": {"abs": 1.0},
|
|
"pages[*].spans[*].confidence": {"abs": 0.2}
|
|
},
|
|
"feature": "ocr",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-encrypted-pdf",
|
|
"fixture": "encrypted/encrypted.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"password": "test123",
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false
|
|
},
|
|
"expected": {
|
|
"schema_version": "1.0",
|
|
"metadata.is_encrypted": true,
|
|
"pages.length": {"min": 1},
|
|
"errors.length": 0
|
|
},
|
|
"tolerances": {},
|
|
"feature": "decrypt",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-fillable-form",
|
|
"fixture": "fillable-form/form.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false
|
|
},
|
|
"expected": {
|
|
"schema_version": "1.0",
|
|
"metadata.page_count": 1,
|
|
"form_fields.length": {"min": 1},
|
|
"pages.length": 1,
|
|
"errors.length": 0
|
|
},
|
|
"tolerances": {},
|
|
"feature": "forms",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-mixed-vector-scanned",
|
|
"fixture": "mixed/mixed.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false
|
|
},
|
|
"expected": {
|
|
"schema_version": "1.0",
|
|
"metadata.page_count": {"min": 2},
|
|
"pages.length": {"min": 2},
|
|
"pages[0].page_type": "mixed",
|
|
"errors.length": 0
|
|
},
|
|
"tolerances": {},
|
|
"feature": "mixed",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-large-document",
|
|
"fixture": "large/100pages.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false,
|
|
"timeout": 120
|
|
},
|
|
"expected": {
|
|
"schema_version": "1.0",
|
|
"metadata.page_count": 100,
|
|
"pages.length": 100,
|
|
"errors.length": 0
|
|
},
|
|
"tolerances": {},
|
|
"feature": "large",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-text-unicode-heavy",
|
|
"fixture": "scientific_paper/02.pdf",
|
|
"method": "extract_text",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false
|
|
},
|
|
"expected": {
|
|
"output_type": "string",
|
|
"min_length": 50,
|
|
"contains": ["Abstract", "Introduction"]
|
|
},
|
|
"tolerances": {},
|
|
"feature": "unicode",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-text-vertical-writing",
|
|
"fixture": "vertical/vertical.pdf",
|
|
"method": "extract_text",
|
|
"options": {
|
|
"ocr_language": "jpn",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": true
|
|
},
|
|
"expected": {
|
|
"output_type": "string",
|
|
"min_length": 10
|
|
},
|
|
"tolerances": {},
|
|
"feature": "vertical",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-text-math-content",
|
|
"fixture": "scientific_paper/03.pdf",
|
|
"method": "extract_text",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false
|
|
},
|
|
"expected": {
|
|
"output_type": "string",
|
|
"min_length": 100,
|
|
"contains": ["equation", "formula"]
|
|
},
|
|
"tolerances": {},
|
|
"feature": "math",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-markdown-table-heavy",
|
|
"fixture": "contract/01.pdf",
|
|
"method": "extract_markdown",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false
|
|
},
|
|
"expected": {
|
|
"output_type": "string",
|
|
"min_length": 100,
|
|
"contains": ["|", "AGREEMENT"]
|
|
},
|
|
"tolerances": {},
|
|
"feature": "tables",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-markdown-code-block",
|
|
"fixture": "code/code.pdf",
|
|
"method": "extract_markdown",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false
|
|
},
|
|
"expected": {
|
|
"output_type": "string",
|
|
"min_length": 50,
|
|
"contains": ["```", "function", "return"]
|
|
},
|
|
"tolerances": {},
|
|
"feature": "code",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-markdown-nested-heading",
|
|
"fixture": "scientific_paper/04.pdf",
|
|
"method": "extract_markdown",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false
|
|
},
|
|
"expected": {
|
|
"output_type": "string",
|
|
"min_length": 100,
|
|
"contains": ["#", "##", "###"]
|
|
},
|
|
"tolerances": {},
|
|
"feature": "headings",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-stream-page-at-a-time",
|
|
"fixture": "scientific_paper/05.pdf",
|
|
"method": "extract_stream",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false
|
|
},
|
|
"expected": {
|
|
"output_type": "iterator",
|
|
"frame_count": {"min": 3},
|
|
"first_frame_type": "header",
|
|
"last_frame_type": "footer",
|
|
"page_frames": {"min": 1}
|
|
},
|
|
"tolerances": {},
|
|
"feature": "stream",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-stream-cancellation",
|
|
"fixture": "large/50pages.pdf",
|
|
"method": "extract_stream",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"max_pages": 5
|
|
},
|
|
"expected": {
|
|
"output_type": "iterator",
|
|
"page_frames": {"max": 6}
|
|
},
|
|
"tolerances": {},
|
|
"feature": "stream",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-stream-ndjson-format",
|
|
"fixture": "scientific_paper/06.pdf",
|
|
"method": "extract_stream",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false
|
|
},
|
|
"expected": {
|
|
"output_type": "iterator",
|
|
"frame_count": {"min": 3},
|
|
"header_frame_has_schema_version": true,
|
|
"header_frame_has_total_pages": true
|
|
},
|
|
"tolerances": {},
|
|
"feature": "stream",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "search-literal-pattern",
|
|
"fixture": "scientific_paper/07.pdf",
|
|
"method": "search",
|
|
"options": {
|
|
"pattern": "Abstract",
|
|
"case_insensitive": false,
|
|
"regex": false,
|
|
"whole_word": false,
|
|
"max_results": null
|
|
},
|
|
"expected": {
|
|
"output_type": "iterator",
|
|
"min_matches": 1,
|
|
"first_match_page": 0,
|
|
"first_match_text": "Abstract"
|
|
},
|
|
"tolerances": {},
|
|
"feature": "search",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "search-regex-pattern",
|
|
"fixture": "scientific_paper/08.pdf",
|
|
"method": "search",
|
|
"options": {
|
|
"pattern": "\\b\\d{4}\\b",
|
|
"case_insensitive": false,
|
|
"regex": true,
|
|
"whole_word": false,
|
|
"max_results": null
|
|
},
|
|
"expected": {
|
|
"output_type": "iterator",
|
|
"min_matches": 1
|
|
},
|
|
"tolerances": {},
|
|
"feature": "search",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "search-case-insensitive",
|
|
"fixture": "invoice/01.pdf",
|
|
"method": "search",
|
|
"options": {
|
|
"pattern": "invoice",
|
|
"case_insensitive": true,
|
|
"regex": false,
|
|
"whole_word": false,
|
|
"max_results": null
|
|
},
|
|
"expected": {
|
|
"output_type": "iterator",
|
|
"min_matches": 1
|
|
},
|
|
"tolerances": {},
|
|
"feature": "search",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "search-no-match",
|
|
"fixture": "scientific_paper/09.pdf",
|
|
"method": "search",
|
|
"options": {
|
|
"pattern": "nonexistent_pattern_xyz123",
|
|
"case_insensitive": false,
|
|
"regex": false,
|
|
"whole_word": false,
|
|
"max_results": null
|
|
},
|
|
"expected": {
|
|
"output_type": "iterator",
|
|
"match_count": 0
|
|
},
|
|
"tolerances": {},
|
|
"feature": "search",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "get-metadata-complete",
|
|
"fixture": "scientific_paper/10.pdf",
|
|
"method": "get_metadata",
|
|
"options": {
|
|
"timeout": 30
|
|
},
|
|
"expected": {
|
|
"metadata.page_count": 1,
|
|
"metadata.has_title": true,
|
|
"metadata.has_author": true,
|
|
"metadata.has_creator": true
|
|
},
|
|
"tolerances": {},
|
|
"feature": "metadata",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "get-metadata-minimal",
|
|
"fixture": "misc/02.pdf",
|
|
"method": "get_metadata",
|
|
"options": {
|
|
"timeout": 30
|
|
},
|
|
"expected": {
|
|
"metadata.page_count": 1,
|
|
"metadata.title": null,
|
|
"metadata.author": null
|
|
},
|
|
"tolerances": {},
|
|
"feature": "metadata",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "get-metadata-xmp-only",
|
|
"fixture": "xmp/xmp-metadata.pdf",
|
|
"method": "get_metadata",
|
|
"options": {
|
|
"timeout": 30
|
|
},
|
|
"expected": {
|
|
"metadata.page_count": 1,
|
|
"metadata.has_xmp": true
|
|
},
|
|
"tolerances": {},
|
|
"feature": "xmp",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "hash-same-file-same-hash",
|
|
"fixture": "scientific_paper/11.pdf",
|
|
"method": "hash",
|
|
"options": {
|
|
"timeout": 30
|
|
},
|
|
"expected": {
|
|
"hash_type": "sha256",
|
|
"hash.length": 64,
|
|
"page_count": 1,
|
|
"fast_hash.length": 64,
|
|
"fast_hash_different_from_hash": true
|
|
},
|
|
"tolerances": {},
|
|
"feature": "hash",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "hash-content-stability",
|
|
"fixture": "scientific_paper/12.pdf",
|
|
"method": "hash",
|
|
"options": {
|
|
"timeout": 30
|
|
},
|
|
"expected": {
|
|
"hash_type": "sha256",
|
|
"hash.length": 64,
|
|
"content_hash_stable": true
|
|
},
|
|
"tolerances": {},
|
|
"feature": "hash",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "classify-academic-paper",
|
|
"fixture": "scientific_paper/13.pdf",
|
|
"method": "classify",
|
|
"options": {},
|
|
"expected": {
|
|
"category": "scientific_paper",
|
|
"confidence": {"min": 0.7},
|
|
"tags.length": {"min": 1},
|
|
"heuristics.has_abstract": true,
|
|
"heuristics.has_references": true
|
|
},
|
|
"tolerances": {
|
|
"confidence": {"abs": 0.2}
|
|
},
|
|
"feature": "classify",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "classify-scientific-paper",
|
|
"fixture": "scientific_paper/14.pdf",
|
|
"method": "classify",
|
|
"options": {},
|
|
"expected": {
|
|
"category": "scientific_paper",
|
|
"confidence": {"min": 0.7},
|
|
"tags.length": {"min": 1},
|
|
"heuristics.has_methods": true,
|
|
"heuristics.has_results": true
|
|
},
|
|
"tolerances": {
|
|
"confidence": {"abs": 0.2}
|
|
},
|
|
"feature": "classify",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "classify-scanned-receipt",
|
|
"fixture": "misc/03.pdf",
|
|
"method": "classify",
|
|
"options": {},
|
|
"expected": {
|
|
"category": "receipt",
|
|
"confidence": {"min": 0.7},
|
|
"tags.length": {"min": 1},
|
|
"heuristics.is_scanned": true
|
|
},
|
|
"tolerances": {
|
|
"confidence": {"abs": 0.2}
|
|
},
|
|
"feature": "classify",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "classify-fillable-form",
|
|
"fixture": "fillable-form/form.pdf",
|
|
"method": "classify",
|
|
"options": {},
|
|
"expected": {
|
|
"category": "form",
|
|
"confidence": {"min": 0.7},
|
|
"tags.length": {"min": 1},
|
|
"heuristics.has_form_fields": true
|
|
},
|
|
"tolerances": {
|
|
"confidence": {"abs": 0.2}
|
|
},
|
|
"feature": "classify",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "verify-receipt-valid",
|
|
"fixture": "receipts/valid-receipt.pdf",
|
|
"method": "verify_receipt",
|
|
"options": {
|
|
"receipt": "receipts/valid-receipt.receipt.json"
|
|
},
|
|
"expected": {
|
|
"valid": true
|
|
},
|
|
"tolerances": {},
|
|
"feature": "receipt",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "verify-receipt-tampered",
|
|
"fixture": "receipts/tampered-receipt.pdf",
|
|
"method": "verify_receipt",
|
|
"options": {
|
|
"receipt": "receipts/tampered-receipt.receipt.json"
|
|
},
|
|
"expected": {
|
|
"valid": false
|
|
},
|
|
"tolerances": {},
|
|
"feature": "receipt",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-broken-pdf",
|
|
"fixture": "broken/corrupt.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false
|
|
},
|
|
"expected": {
|
|
"errors.length": {"min": 1},
|
|
"errors[0].severity": "error"
|
|
},
|
|
"tolerances": {},
|
|
"feature": "error-handling",
|
|
"min_schema_version": "1.0"
|
|
},
|
|
{
|
|
"id": "extract-remote-pdf",
|
|
"fixture": "https://arxiv.org/pdf/2201.00001.pdf",
|
|
"method": "extract",
|
|
"options": {
|
|
"ocr_language": "eng",
|
|
"ocr_threshold": 0.7,
|
|
"preserve_layout": false,
|
|
"extract_images": false,
|
|
"timeout": 60
|
|
},
|
|
"expected": {
|
|
"schema_version": "1.0",
|
|
"metadata.page_count": {"min": 1},
|
|
"pages.length": {"min": 1},
|
|
"errors.length": 0
|
|
},
|
|
"tolerances": {},
|
|
"feature": "remote",
|
|
"min_schema_version": "1.0"
|
|
}
|
|
]
|
|
}
|