pdftract/tests/sdk-conformance/validate_suite.py
jedarden a3178a3960 test(pdftract-1527): add shared SDK conformance suite with 32 test cases
Add tests/sdk-conformance/ containing the shared, language-neutral test
specification for all pdftract SDKs. The suite includes 32 cases covering
all 9 contract methods (extract, extract_text, extract_markdown,
extract_stream, search, get_metadata, hash, classify, verify_receipt)
across vector, scanned, encrypted, fillable-form, mixed, large, broken,
and remote PDFs.

- cases.json: 32 test cases with id, fixture, method, options, expected,
  tolerances, feature tags, and min_schema_version
- schema.json: JSON Schema v7 draft for validating test case structure
- validate_suite.py: Validation script that checks structure and fixture
  existence
- fixtures/: Test PDFs organized by category (symlinks to classifier
  fixtures for shared files)

See notes/pdftract-1527.md for verification details.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-18 01:17:42 -04:00

114 lines
4 KiB
Python
Executable file

#!/usr/bin/env python3
"""Validate the SDK conformance suite against its schema."""
import json
import sys
from pathlib import Path
def validate_schema_structure(cases):
"""Basic validation without jsonschema dependency."""
required_top_level = ["version", "schema_version", "cases"]
for field in required_top_level:
if field not in cases:
return False, f"Missing required top-level field: {field}"
if not isinstance(cases["cases"], list):
return False, "cases must be an array"
if len(cases["cases"]) < 30:
return False, f"Expected at least 30 cases, got {len(cases['cases'])}"
valid_methods = {
"extract", "extract_text", "extract_markdown", "extract_stream",
"search", "get_metadata", "hash", "classify", "verify_receipt"
}
valid_features = {
"vector", "ocr", "decrypt", "forms", "mixed", "large",
"unicode", "vertical", "math", "tables", "code", "headings",
"stream", "search", "metadata", "xmp", "hash", "classify",
"receipt", "error-handling", "remote"
}
for i, case in enumerate(cases["cases"]):
required_case_fields = ["id", "fixture", "method", "options", "expected"]
for field in required_case_fields:
if field not in case:
return False, f"Case {i}: Missing required field: {field}"
if case["method"] not in valid_methods:
return False, f"Case {i}: Invalid method: {case['method']}"
if "feature" in case and case["feature"] not in valid_features:
return False, f"Case {i}: Invalid feature: {case['feature']}"
if "min_schema_version" in case:
if not isinstance(case["min_schema_version"], str):
return False, f"Case {i}: min_schema_version must be a string"
if not isinstance(case["options"], dict):
return False, f"Case {i}: options must be an object"
if not isinstance(case["expected"], dict):
return False, f"Case {i}: expected must be an object"
if "tolerances" in case and not isinstance(case["tolerances"], dict):
return False, f"Case {i}: tolerances must be an object"
return True, ""
def main():
script_dir = Path(__file__).parent
cases_path = script_dir / "cases.json"
with open(cases_path) as f:
cases = json.load(f)
valid, error = validate_schema_structure(cases)
if not valid:
print(f"Validation failed: {error}")
sys.exit(1)
# Check for duplicate case IDs
case_ids = [case["id"] for case in cases["cases"]]
duplicates = [id for id in case_ids if case_ids.count(id) > 1]
if duplicates:
print(f"Error: Duplicate case IDs: {set(duplicates)}")
sys.exit(1)
# Verify fixtures exist
fixtures_dir = script_dir / "fixtures"
missing_fixtures = []
for case in cases["cases"]:
fixture = case["fixture"]
if fixture.startswith("http://") or fixture.startswith("https://"):
continue # Skip remote URLs
fixture_path = fixtures_dir / fixture
if not fixture_path.exists():
missing_fixtures.append(fixture)
if missing_fixtures:
print(f"Warning: {len(missing_fixtures)} fixture(s) not found:")
for fixture in missing_fixtures[:5]: # Show first 5
print(f" - {fixture}")
if len(missing_fixtures) > 5:
print(f" ... and {len(missing_fixtures) - 5} more")
print(f"Validation passed: {len(cases['cases'])} test cases")
print(f"Methods covered:")
methods = {}
for case in cases["cases"]:
methods[case["method"]] = methods.get(case["method"], 0) + 1
for method, count in sorted(methods.items()):
print(f" {method}: {count}")
print(f"\nFeatures covered:")
features = {}
for case in cases["cases"]:
feat = case.get("feature", "general")
features[feat] = features.get(feat, 0) + 1
for feature, count in sorted(features.items()):
print(f" {feature}: {count}")
if __name__ == "__main__":
main()