Add tests/sdk-conformance/ containing the shared, language-neutral test specification for all pdftract SDKs. The suite includes 32 cases covering all 9 contract methods (extract, extract_text, extract_markdown, extract_stream, search, get_metadata, hash, classify, verify_receipt) across vector, scanned, encrypted, fillable-form, mixed, large, broken, and remote PDFs. - cases.json: 32 test cases with id, fixture, method, options, expected, tolerances, feature tags, and min_schema_version - schema.json: JSON Schema v7 draft for validating test case structure - validate_suite.py: Validation script that checks structure and fixture existence - fixtures/: Test PDFs organized by category (symlinks to classifier fixtures for shared files) See notes/pdftract-1527.md for verification details. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
114 lines
4 KiB
Python
Executable file
114 lines
4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""Validate the SDK conformance suite against its schema."""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
def validate_schema_structure(cases):
|
|
"""Basic validation without jsonschema dependency."""
|
|
required_top_level = ["version", "schema_version", "cases"]
|
|
for field in required_top_level:
|
|
if field not in cases:
|
|
return False, f"Missing required top-level field: {field}"
|
|
|
|
if not isinstance(cases["cases"], list):
|
|
return False, "cases must be an array"
|
|
|
|
if len(cases["cases"]) < 30:
|
|
return False, f"Expected at least 30 cases, got {len(cases['cases'])}"
|
|
|
|
valid_methods = {
|
|
"extract", "extract_text", "extract_markdown", "extract_stream",
|
|
"search", "get_metadata", "hash", "classify", "verify_receipt"
|
|
}
|
|
|
|
valid_features = {
|
|
"vector", "ocr", "decrypt", "forms", "mixed", "large",
|
|
"unicode", "vertical", "math", "tables", "code", "headings",
|
|
"stream", "search", "metadata", "xmp", "hash", "classify",
|
|
"receipt", "error-handling", "remote"
|
|
}
|
|
|
|
for i, case in enumerate(cases["cases"]):
|
|
required_case_fields = ["id", "fixture", "method", "options", "expected"]
|
|
for field in required_case_fields:
|
|
if field not in case:
|
|
return False, f"Case {i}: Missing required field: {field}"
|
|
|
|
if case["method"] not in valid_methods:
|
|
return False, f"Case {i}: Invalid method: {case['method']}"
|
|
|
|
if "feature" in case and case["feature"] not in valid_features:
|
|
return False, f"Case {i}: Invalid feature: {case['feature']}"
|
|
|
|
if "min_schema_version" in case:
|
|
if not isinstance(case["min_schema_version"], str):
|
|
return False, f"Case {i}: min_schema_version must be a string"
|
|
|
|
if not isinstance(case["options"], dict):
|
|
return False, f"Case {i}: options must be an object"
|
|
|
|
if not isinstance(case["expected"], dict):
|
|
return False, f"Case {i}: expected must be an object"
|
|
|
|
if "tolerances" in case and not isinstance(case["tolerances"], dict):
|
|
return False, f"Case {i}: tolerances must be an object"
|
|
|
|
return True, ""
|
|
|
|
def main():
|
|
script_dir = Path(__file__).parent
|
|
cases_path = script_dir / "cases.json"
|
|
|
|
with open(cases_path) as f:
|
|
cases = json.load(f)
|
|
|
|
valid, error = validate_schema_structure(cases)
|
|
if not valid:
|
|
print(f"Validation failed: {error}")
|
|
sys.exit(1)
|
|
|
|
# Check for duplicate case IDs
|
|
case_ids = [case["id"] for case in cases["cases"]]
|
|
duplicates = [id for id in case_ids if case_ids.count(id) > 1]
|
|
if duplicates:
|
|
print(f"Error: Duplicate case IDs: {set(duplicates)}")
|
|
sys.exit(1)
|
|
|
|
# Verify fixtures exist
|
|
fixtures_dir = script_dir / "fixtures"
|
|
missing_fixtures = []
|
|
for case in cases["cases"]:
|
|
fixture = case["fixture"]
|
|
if fixture.startswith("http://") or fixture.startswith("https://"):
|
|
continue # Skip remote URLs
|
|
fixture_path = fixtures_dir / fixture
|
|
if not fixture_path.exists():
|
|
missing_fixtures.append(fixture)
|
|
|
|
if missing_fixtures:
|
|
print(f"Warning: {len(missing_fixtures)} fixture(s) not found:")
|
|
for fixture in missing_fixtures[:5]: # Show first 5
|
|
print(f" - {fixture}")
|
|
if len(missing_fixtures) > 5:
|
|
print(f" ... and {len(missing_fixtures) - 5} more")
|
|
|
|
print(f"Validation passed: {len(cases['cases'])} test cases")
|
|
print(f"Methods covered:")
|
|
methods = {}
|
|
for case in cases["cases"]:
|
|
methods[case["method"]] = methods.get(case["method"], 0) + 1
|
|
for method, count in sorted(methods.items()):
|
|
print(f" {method}: {count}")
|
|
|
|
print(f"\nFeatures covered:")
|
|
features = {}
|
|
for case in cases["cases"]:
|
|
feat = case.get("feature", "general")
|
|
features[feat] = features.get(feat, 0) + 1
|
|
for feature, count in sorted(features.items()):
|
|
print(f" {feature}: {count}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|