Add explicit enum constraints to page_type, severity, and confidence_source fields in the generated JSON Schema for better validation. Changes: - Modified xtask/src/bin/gen_schema.rs to add explicit enum constraints during schema generation via add_enum_constraints() function - page_type enum: ["text", "scanned", "mixed", "broken_vector", "blank", "figure_only"] - severity enum: ["info", "warning", "error", "fatal"] - confidence_source enum: ["native", "heuristic", "ocr"] - Regenerated docs/schema/v1.0/pdftract.schema.json with enum constraints - Added .github/workflows/schema-gen.yml CI workflow for schema validation The CI workflow validates: 1. Generated schema matches committed file (fails on diff) 2. JSON syntax is valid 3. Schema structure is correct ($id, $schema, title, $defs) 4. Enum constraints are present and have correct values This ensures schema changes are reviewable in PRs and forces developers to commit the updated schema when type definitions change. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
110 lines
4.4 KiB
YAML
110 lines
4.4 KiB
YAML
name: Schema Generation Validation
|
|
|
|
on:
|
|
push:
|
|
branches: [main]
|
|
pull_request:
|
|
branches: [main]
|
|
workflow_dispatch:
|
|
|
|
jobs:
|
|
validate-schema:
|
|
runs-on: ubuntu-latest
|
|
name: Validate JSON Schema
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install Rust toolchain
|
|
uses: dtolnay/rust-toolchain@stable
|
|
with:
|
|
components: rustfmt, clippy
|
|
|
|
- name: Cache Cargo registry
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: ~/.cargo/registry
|
|
key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
|
|
|
|
- name: Cache Cargo index
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: ~/.cargo/git
|
|
key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
|
|
|
|
- name: Cache Cargo build
|
|
uses: actions/cache@v4
|
|
with:
|
|
path: target
|
|
key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }}
|
|
|
|
- name: Generate JSON Schema
|
|
run: cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema
|
|
|
|
- name: Check for schema changes
|
|
id: check-diff
|
|
run: |
|
|
if git diff --quiet docs/schema/v1.0/pdftract.schema.json; then
|
|
echo "Schema is up to date"
|
|
echo "has_changes=false" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "Schema has uncommitted changes"
|
|
echo "has_changes=true" >> $GITHUB_OUTPUT
|
|
echo "### Schema changes detected :warning:" >> $GITHUB_STEP_SUMMARY
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "The generated JSON schema differs from the committed file." >> $GITHUB_STEP_SUMMARY
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "\`\`\`diff" >> $GITHUB_STEP_SUMMARY
|
|
git diff docs/schema/v1.0/pdftract.schema.json >> $GITHUB_STEP_SUMMARY
|
|
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "To fix this issue:" >> $GITHUB_STEP_SUMMARY
|
|
echo "1. Run \`cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema\`" >> $GITHUB_STEP_SUMMARY
|
|
echo "2. Commit the updated schema file" >> $GITHUB_STEP_SUMMARY
|
|
exit 1
|
|
fi
|
|
|
|
validate-json-syntax:
|
|
runs-on: ubuntu-latest
|
|
name: Validate JSON Syntax
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Validate JSON Schema
|
|
run: |
|
|
python3 -c "import json; json.load(open('docs/schema/v1.0/pdftract.schema.json')); print('Schema is valid JSON')"
|
|
|
|
- name: Validate schema structure
|
|
run: |
|
|
python3 << 'EOF'
|
|
import json
|
|
with open('docs/schema/v1.0/pdftract.schema.json') as f:
|
|
schema = json.load(f)
|
|
|
|
# Verify required fields
|
|
assert schema['$schema'] == 'https://json-schema.org/draft/2020-12/schema', "Missing or incorrect $schema"
|
|
assert schema['$id'] == 'https://pdftract.com/schema/v1.0/pdftract.schema.json', "Missing or incorrect $id"
|
|
assert schema['title'] == 'pdftract Output v1.0', "Missing or incorrect title"
|
|
|
|
# Verify $defs exist
|
|
assert '$defs' in schema, "Missing $defs"
|
|
assert 'PageJson' in schema['$defs'], "Missing PageJson definition"
|
|
assert 'SpanJson' in schema['$defs'], "Missing SpanJson definition"
|
|
assert 'DiagnosticJson' in schema['$defs'], "Missing DiagnosticJson definition"
|
|
|
|
# Verify enum constraints
|
|
page_type = schema['$defs']['PageJson']['properties']['type']
|
|
assert 'enum' in page_type, "Missing enum constraint on PageJson.type"
|
|
assert set(page_type['enum']) == {'text', 'scanned', 'mixed', 'broken_vector', 'blank', 'figure_only'}, "Incorrect page_type enum values"
|
|
|
|
severity = schema['$defs']['DiagnosticJson']['properties']['severity']
|
|
assert 'enum' in severity, "Missing enum constraint on DiagnosticJson.severity"
|
|
assert set(severity['enum']) == {'info', 'warning', 'error', 'fatal'}, "Incorrect severity enum values"
|
|
|
|
conf_source = schema['$defs']['SpanJson']['properties']['confidence_source']
|
|
assert 'enum' in conf_source, "Missing enum constraint on SpanJson.confidence_source"
|
|
assert set(conf_source['enum']) == {'native', 'heuristic', 'ocr'}, "Incorrect confidence_source enum values"
|
|
|
|
print("All schema structure validations passed!")
|
|
EOF
|