name: Schema Generation Validation on: push: branches: [main] pull_request: branches: [main] workflow_dispatch: jobs: validate-schema: runs-on: ubuntu-latest name: Validate JSON Schema steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy - name: Cache Cargo registry uses: actions/cache@v4 with: path: ~/.cargo/registry key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} - name: Cache Cargo index uses: actions/cache@v4 with: path: ~/.cargo/git key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} - name: Cache Cargo build uses: actions/cache@v4 with: path: target key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} - name: Generate JSON Schema run: cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema - name: Check for schema changes id: check-diff run: | if git diff --quiet docs/schema/v1.0/pdftract.schema.json; then echo "Schema is up to date" echo "has_changes=false" >> $GITHUB_OUTPUT else echo "Schema has uncommitted changes" echo "has_changes=true" >> $GITHUB_OUTPUT echo "### Schema changes detected :warning:" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "The generated JSON schema differs from the committed file." >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "\`\`\`diff" >> $GITHUB_STEP_SUMMARY git diff docs/schema/v1.0/pdftract.schema.json >> $GITHUB_STEP_SUMMARY echo "\`\`\`" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "To fix this issue:" >> $GITHUB_STEP_SUMMARY echo "1. Run \`cargo run --manifest-path=xtask/Cargo.toml --bin gen_schema\`" >> $GITHUB_STEP_SUMMARY echo "2. Commit the updated schema file" >> $GITHUB_STEP_SUMMARY exit 1 fi validate-json-syntax: runs-on: ubuntu-latest name: Validate JSON Syntax steps: - name: Checkout repository uses: actions/checkout@v4 - name: Validate JSON Schema run: | python3 -c "import json; json.load(open('docs/schema/v1.0/pdftract.schema.json')); print('Schema is valid JSON')" - name: Validate schema structure run: | python3 << 'EOF' import json with open('docs/schema/v1.0/pdftract.schema.json') as f: schema = json.load(f) # Verify required fields assert schema['$schema'] == 'https://json-schema.org/draft/2020-12/schema', "Missing or incorrect $schema" assert schema['$id'] == 'https://pdftract.com/schema/v1.0/pdftract.schema.json', "Missing or incorrect $id" assert schema['title'] == 'pdftract Output v1.0', "Missing or incorrect title" # Verify $defs exist assert '$defs' in schema, "Missing $defs" assert 'PageJson' in schema['$defs'], "Missing PageJson definition" assert 'SpanJson' in schema['$defs'], "Missing SpanJson definition" assert 'DiagnosticJson' in schema['$defs'], "Missing DiagnosticJson definition" # Verify enum constraints page_type = schema['$defs']['PageJson']['properties']['type'] assert 'enum' in page_type, "Missing enum constraint on PageJson.type" assert set(page_type['enum']) == {'text', 'scanned', 'mixed', 'broken_vector', 'blank', 'figure_only'}, "Incorrect page_type enum values" severity = schema['$defs']['DiagnosticJson']['properties']['severity'] assert 'enum' in severity, "Missing enum constraint on DiagnosticJson.severity" assert set(severity['enum']) == {'info', 'warning', 'error', 'fatal'}, "Incorrect severity enum values" conf_source = schema['$defs']['SpanJson']['properties']['confidence_source'] assert 'enum' in conf_source, "Missing enum constraint on SpanJson.confidence_source" assert set(conf_source['enum']) == {'native', 'heuristic', 'ocr'}, "Incorrect confidence_source enum values" print("All schema structure validations passed!") EOF