pdftract/.ci/argo-workflows/pdftract-ci.yaml

# pdftract-ci WorkflowTemplate
#
# This template orchestrates the CI/CD pipeline for pdftract, a Rust PDF text extraction
# library with PyO3 Python bindings and a CLI binary. The pipeline builds, tests, runs
# quality checks, benchmarks, and publishes releases across multiple targets.
#
# === Webhook Payload Schema ===
# Triggered via GitHub webhook -> WorkflowEventBinding (out of scope for this bead).
# Expected webhook payload schema:
#
#   {
#     "ref": "refs/heads/main" | "refs/tags/v0.1.0",
#     "repository": {
#       "full_name": "jedarden/pdftract",
#       "html_url": "https://github.com/jedarden/pdftract"
#     },
#     "head_commit": {
#       "id": "abc123...",
#       "message": "Commit message"
#     },
#     "sender": {
#       "login": "username"
#     }
#   }
#
# === Parameter Reference ===
# - commit-sha: Full Git commit SHA (40 hex chars)
# - ref: Git ref (branch: "refs/heads/*", tag: "refs/tags/v*")
# - repo-url: GitHub repository URL
# - is-tag: Boolean ("true" if ref is a tag, "false" otherwise)
#
# === DAG Structure ===
# setup -> [parallel: build-matrix, test-matrix, quality-matrix, bench-matrix] -> publish-if-tag
#
# - setup: Clone repo, fetch dependencies, warm cargo cache
# - build-matrix: Cross-compile for 5 targets (x86_64/aarch64 Linux musl, macOS x64/ARM64, Windows x64)
# - test-matrix: Run unit tests across feature combinations (default, full, with OCR)
# - quality-matrix: Linting (clippy, fmt), security audit (cargo-audit), dependency review
# - bench-matrix: Performance benchmarks (cargo bench) against fixture corpus
# - publish-if-tag: On tags only, upload binaries to GitHub Releases
#
# === Subsequent Phase 0 Beads ===
# Each bead fills in a distinct set of templates without colliding:
# - pdftract-xxxx: setup step, volume mount points, cache warming logic
# - pdftract-yyyy: build-matrix templates (5 target builds with cross)
# - pdftract-zzzz: test-matrix templates (feature combinations)
# - pdftract-wwww: quality-matrix templates (clippy, fmt, audit)
# - pdftract-vvvv: bench-matrix templates (cargo bench)
# - pdftract-uuuu: publish-if-tag template (gh release create)
#
apiVersion: argoproj.io/v1alpha1
kind: WorkflowTemplate
metadata:
  name: pdftract-ci
  namespace: argo-workflows
  labels:
    app.kubernetes.io/name: pdftract-ci
    app.kubernetes.io/component: ci
    app.kubernetes.io/part-of: pdftract
spec:
  entrypoint: pipeline
  serviceAccountName: argo-workflow

  podGC: OnPodCompletion
  ttlSecondsAfterFinished:
    success: 1800
    failure: 7200

  arguments:
    parameters:
      - name: commit-sha
        value: ""
        description: "Full Git commit SHA (40 hex chars)"
      - name: ref
        value: "refs/heads/main"
        description: "Git ref (branch: 'refs/heads/*', tag: 'refs/tags/v*')"
      - name: repo-url
        value: "https://github.com/jedarden/pdftract.git"
        description: "GitHub repository URL"
      - name: is-tag
        value: "false"
        description: "Boolean ('true' if ref is a tag, 'false' otherwise)"
      - name: regression-mode
        value: "gate"
        description: "Regression mode: 'gate' (PR) fails on CER > 0.5%, 'update' (merge) refreshes baselines"
      - name: pr-number
        value: ""
        description: "Pull request number for posting benchmark comments (empty skips commenting)"
      - name: proptest-seed
        value: ""
        description: "Proptest seed for reproducibility (empty = auto-generate)"
      - name: proptest-cases
        value: "10000"
        description: "Number of proptest cases per module (default: 10000)"

  volumeClaimTemplates:
    - metadata:
        name: cargo-cache
      spec:
        accessModes: [ReadWriteOnce]
        storageClassName: sata-large
        resources:
          requests:
            storage: 50Gi
    - metadata:
        name: workspace
      spec:
        accessModes: [ReadWriteOnce]
        storageClassName: sata-large
        resources:
          requests:
            storage: 10Gi
    - metadata:
        name: shared-artifacts
      spec:
        accessModes: [ReadWriteOnce]
        storageClassName: sata-large
        resources:
          requests:
            storage: 1Gi
    - metadata:
        name: regression-results
      spec:
        accessModes: [ReadWriteOnce]
        storageClassName: sata-large
        resources:
          requests:
            storage: 2Gi

  volumes:
    - name: docker-config
      secret:
        secretName: docker-hub-registry
        items:
          - key: .dockerconfigjson
            path: config.json

  podMetadata:
    labels:
      app.kubernetes.io/name: pdftract-ci
      commit-sha: "{{workflow.parameters.commit-sha}}"

  podSpecPatch: |
    imagePullSecrets:
      - name: docker-hub-registry
    securityContext:
      runAsNonRoot: true
      runAsUser: 1000
      fsGroup: 1000

  templates:
    # === Top-level DAG ===
    # Setup runs first, then all matrices run in parallel, then publish if tagged
    - name: pipeline
      dag:
        onExit: on-exit
        tasks:
          - name: setup
            template: setup

          - name: build-matrix
            template: build-matrix
            dependencies: [setup]

          - name: test-matrix
            template: test-matrix
            dependencies: [setup]

          - name: quality-matrix
            template: quality-matrix
            dependencies: [setup]

          - name: bench-matrix
            template: bench-matrix
            dependencies: [setup]
            arguments:
              artifacts:
                - name: pdftract-binary
                  from: "{{tasks.build-matrix.tasks.build-linux-x86_64-musl.outputs.artifacts.pdftract-binary}}"

          - name: benchmark-pr-comment
            template: benchmark-pr-comment
            dependencies: [bench-matrix]
            when: "{{workflow.parameters.pr-number}} != \"\""
            arguments:
              artifacts:
                - name: benchmark-comment
                  from: "{{tasks.bench-matrix.outputs.artifacts.benchmark-comment}}"

          - name: regression-corpus
            template: regression-corpus
            dependencies: [build-matrix]

          - name: publish-if-tag
            template: publish-if-tag
            dependencies: [build-matrix, test-matrix, quality-matrix, bench-matrix, regression-corpus]
            when: "{{workflow.parameters.is-tag}} == true"

    # === Exit Handler ===
    # Reports workflow status (success/failure) with details
    - name: on-exit
      script:
        image: alpine:3.19
        command: [sh]
        source: |
          #!/bin/sh
          set -e
          echo "=== Workflow Exit Report ==="
          echo "Workflow: {{workflow.name}}"
          echo "Commit: {{workflow.parameters.commit-sha}}"
          echo "Ref: {{workflow.parameters.ref}}"
          echo "Status available in workflow metadata"
      activeDeadlineSeconds: 60

    # === Setup Step ===
    # Clones repo, fetches dependencies, warms cargo cache
    # Filled in by subsequent Phase 0 bead
    #
    # CRITICAL: All cargo commands in this workflow MUST use --locked (or --locked --frozen)
    # to enforce the workspace Cargo.lock policy. See CONTRIBUTING.md for details.
    - name: setup
      activeDeadlineSeconds: 600
      container:
        image: alpine:3.19
        command: [sh, -c]
        args:
          - |
            # Placeholder: clone repo to /workspace, warm cargo cache
            echo "Setup step - to be implemented by Phase 0 sibling bead"
            echo "Should clone {{workflow.parameters.repo-url}} to /workspace"
            echo "Should checkout {{workflow.parameters.commit-sha}}"
            exit 0
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
        resources:
          requests:
            cpu: 500m
            memory: 1Gi
          limits:
            cpu: 1000m
            memory: 2Gi

    # === Build Matrix ===
    # Cross-compile for 5 targets using cross (Docker-based)
    # Targets: x86_64-unknown-linux-musl, aarch64-unknown-linux-musl,
    #          x86_64-apple-darwin, aarch64-apple-darwin, x86_64-pc-windows-gnu
    - name: build-matrix
      activeDeadlineSeconds: 3600
      dag:
        onExit: build-matrix-exit
        tasks:
          - name: build-linux-x86_64-musl
            template: build-target
            arguments:
              parameters:
                - name: target
                  value: "x86_64-unknown-linux-musl"
                - name: cross-image
                  value: "ghcr.io/cross-rs/x86_64-unknown-linux-musl:main"
                - name: strip-cmd
                  value: "x86_64-linux-musl-strip"
                - name: ext
                  value: ""
            continueOn:
              failed: true
          - name: build-linux-aarch64-musl
            template: build-target
            arguments:
              parameters:
                - name: target
                  value: "aarch64-unknown-linux-musl"
                - name: cross-image
                  value: "ghcr.io/cross-rs/aarch64-unknown-linux-musl:main"
                - name: strip-cmd
                  value: "aarch64-linux-musl-strip"
                - name: ext
                  value: ""
            continueOn:
              failed: true
          - name: build-darwin-x86_64
            template: build-target
            arguments:
              parameters:
                - name: target
                  value: "x86_64-apple-darwin"
                - name: cross-image
                  value: "ghcr.io/cross-rs/x86_64-apple-darwin:main"
                - name: strip-cmd
                  value: "x86_64-apple-darwin-strip"
                - name: ext
                  value: ""
            continueOn:
              failed: true
          - name: build-darwin-aarch64
            template: build-target
            arguments:
              parameters:
                - name: target
                  value: "aarch64-apple-darwin"
                - name: cross-image
                  value: "ghcr.io/cross-rs/aarch64-apple-darwin:main"
                - name: strip-cmd
                  value: "aarch64-apple-darwin-strip"
                - name: ext
                  value: ""
            continueOn:
              failed: true
          - name: build-windows-x86_64-gnu
            template: build-target
            arguments:
              parameters:
                - name: target
                  value: "x86_64-pc-windows-gnu"
                - name: cross-image
                  value: "ghcr.io/cross-rs/x86_64-pc-windows-gnu:main"
                - name: strip-cmd
                  value: "x86_64-w64-mingw32-strip"
                - name: ext
                  value: ".exe"
            continueOn:
              failed: true

    # === Build Target Template ===
    # Single target build using cross (Docker-based)
    # Uses ghcr.io/cross-rs/<target>:main images which have cross pre-installed
    - name: build-target
      inputs:
        parameters:
          - name: target
          - name: cross-image
          - name: strip-cmd
          - name: ext
      activeDeadlineSeconds: 3600
      container:
        image: "{{inputs.parameters.cross-image}}"
        command: [bash, -c]
        args:
          - |
            set -eo pipefail

            TARGET="{{inputs.parameters.target}}"
            STRIP_CMD="{{inputs.parameters.strip-cmd}}"
            EXT="{{inputs.parameters.ext}}"

            echo "=========================================="
            echo "Building pdftract for target: $TARGET"
            echo "=========================================="

            cd /workspace

            # Set reproducible build timestamp
            export SOURCE_DATE_EPOCH=$(git log -1 --format=%ct 2>/dev/null || echo 0)
            export CARGO_HOME="/cache/cargo/registry"
            export CARGO_TARGET_DIR="/cache/cargo/target-$TARGET"

            echo "SOURCE_DATE_EPOCH=$SOURCE_DATE_EPOCH"
            echo "CARGO_HOME=$CARGO_HOME"
            echo "CARGO_TARGET_DIR=$CARGO_TARGET_DIR"

            echo "=== Running cargo build with cross ==="
            cross build --release --target "$TARGET" --locked --features default,serve,decrypt

            BINARY_PATH="target/$TARGET/release/pdftract$EXT"

            if [ ! -f "$BINARY_PATH" ]; then
              echo "ERROR: Binary not found at $BINARY_PATH" >&2
              echo "Contents of target directory:"
              ls -la "target/$TARGET/release/" || true
              exit 1
            fi

            echo "=== Binary size before strip ==="
            ls -lh "$BINARY_PATH"

            echo "=== Stripping binary ==="
            "$STRIP_CMD" "$BINARY_PATH" || {
              echo "WARNING: Strip command failed, continuing with unstripped binary" >&2
            }

            echo "=== Binary size after strip ==="
            ls -lh "$BINARY_PATH"

            mkdir -p /artifacts
            cp "$BINARY_PATH" "/artifacts/pdftract-$TARGET$EXT"

            echo "=== Final artifact ==="
            ls -lh /artifacts/

            SIZE=$(stat -c%s "/artifacts/pdftract-$TARGET$EXT" 2>/dev/null || stat -f%z "/artifacts/pdftract-$TARGET$EXT")
            echo "Binary size: $SIZE bytes"

            if [ "$SIZE" -gt 4194304 ]; then
              echo "WARNING: Binary exceeds 4 MB budget ($SIZE bytes)"
            else
              echo "Binary within 4 MB budget"
            fi

            echo "=== Build complete ==="
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
          - name: docker-config
            mountPath: /root/.docker
        resources:
          requests:
            cpu: 2000m
            memory: 4Gi
          limits:
            cpu: 4000m
            memory: 8Gi
      outputs:
        artifacts:
          - name: pdftract-binary
            path: /artifacts/pdftract-{{inputs.parameters.target}}{{inputs.parameters.ext}}

    # === Build Matrix Exit Handler ===
    - name: build-matrix-exit
      script:
        image: alpine:3.19
        command: [sh]
        source: |
          #!/bin/sh
          echo "=== Build Matrix Exit Report ==="
          echo "Commit: {{workflow.parameters.commit-sha}}"
          echo "All binaries available as artifacts"

    # === Test Matrix ===
    # Run cargo test across feature combinations and proptest
    # - default features unit tests
    # - all features unit tests
    # - proptest property tests (10,000 cases per module)
    #
    # CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
    - name: test-matrix
      activeDeadlineSeconds: 3600
      container:
        image: rust:1.83-bookworm
        command: [bash, -c]
        args:
          - |
            set -eo pipefail

            echo "=========================================="
            echo "Test Matrix"
            echo "=========================================="

            cd /workspace
            export CARGO_HOME="/cache/cargo/registry"
            export CARGO_TARGET_DIR="/cache/cargo/target-test"

            # Set proptest seed for reproducibility
            SEED="{{workflow.parameters.proptest-seed}}"
            if [ -z "$SEED" ]; then
              SEED=$(date +%s%N | sha256sum | head -c 16)
              echo "Generated proptest seed: $SEED"
            else
              echo "Using provided proptest seed: $SEED"
            fi
            export PROPTEST_SEED="$SEED"

            # Set proptest case count
            CASES="{{workflow.parameters.proptest-cases}}"
            echo "Proptest cases per module: $CASES"
            export PROPTEST_CASES="$CASES"

            echo "=== Running unit tests (default features) ==="
            cargo test --locked --lib --bins

            echo "=== Running unit tests (all features) ==="
            cargo test --locked --all-features --lib --bins

            echo "=== Running property tests (proptest) ==="
            echo "Seed: $PROPTEST_SEED | Cases: $PROPTEST_CASES"
            cargo nextest run --features proptest --proptest --profile=ci-proptest || {
              EXIT_CODE=$?
              if [ $EXIT_CODE -ne 0 ]; then
                echo "ERROR: Property tests failed!"
                echo "Check proptest-regressions/ for new minimal counterexamples"
                exit $EXIT_CODE
              fi
            }

            echo "=== All tests passed ==="
            echo "Unit tests: PASS"
            echo "Property tests: PASS ($CASES cases per module)"
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
        resources:
          requests:
            cpu: 2000m
            memory: 4Gi
          limits:
            cpu: 4000m
            memory: 8Gi

    # === Quality Matrix ===
    # Run linting (clippy, fmt), security audit (cargo-audit), dependency review,
    # and MSRV check (build with rust:1.78-slim to detect new-Rust feature usage)
    #
    # CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
    - name: quality-matrix
      activeDeadlineSeconds: 900
      dag:
        tasks:
          - name: clippy-fmt
            template: clippy-fmt
          - name: msrv-check
            template: msrv-check
          - name: cargo-audit
            template: cargo-audit

    # === Clippy and Fmt Check ===
    # Runs clippy with MSRV-aware lints and verifies formatting
    - name: clippy-fmt
      activeDeadlineSeconds: 600
      container:
        image: rust:1.83-bookworm
        command: [bash, -c]
        args:
          - |
            set -eo pipefail

            echo "=========================================="
            echo "Clippy and Format Check"
            echo "=========================================="

            cd /workspace
            export CARGO_HOME="/cache/cargo/registry"
            export CARGO_TARGET_DIR="/cache/cargo/target-clippy"

            echo "=== Running clippy with MSRV = 1.78 ==="
            cargo clippy --locked --all-targets --all-features -- -D warnings

            echo "=== Running fmt check ==="
            cargo fmt --check

            echo "=== Clippy and fmt checks passed ==="
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
        resources:
          requests:
            cpu: 1000m
            memory: 2Gi
          limits:
            cpu: 2000m
            memory: 4Gi

    # === MSRV Check ===
    # Builds with rust:1.78-slim to verify no newer Rust features are used.
    # This gate prevents silent MSRV drift that would break downstream consumers
    # on older toolchains.
    - name: msrv-check
      activeDeadlineSeconds: 600
      container:
        image: rust:1.78-slim
        command: [bash, -c]
        args:
          - |
            set -eo pipefail

            echo "=========================================="
            echo "MSRV Check (Rust 1.78)"
            echo "=========================================="

            cd /workspace
            export CARGO_HOME="/cache/cargo/registry"
            export CARGO_TARGET_DIR="/cache/cargo/target-msrv"

            echo "=== Building with Rust 1.78 (MSRV) ==="
            rustc --version

            # Build workspace with default features to catch MSRV violations
            cargo build --workspace --features default --locked

            echo "=== MSRV check passed ==="
            echo "No Rust 1.79+ features detected"
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
        resources:
          requests:
            cpu: 1000m
            memory: 2Gi
          limits:
            cpu: 2000m
            memory: 4Gi

    # === Cargo Audit ===
    # Runs cargo-audit to check for security vulnerabilities in dependencies
    - name: cargo-audit
      activeDeadlineSeconds: 300
      container:
        image: rust:1.83-bookworm
        command: [bash, -c]
        args:
          - |
            set -eo pipefail

            echo "=========================================="
            echo "Security Audit (cargo-audit)"
            echo "=========================================="

            cd /workspace
            export CARGO_HOME="/cache/cargo/registry"

            # Install cargo-audit if not present
            if ! command -v cargo-audit &> /dev/null; then
              echo "Installing cargo-audit..."
              cargo install cargo-audit --locked
            fi

            echo "=== Running cargo audit ==="
            cargo audit --locked

            echo "=== Security audit passed ==="
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
        resources:
          requests:
            cpu: 500m
            memory: 1Gi
          limits:
            cpu: 1000m
            memory: 2Gi

    # === Bench Matrix ===
    # Competitive benchmarks: pdftract vs pdfminer.six, pypdf, pdfplumber
    # Runs hyperfine against 50-PDF corpus (25 vector + 25 raster)
    # Enforces regression gate (>10%) and 10x-faster gate (vs pdfminer)
    - name: bench-matrix
      inputs:
        artifacts:
          - name: pdftract-binary
            path: /tmp/pdftract-binary
      activeDeadlineSeconds: 3600
      container:
        image: python:3.11-slim-bookworm
        command: [bash, -c]
        args:
          - |
            set -eo pipefail

            echo "=========================================="
            echo "Competitive Benchmark Matrix"
            echo "=========================================="

            cd /workspace

            # Install hyperfine
            echo "=== Installing hyperfine ==="
            apt-get update -qq
            apt-get install -y hyperfine jq

            # Install competitor tools
            echo "=== Installing competitor tools ==="
            pip install --no-cache-dir -r benches/competitors/requirements.txt

            # Get pdftract binary from build-matrix artifact
            echo "=== Installing pdftract binary ==="
            PDFTRACT_ARTIFACT="/tmp/pdftract-binary"
            if [ -f "$PDFTRACT_ARTIFACT" ]; then
              cp "$PDFTRACT_ARTIFACT" /usr/local/bin/pdftract
              chmod +x /usr/local/bin/pdftract
              echo "pdftract binary installed from artifact"
            else
              echo "WARNING: pdftract binary not found in artifacts, using PATH"
            fi

            # Verify pdftract is available
            if ! command -v pdftract &> /dev/null; then
              echo "WARNING: pdftract not found in PATH, benchmarks will fail"
            else
              pdftract --version || echo "WARNING: pdftract --version failed"
            fi

            # Get baseline from main branch
            echo "=== Fetching baseline from main branch ==="
            mkdir -p /tmp/baseline
            if git show main:benches/baselines/main.json > /tmp/baseline/main.json 2>/dev/null; then
              export BASELINE="/tmp/baseline/main.json"
              echo "Baseline loaded from main branch"
            else
              echo "WARNING: Could not fetch baseline from main, using local file"
              export BASELINE="benches/baselines/main.json"
            fi

            # Run benchmarks
            echo "=== Running competitive benchmarks ==="
            cd benches/competitors

            # Set output paths
            export OUTPUT="/tmp/benchmark-results.json"
            export COMMENT="/tmp/benchmark-comment.md"

            # Run the benchmark script
            bash run-benchmarks.sh || {
              EXIT_CODE=$?
              if [ $EXIT_CODE -eq 1 ]; then
                echo "ERROR: Benchmark gates failed!"
                exit 1
              else
                echo "ERROR: Benchmark execution failed with code $EXIT_CODE"
                exit 1
              fi
            }

            # Copy results to workspace for artifacts
            cp "$OUTPUT" /workspace/benchmark-results.json
            cp "$COMMENT" /workspace/benchmark-comment.md

            echo "=== Benchmark complete ==="
            echo "Results:"
            cat "$OUTPUT" | jq -r '[.[] | select(.tool == "pdftract") | .mean_ms] | length' | xargs -I {} echo "  pdftract results: {}"
            cat "$OUTPUT" | jq -r '[.[] | select(.tool == "pdfminer") | .mean_ms] | length' | xargs -I {} echo "  pdfminer results: {}"

            echo "=== All gates passed ==="
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
        resources:
          requests:
            cpu: 2000m
            memory: 4Gi
          limits:
            cpu: 4000m
            memory: 8Gi
        outputs:
          artifacts:
            - name: benchmark-results
              path: /workspace/benchmark-results.json
            - name: benchmark-comment
              path: /workspace/benchmark-comment.md

    # === Benchmark PR Comment ===
    # Posts benchmark results as a comment on the pull request
    # Only runs when pr-number parameter is non-empty
    - name: benchmark-pr-comment
      inputs:
        artifacts:
          - name: benchmark-comment
            path: /tmp/benchmark-comment.md
      activeDeadlineSeconds: 60
      container:
        image: debian:12
        command: [sh, -c]
        args:
          - |
            set -e
            PR_NUMBER="{{workflow.parameters.pr-number}}"
            COMMENT_FILE="/tmp/benchmark-comment.md"

            echo "=== Posting benchmark comment to PR #$PR_NUMBER ==="

            # Read comment content
            if [ ! -f "$COMMENT_FILE" ]; then
              echo "ERROR: Benchmark comment file not found"
              exit 1
            fi

            COMMENT_BODY=$(cat "$COMMENT_FILE")

            # Post comment via GitHub API
            curl -s -X POST \
              -H "Authorization: token ${GH_TOKEN}" \
              -H "Accept: application/vnd.github.v3+json" \
              "https://api.github.com/repos/jedarden/pdftract/issues/${PR_NUMBER}/comments" \
              -d "{\"body\": $(echo "$COMMENT_BODY" | jq -R -s '.')}"

            echo "=== Benchmark comment posted successfully ==="
        env:
          - name: GH_TOKEN
            valueFrom:
              secretKeyRef:
                name: github-webhook-secret
                key: token
        resources:
          requests:
            cpu: 100m
            memory: 256Mi
          limits:
            cpu: 500m
            memory: 512Mi

    # === Regression Corpus ===
    # Run pdftract binary against 500-PDF private regression corpus via ARMOR proxy
    # Compares per-document CER against baseline; fails if delta > 0.5%
    - name: regression-corpus
      activeDeadlineSeconds: 600
      dag:
        onExit: regression-corpus-exit
        tasks:
          - name: build-cer-diff
            template: build-cer-diff
          - name: regression-shards
            template: regression-shard
            dependencies: [build-cer-diff]
            withSequence:
              start: "0"
              end: "7"
            arguments:
              parameters:
                - name: shard-index
                  value: "{{item}}"
                - name: shard-total
                  value: "8"
              artifacts:
                - name: pdftract-binary
                  from: "{{tasks.build-matrix.tasks.build-linux-x86_64-musl.outputs.artifacts.pdftract-binary}}"

    # === Build CER Diff Tool ===
    # Build the cer-diff binary for comparing extraction outputs
    - name: build-cer-diff
      activeDeadlineSeconds: 300
      container:
        image: rust:1.83-bookworm
        command: [bash, -c]
        args:
          - |
            set -eo pipefail
            echo "=== Building cer-diff tool ==="
            cd /workspace
            export CARGO_HOME="/cache/cargo/registry"
            export CARGO_TARGET_DIR="/cache/cargo/target-cer-diff"
            cargo build --release --bin cer-diff --package pdftract-cer-diff --locked
            cp target/release/cer-diff /shared/cer-diff
            echo "=== cer-diff binary ready ==="
            ls -lh /shared/cer-diff
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
          - name: shared-artifacts
            mountPath: /shared
        resources:
          requests:
            cpu: 1000m
            memory: 2Gi
          limits:
            cpu: 2000m
            memory: 4Gi

    # === Regression Shard ===
    # Process a subset of the regression corpus (1 of 8 shards)
    - name: regression-shard
      inputs:
        parameters:
          - name: shard-index
          - name: shard-total
        artifacts:
          - name: pdftract-binary
            path: /tmp/pdftract-binary
      activeDeadlineSeconds: 360
      container:
        image: pdftract-test-glibc:1.78
        command: [bash, -c]
        args:
          - |
            set -eo pipefail

            SHARD_INDEX="{{inputs.parameters.shard-index}}"
            SHARD_TOTAL="{{inputs.parameters.shard-total}}"
            THRESHOLD="0.005"
            REGRESSION_MODE="{{workflow.parameters.regression-mode}}"

            echo "=========================================="
            echo "Regression Shard: $SHARD_INDEX / $SHARD_TOTAL"
            echo "Mode: $REGRESSION_MODE"
            echo "=========================================="

            # Configure AWS CLI for ARMOR proxy
            export AWS_ACCESS_KEY_ID="$ARMOR_ACCESS_KEY_ID"
            export AWS_SECRET_ACCESS_KEY="$ARMOR_SECRET_ACCESS_KEY"
            export AWS_ENDPOINT_URL="http://armor.armor.svc.cluster.local:9000"

            # Download pdftract binary
            echo "=== Downloading pdftract binary ==="
            PDFTRACT_ARTIFACT="/tmp/pdftract-binary"
            if [ -f "$PDFTRACT_ARTIFACT" ]; then
              cp "$PDFTRACT_ARTIFACT" ./pdftract-x86_64-unknown-linux-musl
              chmod +x pdftract-x86_64-unknown-linux-musl
              echo "Binary downloaded from artifact"
            else
              echo "ERROR: pdftract binary not found in artifacts"
              exit 1
            fi

            ./pdftract-x86_64-unknown-linux-musl --version || echo "Binary check passed"

            # Copy cer-diff to PATH
            cp /shared/cer-diff /usr/local/bin/cer-diff
            chmod +x /usr/local/bin/cer-diff
            cer-diff --help || true

            # Create output directory
            mkdir -p /regression/results

            # List corpus files for this shard
            echo "=== Fetching corpus document list ==="
            aws s3 ls --endpoint-url="$AWS_ENDPOINT_URL" "s3://pdftract-regression-corpus/v1/" | \
              awk '{print $NF}' | grep '\.pdf$' > /tmp/all_docs.txt

            TOTAL_DOCS=$(wc -l < /tmp/all_docs.txt)
            echo "Total documents in corpus: $TOTAL_DOCS"

            # Calculate shard boundaries
            DOCS_PER_SHARD=$(( (TOTAL_DOCS + SHARD_TOTAL - 1) / SHARD_TOTAL ))
            START_LINE=$((SHARD_INDEX * DOCS_PER_SHARD + 1))
            END_LINE=$((START_LINE + DOCS_PER_SHARD - 1))

            echo "Shard $SHARD_INDEX: processing lines $START_LINE to $END_LINE"

            # Extract shard documents
            sed -n "${START_LINE},${END_LINE}p" /tmp/all_docs.txt > /tmp/shard_docs.txt
            SHARD_DOC_COUNT=$(wc -l < /tmp/shard_docs.txt)
            echo "Documents in this shard: $SHARD_DOC_COUNT"

            # Process each document
            PASS_COUNT=0
            FAIL_COUNT=0
            PROCESSED=0

            while IFS= read -r pdf_name; do
              [ -z "$pdf_name" ] && continue
              PROCESSED=$((PROCESSED + 1))

              SHA256="${pdf_name%.pdf}"
              PDF_PATH="s3://pdftract-regression-corpus/v1/${pdf_name}"
              BASELINE_PATH="s3://pdftract-regression-corpus/baselines/${SHA256}.json"

              echo "[$PROCESSED/$SHARD_DOC_COUNT] Processing: $pdf_name"

              # Download PDF
              aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" "$PDF_PATH" "/tmp/${pdf_name}" || {
                echo "ERROR: Failed to download PDF: $pdf_name"
                continue
              }

              # Run pdftract extraction
              if ! ./pdftract-x86_64-unknown-linux-musl extract --json --pages all "/tmp/${pdf_name}" > /tmp/actual.json 2>/dev/null; then
                echo "ERROR: Extraction failed for: $pdf_name"
                continue
              fi

              # Fetch or compute baseline
              if [ "$REGRESSION_MODE" = "update" ]; then
                # Update mode: save current output as new baseline
                aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" /tmp/actual.json "$BASELINE_PATH"
                RESULT="{\"sha\":\"$SHA256\",\"cer_delta\":0.0,\"pass\":true,\"mode\":\"update\"}"
              else
                # Gate mode: compare against baseline
                if ! aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" "$BASELINE_PATH" /tmp/baseline.json 2>/dev/null; then
                  echo "WARN: No baseline found for: $pdf_name (new corpus doc?)"
                  RESULT="{\"sha\":\"$SHA256\",\"cer_delta\":0.0,\"pass\":true,\"note\":\"no_baseline\"}"
                else
                  # Compute CER
                  CER_OUTPUT=$(cer-diff --sha "$SHA256" /tmp/actual.json /tmp/baseline.json --threshold "$THRESHOLD")
                  EXIT_CODE=$?

                  if [ $EXIT_CODE -eq 0 ]; then
                    PASS_COUNT=$((PASS_COUNT + 1))
                  else
                    FAIL_COUNT=$((FAIL_COUNT + 1))
                  fi

                  RESULT="$CER_OUTPUT"
                fi
              fi

              # Write result to JSONL
              echo "$RESULT" >> "/regression/results/shard-${SHARD_INDEX}.jsonl"

              # Cleanup
              rm -f "/tmp/${pdf_name}" /tmp/actual.json /tmp/baseline.json
            done < /tmp/shard_docs.txt

            echo "=========================================="
            echo "Shard $SHARD_INDEX complete"
            echo "Processed: $PROCESSED"
            echo "Passed: $PASS_COUNT"
            echo "Failed: $FAIL_COUNT"
            echo "=========================================="

            # Merge shard results into main output
            if [ -f "/regression/results/shard-${SHARD_INDEX}.jsonl" ]; then
              cat "/regression/results/shard-${SHARD_INDEX}.jsonl" >> "/regression/regression-results.jsonl"
            fi

            # Fail shard if any document exceeded threshold
            if [ "$FAIL_COUNT" -gt 0 ] && [ "$REGRESSION_MODE" = "gate" ]; then
              echo "ERROR: $FAIL_COUNT documents exceeded CER threshold"
              exit 1
            fi

        env:
          - name: ARMOR_ACCESS_KEY_ID
            valueFrom:
              secretKeyRef:
                name: b2-readonly
                key: access-key-id
                optional: true
          - name: ARMOR_SECRET_ACCESS_KEY
            valueFrom:
              secretKeyRef:
                name: b2-readonly
                key: secret-access-key
                optional: true
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: shared-artifacts
            mountPath: /shared
          - name: regression-results
            mountPath: /regression
        resources:
          requests:
            cpu: 1000m
            memory: 2Gi
          limits:
            cpu: 2000m
            memory: 4Gi
        outputs:
          artifacts:
            - name: shard-results
              path: /regression/results/shard-{{inputs.parameters.shard-index}}.jsonl
              optional: true

    # === Regression Corpus Exit Handler ===
    - name: regression-corpus-exit
      script:
        image: debian:12
        command: [sh]
        source: |
          #!/bin/sh
          set -e
          echo "=== Regression Corpus Exit Report ==="
          echo "Commit: {{workflow.parameters.commit-sha}}"
          echo "Regression mode: {{workflow.parameters.regression-mode}}"
          echo "Results artifacts available from all shards"

          if [ -f "/regression/regression-results.jsonl" ]; then
            echo "Total results lines: $(wc -l < /regression/regression-results.jsonl)"
            echo "=== Sample results (first 5) ==="
            head -5 /regression/regression-results.jsonl || true
          fi
        volumeMounts:
          - name: regression-results
            mountPath: /regression
        resources:
          requests:
            cpu: 200m
            memory: 256Mi
          limits:
            cpu: 500m
            memory: 512Mi
      outputs:
        artifacts:
          - name: regression-results
            path: /regression/regression-results.jsonl
            optional: true

    # === Publish If Tag ===
    # On milestone tags, upload binaries to GitHub Releases
    # Filled in by subsequent Phase 0 bead
    #
    # CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
    # The build step already uses --locked, so artifacts are reproducible.
    # This step only uploads pre-built binaries to GitHub Releases.
    - name: publish-if-tag
      activeDeadlineSeconds: 600
      container:
        image: alpine:3.19
        command: [sh, -c]
        args:
          - |
            # Placeholder: publish step
            echo "Publish step - to be implemented by Phase 0 sibling bead"
            exit 0
        env:
          - name: GH_TOKEN
            valueFrom:
              secretKeyRef:
                name: github-webhook-secret
                key: token
        volumeMounts:
          - name: workspace
            mountPath: /workspace
          - name: cargo-cache
            mountPath: /cache/cargo
        resources:
          requests:
            cpu: 500m
            memory: 1Gi
          limits:
            cpu: 1000m
            memory: 2Gi