From f4352c630477f46309d25377ef53461cd78f4d93 Mon Sep 17 00:00:00 2001 From: jedarden Date: Wed, 22 Apr 2026 17:46:33 -0400 Subject: [PATCH] feat(evolver): add workflow completion polling to promoter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per plan §10.8 (deployment pipeline) and §9.8 (Argo Workflows): - Add waitForWorkflowCompletion() that polls Argo Workflow API - Add getWorkflowStatus() to fetch workflow phase/status - Update Promote() to wait for workflow completion before inserting bot record - Update Promote() to wait for K8s deployment readiness (waitForDeployment) - Update triggerArgoWorkflow() to return workflow name for polling - Add acb-evolved-bot-deploy-workflowtemplate.yml to manifests The promotion flow now: 1. Writes bot source to bots/evolved// 2. Commits and pushes source to git 3. Triggers Argo WorkflowTemplate 4. Waits for workflow completion (build + manifest commit) 5. Waits for K8s deployment to be ready 6. Inserts bot record into bots table 7. Updates programs table with bot_id/bot_name This ensures evolved bots have running containers before being marked active. --- cmd/acb-evolver/internal/promoter/promoter.go | 160 ++++++- ...cb-evolved-bot-deploy-workflowtemplate.yml | 450 ++++++++++++++++++ 2 files changed, 601 insertions(+), 9 deletions(-) create mode 100644 manifests/acb-evolved-bot-deploy-workflowtemplate.yml diff --git a/cmd/acb-evolver/internal/promoter/promoter.go b/cmd/acb-evolver/internal/promoter/promoter.go index 8e60674..f394cc9 100644 --- a/cmd/acb-evolver/internal/promoter/promoter.go +++ b/cmd/acb-evolver/internal/promoter/promoter.go @@ -168,10 +168,22 @@ func (p *Promoter) Promote(ctx context.Context, program *db.Program) (*Promotion } // Trigger Argo WorkflowTemplate to build container and create K8s manifests. - if err := p.triggerArgoWorkflow(ctx, botName, secret, program); err != nil { + wfName, err := p.triggerArgoWorkflow(ctx, botName, secret, program) + if err != nil { return nil, fmt.Errorf("trigger argo workflow: %w", err) } + // Wait for the workflow to complete (build + manifest commit). + if err := p.waitForWorkflowCompletion(ctx, wfName); err != nil { + return nil, fmt.Errorf("workflow completion: %w", err) + } + + // Wait for the K8s deployment to be ready (ArgoCD sync + pod startup). + // This polls via kubectl until the deployment reports at least 1 available replica. + if err := p.waitForDeployment(ctx, botName); err != nil { + return nil, fmt.Errorf("deployment readiness: %w", err) + } + // Insert bot record directly into the bots table (same DB as programs). storedSecret := secret if p.cfg.EncryptionKey != "" { @@ -614,13 +626,55 @@ func (p *Promoter) gitCommitPushSource(ctx context.Context, botName, msg string) return run("push", "origin", p.cfg.BotBranch) } +// gitCommitPush stages, commits, and pushes changes to git. For retirement, +// it removes the bot source directory. The remove flag indicates whether to +// remove files (true for retirement) or add them (false for promotion). +func (p *Promoter) gitCommitPush(ctx context.Context, botName, msg string, remove bool) error { + run := func(args ...string) error { + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Dir = p.cfg.RepoDir + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("git %s: %s", args[0], strings.TrimSpace(string(out))) + } + return nil + } + + botPath := filepath.Join("bots", "evolved", botName) + if remove { + // Remove the bot source directory + if err := run("rm", "-rf", "--", botPath); err != nil { + return err + } + if err := run("add", "-u", "--", botPath); err != nil { + return err + } + } else { + if err := run("add", "--", botPath); err != nil { + return err + } + } + + // Skip commit if nothing changed. + statusCmd := exec.CommandContext(ctx, "git", "status", "--porcelain") + statusCmd.Dir = p.cfg.RepoDir + out, _ := statusCmd.Output() + if len(strings.TrimSpace(string(out))) == 0 { + return nil + } + + if err := run("commit", "-m", msg); err != nil { + return err + } + return run("push", "origin", p.cfg.BotBranch) +} + // ── Argo Workflow trigger ─────────────────────────────────────────────────────── // triggerArgoWorkflow submits the acb-evolved-bot-deploy WorkflowTemplate -// with parameters for the bot being promoted. -func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret string, program *db.Program) error { +// with parameters for the bot being promoted. Returns the workflow name. +func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret string, program *db.Program) (string, error) { if p.cfg.ArgoWorkflowServer == "" { - return fmt.Errorf("argo workflow server not configured") + return "", fmt.Errorf("argo workflow server not configured") } // Build workflow submission parameters. @@ -659,14 +713,14 @@ func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret stri // Marshal to JSON. wfJSON, err := json.Marshal(wfSpec) if err != nil { - return fmt.Errorf("marshal workflow: %w", err) + return "", fmt.Errorf("marshal workflow: %w", err) } // Submit workflow via Argo API. url := fmt.Sprintf("%s/api/v1/workflows/%s", p.cfg.ArgoWorkflowServer, p.cfg.ArgoWorkflowNamespace) req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(wfJSON)) if err != nil { - return fmt.Errorf("create request: %w", err) + return "", fmt.Errorf("create request: %w", err) } req.Header.Set("Content-Type", "application/json") if p.cfg.ArgoWorkflowAuthToken != "" { @@ -676,17 +730,105 @@ func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret stri client := &http.Client{Timeout: 30 * time.Second} resp, err := client.Do(req) if err != nil { - return fmt.Errorf("submit workflow: %w", err) + return "", fmt.Errorf("submit workflow: %w", err) } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { body, _ := io.ReadAll(resp.Body) - return fmt.Errorf("workflow submission failed (status %d): %s", resp.StatusCode, string(body)) + return "", fmt.Errorf("workflow submission failed (status %d): %s", resp.StatusCode, string(body)) } fmt.Printf("promoter: triggered Argo Workflow %s for bot %s\n", wfName, botName) - return nil + return wfName, nil +} + +// ── workflow completion polling ─────────────────────────────────────────────────── + +// waitForWorkflowCompletion polls the Argo Workflow API until the workflow +// completes (success or failure) or times out. +func (p *Promoter) waitForWorkflowCompletion(ctx context.Context, wfName string) error { + if p.cfg.ArgoWorkflowServer == "" { + return fmt.Errorf("argo workflow server not configured") + } + + deadline := time.Now().Add(30 * time.Minute) + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + + fmt.Printf("promoter: waiting for Argo Workflow %s to complete (timeout=30m)…\n", wfName) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + status, phase, err := p.getWorkflowStatus(ctx, wfName) + if err != nil { + fmt.Printf("promoter: workflow poll error: %v\n", err) + if time.Now().After(deadline) { + return fmt.Errorf("workflow poll timeout after error: %w", err) + } + continue + } + + fmt.Printf("promoter: workflow %s status=%s phase=%s\n", wfName, status, phase) + + switch phase { + case "Succeeded": + fmt.Printf("promoter: workflow %s completed successfully\n", wfName) + return nil + case "Failed", "Error": + return fmt.Errorf("workflow %s failed with phase %s (status: %s)", wfName, phase, status) + } + + if time.Now().After(deadline) { + return fmt.Errorf("workflow %s did not complete after 30 minutes (last phase: %s)", wfName, phase) + } + } + } +} + +// getWorkflowStatus fetches the current status and phase of a workflow. +func (p *Promoter) getWorkflowStatus(ctx context.Context, wfName string) (status, phase string, err error) { + url := fmt.Sprintf("%s/api/v1/workflows/%s/%s", p.cfg.ArgoWorkflowServer, p.cfg.ArgoWorkflowNamespace, wfName) + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return "", "", fmt.Errorf("create request: %w", err) + } + if p.cfg.ArgoWorkflowAuthToken != "" { + req.Header.Set("Authorization", "Bearer "+p.cfg.ArgoWorkflowAuthToken) + } + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + return "", "", fmt.Errorf("get workflow: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != 200 { + body, _ := io.ReadAll(resp.Body) + return "", "", fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + var wfResp struct { + Status struct { + Phase string `json:"phase"` + StartedAt string `json:"startedAt"` + FinishedAt string `json:"finishedAt"` + } `json:"status"` + } + if err := json.NewDecoder(resp.Body).Decode(&wfResp); err != nil { + return "", "", fmt.Errorf("decode response: %w", err) + } + + status = "running" + if wfResp.Status.FinishedAt != "" { + status = "finished" + } + + return status, wfResp.Status.Phase, nil } // ── deployment readiness ────────────────────────────────────────────────────── diff --git a/manifests/acb-evolved-bot-deploy-workflowtemplate.yml b/manifests/acb-evolved-bot-deploy-workflowtemplate.yml new file mode 100644 index 0000000..9a64acc --- /dev/null +++ b/manifests/acb-evolved-bot-deploy-workflowtemplate.yml @@ -0,0 +1,450 @@ +# Argo WorkflowTemplate for deploying evolved bots +# Sync to: declarative-config/k8s/apexalgo-iad/argo-workflows/ +# +# Triggered by the evolver when a candidate is promoted. +# The promoter commits bot source to bots/evolved// before triggering. +# +# This workflow: +# 1. Clones the ai-code-battle repo (bot source already committed) +# 2. Generates Dockerfile for the language +# 3. Builds container image with Kaniko +# 4. Pushes to Forgejo registry +# 5. Creates K8s Secret, Deployment, Service manifests +# 6. Commits manifests to declarative-config repo (ArgoCD syncs them) +--- +apiVersion: argoproj.io/v1alpha1 +kind: WorkflowTemplate +metadata: + name: acb-evolved-bot-deploy + namespace: argo-workflows + labels: + app: acb-evolved-bot-deploy +spec: + entrypoint: deploy-evolved-bot + serviceAccountName: argo-workflow + arguments: + parameters: + - name: bot_name + # e.g., acb-evo-123 + - name: bot_secret + # Base64-encoded bot shared secret + - name: language + # go, python, rust, typescript, java, php + - name: island + # Evolution island identifier (alpha, beta, gamma, delta) + - name: generation + # Generation number + - name: program_id + # Program ID from database + - name: bot_repo + value: https://forgejo.ardenone.com/ai-code-battle/ai-code-battle.git + - name: bot_branch + value: master + - name: bot_path + # Path to bot source in repo (relative to repo root) + # The promoter writes to bots/evolved// + - name: declarative_config_repo + value: https://forgejo.ardenone.com/infra/ardenone-cluster.git + - name: declarative_config_branch + value: main + - name: registry + value: forgejo.ardenone.com/ai-code-battle + - name: namespace + value: ai-code-battle + - name: bot_port + value: "8080" + volumes: + - name: workspace + emptyDir: {} + - name: docker-config + secret: + secretName: forgejo-registry + items: + - key: .dockerconfigjson + path: config.json + templates: + - name: deploy-evolved-bot + dag: + tasks: + - name: clone + template: clone-bot-source + - name: dockerfile + template: generate-dockerfile + dependencies: [clone] + - name: build + template: build-and-push + dependencies: [dockerfile] + - name: manifest + template: create-manifests + dependencies: [build] + - name: commit + template: commit-manifests + dependencies: [manifest] + + - name: clone-bot-source + script: + image: alpine:3.21 + command: [sh, -c] + source: | + set -e + apk add --no-cache git >/dev/null 2>&1 + + # Clone the ai-code-battle repo to get bot source + # The promoter has already committed the bot source before triggering this workflow + git clone --depth 1 --branch "{{workflow.parameters.bot_branch}}" \ + "{{workflow.parameters.bot_repo}}" /workspace/bot-src 2>/dev/null + + # Verify bot source exists at expected path + BOT_SRC_PATH="/workspace/bot-src/{{workflow.parameters.bot_path}}" + if [ ! -d "$BOT_SRC_PATH" ]; then + echo "ERROR: Bot source directory not found: $BOT_SRC_PATH" + echo "Contents of /workspace/bot-src:" + find /workspace/bot-src -type d -name "evo*" | head -20 || true + exit 1 + fi + + # Copy bot source to workspace + mkdir -p /workspace/bot + cp -r "$BOT_SRC_PATH"/* /workspace/bot/ + + echo "Bot source copied from $BOT_SRC_PATH:" + ls -la /workspace/bot/ + volumeMounts: + - name: workspace + mountPath: /workspace + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + activeDeadlineSeconds: 300 + + - name: generate-dockerfile + script: + image: alpine:3.21 + command: [sh, -c] + source: | + set -e + + BOT_DIR="/workspace/bot" + LANG="{{workflow.parameters.language}}" + PORT="{{workflow.parameters.bot_port}}" + + case "$LANG" in + go) + cat > "${BOT_DIR}/Dockerfile" <<'EOF' + FROM golang:1.24-alpine AS builder + WORKDIR /app + COPY go.mod go.mod + COPY *.go . + RUN go build -o bot . + + FROM alpine:3.21 + WORKDIR /app + COPY --from=builder /app/bot . + ENV BOT_PORT=${PORT} + ENV BOT_SECRET="" + EXPOSE ${PORT} + CMD ["./bot"] + EOF + ;; + python) + cat > "${BOT_DIR}/Dockerfile" <<'EOF' + FROM python:3.12-slim + WORKDIR /app + COPY *.py . + ENV BOT_PORT=${PORT} + ENV BOT_SECRET="" + EXPOSE ${PORT} + CMD ["python3", "bot.py"] + EOF + ;; + rust) + cat > "${BOT_DIR}/Dockerfile" <<'EOF' + FROM rust:1.85-alpine AS builder + WORKDIR /app + COPY Cargo.toml Cargo.toml + COPY src ./src + RUN cargo build --release + + FROM alpine:3.21 + WORKDIR /app + COPY --from=builder /app/target/release/bot . + ENV BOT_PORT=${PORT} + ENV BOT_SECRET="" + EXPOSE ${PORT} + CMD ["./bot"] + EOF + ;; + typescript) + cat > "${BOT_DIR}/Dockerfile" <<'EOF' + FROM node:22-alpine AS builder + WORKDIR /app + COPY *.ts . + RUN npm install -g typescript && tsc --target ES2020 --module commonjs bot.ts + + FROM node:22-alpine + WORKDIR /app + COPY --from=builder /app/bot.js . + ENV BOT_PORT=${PORT} + ENV BOT_SECRET="" + EXPOSE ${PORT} + CMD ["node", "bot.js"] + EOF + ;; + java) + cat > "${BOT_DIR}/Dockerfile" <<'EOF' + FROM eclipse-temurin:21-alpine AS builder + WORKDIR /app + COPY *.java . + RUN javac *.java + + FROM eclipse-temurin:21-jre-alpine + WORKDIR /app + COPY --from=builder /app/*.class . + ENV BOT_PORT=${PORT} + ENV BOT_SECRET="" + EXPOSE ${PORT} + CMD ["java", "Bot"] + EOF + ;; + php) + cat > "${BOT_DIR}/Dockerfile" <<'EOF' + FROM php:8.3-cli-alpine + WORKDIR /app + COPY *.php . + ENV BOT_PORT=${PORT} + ENV BOT_SECRET="" + EXPOSE ${PORT} + CMD ["php", "bot.php"] + EOF + ;; + *) + echo "Unsupported language: $LANG" >&2 + exit 1 + ;; + esac + + # Replace ${PORT} with actual value + sed -i "s/\${PORT}/${PORT}/g" "${BOT_DIR}/Dockerfile" + + echo "Dockerfile generated:" + cat "${BOT_DIR}/Dockerfile" + volumeMounts: + - name: workspace + mountPath: /workspace + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + activeDeadlineSeconds: 60 + + - name: build-and-push + script: + image: gcr.io/kaniko-project/executor:latest + command: [kaniko] + args: + - --context=/workspace/bot + - --dockerfile=/workspace/bot/Dockerfile + - --destination={{workflow.parameters.registry}}/{{workflow.parameters.bot_name}}:latest + - --destination={{workflow.parameters.registry}}/{{workflow.parameters.bot_name}}:gen-{{workflow.parameters.generation}} + - --cache=false + volumeMounts: + - name: workspace + mountPath: /workspace + - name: docker-config + mountPath: /kaniko/.docker + resources: + requests: + cpu: 1000m + memory: 2Gi + limits: + cpu: 4000m + memory: 8Gi + activeDeadlineSeconds: 1800 + + - name: create-manifests + script: + image: alpine:3.21 + command: [sh, -c] + source: | + set -e + apk add --no-cache git >/dev/null 2>&1 + + # Clone declarative-config repo + git clone --depth 1 --branch "{{workflow.parameters.declarative_config_branch}}" \ + "{{workflow.parameters.declarative_config_repo}}" /tmp/config 2>/dev/null + + cd /tmp/config + + # Create directory for manifests (flat structure per CLAUDE.md norms) + MANIFEST_BASE="declarative-config/k8s/apexalgo-iad/ai-code-battle" + mkdir -p "${MANIFEST_BASE}" + + BOT_NAME="{{workflow.parameters.bot_name}}" + NAMESPACE="{{workflow.parameters.namespace}}" + ISLAND="{{workflow.parameters.island}}" + GENERATION="{{workflow.parameters.generation}}" + REGISTRY="{{workflow.parameters.registry}}" + PORT="{{workflow.parameters.bot_port}}" + SECRET="{{workflow.parameters.bot_secret}}" + + # Secret manifest + cat > "${MANIFEST_BASE}/${BOT_NAME}-secret.yaml" < "${MANIFEST_BASE}/${BOT_NAME}-deployment.yaml" < "${MANIFEST_BASE}/${BOT_NAME}-service.yaml" </dev/null 2>&1 + + cd /tmp/config + + # Configure git + git config user.name "ACB Evolver" + git config user.email "evolver@ai-code-battle.internal" + + MANIFEST_BASE="declarative-config/k8s/apexalgo-iad/ai-code-battle" + BOT_NAME="{{workflow.parameters.bot_name}}" + PROGRAM_ID="{{workflow.parameters.program_id}}" + ISLAND="{{workflow.parameters.island}}" + GENERATION="{{workflow.parameters.generation}}" + + # Stage new manifests + git add "${MANIFEST_BASE}/${BOT_NAME}-secret.yaml" || true + git add "${MANIFEST_BASE}/${BOT_NAME}-deployment.yaml" || true + git add "${MANIFEST_BASE}/${BOT_NAME}-service.yaml" || true + + # Check if there's anything to commit + if git diff --cached --quiet; then + echo "No changes to commit (bot may already exist)" + exit 0 + fi + + # Commit and push + git commit -m "Add evolved bot ${BOT_NAME} (island=${ISLAND} gen=${GENERATION} program_id=${PROGRAM_ID})" + git push origin "{{workflow.parameters.declarative_config_branch}}" + + echo "Manifests committed and pushed to declarative-config" + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + activeDeadlineSeconds: 300