feat(evolver): add workflow completion polling to promoter

Per plan §10.8 (deployment pipeline) and §9.8 (Argo Workflows):

- Add waitForWorkflowCompletion() that polls Argo Workflow API
- Add getWorkflowStatus() to fetch workflow phase/status
- Update Promote() to wait for workflow completion before inserting bot record
- Update Promote() to wait for K8s deployment readiness (waitForDeployment)
- Update triggerArgoWorkflow() to return workflow name for polling
- Add acb-evolved-bot-deploy-workflowtemplate.yml to manifests

The promotion flow now:
1. Writes bot source to bots/evolved/<bot_name>/
2. Commits and pushes source to git
3. Triggers Argo WorkflowTemplate
4. Waits for workflow completion (build + manifest commit)
5. Waits for K8s deployment to be ready
6. Inserts bot record into bots table
7. Updates programs table with bot_id/bot_name

This ensures evolved bots have running containers before being marked active.
This commit is contained in:
jedarden 2026-04-22 17:46:33 -04:00
parent b4a975f5bf
commit f4352c6304
2 changed files with 601 additions and 9 deletions

View file

@ -168,10 +168,22 @@ func (p *Promoter) Promote(ctx context.Context, program *db.Program) (*Promotion
}
// Trigger Argo WorkflowTemplate to build container and create K8s manifests.
if err := p.triggerArgoWorkflow(ctx, botName, secret, program); err != nil {
wfName, err := p.triggerArgoWorkflow(ctx, botName, secret, program)
if err != nil {
return nil, fmt.Errorf("trigger argo workflow: %w", err)
}
// Wait for the workflow to complete (build + manifest commit).
if err := p.waitForWorkflowCompletion(ctx, wfName); err != nil {
return nil, fmt.Errorf("workflow completion: %w", err)
}
// Wait for the K8s deployment to be ready (ArgoCD sync + pod startup).
// This polls via kubectl until the deployment reports at least 1 available replica.
if err := p.waitForDeployment(ctx, botName); err != nil {
return nil, fmt.Errorf("deployment readiness: %w", err)
}
// Insert bot record directly into the bots table (same DB as programs).
storedSecret := secret
if p.cfg.EncryptionKey != "" {
@ -614,13 +626,55 @@ func (p *Promoter) gitCommitPushSource(ctx context.Context, botName, msg string)
return run("push", "origin", p.cfg.BotBranch)
}
// gitCommitPush stages, commits, and pushes changes to git. For retirement,
// it removes the bot source directory. The remove flag indicates whether to
// remove files (true for retirement) or add them (false for promotion).
func (p *Promoter) gitCommitPush(ctx context.Context, botName, msg string, remove bool) error {
run := func(args ...string) error {
cmd := exec.CommandContext(ctx, "git", args...)
cmd.Dir = p.cfg.RepoDir
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("git %s: %s", args[0], strings.TrimSpace(string(out)))
}
return nil
}
botPath := filepath.Join("bots", "evolved", botName)
if remove {
// Remove the bot source directory
if err := run("rm", "-rf", "--", botPath); err != nil {
return err
}
if err := run("add", "-u", "--", botPath); err != nil {
return err
}
} else {
if err := run("add", "--", botPath); err != nil {
return err
}
}
// Skip commit if nothing changed.
statusCmd := exec.CommandContext(ctx, "git", "status", "--porcelain")
statusCmd.Dir = p.cfg.RepoDir
out, _ := statusCmd.Output()
if len(strings.TrimSpace(string(out))) == 0 {
return nil
}
if err := run("commit", "-m", msg); err != nil {
return err
}
return run("push", "origin", p.cfg.BotBranch)
}
// ── Argo Workflow trigger ───────────────────────────────────────────────────────
// triggerArgoWorkflow submits the acb-evolved-bot-deploy WorkflowTemplate
// with parameters for the bot being promoted.
func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret string, program *db.Program) error {
// with parameters for the bot being promoted. Returns the workflow name.
func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret string, program *db.Program) (string, error) {
if p.cfg.ArgoWorkflowServer == "" {
return fmt.Errorf("argo workflow server not configured")
return "", fmt.Errorf("argo workflow server not configured")
}
// Build workflow submission parameters.
@ -659,14 +713,14 @@ func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret stri
// Marshal to JSON.
wfJSON, err := json.Marshal(wfSpec)
if err != nil {
return fmt.Errorf("marshal workflow: %w", err)
return "", fmt.Errorf("marshal workflow: %w", err)
}
// Submit workflow via Argo API.
url := fmt.Sprintf("%s/api/v1/workflows/%s", p.cfg.ArgoWorkflowServer, p.cfg.ArgoWorkflowNamespace)
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(wfJSON))
if err != nil {
return fmt.Errorf("create request: %w", err)
return "", fmt.Errorf("create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
if p.cfg.ArgoWorkflowAuthToken != "" {
@ -676,17 +730,105 @@ func (p *Promoter) triggerArgoWorkflow(ctx context.Context, botName, secret stri
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("submit workflow: %w", err)
return "", fmt.Errorf("submit workflow: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("workflow submission failed (status %d): %s", resp.StatusCode, string(body))
return "", fmt.Errorf("workflow submission failed (status %d): %s", resp.StatusCode, string(body))
}
fmt.Printf("promoter: triggered Argo Workflow %s for bot %s\n", wfName, botName)
return nil
return wfName, nil
}
// ── workflow completion polling ───────────────────────────────────────────────────
// waitForWorkflowCompletion polls the Argo Workflow API until the workflow
// completes (success or failure) or times out.
func (p *Promoter) waitForWorkflowCompletion(ctx context.Context, wfName string) error {
if p.cfg.ArgoWorkflowServer == "" {
return fmt.Errorf("argo workflow server not configured")
}
deadline := time.Now().Add(30 * time.Minute)
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
fmt.Printf("promoter: waiting for Argo Workflow %s to complete (timeout=30m)…\n", wfName)
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
status, phase, err := p.getWorkflowStatus(ctx, wfName)
if err != nil {
fmt.Printf("promoter: workflow poll error: %v\n", err)
if time.Now().After(deadline) {
return fmt.Errorf("workflow poll timeout after error: %w", err)
}
continue
}
fmt.Printf("promoter: workflow %s status=%s phase=%s\n", wfName, status, phase)
switch phase {
case "Succeeded":
fmt.Printf("promoter: workflow %s completed successfully\n", wfName)
return nil
case "Failed", "Error":
return fmt.Errorf("workflow %s failed with phase %s (status: %s)", wfName, phase, status)
}
if time.Now().After(deadline) {
return fmt.Errorf("workflow %s did not complete after 30 minutes (last phase: %s)", wfName, phase)
}
}
}
}
// getWorkflowStatus fetches the current status and phase of a workflow.
func (p *Promoter) getWorkflowStatus(ctx context.Context, wfName string) (status, phase string, err error) {
url := fmt.Sprintf("%s/api/v1/workflows/%s/%s", p.cfg.ArgoWorkflowServer, p.cfg.ArgoWorkflowNamespace, wfName)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return "", "", fmt.Errorf("create request: %w", err)
}
if p.cfg.ArgoWorkflowAuthToken != "" {
req.Header.Set("Authorization", "Bearer "+p.cfg.ArgoWorkflowAuthToken)
}
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
return "", "", fmt.Errorf("get workflow: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return "", "", fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
}
var wfResp struct {
Status struct {
Phase string `json:"phase"`
StartedAt string `json:"startedAt"`
FinishedAt string `json:"finishedAt"`
} `json:"status"`
}
if err := json.NewDecoder(resp.Body).Decode(&wfResp); err != nil {
return "", "", fmt.Errorf("decode response: %w", err)
}
status = "running"
if wfResp.Status.FinishedAt != "" {
status = "finished"
}
return status, wfResp.Status.Phase, nil
}
// ── deployment readiness ──────────────────────────────────────────────────────

View file

@ -0,0 +1,450 @@
# Argo WorkflowTemplate for deploying evolved bots
# Sync to: declarative-config/k8s/apexalgo-iad/argo-workflows/
#
# Triggered by the evolver when a candidate is promoted.
# The promoter commits bot source to bots/evolved/<bot_name>/ before triggering.
#
# This workflow:
# 1. Clones the ai-code-battle repo (bot source already committed)
# 2. Generates Dockerfile for the language
# 3. Builds container image with Kaniko
# 4. Pushes to Forgejo registry
# 5. Creates K8s Secret, Deployment, Service manifests
# 6. Commits manifests to declarative-config repo (ArgoCD syncs them)
---
apiVersion: argoproj.io/v1alpha1
kind: WorkflowTemplate
metadata:
name: acb-evolved-bot-deploy
namespace: argo-workflows
labels:
app: acb-evolved-bot-deploy
spec:
entrypoint: deploy-evolved-bot
serviceAccountName: argo-workflow
arguments:
parameters:
- name: bot_name
# e.g., acb-evo-123
- name: bot_secret
# Base64-encoded bot shared secret
- name: language
# go, python, rust, typescript, java, php
- name: island
# Evolution island identifier (alpha, beta, gamma, delta)
- name: generation
# Generation number
- name: program_id
# Program ID from database
- name: bot_repo
value: https://forgejo.ardenone.com/ai-code-battle/ai-code-battle.git
- name: bot_branch
value: master
- name: bot_path
# Path to bot source in repo (relative to repo root)
# The promoter writes to bots/evolved/<bot_name>/
- name: declarative_config_repo
value: https://forgejo.ardenone.com/infra/ardenone-cluster.git
- name: declarative_config_branch
value: main
- name: registry
value: forgejo.ardenone.com/ai-code-battle
- name: namespace
value: ai-code-battle
- name: bot_port
value: "8080"
volumes:
- name: workspace
emptyDir: {}
- name: docker-config
secret:
secretName: forgejo-registry
items:
- key: .dockerconfigjson
path: config.json
templates:
- name: deploy-evolved-bot
dag:
tasks:
- name: clone
template: clone-bot-source
- name: dockerfile
template: generate-dockerfile
dependencies: [clone]
- name: build
template: build-and-push
dependencies: [dockerfile]
- name: manifest
template: create-manifests
dependencies: [build]
- name: commit
template: commit-manifests
dependencies: [manifest]
- name: clone-bot-source
script:
image: alpine:3.21
command: [sh, -c]
source: |
set -e
apk add --no-cache git >/dev/null 2>&1
# Clone the ai-code-battle repo to get bot source
# The promoter has already committed the bot source before triggering this workflow
git clone --depth 1 --branch "{{workflow.parameters.bot_branch}}" \
"{{workflow.parameters.bot_repo}}" /workspace/bot-src 2>/dev/null
# Verify bot source exists at expected path
BOT_SRC_PATH="/workspace/bot-src/{{workflow.parameters.bot_path}}"
if [ ! -d "$BOT_SRC_PATH" ]; then
echo "ERROR: Bot source directory not found: $BOT_SRC_PATH"
echo "Contents of /workspace/bot-src:"
find /workspace/bot-src -type d -name "evo*" | head -20 || true
exit 1
fi
# Copy bot source to workspace
mkdir -p /workspace/bot
cp -r "$BOT_SRC_PATH"/* /workspace/bot/
echo "Bot source copied from $BOT_SRC_PATH:"
ls -la /workspace/bot/
volumeMounts:
- name: workspace
mountPath: /workspace
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
activeDeadlineSeconds: 300
- name: generate-dockerfile
script:
image: alpine:3.21
command: [sh, -c]
source: |
set -e
BOT_DIR="/workspace/bot"
LANG="{{workflow.parameters.language}}"
PORT="{{workflow.parameters.bot_port}}"
case "$LANG" in
go)
cat > "${BOT_DIR}/Dockerfile" <<'EOF'
FROM golang:1.24-alpine AS builder
WORKDIR /app
COPY go.mod go.mod
COPY *.go .
RUN go build -o bot .
FROM alpine:3.21
WORKDIR /app
COPY --from=builder /app/bot .
ENV BOT_PORT=${PORT}
ENV BOT_SECRET=""
EXPOSE ${PORT}
CMD ["./bot"]
EOF
;;
python)
cat > "${BOT_DIR}/Dockerfile" <<'EOF'
FROM python:3.12-slim
WORKDIR /app
COPY *.py .
ENV BOT_PORT=${PORT}
ENV BOT_SECRET=""
EXPOSE ${PORT}
CMD ["python3", "bot.py"]
EOF
;;
rust)
cat > "${BOT_DIR}/Dockerfile" <<'EOF'
FROM rust:1.85-alpine AS builder
WORKDIR /app
COPY Cargo.toml Cargo.toml
COPY src ./src
RUN cargo build --release
FROM alpine:3.21
WORKDIR /app
COPY --from=builder /app/target/release/bot .
ENV BOT_PORT=${PORT}
ENV BOT_SECRET=""
EXPOSE ${PORT}
CMD ["./bot"]
EOF
;;
typescript)
cat > "${BOT_DIR}/Dockerfile" <<'EOF'
FROM node:22-alpine AS builder
WORKDIR /app
COPY *.ts .
RUN npm install -g typescript && tsc --target ES2020 --module commonjs bot.ts
FROM node:22-alpine
WORKDIR /app
COPY --from=builder /app/bot.js .
ENV BOT_PORT=${PORT}
ENV BOT_SECRET=""
EXPOSE ${PORT}
CMD ["node", "bot.js"]
EOF
;;
java)
cat > "${BOT_DIR}/Dockerfile" <<'EOF'
FROM eclipse-temurin:21-alpine AS builder
WORKDIR /app
COPY *.java .
RUN javac *.java
FROM eclipse-temurin:21-jre-alpine
WORKDIR /app
COPY --from=builder /app/*.class .
ENV BOT_PORT=${PORT}
ENV BOT_SECRET=""
EXPOSE ${PORT}
CMD ["java", "Bot"]
EOF
;;
php)
cat > "${BOT_DIR}/Dockerfile" <<'EOF'
FROM php:8.3-cli-alpine
WORKDIR /app
COPY *.php .
ENV BOT_PORT=${PORT}
ENV BOT_SECRET=""
EXPOSE ${PORT}
CMD ["php", "bot.php"]
EOF
;;
*)
echo "Unsupported language: $LANG" >&2
exit 1
;;
esac
# Replace ${PORT} with actual value
sed -i "s/\${PORT}/${PORT}/g" "${BOT_DIR}/Dockerfile"
echo "Dockerfile generated:"
cat "${BOT_DIR}/Dockerfile"
volumeMounts:
- name: workspace
mountPath: /workspace
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
activeDeadlineSeconds: 60
- name: build-and-push
script:
image: gcr.io/kaniko-project/executor:latest
command: [kaniko]
args:
- --context=/workspace/bot
- --dockerfile=/workspace/bot/Dockerfile
- --destination={{workflow.parameters.registry}}/{{workflow.parameters.bot_name}}:latest
- --destination={{workflow.parameters.registry}}/{{workflow.parameters.bot_name}}:gen-{{workflow.parameters.generation}}
- --cache=false
volumeMounts:
- name: workspace
mountPath: /workspace
- name: docker-config
mountPath: /kaniko/.docker
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 4000m
memory: 8Gi
activeDeadlineSeconds: 1800
- name: create-manifests
script:
image: alpine:3.21
command: [sh, -c]
source: |
set -e
apk add --no-cache git >/dev/null 2>&1
# Clone declarative-config repo
git clone --depth 1 --branch "{{workflow.parameters.declarative_config_branch}}" \
"{{workflow.parameters.declarative_config_repo}}" /tmp/config 2>/dev/null
cd /tmp/config
# Create directory for manifests (flat structure per CLAUDE.md norms)
MANIFEST_BASE="declarative-config/k8s/apexalgo-iad/ai-code-battle"
mkdir -p "${MANIFEST_BASE}"
BOT_NAME="{{workflow.parameters.bot_name}}"
NAMESPACE="{{workflow.parameters.namespace}}"
ISLAND="{{workflow.parameters.island}}"
GENERATION="{{workflow.parameters.generation}}"
REGISTRY="{{workflow.parameters.registry}}"
PORT="{{workflow.parameters.bot_port}}"
SECRET="{{workflow.parameters.bot_secret}}"
# Secret manifest
cat > "${MANIFEST_BASE}/${BOT_NAME}-secret.yaml" <<EOF
apiVersion: v1
kind: Secret
metadata:
name: ${BOT_NAME}-secret
namespace: ${NAMESPACE}
labels:
app.kubernetes.io/name: ${BOT_NAME}
app.kubernetes.io/part-of: ai-code-battle
app.kubernetes.io/component: evolved-bot
type: Opaque
data:
bot-secret: ${SECRET}
EOF
# Deployment manifest
cat > "${MANIFEST_BASE}/${BOT_NAME}-deployment.yaml" <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: ${BOT_NAME}
namespace: ${NAMESPACE}
labels:
app.kubernetes.io/name: ${BOT_NAME}
app.kubernetes.io/part-of: ai-code-battle
app.kubernetes.io/component: evolved-bot
acb/island: ${ISLAND}
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: ${BOT_NAME}
template:
metadata:
labels:
app.kubernetes.io/name: ${BOT_NAME}
app.kubernetes.io/part-of: ai-code-battle
app.kubernetes.io/component: evolved-bot
acb/island: ${ISLAND}
spec:
containers:
- name: bot
image: ${REGISTRY}/${BOT_NAME}:latest
env:
- name: BOT_PORT
value: "${PORT}"
- name: BOT_SECRET
valueFrom:
secretKeyRef:
name: ${BOT_NAME}-secret
key: bot-secret
ports:
- name: http
containerPort: ${PORT}
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 5
periodSeconds: 30
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 3
periodSeconds: 10
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
memory: 128Mi
restartPolicy: Always
EOF
# Service manifest
cat > "${MANIFEST_BASE}/${BOT_NAME}-service.yaml" <<EOF
apiVersion: v1
kind: Service
metadata:
name: ${BOT_NAME}
namespace: ${NAMESPACE}
labels:
app.kubernetes.io/name: ${BOT_NAME}
app.kubernetes.io/part-of: ai-code-battle
app.kubernetes.io/component: evolved-bot
spec:
type: ClusterIP
selector:
app.kubernetes.io/name: ${BOT_NAME}
ports:
- name: http
port: ${PORT}
targetPort: http
protocol: TCP
EOF
echo "Manifests created in ${MANIFEST_BASE}:"
ls -la "${MANIFEST_BASE}" | grep "${BOT_NAME}" || true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
activeDeadlineSeconds: 300
- name: commit-manifests
script:
image: alpine:3.21
command: [sh, -c]
source: |
set -e
apk add --no-cache git >/dev/null 2>&1
cd /tmp/config
# Configure git
git config user.name "ACB Evolver"
git config user.email "evolver@ai-code-battle.internal"
MANIFEST_BASE="declarative-config/k8s/apexalgo-iad/ai-code-battle"
BOT_NAME="{{workflow.parameters.bot_name}}"
PROGRAM_ID="{{workflow.parameters.program_id}}"
ISLAND="{{workflow.parameters.island}}"
GENERATION="{{workflow.parameters.generation}}"
# Stage new manifests
git add "${MANIFEST_BASE}/${BOT_NAME}-secret.yaml" || true
git add "${MANIFEST_BASE}/${BOT_NAME}-deployment.yaml" || true
git add "${MANIFEST_BASE}/${BOT_NAME}-service.yaml" || true
# Check if there's anything to commit
if git diff --cached --quiet; then
echo "No changes to commit (bot may already exist)"
exit 0
fi
# Commit and push
git commit -m "Add evolved bot ${BOT_NAME} (island=${ISLAND} gen=${GENERATION} program_id=${PROGRAM_ID})"
git push origin "{{workflow.parameters.declarative_config_branch}}"
echo "Manifests committed and pushed to declarative-config"
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
activeDeadlineSeconds: 300