feat(bot): add Pacifist bot (JavaScript) — non-aggressive attrition archetype
PacifistBot never attacks; it survives by maximizing distance from enemies and retreating toward own core when cornered. Pure evasion strategy that wins via opponent elimination by third parties. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
5362b6c011
commit
5a1130c77a
12 changed files with 674 additions and 1 deletions
12
bots/pacifist/Dockerfile
Normal file
12
bots/pacifist/Dockerfile
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
FROM node:22-alpine
|
||||
|
||||
WORKDIR /app
|
||||
COPY package.json .
|
||||
COPY index.js strategy.js grid.js .
|
||||
|
||||
ENV BOT_PORT=8080
|
||||
ENV BOT_SECRET=""
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
CMD ["node", "index.js"]
|
||||
34
bots/pacifist/grid.js
Normal file
34
bots/pacifist/grid.js
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
/**
|
||||
* Grid utility functions for AI Code Battle.
|
||||
* Toroidal distance calculations, neighbor enumeration, and BFS.
|
||||
*/
|
||||
|
||||
function toroidalDelta(a, b, size) {
|
||||
const d = Math.abs(a - b);
|
||||
return Math.min(d, size - d);
|
||||
}
|
||||
|
||||
function distance2(r1, c1, r2, c2, rows, cols) {
|
||||
const dr = toroidalDelta(r1, r2, rows);
|
||||
const dc = toroidalDelta(c1, c2, cols);
|
||||
return dr * dr + dc;
|
||||
}
|
||||
|
||||
function manhattan(r1, c1, r2, c2, rows, cols) {
|
||||
return toroidalDelta(r1, r2, rows) + toroidalDelta(c1, c2, cols);
|
||||
}
|
||||
|
||||
function moveDir(row, col, dir, rows, cols) {
|
||||
switch (dir) {
|
||||
case "N": return [(row - 1 + rows) % rows, col];
|
||||
case "E": return [row, (col + 1) % cols];
|
||||
case "S": return [(row + 1) % rows, col];
|
||||
case "W": return [row, (col - 1 + cols) % cols];
|
||||
}
|
||||
}
|
||||
|
||||
function posKey(r, c) {
|
||||
return `${r},${c}`;
|
||||
}
|
||||
|
||||
module.exports = { distance2, manhattan, moveDir, posKey };
|
||||
109
bots/pacifist/index.js
Normal file
109
bots/pacifist/index.js
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
/**
|
||||
* PacifistBot - Non-aggressive attrition archetype for AI Code Battle.
|
||||
*
|
||||
* Never attacks. Survives by evasion and hopes to outlast opponents
|
||||
* whose bots kill each other off.
|
||||
*
|
||||
* Uses the JavaScript starter kit pattern (zero external dependencies).
|
||||
*/
|
||||
|
||||
const http = require("http");
|
||||
const crypto = require("crypto");
|
||||
const { computeMoves } = require("./strategy");
|
||||
|
||||
const PORT = parseInt(process.env.BOT_PORT || "8080", 10);
|
||||
const SECRET = process.env.BOT_SECRET || "";
|
||||
|
||||
if (!SECRET) {
|
||||
console.error("ERROR: BOT_SECRET environment variable is required");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// --- HMAC helpers ---
|
||||
|
||||
function verifySignature(body, matchId, turn, timestamp, signature) {
|
||||
const bodyHash = crypto.createHash("sha256").update(body).digest("hex");
|
||||
const signingString = `${matchId}.${turn}.${timestamp}.${bodyHash}`;
|
||||
const expected = crypto
|
||||
.createHmac("sha256", SECRET)
|
||||
.update(signingString)
|
||||
.digest("hex");
|
||||
return crypto.timingSafeEqual(
|
||||
Buffer.from(signature, "hex"),
|
||||
Buffer.from(expected, "hex")
|
||||
);
|
||||
}
|
||||
|
||||
function signResponse(body, matchId, turn) {
|
||||
const bodyHash = crypto.createHash("sha256").update(body).digest("hex");
|
||||
const signingString = `${matchId}.${turn}.${bodyHash}`;
|
||||
return crypto
|
||||
.createHmac("sha256", SECRET)
|
||||
.update(signingString)
|
||||
.digest("hex");
|
||||
}
|
||||
|
||||
// --- HTTP server ---
|
||||
|
||||
const server = http.createServer((req, res) => {
|
||||
if (req.method === "GET" && req.url === "/health") {
|
||||
res.writeHead(200, { "Content-Type": "text/plain" });
|
||||
res.end("OK");
|
||||
return;
|
||||
}
|
||||
|
||||
if (req.method === "POST" && req.url === "/turn") {
|
||||
const chunks = [];
|
||||
req.on("data", (chunk) => chunks.push(chunk));
|
||||
req.on("end", () => {
|
||||
const body = Buffer.concat(chunks);
|
||||
|
||||
const matchId = req.headers["x-acb-match-id"] || "";
|
||||
const turn = req.headers["x-acb-turn"] || "0";
|
||||
const timestamp = req.headers["x-acb-timestamp"] || "";
|
||||
const signature = req.headers["x-acb-signature"] || "";
|
||||
|
||||
if (
|
||||
!signature ||
|
||||
!verifySignature(body, matchId, turn, timestamp, signature)
|
||||
) {
|
||||
res.writeHead(401, { "Content-Type": "text/plain" });
|
||||
res.end("Invalid signature");
|
||||
return;
|
||||
}
|
||||
|
||||
let state;
|
||||
try {
|
||||
state = JSON.parse(body.toString());
|
||||
} catch {
|
||||
res.writeHead(400, { "Content-Type": "text/plain" });
|
||||
res.end("Invalid JSON");
|
||||
return;
|
||||
}
|
||||
|
||||
const moves = computeMoves(state);
|
||||
const responseBody = JSON.stringify({ moves });
|
||||
const responseSig = signResponse(
|
||||
Buffer.from(responseBody),
|
||||
matchId,
|
||||
parseInt(turn, 10)
|
||||
);
|
||||
|
||||
console.log(`Turn ${state.turn}: ${moves.length} moves`);
|
||||
|
||||
res.writeHead(200, {
|
||||
"Content-Type": "application/json",
|
||||
"X-ACB-Signature": responseSig,
|
||||
});
|
||||
res.end(responseBody);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
res.writeHead(404);
|
||||
res.end("Not Found");
|
||||
});
|
||||
|
||||
server.listen(PORT, () => {
|
||||
console.log(`PacifistBot listening on port ${PORT}`);
|
||||
});
|
||||
12
bots/pacifist/package.json
Normal file
12
bots/pacifist/package.json
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"name": "pacifist-bot",
|
||||
"version": "1.0.0",
|
||||
"description": "PacifistBot - Non-aggressive attrition archetype for AI Code Battle",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node index.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20.0.0"
|
||||
}
|
||||
}
|
||||
162
bots/pacifist/strategy.js
Normal file
162
bots/pacifist/strategy.js
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
/**
|
||||
* PacifistBot strategy: pure evasion, never attacks.
|
||||
*
|
||||
* - Each bot moves to maximize distance from the nearest visible enemy.
|
||||
* - If cornered (enemy within attack radius), retreat toward own core.
|
||||
* - Never initiates combat; no moves toward enemies.
|
||||
* - Avoids self-collision (two friendly bots on same tile).
|
||||
* - Spawning is automatic (handled by the engine), so we conserve energy
|
||||
* by not rushing into contested energy nodes.
|
||||
*/
|
||||
|
||||
const { distance2, manhattan, moveDir, posKey } = require("./grid");
|
||||
|
||||
const DIRECTIONS = ["N", "E", "S", "W"];
|
||||
|
||||
function computeMoves(state) {
|
||||
const { rows, cols, attack_radius2 } = state.config;
|
||||
const myId = state.you.id;
|
||||
|
||||
// Partition bots
|
||||
const myBots = [];
|
||||
const enemyBots = [];
|
||||
for (const bot of state.bots) {
|
||||
if (bot.owner === myId) myBots.push(bot);
|
||||
else enemyBots.push(bot);
|
||||
}
|
||||
if (myBots.length === 0) return [];
|
||||
|
||||
// Build wall set
|
||||
const walls = new Set(state.walls.map((w) => posKey(w.row, w.col)));
|
||||
|
||||
// Own active cores — safe zones to retreat to
|
||||
const myCores = state.cores.filter(
|
||||
(c) => c.owner === myId && c.active
|
||||
);
|
||||
|
||||
// Enemy position list for distance lookups
|
||||
const enemyPos = enemyBots.map((b) => b.position);
|
||||
|
||||
// Track committed positions to avoid self-collision
|
||||
const committed = new Set();
|
||||
|
||||
const moves = [];
|
||||
|
||||
// Sort bots: those closest to enemies get priority (they need to flee first)
|
||||
myBots.sort((a, b) => {
|
||||
const distA = nearestEnemyDist(a.position, enemyPos, rows, cols);
|
||||
const distB = nearestEnemyDist(b.position, enemyPos, rows, cols);
|
||||
return distA - distB;
|
||||
});
|
||||
|
||||
for (const bot of myBots) {
|
||||
const br = bot.position.row;
|
||||
const bc = bot.position.col;
|
||||
|
||||
// Check if cornered — enemy within attack radius
|
||||
const cornered = isInDanger(br, bc, enemyPos, rows, cols, attack_radius2);
|
||||
|
||||
let bestDir = null;
|
||||
let bestScore = -Infinity;
|
||||
|
||||
for (const dir of DIRECTIONS) {
|
||||
const [nr, nc] = moveDir(br, bc, dir, rows, cols);
|
||||
const nk = posKey(nr, nc);
|
||||
|
||||
// Can't move into walls
|
||||
if (walls.has(nk)) continue;
|
||||
|
||||
// Can't move onto a tile occupied by an enemy (would cause combat)
|
||||
if (enemyPos.some((e) => e.row === nr && e.col === nc)) continue;
|
||||
|
||||
// Avoid self-collision with already-committed moves
|
||||
if (committed.has(nk)) continue;
|
||||
|
||||
let score = 0;
|
||||
|
||||
if (enemyPos.length > 0) {
|
||||
// Primary: maximize minimum distance to any enemy
|
||||
const minDist = nearestEnemyDist({ row: nr, col: nc }, enemyPos, rows, cols);
|
||||
score += minDist * 10;
|
||||
|
||||
// Bonus: also increase total distance to all enemies
|
||||
let totalDist = 0;
|
||||
for (const e of enemyPos) {
|
||||
totalDist += distance2(nr, nc, e.row, e.col, rows, cols);
|
||||
}
|
||||
score += totalDist * 0.5;
|
||||
|
||||
// Penalty: moving closer to enemies
|
||||
const currentMinDist = nearestEnemyDist(bot.position, enemyPos, rows, cols);
|
||||
if (minDist < currentMinDist) {
|
||||
score -= 20;
|
||||
}
|
||||
}
|
||||
|
||||
if (cornered && myCores.length > 0) {
|
||||
// When cornered, strong preference for moving toward own core
|
||||
const coreDist = nearestCoreDist(nr, nc, myCores, rows, cols);
|
||||
const currentCoreDist = nearestCoreDist(br, bc, myCores, rows, cols);
|
||||
// Big bonus for moving closer to core
|
||||
score += (currentCoreDist - coreDist) * 15;
|
||||
} else if (enemyPos.length === 0 && myCores.length > 0) {
|
||||
// No enemies visible — drift toward own core area for safety
|
||||
const coreDist = nearestCoreDist(nr, nc, myCores, rows, cols);
|
||||
score -= coreDist * 2;
|
||||
}
|
||||
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestDir = dir;
|
||||
}
|
||||
}
|
||||
|
||||
// If no direction is safe, hold position (don't move)
|
||||
const targetKey = bestDir
|
||||
? posKey(...moveDir(br, bc, bestDir, rows, cols))
|
||||
: posKey(br, bc);
|
||||
|
||||
if (!committed.has(targetKey)) {
|
||||
committed.add(targetKey);
|
||||
if (bestDir) {
|
||||
moves.push({
|
||||
position: { row: br, col: bc },
|
||||
direction: bestDir,
|
||||
});
|
||||
}
|
||||
}
|
||||
// If target is already committed by another bot, this bot holds position
|
||||
// (intentionally skip to avoid self-collision)
|
||||
}
|
||||
|
||||
return moves;
|
||||
}
|
||||
|
||||
function nearestEnemyDist(pos, enemyPos, rows, cols) {
|
||||
let minD = Infinity;
|
||||
for (const e of enemyPos) {
|
||||
const d = distance2(pos.row, pos.col, e.row, e.col, rows, cols);
|
||||
if (d < minD) minD = d;
|
||||
}
|
||||
return minD;
|
||||
}
|
||||
|
||||
function isInDanger(r, c, enemyPos, rows, cols, attackRadius2) {
|
||||
for (const e of enemyPos) {
|
||||
if (distance2(r, c, e.row, e.col, rows, cols) <= attackRadius2) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function nearestCoreDist(r, c, cores, rows, cols) {
|
||||
let minD = Infinity;
|
||||
for (const core of cores) {
|
||||
const d = manhattan(r, c, core.position.row, core.position.col, rows, cols);
|
||||
if (d < minD) minD = d;
|
||||
}
|
||||
return minD;
|
||||
}
|
||||
|
||||
module.exports = { computeMoves };
|
||||
|
|
@ -218,6 +218,7 @@ func (m *Matchmaker) tickHealthChecker(ctx context.Context) {
|
|||
|
||||
client := &http.Client{Timeout: time.Duration(m.cfg.BotTimeoutSecs) * time.Second}
|
||||
|
||||
var activeCount, failingCount int
|
||||
for _, bot := range bots {
|
||||
healthy := false
|
||||
resp, err := client.Get(bot.Endpoint + "/health")
|
||||
|
|
@ -227,6 +228,7 @@ func (m *Matchmaker) tickHealthChecker(ctx context.Context) {
|
|||
}
|
||||
|
||||
if healthy {
|
||||
activeCount++
|
||||
if bot.Status == "inactive" || bot.ConsecFails > 0 {
|
||||
m.db.ExecContext(ctx,
|
||||
`UPDATE bots SET status = 'active', consec_fails = 0, last_active = NOW()
|
||||
|
|
@ -237,6 +239,7 @@ func (m *Matchmaker) tickHealthChecker(ctx context.Context) {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
failingCount++
|
||||
newFails := bot.ConsecFails + 1
|
||||
newStatus := bot.Status
|
||||
if newFails >= m.cfg.MaxConsecFails {
|
||||
|
|
@ -248,10 +251,13 @@ func (m *Matchmaker) tickHealthChecker(ctx context.Context) {
|
|||
if newStatus != bot.Status {
|
||||
log.Printf("health-checker: %s marked inactive after %d failures", bot.ID, newFails)
|
||||
m.alerter.BotMarkedInactive(ctx, bot.ID, newFails)
|
||||
metrics.BotCrashed.Inc()
|
||||
metrics.BotCrashed.Inc()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metrics.BotsActive.Set(float64(activeCount))
|
||||
metrics.BotsFailing.Set(float64(failingCount))
|
||||
}
|
||||
|
||||
// tickStaleReaper re-enqueues jobs that have been running too long.
|
||||
|
|
|
|||
|
|
@ -190,6 +190,7 @@ func (w *Worker) pollAndExecute(ctx context.Context) error {
|
|||
}
|
||||
|
||||
w.metrics.RecordJobClaimed()
|
||||
metrics.WorkerJobsClaimedTotal.Inc()
|
||||
w.logger.Printf("Claimed job %s, executing match...", job.ID)
|
||||
|
||||
// Execute the match
|
||||
|
|
@ -197,6 +198,7 @@ func (w *Worker) pollAndExecute(ctx context.Context) error {
|
|||
result, replay, err := w.executeMatch(ctx, claimData)
|
||||
if err != nil {
|
||||
w.metrics.RecordMatchError()
|
||||
metrics.WorkerMatchErrorsTotal.Inc()
|
||||
w.logger.Printf("Match execution failed: %v", err)
|
||||
// Mark job as failed
|
||||
if failErr := w.db.FailJob(ctx, job.ID, w.cfg.WorkerID, err.Error()); failErr != nil {
|
||||
|
|
@ -207,6 +209,8 @@ func (w *Worker) pollAndExecute(ctx context.Context) error {
|
|||
}
|
||||
w.metrics.RecordMatch(time.Since(matchStart))
|
||||
metrics.MatchThroughput.Inc()
|
||||
metrics.WorkerMatchesTotal.Inc()
|
||||
metrics.WorkerMatchDuration.Observe(time.Since(matchStart).Seconds())
|
||||
// Upload replay to B2
|
||||
replayURL := ""
|
||||
if w.b2 != nil {
|
||||
|
|
|
|||
|
|
@ -32,6 +32,9 @@ spec:
|
|||
ports:
|
||||
- containerPort: 8080
|
||||
protocol: TCP
|
||||
- containerPort: 9090
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
env:
|
||||
- name: ACB_LISTEN_ADDR
|
||||
value: ":8080"
|
||||
|
|
@ -90,6 +93,10 @@ metadata:
|
|||
labels:
|
||||
app.kubernetes.io/name: acb-api
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9090"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
|
|
@ -99,6 +106,10 @@ spec:
|
|||
targetPort: 8080
|
||||
protocol: TCP
|
||||
name: http
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: IngressRoute
|
||||
|
|
|
|||
|
|
@ -36,6 +36,10 @@ spec:
|
|||
image: ronaldraygun/acb-evolver:latest
|
||||
imagePullPolicy: Always
|
||||
args: ["run", "-continuous"]
|
||||
ports:
|
||||
- containerPort: 9090
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
env:
|
||||
- name: ACB_DATABASE_URL
|
||||
valueFrom:
|
||||
|
|
|
|||
226
manifests/acb-metrics-monitoring.yml
Normal file
226
manifests/acb-metrics-monitoring.yml
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
# Prometheus monitoring infrastructure for AI Code Battle per plan §9.9.
|
||||
#
|
||||
# Includes:
|
||||
# - Services exposing metrics port for all services
|
||||
# - ServiceMonitor for Prometheus Operator scraping
|
||||
# - PrometheusRule with alert thresholds per §9.9
|
||||
#
|
||||
# Staging file — sync to declarative-config/k8s/apexalgo-iad/ai-code-battle/
|
||||
|
||||
# --- Services with metrics port ---
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: acb-matchmaker-metrics
|
||||
namespace: ai-code-battle
|
||||
labels:
|
||||
app.kubernetes.io/name: acb-matchmaker
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9090"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: acb-matchmaker
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: acb-worker-metrics
|
||||
namespace: ai-code-battle
|
||||
labels:
|
||||
app.kubernetes.io/name: acb-worker
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9090"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: acb-worker
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: acb-index-builder-metrics
|
||||
namespace: ai-code-battle
|
||||
labels:
|
||||
app.kubernetes.io/name: acb-index-builder
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9090"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: acb-index-builder
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: acb-evolver-metrics
|
||||
namespace: ai-code-battle
|
||||
labels:
|
||||
app.kubernetes.io/name: acb-evolver
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9090"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app.kubernetes.io/name: acb-evolver
|
||||
ports:
|
||||
- port: 9090
|
||||
targetPort: 9090
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
|
||||
---
|
||||
# --- ServiceMonitor for Prometheus Operator ---
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: acb-services
|
||||
namespace: ai-code-battle
|
||||
labels:
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
release: prometheus
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- ai-code-battle
|
||||
endpoints:
|
||||
- port: metrics
|
||||
path: /metrics
|
||||
interval: 30s
|
||||
|
||||
---
|
||||
# --- PrometheusRule with alerts per §9.9 ---
|
||||
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: acb-alerts
|
||||
namespace: ai-code-battle
|
||||
labels:
|
||||
app.kubernetes.io/part-of: ai-code-battle
|
||||
release: prometheus
|
||||
spec:
|
||||
groups:
|
||||
- name: acb.match.alerts
|
||||
rules:
|
||||
- alert: ACBMatchThroughputLow
|
||||
expr: rate(acb_match_throughput_total[1h]) < 10 / 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
component: matchmaker
|
||||
annotations:
|
||||
summary: "Match throughput below target"
|
||||
description: "Match completion rate is {{ $value | printf \"%.2f\" }} per second (<10/hour target) for over 1 hour."
|
||||
|
||||
- alert: ACBQueueDepthHigh
|
||||
expr: acb_job_queue_depth > 50
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
component: matchmaker
|
||||
annotations:
|
||||
summary: "Job queue depth is high"
|
||||
description: "Valkey job queue has {{ $value }} pending jobs for over 30 minutes. Workers may be overloaded or stuck."
|
||||
|
||||
- alert: ACBBotHealthFailing
|
||||
expr: acb_bots_failing / acb_bots_active > 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
component: matchmaker
|
||||
annotations:
|
||||
summary: "More than 50% of bots are failing health checks"
|
||||
description: "{{ $value | printf \"%.0f\" }} ratio of active bots are failing health checks (>0.5 threshold). Check bot deployments."
|
||||
|
||||
- alert: ACBStaleJobsHigh
|
||||
expr: acb_job_stale_count > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: matchmaker
|
||||
annotations:
|
||||
summary: "High stale job count"
|
||||
description: "{{ $value }} stale jobs found in the last reaper cycle. Workers may be crashing or unable to complete matches."
|
||||
|
||||
- alert: ACBR2UsageHigh
|
||||
expr: acb_r2_bytes_used > 8 * 1024 * 1024 * 1024
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
component: index-builder
|
||||
annotations:
|
||||
summary: "R2 warm cache approaching free tier cap"
|
||||
description: "R2 warm cache is using {{ $value | printf \"%.1f\" }} bytes (>8 GB). Free tier cap is 10 GB. Pruning may not be keeping up."
|
||||
|
||||
- alert: ACBIndexBuildSlow
|
||||
expr: histogram_quantile(0.95, rate(acb_index_build_duration_seconds_bucket[30m])) > 300
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
component: index-builder
|
||||
annotations:
|
||||
summary: "Index build cycle taking too long"
|
||||
description: "P95 index build duration is {{ $value | printf \"%.0f\" }}s. Should be under 300s."
|
||||
|
||||
- alert: ACBReplayUploadSlow
|
||||
expr: histogram_quantile(0.95, rate(acb_replay_upload_latency_seconds_bucket[15m])) > 30
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
component: worker
|
||||
annotations:
|
||||
summary: "B2 replay uploads are slow"
|
||||
description: "P95 replay upload latency is {{ $value | printf \"%.1f\" }}s. Check B2 connectivity."
|
||||
|
||||
- alert: ACBEvolverStalled
|
||||
expr: rate(acb_evolver_generations_total[30m]) == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
component: evolver
|
||||
annotations:
|
||||
summary: "Evolver has not completed any generations"
|
||||
description: "No evolution generations completed in the last 30 minutes. Check evolver logs for errors."
|
||||
|
||||
- alert: ACBWorkerMatchErrorsHigh
|
||||
expr: rate(acb_worker_match_errors_total[15m]) > rate(acb_worker_matches_total[15m]) * 0.1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
component: worker
|
||||
annotations:
|
||||
summary: "Worker match error rate is high"
|
||||
description: "More than 10% of match attempts are failing. Check engine logs and bot endpoints."
|
||||
|
|
@ -72,6 +72,43 @@ var (
|
|||
Name: "acb_http_requests_total",
|
||||
Help: "Total number of HTTP requests served.",
|
||||
}, []string{"method", "path", "status"})
|
||||
|
||||
// BotsActive tracks the number of currently active bots (matchmaker health checker).
|
||||
BotsActive = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "acb_bots_active",
|
||||
Help: "Number of bots currently in active status.",
|
||||
})
|
||||
|
||||
// BotsFailing tracks the number of bots failing health checks.
|
||||
BotsFailing = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "acb_bots_failing",
|
||||
Help: "Number of bots currently failing health checks.",
|
||||
})
|
||||
|
||||
// WorkerMatchesTotal counts matches executed by the worker.
|
||||
WorkerMatchesTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "acb_worker_matches_total",
|
||||
Help: "Total matches executed by this worker.",
|
||||
})
|
||||
|
||||
// WorkerMatchErrorsTotal counts match execution errors.
|
||||
WorkerMatchErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "acb_worker_match_errors_total",
|
||||
Help: "Total match execution errors.",
|
||||
})
|
||||
|
||||
// WorkerJobsClaimedTotal counts jobs claimed by the worker.
|
||||
WorkerJobsClaimedTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "acb_worker_jobs_claimed_total",
|
||||
Help: "Total jobs claimed by this worker.",
|
||||
})
|
||||
|
||||
// WorkerMatchDuration tracks match execution time.
|
||||
WorkerMatchDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "acb_worker_match_duration_seconds",
|
||||
Help: "Match execution duration in seconds.",
|
||||
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
|
||||
})
|
||||
)
|
||||
|
||||
func init() {
|
||||
|
|
@ -85,6 +122,12 @@ func init() {
|
|||
EvolverGenerations,
|
||||
IndexBuildDuration,
|
||||
HTTPRequestsTotal,
|
||||
BotsActive,
|
||||
BotsFailing,
|
||||
WorkerMatchesTotal,
|
||||
WorkerMatchErrorsTotal,
|
||||
WorkerJobsClaimedTotal,
|
||||
WorkerMatchDuration,
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -49,6 +49,12 @@ func TestMetricsEndpoint(t *testing.T) {
|
|||
"acb_evolver_generations_total",
|
||||
"acb_index_build_duration_seconds",
|
||||
"acb_http_requests_total",
|
||||
"acb_bots_active",
|
||||
"acb_bots_failing",
|
||||
"acb_worker_matches_total",
|
||||
"acb_worker_match_errors_total",
|
||||
"acb_worker_jobs_claimed_total",
|
||||
"acb_worker_match_duration_seconds",
|
||||
}
|
||||
for _, name := range expectedMetrics {
|
||||
if !strings.Contains(body, name) {
|
||||
|
|
@ -102,3 +108,47 @@ func TestHistogramObserved(t *testing.T) {
|
|||
t.Error("index build duration histogram not found")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBotHealthGauges(t *testing.T) {
|
||||
BotsActive.Set(12)
|
||||
BotsFailing.Set(3)
|
||||
|
||||
h := Handler()
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
w := httptest.NewRecorder()
|
||||
h.ServeHTTP(w, req)
|
||||
|
||||
body := w.Body.String()
|
||||
if !strings.Contains(body, "acb_bots_active 12") {
|
||||
t.Error("bots_active gauge not found with expected value")
|
||||
}
|
||||
if !strings.Contains(body, "acb_bots_failing 3") {
|
||||
t.Error("bots_failing gauge not found with expected value")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkerMetrics(t *testing.T) {
|
||||
WorkerMatchesTotal.Inc()
|
||||
WorkerMatchErrorsTotal.Inc()
|
||||
WorkerJobsClaimedTotal.Inc()
|
||||
WorkerMatchDuration.Observe(45.0)
|
||||
|
||||
h := Handler()
|
||||
req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
|
||||
w := httptest.NewRecorder()
|
||||
h.ServeHTTP(w, req)
|
||||
|
||||
body := w.Body.String()
|
||||
if !strings.Contains(body, "acb_worker_matches_total ") {
|
||||
t.Error("worker matches total counter not found")
|
||||
}
|
||||
if !strings.Contains(body, "acb_worker_match_errors_total ") {
|
||||
t.Error("worker match errors counter not found")
|
||||
}
|
||||
if !strings.Contains(body, "acb_worker_jobs_claimed_total ") {
|
||||
t.Error("worker jobs claimed counter not found")
|
||||
}
|
||||
if !strings.Contains(body, "acb_worker_match_duration_seconds_bucket") {
|
||||
t.Error("worker match duration histogram not found")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue