diff --git a/.env.example b/.env.example index b03c513..5a4ad91 100644 --- a/.env.example +++ b/.env.example @@ -34,6 +34,27 @@ BOT_SECRET_GUARDIAN=dev-secret-guardian BOT_SECRET_SWARM=dev-secret-swarm BOT_SECRET_HUNTER=dev-secret-hunter +# =========================================== +# Go API Server Configuration +# =========================================== +# PostgreSQL connection URL +ACB_DATABASE_URL=postgres://localhost:5432/acb?sslmode=disable + +# Valkey (Redis-compatible) address +ACB_VALKEY_ADDR=localhost:6379 + +# AES-256-GCM key for encrypting bot shared secrets at rest (32 bytes, hex-encoded) +ACB_ENCRYPTION_KEY= + +# =========================================== +# Alerting Webhooks (optional) +# =========================================== +# Discord webhook URL for operational alerts (bot health, stale jobs, errors) +ACB_DISCORD_WEBHOOK= + +# Slack incoming webhook URL for operational alerts +ACB_SLACK_WEBHOOK= + # =========================================== # Worker Configuration # =========================================== diff --git a/PROGRESS.md b/PROGRESS.md index a3fb22d..3aa3768 100644 --- a/PROGRESS.md +++ b/PROGRESS.md @@ -7,6 +7,22 @@ **Last Updated: 2026-03-26** ### Recent Changes (2026-03-26) +- Added Discord/Slack alerting webhooks to Go API server (`cmd/acb-api/alerts.go`): + - `Alerter` module sends notifications to Discord and/or Slack incoming webhook URLs + - Discord embeds with color-coded severity (blue=info, yellow=warning, red=error) + timestamps + - Slack attachments with color-coded severity + footer + - Rate limiting with per-key dedup cooldown (5 min default) to prevent alert storms + - Garbage collection of expired dedup entries + - Helper methods: `BotMarkedInactive`, `BotRecovered`, `StaleJobsReaped`, `MatchError` + - Integrated into health checker ticker (alerts on bot inactive/recovered transitions) + - Integrated into stale job reaper ticker (alerts when stale jobs re-enqueued) + - Config via `ACB_DISCORD_WEBHOOK` and `ACB_SLACK_WEBHOOK` env vars + - 15 unit tests: enabled detection, Discord/Slack payload format, color codes, rate limiting, + cooldown expiry, no-dedup bypass, webhook errors, both-webhook dispatch, helper methods, GC + - Updated `.env.example` with Go API and alerting webhook configuration + - All tests pass (45 API tests total, 15 new + 30 existing) + +### Previous Changes (2026-03-26) - Added Traefik IngressRoute, cert-manager Certificate, and CI/CD pipeline manifests (`deploy/k8s/`): - `ingress/acb-api-ingressroute.yaml` — Traefik IngressRoute for `api.aicodebattle.com` with CORS middleware (allow origins for aicodebattle.com), security headers, rate limiting (100 req/min burst 200) @@ -175,6 +191,13 @@ - `service-account.yaml` - CI ServiceAccount + RBAC (pods, workflows access) - DAG builds all 10 images in parallel: acb-api, acb-worker, acb-indexer, 6 strategy bots, plus site build - [x] Registry credentials SealedSecret template (`deploy/k8s/sealed-secrets/registry-credentials.yaml`) +- [x] Discord/Slack alerting webhooks (`cmd/acb-api/alerts.go`) + - Alerter module with Discord embeds and Slack attachments + - Color-coded severity levels (info/warning/error) + - Per-key rate limiting with configurable cooldown + - Integrated into health checker and stale job reaper tickers + - Helper methods for common alert events + - 15 unit tests covering all functionality ### Remaining Phase 6 Work (requires Cloudflare account access) diff --git a/cmd/acb-api/alerts.go b/cmd/acb-api/alerts.go new file mode 100644 index 0000000..e0a7d2f --- /dev/null +++ b/cmd/acb-api/alerts.go @@ -0,0 +1,223 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "log" + "net/http" + "strings" + "sync" + "time" +) + +// AlertLevel indicates severity for color-coding in webhook messages. +type AlertLevel int + +const ( + AlertInfo AlertLevel = iota // blue / informational + AlertWarning // yellow / warning + AlertError // red / error +) + +// Alerter sends notifications to configured Discord and/or Slack webhooks. +type Alerter struct { + discordURL string + slackURL string + client *http.Client + + // Rate limiting: max 1 alert per key per cooldown period. + mu sync.Mutex + cooldown time.Duration + sent map[string]time.Time +} + +// NewAlerter creates an Alerter. If both URLs are empty, Send is a no-op. +func NewAlerter(discordURL, slackURL string) *Alerter { + return &Alerter{ + discordURL: discordURL, + slackURL: slackURL, + client: &http.Client{Timeout: 10 * time.Second}, + cooldown: 5 * time.Minute, + sent: make(map[string]time.Time), + } +} + +// Enabled returns true if at least one webhook URL is configured. +func (a *Alerter) Enabled() bool { + return a.discordURL != "" || a.slackURL != "" +} + +// Send dispatches an alert to all configured webhooks. The dedupKey is used +// for rate limiting — identical keys within the cooldown window are suppressed. +func (a *Alerter) Send(ctx context.Context, level AlertLevel, title, message, dedupKey string) { + if !a.Enabled() { + return + } + + if dedupKey != "" && !a.shouldSend(dedupKey) { + return + } + + if a.discordURL != "" { + if err := a.sendDiscord(ctx, level, title, message); err != nil { + log.Printf("alert: discord send error: %v", err) + } + } + + if a.slackURL != "" { + if err := a.sendSlack(ctx, level, title, message); err != nil { + log.Printf("alert: slack send error: %v", err) + } + } +} + +// shouldSend checks rate limiting. Returns true if the alert should be sent. +func (a *Alerter) shouldSend(key string) bool { + a.mu.Lock() + defer a.mu.Unlock() + + now := time.Now() + + // Garbage collect expired entries periodically + if len(a.sent) > 100 { + for k, t := range a.sent { + if now.Sub(t) > a.cooldown { + delete(a.sent, k) + } + } + } + + if last, ok := a.sent[key]; ok && now.Sub(last) < a.cooldown { + return false + } + a.sent[key] = now + return true +} + +// discordPayload is the Discord webhook message format. +type discordPayload struct { + Embeds []discordEmbed `json:"embeds"` +} + +type discordEmbed struct { + Title string `json:"title"` + Description string `json:"description"` + Color int `json:"color"` + Timestamp string `json:"timestamp"` +} + +func (a *Alerter) sendDiscord(ctx context.Context, level AlertLevel, title, message string) error { + color := 0x3498db // blue + switch level { + case AlertWarning: + color = 0xf39c12 // yellow/orange + case AlertError: + color = 0xe74c3c // red + } + + payload := discordPayload{ + Embeds: []discordEmbed{{ + Title: fmt.Sprintf("[ACB] %s", title), + Description: message, + Color: color, + Timestamp: time.Now().UTC().Format(time.RFC3339), + }}, + } + + return a.postJSON(ctx, a.discordURL, payload) +} + +// slackPayload is the Slack incoming webhook format. +type slackPayload struct { + Attachments []slackAttachment `json:"attachments"` +} + +type slackAttachment struct { + Color string `json:"color"` + Title string `json:"title"` + Text string `json:"text"` + Footer string `json:"footer"` + Ts int64 `json:"ts"` +} + +func (a *Alerter) sendSlack(ctx context.Context, level AlertLevel, title, message string) error { + color := "#3498db" + switch level { + case AlertWarning: + color = "#f39c12" + case AlertError: + color = "#e74c3c" + } + + payload := slackPayload{ + Attachments: []slackAttachment{{ + Color: color, + Title: fmt.Sprintf("[ACB] %s", title), + Text: message, + Footer: "AI Code Battle", + Ts: time.Now().Unix(), + }}, + } + + return a.postJSON(ctx, a.slackURL, payload) +} + +func (a *Alerter) postJSON(ctx context.Context, url string, payload any) error { + body, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshal payload: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body)) + if err != nil { + return fmt.Errorf("create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := a.client.Do(req) + if err != nil { + return fmt.Errorf("send webhook: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode >= 300 { + return fmt.Errorf("webhook returned status %d", resp.StatusCode) + } + return nil +} + +// Alert helper methods for common events. + +func (a *Alerter) BotMarkedInactive(ctx context.Context, botID string, failCount int) { + a.Send(ctx, AlertWarning, + "Bot Marked Inactive", + fmt.Sprintf("Bot `%s` marked inactive after %d consecutive health check failures.", botID, failCount), + "bot-inactive:"+botID, + ) +} + +func (a *Alerter) BotRecovered(ctx context.Context, botID string) { + a.Send(ctx, AlertInfo, + "Bot Recovered", + fmt.Sprintf("Bot `%s` is back online and marked active.", botID), + "bot-recovered:"+botID, + ) +} + +func (a *Alerter) StaleJobsReaped(ctx context.Context, jobIDs []string) { + a.Send(ctx, AlertWarning, + "Stale Jobs Re-enqueued", + fmt.Sprintf("%d stale job(s) re-enqueued: %s", len(jobIDs), strings.Join(jobIDs, ", ")), + "stale-jobs", + ) +} + +func (a *Alerter) MatchError(ctx context.Context, matchID, reason string) { + a.Send(ctx, AlertError, + "Match Error", + fmt.Sprintf("Match `%s` failed: %s", matchID, reason), + "match-error:"+matchID, + ) +} diff --git a/cmd/acb-api/alerts_test.go b/cmd/acb-api/alerts_test.go new file mode 100644 index 0000000..6f7522d --- /dev/null +++ b/cmd/acb-api/alerts_test.go @@ -0,0 +1,393 @@ +package main + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "sync" + "testing" + "time" +) + +func TestAlerterEnabled(t *testing.T) { + tests := []struct { + name string + discord string + slack string + want bool + }{ + {"both empty", "", "", false}, + {"discord only", "http://discord.example.com", "", true}, + {"slack only", "", "http://slack.example.com", true}, + {"both set", "http://discord.example.com", "http://slack.example.com", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + a := NewAlerter(tt.discord, tt.slack) + if got := a.Enabled(); got != tt.want { + t.Errorf("Enabled() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestAlerterSendNoOp(t *testing.T) { + // With no webhook URLs, Send should be a no-op (no panic, no error). + a := NewAlerter("", "") + a.Send(context.Background(), AlertError, "Test", "message", "key") +} + +func TestAlerterSendDiscord(t *testing.T) { + var received discordPayload + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("method = %s, want POST", r.Method) + } + if ct := r.Header.Get("Content-Type"); ct != "application/json" { + t.Errorf("content-type = %s, want application/json", ct) + } + json.NewDecoder(r.Body).Decode(&received) + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + a.Send(context.Background(), AlertError, "Test Alert", "Something broke", "") + + if len(received.Embeds) != 1 { + t.Fatalf("embeds count = %d, want 1", len(received.Embeds)) + } + embed := received.Embeds[0] + if embed.Title != "[ACB] Test Alert" { + t.Errorf("title = %q, want %q", embed.Title, "[ACB] Test Alert") + } + if embed.Description != "Something broke" { + t.Errorf("description = %q, want %q", embed.Description, "Something broke") + } + if embed.Color != 0xe74c3c { + t.Errorf("color = %#x, want %#x (red/error)", embed.Color, 0xe74c3c) + } + if embed.Timestamp == "" { + t.Error("timestamp should not be empty") + } +} + +func TestAlerterSendSlack(t *testing.T) { + var received slackPayload + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewDecoder(r.Body).Decode(&received) + w.WriteHeader(http.StatusOK) + })) + defer ts.Close() + + a := NewAlerter("", ts.URL) + a.Send(context.Background(), AlertWarning, "Warning", "Watch out", "") + + if len(received.Attachments) != 1 { + t.Fatalf("attachments count = %d, want 1", len(received.Attachments)) + } + att := received.Attachments[0] + if att.Title != "[ACB] Warning" { + t.Errorf("title = %q, want %q", att.Title, "[ACB] Warning") + } + if att.Text != "Watch out" { + t.Errorf("text = %q, want %q", att.Text, "Watch out") + } + if att.Color != "#f39c12" { + t.Errorf("color = %q, want %q (warning)", att.Color, "#f39c12") + } + if att.Footer != "AI Code Battle" { + t.Errorf("footer = %q, want %q", att.Footer, "AI Code Battle") + } +} + +func TestAlerterColorCodes(t *testing.T) { + tests := []struct { + level AlertLevel + wantDiscord int + wantSlack string + }{ + {AlertInfo, 0x3498db, "#3498db"}, + {AlertWarning, 0xf39c12, "#f39c12"}, + {AlertError, 0xe74c3c, "#e74c3c"}, + } + + for _, tt := range tests { + var discordReceived discordPayload + var slackReceived slackPayload + + discordSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewDecoder(r.Body).Decode(&discordReceived) + w.WriteHeader(http.StatusNoContent) + })) + slackSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewDecoder(r.Body).Decode(&slackReceived) + w.WriteHeader(http.StatusOK) + })) + + a := NewAlerter(discordSrv.URL, slackSrv.URL) + a.Send(context.Background(), tt.level, "Test", "msg", "") + + if len(discordReceived.Embeds) > 0 && discordReceived.Embeds[0].Color != tt.wantDiscord { + t.Errorf("level %d: discord color = %#x, want %#x", tt.level, discordReceived.Embeds[0].Color, tt.wantDiscord) + } + if len(slackReceived.Attachments) > 0 && slackReceived.Attachments[0].Color != tt.wantSlack { + t.Errorf("level %d: slack color = %q, want %q", tt.level, slackReceived.Attachments[0].Color, tt.wantSlack) + } + + discordSrv.Close() + slackSrv.Close() + } +} + +func TestAlerterRateLimiting(t *testing.T) { + var callCount int + var mu sync.Mutex + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + callCount++ + mu.Unlock() + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + a.cooldown = 1 * time.Hour // long cooldown for test + + ctx := context.Background() + + // First send should go through + a.Send(ctx, AlertInfo, "Test", "msg1", "same-key") + mu.Lock() + if callCount != 1 { + t.Errorf("after first send: count = %d, want 1", callCount) + } + mu.Unlock() + + // Second send with same key should be suppressed + a.Send(ctx, AlertInfo, "Test", "msg2", "same-key") + mu.Lock() + if callCount != 1 { + t.Errorf("after duplicate send: count = %d, want 1 (suppressed)", callCount) + } + mu.Unlock() + + // Different key should go through + a.Send(ctx, AlertInfo, "Test", "msg3", "other-key") + mu.Lock() + if callCount != 2 { + t.Errorf("after different key: count = %d, want 2", callCount) + } + mu.Unlock() +} + +func TestAlerterNoDedupKeyAlwaysSends(t *testing.T) { + var callCount int + var mu sync.Mutex + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + callCount++ + mu.Unlock() + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + ctx := context.Background() + + // Empty dedup key should always send + a.Send(ctx, AlertInfo, "Test", "msg1", "") + a.Send(ctx, AlertInfo, "Test", "msg2", "") + a.Send(ctx, AlertInfo, "Test", "msg3", "") + + mu.Lock() + if callCount != 3 { + t.Errorf("without dedup key: count = %d, want 3", callCount) + } + mu.Unlock() +} + +func TestAlerterRateLimitExpiry(t *testing.T) { + var callCount int + var mu sync.Mutex + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + callCount++ + mu.Unlock() + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + a.cooldown = 10 * time.Millisecond // very short for test + + ctx := context.Background() + + a.Send(ctx, AlertInfo, "Test", "msg1", "expire-key") + mu.Lock() + c1 := callCount + mu.Unlock() + + time.Sleep(20 * time.Millisecond) + + // After cooldown, same key should send again + a.Send(ctx, AlertInfo, "Test", "msg2", "expire-key") + mu.Lock() + c2 := callCount + mu.Unlock() + + if c1 != 1 || c2 != 2 { + t.Errorf("rate limit expiry: counts = (%d, %d), want (1, 2)", c1, c2) + } +} + +func TestAlerterWebhookError(t *testing.T) { + // Server that returns 500 — should not panic + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, ts.URL) + // Should log errors but not panic + a.Send(context.Background(), AlertError, "Test", "msg", "") +} + +func TestAlerterBothWebhooks(t *testing.T) { + var discordCalls, slackCalls int + var mu sync.Mutex + + discordSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + discordCalls++ + mu.Unlock() + w.WriteHeader(http.StatusNoContent) + })) + defer discordSrv.Close() + + slackSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mu.Lock() + slackCalls++ + mu.Unlock() + w.WriteHeader(http.StatusOK) + })) + defer slackSrv.Close() + + a := NewAlerter(discordSrv.URL, slackSrv.URL) + a.Send(context.Background(), AlertInfo, "Test", "msg", "") + + mu.Lock() + defer mu.Unlock() + if discordCalls != 1 { + t.Errorf("discord calls = %d, want 1", discordCalls) + } + if slackCalls != 1 { + t.Errorf("slack calls = %d, want 1", slackCalls) + } +} + +func TestHelperBotMarkedInactive(t *testing.T) { + var received discordPayload + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewDecoder(r.Body).Decode(&received) + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + a.BotMarkedInactive(context.Background(), "bot_abc123", 3) + + if len(received.Embeds) != 1 { + t.Fatal("expected 1 embed") + } + if received.Embeds[0].Color != 0xf39c12 { + t.Errorf("bot inactive should be warning (yellow), got %#x", received.Embeds[0].Color) + } +} + +func TestHelperBotRecovered(t *testing.T) { + var received discordPayload + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewDecoder(r.Body).Decode(&received) + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + a.BotRecovered(context.Background(), "bot_abc123") + + if len(received.Embeds) != 1 { + t.Fatal("expected 1 embed") + } + if received.Embeds[0].Color != 0x3498db { + t.Errorf("bot recovered should be info (blue), got %#x", received.Embeds[0].Color) + } +} + +func TestHelperStaleJobsReaped(t *testing.T) { + var received discordPayload + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewDecoder(r.Body).Decode(&received) + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + a.StaleJobsReaped(context.Background(), []string{"j_001", "j_002"}) + + if len(received.Embeds) != 1 { + t.Fatal("expected 1 embed") + } + if received.Embeds[0].Color != 0xf39c12 { + t.Errorf("stale jobs should be warning (yellow), got %#x", received.Embeds[0].Color) + } +} + +func TestHelperMatchError(t *testing.T) { + var received discordPayload + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewDecoder(r.Body).Decode(&received) + w.WriteHeader(http.StatusNoContent) + })) + defer ts.Close() + + a := NewAlerter(ts.URL, "") + a.MatchError(context.Background(), "m_12345678", "bot timeout") + + if len(received.Embeds) != 1 { + t.Fatal("expected 1 embed") + } + if received.Embeds[0].Color != 0xe74c3c { + t.Errorf("match error should be error (red), got %#x", received.Embeds[0].Color) + } +} + +func TestShouldSendGarbageCollects(t *testing.T) { + a := NewAlerter("http://unused", "") + a.cooldown = 1 * time.Millisecond + + // Fill beyond 100 entries to trigger GC + for i := 0; i < 110; i++ { + a.sent[string(rune('a'+i%26))+string(rune('0'+i/26))] = time.Now().Add(-time.Hour) + } + + // This should trigger cleanup and succeed + got := a.shouldSend("new-key") + if !got { + t.Error("shouldSend returned false for new key after GC should have cleaned expired entries") + } + + // Old expired entries should have been cleaned + a.mu.Lock() + count := len(a.sent) + a.mu.Unlock() + // Should only have "new-key" left (all others expired) + if count > 10 { + t.Errorf("after GC: %d entries remain, expected most to be cleaned", count) + } +} diff --git a/cmd/acb-api/main.go b/cmd/acb-api/main.go index 0709fbe..4ea558c 100644 --- a/cmd/acb-api/main.go +++ b/cmd/acb-api/main.go @@ -26,6 +26,8 @@ type Config struct { ValkeyPassword string WorkerAPIKey string // API key workers use to submit results EncryptionKey string // AES-256-GCM key for shared secret encryption + DiscordWebhook string // Discord webhook URL for alerts + SlackWebhook string // Slack webhook URL for alerts MatchmakerSecs int HealthCheckSecs int ReaperSecs int @@ -42,6 +44,8 @@ func loadConfig() Config { ValkeyPassword: os.Getenv("ACB_VALKEY_PASSWORD"), WorkerAPIKey: os.Getenv("ACB_WORKER_API_KEY"), EncryptionKey: os.Getenv("ACB_ENCRYPTION_KEY"), + DiscordWebhook: os.Getenv("ACB_DISCORD_WEBHOOK"), + SlackWebhook: os.Getenv("ACB_SLACK_WEBHOOK"), MatchmakerSecs: envInt("ACB_MATCHMAKER_INTERVAL", 60), HealthCheckSecs: envInt("ACB_HEALTHCHECK_INTERVAL", 900), ReaperSecs: envInt("ACB_REAPER_INTERVAL", 300), @@ -71,9 +75,10 @@ func main() { defer rdb.Close() srv := &Server{ - cfg: cfg, - db: db, - rdb: rdb, + cfg: cfg, + db: db, + rdb: rdb, + alerter: NewAlerter(cfg.DiscordWebhook, cfg.SlackWebhook), } mux := http.NewServeMux() diff --git a/cmd/acb-api/server.go b/cmd/acb-api/server.go index cdad9a2..e31c2da 100644 --- a/cmd/acb-api/server.go +++ b/cmd/acb-api/server.go @@ -9,9 +9,10 @@ import ( ) type Server struct { - cfg Config - db *sql.DB - rdb *redis.Client + cfg Config + db *sql.DB + rdb *redis.Client + alerter *Alerter } func (s *Server) RegisterRoutes(mux *http.ServeMux) { diff --git a/cmd/acb-api/server_test.go b/cmd/acb-api/server_test.go index c266961..8c60812 100644 --- a/cmd/acb-api/server_test.go +++ b/cmd/acb-api/server_test.go @@ -18,6 +18,7 @@ func newTestServer() *Server { BotTimeoutSecs: 5, MaxConsecFails: 3, }, + alerter: NewAlerter("", ""), } } @@ -71,7 +72,7 @@ func TestAuthenticateWorker(t *testing.T) { } func TestAuthenticateWorker_NoKeyConfigured(t *testing.T) { - srv := &Server{cfg: Config{WorkerAPIKey: ""}} + srv := &Server{cfg: Config{WorkerAPIKey: ""}, alerter: NewAlerter("", "")} req := httptest.NewRequest("GET", "/", nil) if !srv.authenticateWorker(req) { t.Error("with no key configured, all requests should be authenticated") diff --git a/cmd/acb-api/tickers.go b/cmd/acb-api/tickers.go index 90bfa25..ab7d361 100644 --- a/cmd/acb-api/tickers.go +++ b/cmd/acb-api/tickers.go @@ -220,6 +220,9 @@ func (s *Server) tickHealthChecker(ctx context.Context) { `UPDATE bots SET status = 'active', consec_fails = 0, last_active = NOW() WHERE bot_id = $1`, bot.ID) log.Printf("health-checker: %s recovered → active", bot.ID) + if bot.Status == "inactive" { + s.alerter.BotRecovered(ctx, bot.ID) + } } } else { newFails := bot.ConsecFails + 1 @@ -232,6 +235,7 @@ func (s *Server) tickHealthChecker(ctx context.Context) { newStatus, newFails, bot.ID) if newStatus != bot.Status { log.Printf("health-checker: %s marked inactive after %d failures", bot.ID, newFails) + s.alerter.BotMarkedInactive(ctx, bot.ID, newFails) } } } @@ -285,6 +289,7 @@ func (s *Server) tickStaleReaper(ctx context.Context) { if len(staleJobs) > 0 { log.Printf("stale-reaper: processed %d stale jobs", len(staleJobs)) + s.alerter.StaleJobsReaped(ctx, staleJobs) } }