Add Discord/Slack alerting webhooks to Go API server
Implements the monitoring alerting deliverable from Phase 6. The Alerter module sends color-coded notifications to Discord and/or Slack webhooks for operational events: bot health transitions, stale job re-enqueues, and match errors. Includes per-key rate limiting to prevent alert storms. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e88749b134
commit
4aca8add20
8 changed files with 679 additions and 7 deletions
21
.env.example
21
.env.example
|
|
@ -34,6 +34,27 @@ BOT_SECRET_GUARDIAN=dev-secret-guardian
|
|||
BOT_SECRET_SWARM=dev-secret-swarm
|
||||
BOT_SECRET_HUNTER=dev-secret-hunter
|
||||
|
||||
# ===========================================
|
||||
# Go API Server Configuration
|
||||
# ===========================================
|
||||
# PostgreSQL connection URL
|
||||
ACB_DATABASE_URL=postgres://localhost:5432/acb?sslmode=disable
|
||||
|
||||
# Valkey (Redis-compatible) address
|
||||
ACB_VALKEY_ADDR=localhost:6379
|
||||
|
||||
# AES-256-GCM key for encrypting bot shared secrets at rest (32 bytes, hex-encoded)
|
||||
ACB_ENCRYPTION_KEY=
|
||||
|
||||
# ===========================================
|
||||
# Alerting Webhooks (optional)
|
||||
# ===========================================
|
||||
# Discord webhook URL for operational alerts (bot health, stale jobs, errors)
|
||||
ACB_DISCORD_WEBHOOK=
|
||||
|
||||
# Slack incoming webhook URL for operational alerts
|
||||
ACB_SLACK_WEBHOOK=
|
||||
|
||||
# ===========================================
|
||||
# Worker Configuration
|
||||
# ===========================================
|
||||
|
|
|
|||
23
PROGRESS.md
23
PROGRESS.md
|
|
@ -7,6 +7,22 @@
|
|||
**Last Updated: 2026-03-26**
|
||||
|
||||
### Recent Changes (2026-03-26)
|
||||
- Added Discord/Slack alerting webhooks to Go API server (`cmd/acb-api/alerts.go`):
|
||||
- `Alerter` module sends notifications to Discord and/or Slack incoming webhook URLs
|
||||
- Discord embeds with color-coded severity (blue=info, yellow=warning, red=error) + timestamps
|
||||
- Slack attachments with color-coded severity + footer
|
||||
- Rate limiting with per-key dedup cooldown (5 min default) to prevent alert storms
|
||||
- Garbage collection of expired dedup entries
|
||||
- Helper methods: `BotMarkedInactive`, `BotRecovered`, `StaleJobsReaped`, `MatchError`
|
||||
- Integrated into health checker ticker (alerts on bot inactive/recovered transitions)
|
||||
- Integrated into stale job reaper ticker (alerts when stale jobs re-enqueued)
|
||||
- Config via `ACB_DISCORD_WEBHOOK` and `ACB_SLACK_WEBHOOK` env vars
|
||||
- 15 unit tests: enabled detection, Discord/Slack payload format, color codes, rate limiting,
|
||||
cooldown expiry, no-dedup bypass, webhook errors, both-webhook dispatch, helper methods, GC
|
||||
- Updated `.env.example` with Go API and alerting webhook configuration
|
||||
- All tests pass (45 API tests total, 15 new + 30 existing)
|
||||
|
||||
### Previous Changes (2026-03-26)
|
||||
- Added Traefik IngressRoute, cert-manager Certificate, and CI/CD pipeline manifests (`deploy/k8s/`):
|
||||
- `ingress/acb-api-ingressroute.yaml` — Traefik IngressRoute for `api.aicodebattle.com`
|
||||
with CORS middleware (allow origins for aicodebattle.com), security headers, rate limiting (100 req/min burst 200)
|
||||
|
|
@ -175,6 +191,13 @@
|
|||
- `service-account.yaml` - CI ServiceAccount + RBAC (pods, workflows access)
|
||||
- DAG builds all 10 images in parallel: acb-api, acb-worker, acb-indexer, 6 strategy bots, plus site build
|
||||
- [x] Registry credentials SealedSecret template (`deploy/k8s/sealed-secrets/registry-credentials.yaml`)
|
||||
- [x] Discord/Slack alerting webhooks (`cmd/acb-api/alerts.go`)
|
||||
- Alerter module with Discord embeds and Slack attachments
|
||||
- Color-coded severity levels (info/warning/error)
|
||||
- Per-key rate limiting with configurable cooldown
|
||||
- Integrated into health checker and stale job reaper tickers
|
||||
- Helper methods for common alert events
|
||||
- 15 unit tests covering all functionality
|
||||
|
||||
### Remaining Phase 6 Work (requires Cloudflare account access)
|
||||
|
||||
|
|
|
|||
223
cmd/acb-api/alerts.go
Normal file
223
cmd/acb-api/alerts.go
Normal file
|
|
@ -0,0 +1,223 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// AlertLevel indicates severity for color-coding in webhook messages.
|
||||
type AlertLevel int
|
||||
|
||||
const (
|
||||
AlertInfo AlertLevel = iota // blue / informational
|
||||
AlertWarning // yellow / warning
|
||||
AlertError // red / error
|
||||
)
|
||||
|
||||
// Alerter sends notifications to configured Discord and/or Slack webhooks.
|
||||
type Alerter struct {
|
||||
discordURL string
|
||||
slackURL string
|
||||
client *http.Client
|
||||
|
||||
// Rate limiting: max 1 alert per key per cooldown period.
|
||||
mu sync.Mutex
|
||||
cooldown time.Duration
|
||||
sent map[string]time.Time
|
||||
}
|
||||
|
||||
// NewAlerter creates an Alerter. If both URLs are empty, Send is a no-op.
|
||||
func NewAlerter(discordURL, slackURL string) *Alerter {
|
||||
return &Alerter{
|
||||
discordURL: discordURL,
|
||||
slackURL: slackURL,
|
||||
client: &http.Client{Timeout: 10 * time.Second},
|
||||
cooldown: 5 * time.Minute,
|
||||
sent: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
// Enabled returns true if at least one webhook URL is configured.
|
||||
func (a *Alerter) Enabled() bool {
|
||||
return a.discordURL != "" || a.slackURL != ""
|
||||
}
|
||||
|
||||
// Send dispatches an alert to all configured webhooks. The dedupKey is used
|
||||
// for rate limiting — identical keys within the cooldown window are suppressed.
|
||||
func (a *Alerter) Send(ctx context.Context, level AlertLevel, title, message, dedupKey string) {
|
||||
if !a.Enabled() {
|
||||
return
|
||||
}
|
||||
|
||||
if dedupKey != "" && !a.shouldSend(dedupKey) {
|
||||
return
|
||||
}
|
||||
|
||||
if a.discordURL != "" {
|
||||
if err := a.sendDiscord(ctx, level, title, message); err != nil {
|
||||
log.Printf("alert: discord send error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if a.slackURL != "" {
|
||||
if err := a.sendSlack(ctx, level, title, message); err != nil {
|
||||
log.Printf("alert: slack send error: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// shouldSend checks rate limiting. Returns true if the alert should be sent.
|
||||
func (a *Alerter) shouldSend(key string) bool {
|
||||
a.mu.Lock()
|
||||
defer a.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
|
||||
// Garbage collect expired entries periodically
|
||||
if len(a.sent) > 100 {
|
||||
for k, t := range a.sent {
|
||||
if now.Sub(t) > a.cooldown {
|
||||
delete(a.sent, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if last, ok := a.sent[key]; ok && now.Sub(last) < a.cooldown {
|
||||
return false
|
||||
}
|
||||
a.sent[key] = now
|
||||
return true
|
||||
}
|
||||
|
||||
// discordPayload is the Discord webhook message format.
|
||||
type discordPayload struct {
|
||||
Embeds []discordEmbed `json:"embeds"`
|
||||
}
|
||||
|
||||
type discordEmbed struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Color int `json:"color"`
|
||||
Timestamp string `json:"timestamp"`
|
||||
}
|
||||
|
||||
func (a *Alerter) sendDiscord(ctx context.Context, level AlertLevel, title, message string) error {
|
||||
color := 0x3498db // blue
|
||||
switch level {
|
||||
case AlertWarning:
|
||||
color = 0xf39c12 // yellow/orange
|
||||
case AlertError:
|
||||
color = 0xe74c3c // red
|
||||
}
|
||||
|
||||
payload := discordPayload{
|
||||
Embeds: []discordEmbed{{
|
||||
Title: fmt.Sprintf("[ACB] %s", title),
|
||||
Description: message,
|
||||
Color: color,
|
||||
Timestamp: time.Now().UTC().Format(time.RFC3339),
|
||||
}},
|
||||
}
|
||||
|
||||
return a.postJSON(ctx, a.discordURL, payload)
|
||||
}
|
||||
|
||||
// slackPayload is the Slack incoming webhook format.
|
||||
type slackPayload struct {
|
||||
Attachments []slackAttachment `json:"attachments"`
|
||||
}
|
||||
|
||||
type slackAttachment struct {
|
||||
Color string `json:"color"`
|
||||
Title string `json:"title"`
|
||||
Text string `json:"text"`
|
||||
Footer string `json:"footer"`
|
||||
Ts int64 `json:"ts"`
|
||||
}
|
||||
|
||||
func (a *Alerter) sendSlack(ctx context.Context, level AlertLevel, title, message string) error {
|
||||
color := "#3498db"
|
||||
switch level {
|
||||
case AlertWarning:
|
||||
color = "#f39c12"
|
||||
case AlertError:
|
||||
color = "#e74c3c"
|
||||
}
|
||||
|
||||
payload := slackPayload{
|
||||
Attachments: []slackAttachment{{
|
||||
Color: color,
|
||||
Title: fmt.Sprintf("[ACB] %s", title),
|
||||
Text: message,
|
||||
Footer: "AI Code Battle",
|
||||
Ts: time.Now().Unix(),
|
||||
}},
|
||||
}
|
||||
|
||||
return a.postJSON(ctx, a.slackURL, payload)
|
||||
}
|
||||
|
||||
func (a *Alerter) postJSON(ctx context.Context, url string, payload any) error {
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal payload: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := a.client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("send webhook: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 300 {
|
||||
return fmt.Errorf("webhook returned status %d", resp.StatusCode)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Alert helper methods for common events.
|
||||
|
||||
func (a *Alerter) BotMarkedInactive(ctx context.Context, botID string, failCount int) {
|
||||
a.Send(ctx, AlertWarning,
|
||||
"Bot Marked Inactive",
|
||||
fmt.Sprintf("Bot `%s` marked inactive after %d consecutive health check failures.", botID, failCount),
|
||||
"bot-inactive:"+botID,
|
||||
)
|
||||
}
|
||||
|
||||
func (a *Alerter) BotRecovered(ctx context.Context, botID string) {
|
||||
a.Send(ctx, AlertInfo,
|
||||
"Bot Recovered",
|
||||
fmt.Sprintf("Bot `%s` is back online and marked active.", botID),
|
||||
"bot-recovered:"+botID,
|
||||
)
|
||||
}
|
||||
|
||||
func (a *Alerter) StaleJobsReaped(ctx context.Context, jobIDs []string) {
|
||||
a.Send(ctx, AlertWarning,
|
||||
"Stale Jobs Re-enqueued",
|
||||
fmt.Sprintf("%d stale job(s) re-enqueued: %s", len(jobIDs), strings.Join(jobIDs, ", ")),
|
||||
"stale-jobs",
|
||||
)
|
||||
}
|
||||
|
||||
func (a *Alerter) MatchError(ctx context.Context, matchID, reason string) {
|
||||
a.Send(ctx, AlertError,
|
||||
"Match Error",
|
||||
fmt.Sprintf("Match `%s` failed: %s", matchID, reason),
|
||||
"match-error:"+matchID,
|
||||
)
|
||||
}
|
||||
393
cmd/acb-api/alerts_test.go
Normal file
393
cmd/acb-api/alerts_test.go
Normal file
|
|
@ -0,0 +1,393 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAlerterEnabled(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
discord string
|
||||
slack string
|
||||
want bool
|
||||
}{
|
||||
{"both empty", "", "", false},
|
||||
{"discord only", "http://discord.example.com", "", true},
|
||||
{"slack only", "", "http://slack.example.com", true},
|
||||
{"both set", "http://discord.example.com", "http://slack.example.com", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
a := NewAlerter(tt.discord, tt.slack)
|
||||
if got := a.Enabled(); got != tt.want {
|
||||
t.Errorf("Enabled() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterSendNoOp(t *testing.T) {
|
||||
// With no webhook URLs, Send should be a no-op (no panic, no error).
|
||||
a := NewAlerter("", "")
|
||||
a.Send(context.Background(), AlertError, "Test", "message", "key")
|
||||
}
|
||||
|
||||
func TestAlerterSendDiscord(t *testing.T) {
|
||||
var received discordPayload
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
t.Errorf("method = %s, want POST", r.Method)
|
||||
}
|
||||
if ct := r.Header.Get("Content-Type"); ct != "application/json" {
|
||||
t.Errorf("content-type = %s, want application/json", ct)
|
||||
}
|
||||
json.NewDecoder(r.Body).Decode(&received)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
a.Send(context.Background(), AlertError, "Test Alert", "Something broke", "")
|
||||
|
||||
if len(received.Embeds) != 1 {
|
||||
t.Fatalf("embeds count = %d, want 1", len(received.Embeds))
|
||||
}
|
||||
embed := received.Embeds[0]
|
||||
if embed.Title != "[ACB] Test Alert" {
|
||||
t.Errorf("title = %q, want %q", embed.Title, "[ACB] Test Alert")
|
||||
}
|
||||
if embed.Description != "Something broke" {
|
||||
t.Errorf("description = %q, want %q", embed.Description, "Something broke")
|
||||
}
|
||||
if embed.Color != 0xe74c3c {
|
||||
t.Errorf("color = %#x, want %#x (red/error)", embed.Color, 0xe74c3c)
|
||||
}
|
||||
if embed.Timestamp == "" {
|
||||
t.Error("timestamp should not be empty")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterSendSlack(t *testing.T) {
|
||||
var received slackPayload
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewDecoder(r.Body).Decode(&received)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter("", ts.URL)
|
||||
a.Send(context.Background(), AlertWarning, "Warning", "Watch out", "")
|
||||
|
||||
if len(received.Attachments) != 1 {
|
||||
t.Fatalf("attachments count = %d, want 1", len(received.Attachments))
|
||||
}
|
||||
att := received.Attachments[0]
|
||||
if att.Title != "[ACB] Warning" {
|
||||
t.Errorf("title = %q, want %q", att.Title, "[ACB] Warning")
|
||||
}
|
||||
if att.Text != "Watch out" {
|
||||
t.Errorf("text = %q, want %q", att.Text, "Watch out")
|
||||
}
|
||||
if att.Color != "#f39c12" {
|
||||
t.Errorf("color = %q, want %q (warning)", att.Color, "#f39c12")
|
||||
}
|
||||
if att.Footer != "AI Code Battle" {
|
||||
t.Errorf("footer = %q, want %q", att.Footer, "AI Code Battle")
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterColorCodes(t *testing.T) {
|
||||
tests := []struct {
|
||||
level AlertLevel
|
||||
wantDiscord int
|
||||
wantSlack string
|
||||
}{
|
||||
{AlertInfo, 0x3498db, "#3498db"},
|
||||
{AlertWarning, 0xf39c12, "#f39c12"},
|
||||
{AlertError, 0xe74c3c, "#e74c3c"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
var discordReceived discordPayload
|
||||
var slackReceived slackPayload
|
||||
|
||||
discordSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewDecoder(r.Body).Decode(&discordReceived)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
slackSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewDecoder(r.Body).Decode(&slackReceived)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
|
||||
a := NewAlerter(discordSrv.URL, slackSrv.URL)
|
||||
a.Send(context.Background(), tt.level, "Test", "msg", "")
|
||||
|
||||
if len(discordReceived.Embeds) > 0 && discordReceived.Embeds[0].Color != tt.wantDiscord {
|
||||
t.Errorf("level %d: discord color = %#x, want %#x", tt.level, discordReceived.Embeds[0].Color, tt.wantDiscord)
|
||||
}
|
||||
if len(slackReceived.Attachments) > 0 && slackReceived.Attachments[0].Color != tt.wantSlack {
|
||||
t.Errorf("level %d: slack color = %q, want %q", tt.level, slackReceived.Attachments[0].Color, tt.wantSlack)
|
||||
}
|
||||
|
||||
discordSrv.Close()
|
||||
slackSrv.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterRateLimiting(t *testing.T) {
|
||||
var callCount int
|
||||
var mu sync.Mutex
|
||||
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mu.Lock()
|
||||
callCount++
|
||||
mu.Unlock()
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
a.cooldown = 1 * time.Hour // long cooldown for test
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// First send should go through
|
||||
a.Send(ctx, AlertInfo, "Test", "msg1", "same-key")
|
||||
mu.Lock()
|
||||
if callCount != 1 {
|
||||
t.Errorf("after first send: count = %d, want 1", callCount)
|
||||
}
|
||||
mu.Unlock()
|
||||
|
||||
// Second send with same key should be suppressed
|
||||
a.Send(ctx, AlertInfo, "Test", "msg2", "same-key")
|
||||
mu.Lock()
|
||||
if callCount != 1 {
|
||||
t.Errorf("after duplicate send: count = %d, want 1 (suppressed)", callCount)
|
||||
}
|
||||
mu.Unlock()
|
||||
|
||||
// Different key should go through
|
||||
a.Send(ctx, AlertInfo, "Test", "msg3", "other-key")
|
||||
mu.Lock()
|
||||
if callCount != 2 {
|
||||
t.Errorf("after different key: count = %d, want 2", callCount)
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
|
||||
func TestAlerterNoDedupKeyAlwaysSends(t *testing.T) {
|
||||
var callCount int
|
||||
var mu sync.Mutex
|
||||
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mu.Lock()
|
||||
callCount++
|
||||
mu.Unlock()
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
ctx := context.Background()
|
||||
|
||||
// Empty dedup key should always send
|
||||
a.Send(ctx, AlertInfo, "Test", "msg1", "")
|
||||
a.Send(ctx, AlertInfo, "Test", "msg2", "")
|
||||
a.Send(ctx, AlertInfo, "Test", "msg3", "")
|
||||
|
||||
mu.Lock()
|
||||
if callCount != 3 {
|
||||
t.Errorf("without dedup key: count = %d, want 3", callCount)
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
|
||||
func TestAlerterRateLimitExpiry(t *testing.T) {
|
||||
var callCount int
|
||||
var mu sync.Mutex
|
||||
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mu.Lock()
|
||||
callCount++
|
||||
mu.Unlock()
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
a.cooldown = 10 * time.Millisecond // very short for test
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
a.Send(ctx, AlertInfo, "Test", "msg1", "expire-key")
|
||||
mu.Lock()
|
||||
c1 := callCount
|
||||
mu.Unlock()
|
||||
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
// After cooldown, same key should send again
|
||||
a.Send(ctx, AlertInfo, "Test", "msg2", "expire-key")
|
||||
mu.Lock()
|
||||
c2 := callCount
|
||||
mu.Unlock()
|
||||
|
||||
if c1 != 1 || c2 != 2 {
|
||||
t.Errorf("rate limit expiry: counts = (%d, %d), want (1, 2)", c1, c2)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerterWebhookError(t *testing.T) {
|
||||
// Server that returns 500 — should not panic
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, ts.URL)
|
||||
// Should log errors but not panic
|
||||
a.Send(context.Background(), AlertError, "Test", "msg", "")
|
||||
}
|
||||
|
||||
func TestAlerterBothWebhooks(t *testing.T) {
|
||||
var discordCalls, slackCalls int
|
||||
var mu sync.Mutex
|
||||
|
||||
discordSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mu.Lock()
|
||||
discordCalls++
|
||||
mu.Unlock()
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer discordSrv.Close()
|
||||
|
||||
slackSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mu.Lock()
|
||||
slackCalls++
|
||||
mu.Unlock()
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer slackSrv.Close()
|
||||
|
||||
a := NewAlerter(discordSrv.URL, slackSrv.URL)
|
||||
a.Send(context.Background(), AlertInfo, "Test", "msg", "")
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
if discordCalls != 1 {
|
||||
t.Errorf("discord calls = %d, want 1", discordCalls)
|
||||
}
|
||||
if slackCalls != 1 {
|
||||
t.Errorf("slack calls = %d, want 1", slackCalls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelperBotMarkedInactive(t *testing.T) {
|
||||
var received discordPayload
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewDecoder(r.Body).Decode(&received)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
a.BotMarkedInactive(context.Background(), "bot_abc123", 3)
|
||||
|
||||
if len(received.Embeds) != 1 {
|
||||
t.Fatal("expected 1 embed")
|
||||
}
|
||||
if received.Embeds[0].Color != 0xf39c12 {
|
||||
t.Errorf("bot inactive should be warning (yellow), got %#x", received.Embeds[0].Color)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelperBotRecovered(t *testing.T) {
|
||||
var received discordPayload
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewDecoder(r.Body).Decode(&received)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
a.BotRecovered(context.Background(), "bot_abc123")
|
||||
|
||||
if len(received.Embeds) != 1 {
|
||||
t.Fatal("expected 1 embed")
|
||||
}
|
||||
if received.Embeds[0].Color != 0x3498db {
|
||||
t.Errorf("bot recovered should be info (blue), got %#x", received.Embeds[0].Color)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelperStaleJobsReaped(t *testing.T) {
|
||||
var received discordPayload
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewDecoder(r.Body).Decode(&received)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
a.StaleJobsReaped(context.Background(), []string{"j_001", "j_002"})
|
||||
|
||||
if len(received.Embeds) != 1 {
|
||||
t.Fatal("expected 1 embed")
|
||||
}
|
||||
if received.Embeds[0].Color != 0xf39c12 {
|
||||
t.Errorf("stale jobs should be warning (yellow), got %#x", received.Embeds[0].Color)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHelperMatchError(t *testing.T) {
|
||||
var received discordPayload
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewDecoder(r.Body).Decode(&received)
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
a := NewAlerter(ts.URL, "")
|
||||
a.MatchError(context.Background(), "m_12345678", "bot timeout")
|
||||
|
||||
if len(received.Embeds) != 1 {
|
||||
t.Fatal("expected 1 embed")
|
||||
}
|
||||
if received.Embeds[0].Color != 0xe74c3c {
|
||||
t.Errorf("match error should be error (red), got %#x", received.Embeds[0].Color)
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldSendGarbageCollects(t *testing.T) {
|
||||
a := NewAlerter("http://unused", "")
|
||||
a.cooldown = 1 * time.Millisecond
|
||||
|
||||
// Fill beyond 100 entries to trigger GC
|
||||
for i := 0; i < 110; i++ {
|
||||
a.sent[string(rune('a'+i%26))+string(rune('0'+i/26))] = time.Now().Add(-time.Hour)
|
||||
}
|
||||
|
||||
// This should trigger cleanup and succeed
|
||||
got := a.shouldSend("new-key")
|
||||
if !got {
|
||||
t.Error("shouldSend returned false for new key after GC should have cleaned expired entries")
|
||||
}
|
||||
|
||||
// Old expired entries should have been cleaned
|
||||
a.mu.Lock()
|
||||
count := len(a.sent)
|
||||
a.mu.Unlock()
|
||||
// Should only have "new-key" left (all others expired)
|
||||
if count > 10 {
|
||||
t.Errorf("after GC: %d entries remain, expected most to be cleaned", count)
|
||||
}
|
||||
}
|
||||
|
|
@ -26,6 +26,8 @@ type Config struct {
|
|||
ValkeyPassword string
|
||||
WorkerAPIKey string // API key workers use to submit results
|
||||
EncryptionKey string // AES-256-GCM key for shared secret encryption
|
||||
DiscordWebhook string // Discord webhook URL for alerts
|
||||
SlackWebhook string // Slack webhook URL for alerts
|
||||
MatchmakerSecs int
|
||||
HealthCheckSecs int
|
||||
ReaperSecs int
|
||||
|
|
@ -42,6 +44,8 @@ func loadConfig() Config {
|
|||
ValkeyPassword: os.Getenv("ACB_VALKEY_PASSWORD"),
|
||||
WorkerAPIKey: os.Getenv("ACB_WORKER_API_KEY"),
|
||||
EncryptionKey: os.Getenv("ACB_ENCRYPTION_KEY"),
|
||||
DiscordWebhook: os.Getenv("ACB_DISCORD_WEBHOOK"),
|
||||
SlackWebhook: os.Getenv("ACB_SLACK_WEBHOOK"),
|
||||
MatchmakerSecs: envInt("ACB_MATCHMAKER_INTERVAL", 60),
|
||||
HealthCheckSecs: envInt("ACB_HEALTHCHECK_INTERVAL", 900),
|
||||
ReaperSecs: envInt("ACB_REAPER_INTERVAL", 300),
|
||||
|
|
@ -71,9 +75,10 @@ func main() {
|
|||
defer rdb.Close()
|
||||
|
||||
srv := &Server{
|
||||
cfg: cfg,
|
||||
db: db,
|
||||
rdb: rdb,
|
||||
cfg: cfg,
|
||||
db: db,
|
||||
rdb: rdb,
|
||||
alerter: NewAlerter(cfg.DiscordWebhook, cfg.SlackWebhook),
|
||||
}
|
||||
|
||||
mux := http.NewServeMux()
|
||||
|
|
|
|||
|
|
@ -9,9 +9,10 @@ import (
|
|||
)
|
||||
|
||||
type Server struct {
|
||||
cfg Config
|
||||
db *sql.DB
|
||||
rdb *redis.Client
|
||||
cfg Config
|
||||
db *sql.DB
|
||||
rdb *redis.Client
|
||||
alerter *Alerter
|
||||
}
|
||||
|
||||
func (s *Server) RegisterRoutes(mux *http.ServeMux) {
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ func newTestServer() *Server {
|
|||
BotTimeoutSecs: 5,
|
||||
MaxConsecFails: 3,
|
||||
},
|
||||
alerter: NewAlerter("", ""),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -71,7 +72,7 @@ func TestAuthenticateWorker(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestAuthenticateWorker_NoKeyConfigured(t *testing.T) {
|
||||
srv := &Server{cfg: Config{WorkerAPIKey: ""}}
|
||||
srv := &Server{cfg: Config{WorkerAPIKey: ""}, alerter: NewAlerter("", "")}
|
||||
req := httptest.NewRequest("GET", "/", nil)
|
||||
if !srv.authenticateWorker(req) {
|
||||
t.Error("with no key configured, all requests should be authenticated")
|
||||
|
|
|
|||
|
|
@ -220,6 +220,9 @@ func (s *Server) tickHealthChecker(ctx context.Context) {
|
|||
`UPDATE bots SET status = 'active', consec_fails = 0, last_active = NOW()
|
||||
WHERE bot_id = $1`, bot.ID)
|
||||
log.Printf("health-checker: %s recovered → active", bot.ID)
|
||||
if bot.Status == "inactive" {
|
||||
s.alerter.BotRecovered(ctx, bot.ID)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
newFails := bot.ConsecFails + 1
|
||||
|
|
@ -232,6 +235,7 @@ func (s *Server) tickHealthChecker(ctx context.Context) {
|
|||
newStatus, newFails, bot.ID)
|
||||
if newStatus != bot.Status {
|
||||
log.Printf("health-checker: %s marked inactive after %d failures", bot.ID, newFails)
|
||||
s.alerter.BotMarkedInactive(ctx, bot.ID, newFails)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -285,6 +289,7 @@ func (s *Server) tickStaleReaper(ctx context.Context) {
|
|||
|
||||
if len(staleJobs) > 0 {
|
||||
log.Printf("stale-reaper: processed %d stale jobs", len(staleJobs))
|
||||
s.alerter.StaleJobsReaped(ctx, staleJobs)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue