ai-code-battle/metrics/metrics.go
jedarden e88c108010 feat(acb-enrichment): implement AI replay enrichment service
Implements the acb-enrichment service (plan §13.3) that generates AI
commentary for featured matches.

Key features:
- LLM client (OpenAI/Anthropic API compatible)
- Replay fetch from B2/R2 storage
- Structured commentary output (key_moments array with turn,
  description, significance, tags)
- Rate limiting to control LLM costs
- Match selection based on:
  - Minimum turn count
  - Win probability crossings
  - Upset threshold
  - Close finishes

Components:
- cmd/acb-enrichment/main.go - service entry point
- cmd/acb-enrichment/config.go - configuration from env vars
- cmd/acb-enrichment/service.go - orchestration logic
- internal/db/store.go - database access for match selection
- internal/llm/client.go - OpenAI-compatible LLM client
- internal/selector/selector.go - match selection with priority
- internal/generator/generator.go - commentary generation
- internal/storage/client.go - S3-compatible storage client
- Dockerfile - container image
- manifests/acb-enrichment-deployment.yml - K8s deployment
- metrics/metrics.go - Prometheus metrics for enrichment

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-04 02:22:28 -04:00

221 lines
7.4 KiB
Go

// Package metrics defines Prometheus metrics for AI Code Battle services per plan §9.9.
//
// All services import this package to expose a /metrics endpoint on an
// internal port (default :9090). The metrics match the 9 monitoring signals
// listed in the plan.
package metrics
import (
"net/http"
"os"
"strconv"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// §9.9 metric definitions — registered once at init time.
var (
// MatchThroughput counts completed matches (worker increments per result).
MatchThroughput = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_match_throughput_total",
Help: "Total number of matches completed.",
})
// JobQueueDepth tracks the Valkey job queue length (matchmaker updates each tick).
JobQueueDepth = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "acb_job_queue_depth",
Help: "Current number of pending jobs in the Valkey queue.",
})
// BotCrashed counts bots marked as crashed by the health checker.
BotCrashed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_bot_crashed_total",
Help: "Total number of bot crash events detected by the health checker.",
})
// StaleJobCount is the number of stale jobs found in the last reaper cycle.
StaleJobCount = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "acb_job_stale_count",
Help: "Number of stale jobs found in the most recent reaper cycle.",
})
// R2BytesUsed tracks the R2 warm cache size in bytes (index-builder updates).
R2BytesUsed = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "acb_r2_bytes_used",
Help: "Total bytes used in the R2 warm cache.",
})
// ReplayUploadLatency tracks B2 replay upload duration.
ReplayUploadLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "acb_replay_upload_latency_seconds",
Help: "Latency of replay uploads to B2 in seconds.",
Buckets: prometheus.DefBuckets,
})
// EvolverGenerations counts evolution cycles completed.
EvolverGenerations = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_evolver_generations_total",
Help: "Total number of evolution generations completed.",
})
// IndexBuildDuration tracks how long each index build cycle takes.
IndexBuildDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "acb_index_build_duration_seconds",
Help: "Duration of index build cycles in seconds.",
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
})
// HTTPRequestsTotal counts HTTP requests served by the API.
HTTPRequestsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "acb_http_requests_total",
Help: "Total number of HTTP requests served.",
}, []string{"method", "path", "status"})
// BotsActive tracks the number of currently active bots (matchmaker health checker).
BotsActive = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "acb_bots_active",
Help: "Number of bots currently in active status.",
})
// BotsFailing tracks the number of bots failing health checks.
BotsFailing = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "acb_bots_failing",
Help: "Number of bots currently failing health checks.",
})
// WorkerMatchesTotal counts matches executed by the worker.
WorkerMatchesTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_worker_matches_total",
Help: "Total matches executed by this worker.",
})
// WorkerMatchErrorsTotal counts match execution errors.
WorkerMatchErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_worker_match_errors_total",
Help: "Total match execution errors.",
})
// WorkerJobsClaimedTotal counts jobs claimed by the worker.
WorkerJobsClaimedTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_worker_jobs_claimed_total",
Help: "Total jobs claimed by this worker.",
})
// WorkerMatchDuration tracks match execution time.
WorkerMatchDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "acb_worker_match_duration_seconds",
Help: "Match execution duration in seconds.",
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
})
// RateLimitHits counts requests rejected by rate limiting.
RateLimitHits = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "acb_rate_limit_hits_total",
Help: "Total number of requests rejected by rate limiting.",
}, []string{"endpoint"})
// EnrichmentCycles counts enrichment cycles completed.
EnrichmentCycles = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "acb_enrichment_cycles_total",
Help: "Total number of enrichment cycles completed.",
}, []string{"status"})
// EnrichmentProcessed counts matches processed for enrichment.
EnrichmentProcessed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_enrichment_processed_total",
Help: "Total number of matches processed for enrichment.",
})
// EnrichmentGenerated counts successful commentaries generated.
EnrichmentGenerated = prometheus.NewCounter(prometheus.CounterOpts{
Name: "acb_enrichment_generated_total",
Help: "Total number of commentaries successfully generated.",
})
// EnrichmentCycleDuration tracks enrichment cycle duration.
EnrichmentCycleDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "acb_enrichment_cycle_duration_seconds",
Help: "Duration of enrichment cycles in seconds.",
Buckets: []float64{30, 60, 120, 300, 600, 900, 1800},
})
// EnrichmentLLMRequests counts LLM API requests for enrichment.
EnrichmentLLMRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "acb_enrichment_llm_requests_total",
Help: "Total number of LLM requests for enrichment.",
}, []string{"status"})
)
func init() {
prometheus.MustRegister(
MatchThroughput,
JobQueueDepth,
BotCrashed,
StaleJobCount,
R2BytesUsed,
ReplayUploadLatency,
EvolverGenerations,
IndexBuildDuration,
HTTPRequestsTotal,
BotsActive,
BotsFailing,
WorkerMatchesTotal,
WorkerMatchErrorsTotal,
WorkerJobsClaimedTotal,
WorkerMatchDuration,
RateLimitHits,
EnrichmentCycles,
EnrichmentProcessed,
EnrichmentGenerated,
EnrichmentCycleDuration,
EnrichmentLLMRequests,
)
}
// Handler returns an http.Handler that serves /metrics.
func Handler() http.Handler {
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok"}`))
})
return mux
}
// StartServer starts a Prometheus metrics HTTP server. Returns the server
// so the caller can shut it down gracefully. The address defaults to
// ACB_METRICS_ADDR env var, falling back to ":9090".
func StartServer() *http.Server {
addr := os.Getenv("ACB_METRICS_ADDR")
if addr == "" {
addr = ":9090"
}
srv := &http.Server{Addr: addr, Handler: Handler()}
go srv.ListenAndServe()
return srv
}
// HTTPMiddleware wraps an http.Handler to count requests via HTTPRequestsTotal.
func HTTPMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
sw := &statusWriter{ResponseWriter: w, status: http.StatusOK}
next.ServeHTTP(sw, r)
HTTPRequestsTotal.WithLabelValues(r.Method, r.URL.Path, strconv.Itoa(sw.status)).Inc()
_ = start
})
}
// statusWriter wraps http.ResponseWriter to capture the status code.
type statusWriter struct {
http.ResponseWriter
status int
}
func (w *statusWriter) WriteHeader(code int) {
w.status = code
w.ResponseWriter.WriteHeader(code)
}