Implements the acb-enrichment service (plan §13.3) that generates AI commentary for featured matches. Key features: - LLM client (OpenAI/Anthropic API compatible) - Replay fetch from B2/R2 storage - Structured commentary output (key_moments array with turn, description, significance, tags) - Rate limiting to control LLM costs - Match selection based on: - Minimum turn count - Win probability crossings - Upset threshold - Close finishes Components: - cmd/acb-enrichment/main.go - service entry point - cmd/acb-enrichment/config.go - configuration from env vars - cmd/acb-enrichment/service.go - orchestration logic - internal/db/store.go - database access for match selection - internal/llm/client.go - OpenAI-compatible LLM client - internal/selector/selector.go - match selection with priority - internal/generator/generator.go - commentary generation - internal/storage/client.go - S3-compatible storage client - Dockerfile - container image - manifests/acb-enrichment-deployment.yml - K8s deployment - metrics/metrics.go - Prometheus metrics for enrichment Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
221 lines
7.4 KiB
Go
221 lines
7.4 KiB
Go
// Package metrics defines Prometheus metrics for AI Code Battle services per plan §9.9.
|
|
//
|
|
// All services import this package to expose a /metrics endpoint on an
|
|
// internal port (default :9090). The metrics match the 9 monitoring signals
|
|
// listed in the plan.
|
|
package metrics
|
|
|
|
import (
|
|
"net/http"
|
|
"os"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
)
|
|
|
|
// §9.9 metric definitions — registered once at init time.
|
|
var (
|
|
// MatchThroughput counts completed matches (worker increments per result).
|
|
MatchThroughput = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_match_throughput_total",
|
|
Help: "Total number of matches completed.",
|
|
})
|
|
|
|
// JobQueueDepth tracks the Valkey job queue length (matchmaker updates each tick).
|
|
JobQueueDepth = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "acb_job_queue_depth",
|
|
Help: "Current number of pending jobs in the Valkey queue.",
|
|
})
|
|
|
|
// BotCrashed counts bots marked as crashed by the health checker.
|
|
BotCrashed = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_bot_crashed_total",
|
|
Help: "Total number of bot crash events detected by the health checker.",
|
|
})
|
|
|
|
// StaleJobCount is the number of stale jobs found in the last reaper cycle.
|
|
StaleJobCount = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "acb_job_stale_count",
|
|
Help: "Number of stale jobs found in the most recent reaper cycle.",
|
|
})
|
|
|
|
// R2BytesUsed tracks the R2 warm cache size in bytes (index-builder updates).
|
|
R2BytesUsed = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "acb_r2_bytes_used",
|
|
Help: "Total bytes used in the R2 warm cache.",
|
|
})
|
|
|
|
// ReplayUploadLatency tracks B2 replay upload duration.
|
|
ReplayUploadLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: "acb_replay_upload_latency_seconds",
|
|
Help: "Latency of replay uploads to B2 in seconds.",
|
|
Buckets: prometheus.DefBuckets,
|
|
})
|
|
|
|
// EvolverGenerations counts evolution cycles completed.
|
|
EvolverGenerations = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_evolver_generations_total",
|
|
Help: "Total number of evolution generations completed.",
|
|
})
|
|
|
|
// IndexBuildDuration tracks how long each index build cycle takes.
|
|
IndexBuildDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: "acb_index_build_duration_seconds",
|
|
Help: "Duration of index build cycles in seconds.",
|
|
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
|
|
})
|
|
|
|
// HTTPRequestsTotal counts HTTP requests served by the API.
|
|
HTTPRequestsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "acb_http_requests_total",
|
|
Help: "Total number of HTTP requests served.",
|
|
}, []string{"method", "path", "status"})
|
|
|
|
// BotsActive tracks the number of currently active bots (matchmaker health checker).
|
|
BotsActive = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "acb_bots_active",
|
|
Help: "Number of bots currently in active status.",
|
|
})
|
|
|
|
// BotsFailing tracks the number of bots failing health checks.
|
|
BotsFailing = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Name: "acb_bots_failing",
|
|
Help: "Number of bots currently failing health checks.",
|
|
})
|
|
|
|
// WorkerMatchesTotal counts matches executed by the worker.
|
|
WorkerMatchesTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_worker_matches_total",
|
|
Help: "Total matches executed by this worker.",
|
|
})
|
|
|
|
// WorkerMatchErrorsTotal counts match execution errors.
|
|
WorkerMatchErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_worker_match_errors_total",
|
|
Help: "Total match execution errors.",
|
|
})
|
|
|
|
// WorkerJobsClaimedTotal counts jobs claimed by the worker.
|
|
WorkerJobsClaimedTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_worker_jobs_claimed_total",
|
|
Help: "Total jobs claimed by this worker.",
|
|
})
|
|
|
|
// WorkerMatchDuration tracks match execution time.
|
|
WorkerMatchDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: "acb_worker_match_duration_seconds",
|
|
Help: "Match execution duration in seconds.",
|
|
Buckets: []float64{1, 5, 10, 30, 60, 120, 300, 600},
|
|
})
|
|
|
|
// RateLimitHits counts requests rejected by rate limiting.
|
|
RateLimitHits = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "acb_rate_limit_hits_total",
|
|
Help: "Total number of requests rejected by rate limiting.",
|
|
}, []string{"endpoint"})
|
|
|
|
// EnrichmentCycles counts enrichment cycles completed.
|
|
EnrichmentCycles = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "acb_enrichment_cycles_total",
|
|
Help: "Total number of enrichment cycles completed.",
|
|
}, []string{"status"})
|
|
|
|
// EnrichmentProcessed counts matches processed for enrichment.
|
|
EnrichmentProcessed = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_enrichment_processed_total",
|
|
Help: "Total number of matches processed for enrichment.",
|
|
})
|
|
|
|
// EnrichmentGenerated counts successful commentaries generated.
|
|
EnrichmentGenerated = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "acb_enrichment_generated_total",
|
|
Help: "Total number of commentaries successfully generated.",
|
|
})
|
|
|
|
// EnrichmentCycleDuration tracks enrichment cycle duration.
|
|
EnrichmentCycleDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Name: "acb_enrichment_cycle_duration_seconds",
|
|
Help: "Duration of enrichment cycles in seconds.",
|
|
Buckets: []float64{30, 60, 120, 300, 600, 900, 1800},
|
|
})
|
|
|
|
// EnrichmentLLMRequests counts LLM API requests for enrichment.
|
|
EnrichmentLLMRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Name: "acb_enrichment_llm_requests_total",
|
|
Help: "Total number of LLM requests for enrichment.",
|
|
}, []string{"status"})
|
|
)
|
|
|
|
func init() {
|
|
prometheus.MustRegister(
|
|
MatchThroughput,
|
|
JobQueueDepth,
|
|
BotCrashed,
|
|
StaleJobCount,
|
|
R2BytesUsed,
|
|
ReplayUploadLatency,
|
|
EvolverGenerations,
|
|
IndexBuildDuration,
|
|
HTTPRequestsTotal,
|
|
BotsActive,
|
|
BotsFailing,
|
|
WorkerMatchesTotal,
|
|
WorkerMatchErrorsTotal,
|
|
WorkerJobsClaimedTotal,
|
|
WorkerMatchDuration,
|
|
RateLimitHits,
|
|
EnrichmentCycles,
|
|
EnrichmentProcessed,
|
|
EnrichmentGenerated,
|
|
EnrichmentCycleDuration,
|
|
EnrichmentLLMRequests,
|
|
)
|
|
}
|
|
|
|
// Handler returns an http.Handler that serves /metrics.
|
|
func Handler() http.Handler {
|
|
mux := http.NewServeMux()
|
|
mux.Handle("/metrics", promhttp.Handler())
|
|
mux.HandleFunc("/health", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
w.Write([]byte(`{"status":"ok"}`))
|
|
})
|
|
return mux
|
|
}
|
|
|
|
// StartServer starts a Prometheus metrics HTTP server. Returns the server
|
|
// so the caller can shut it down gracefully. The address defaults to
|
|
// ACB_METRICS_ADDR env var, falling back to ":9090".
|
|
func StartServer() *http.Server {
|
|
addr := os.Getenv("ACB_METRICS_ADDR")
|
|
if addr == "" {
|
|
addr = ":9090"
|
|
}
|
|
srv := &http.Server{Addr: addr, Handler: Handler()}
|
|
go srv.ListenAndServe()
|
|
return srv
|
|
}
|
|
|
|
// HTTPMiddleware wraps an http.Handler to count requests via HTTPRequestsTotal.
|
|
func HTTPMiddleware(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
start := time.Now()
|
|
sw := &statusWriter{ResponseWriter: w, status: http.StatusOK}
|
|
next.ServeHTTP(sw, r)
|
|
HTTPRequestsTotal.WithLabelValues(r.Method, r.URL.Path, strconv.Itoa(sw.status)).Inc()
|
|
_ = start
|
|
})
|
|
}
|
|
|
|
// statusWriter wraps http.ResponseWriter to capture the status code.
|
|
type statusWriter struct {
|
|
http.ResponseWriter
|
|
status int
|
|
}
|
|
|
|
func (w *statusWriter) WriteHeader(code int) {
|
|
w.status = code
|
|
w.ResponseWriter.WriteHeader(code)
|
|
}
|