Extracted from ardenone-cluster/containers/zai-proxy and ardenone-cluster/containers/zai-proxy-dashboard. - proxy/: OpenAI-compatible ZAI reverse proxy (Go, v1.10.0) - Token counting, rate limiting, Prometheus metrics, canary support - dashboard/: Metrics dashboard backend + React frontend (Go, v1.0.0) - Prometheus collector, SQLite storage, SSE live updates - docs/: Operational notes, research, and plan subdirs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
244 lines
8.1 KiB
Go
244 lines
8.1 KiB
Go
package main
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
var (
|
|
// Request metrics
|
|
requestsTotal = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "zai_proxy_requests_total",
|
|
Help: "Total number of requests by method, path, status code, and variant",
|
|
},
|
|
[]string{"method", "path", "status_code", "variant"},
|
|
)
|
|
|
|
requestDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "zai_proxy_request_duration_seconds",
|
|
Help: "Request duration in seconds",
|
|
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60, 120, 300},
|
|
},
|
|
[]string{"method", "path", "status_code", "variant"},
|
|
)
|
|
|
|
requestSize = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "zai_proxy_request_size_bytes",
|
|
Help: "Request size in bytes",
|
|
Buckets: prometheus.ExponentialBuckets(100, 10, 8),
|
|
},
|
|
[]string{"method", "path", "variant"},
|
|
)
|
|
|
|
responseSize = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "zai_proxy_response_size_bytes",
|
|
Help: "Response size in bytes",
|
|
Buckets: prometheus.ExponentialBuckets(100, 10, 8),
|
|
},
|
|
[]string{"method", "path", "status_code", "variant"},
|
|
)
|
|
|
|
concurrentRequests = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "zai_proxy_concurrent_requests",
|
|
Help: "Number of requests currently being processed",
|
|
},
|
|
[]string{"variant"},
|
|
)
|
|
|
|
upstreamErrors = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "zai_proxy_upstream_errors_total",
|
|
Help: "Total number of upstream errors by type",
|
|
},
|
|
[]string{"error_type", "variant"},
|
|
)
|
|
|
|
maxWorkers = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "zai_proxy_max_workers",
|
|
Help: "Maximum number of concurrent workers allowed",
|
|
},
|
|
[]string{"variant"},
|
|
)
|
|
|
|
workerUtilization = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "zai_proxy_worker_utilization_ratio",
|
|
Help: "Current worker utilization ratio (concurrent_requests/max_workers)",
|
|
},
|
|
[]string{"variant"},
|
|
)
|
|
|
|
// Rate limiting metrics
|
|
rateLimitCurrentRate = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "zai_proxy_rate_limit_requests_per_second",
|
|
Help: "Current rate limit in requests per second",
|
|
},
|
|
[]string{"variant"},
|
|
)
|
|
|
|
rateLimitWaitTime = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "zai_proxy_rate_limit_wait_seconds",
|
|
Help: "Time spent waiting for rate limiter",
|
|
Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2, 5, 10},
|
|
},
|
|
[]string{"variant"},
|
|
)
|
|
|
|
rateLimitAdjustments = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "zai_proxy_rate_limit_adjustments_total",
|
|
Help: "Total number of rate limit adjustments",
|
|
},
|
|
[]string{"direction", "variant"}, // "increase" or "decrease"
|
|
)
|
|
|
|
rateLimitRejections = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "zai_proxy_rate_limit_rejections_total",
|
|
Help: "Total number of requests rejected due to rate limiting",
|
|
},
|
|
[]string{"variant"},
|
|
)
|
|
|
|
retryAttempts = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "zai_proxy_retry_attempts_total",
|
|
Help: "Total number of retry attempts",
|
|
},
|
|
[]string{"reason", "variant"}, // "429" or "network_error"
|
|
)
|
|
|
|
// Token counting metrics
|
|
tokensTotal = promauto.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "zai_proxy_tokens_total",
|
|
Help: "Total number of tokens processed by direction (input/output), model, deployment variant, and pricing tier",
|
|
},
|
|
[]string{"direction", "model", "variant", "pricing_tier"},
|
|
)
|
|
|
|
tokenCountDuration = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "zai_proxy_token_count_duration_seconds",
|
|
Help: "Duration of token counting operations",
|
|
Buckets: []float64{.0001, .0005, .001, .005, .01, .025, .05, .1},
|
|
},
|
|
[]string{"variant"},
|
|
)
|
|
|
|
// Token rate metrics - tracks tokens per second throughput
|
|
// This measures how fast tokens are being processed (tokenization speed)
|
|
tokenRateSeconds = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "zai_proxy_token_rate_seconds",
|
|
Help: "Token processing rate histogram - tracks time taken to process tokens (lower is faster). Labels: direction (input/output), model (glm-4, etc.), variant (stable/canary)",
|
|
Buckets: []float64{.00001, .00005, .0001, .0005, .001, .005, .01, .05, .1},
|
|
},
|
|
[]string{"direction", "model", "variant"},
|
|
)
|
|
|
|
// Alternative token rate metric - tokens per second throughput
|
|
tokenRate = promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "zai_proxy_token_rate",
|
|
Help: "Token processing rate in tokens per second (throughput). Labels: direction (input/output), model (glm-4, etc.), variant (stable/canary)",
|
|
Buckets: []float64{10, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000, 100000},
|
|
},
|
|
[]string{"direction", "model", "variant"},
|
|
)
|
|
|
|
// Build info metric for version tracking
|
|
buildInfo = promauto.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "zai_proxy_build_info",
|
|
Help: "Build information including version, variant, commit, and build time",
|
|
},
|
|
[]string{"version", "variant", "commit", "build_time"},
|
|
)
|
|
)
|
|
|
|
// GetPricingTier returns "peak" during 02:00-06:00 ET, "off_peak" otherwise.
|
|
// Z.AI Coding Plan: 1x off-peak, 2x peak.
|
|
func GetPricingTier() string {
|
|
et := time.FixedZone("ET", -5*3600)
|
|
hour := time.Now().In(et).Hour()
|
|
if hour >= 2 && hour < 6 {
|
|
return "peak"
|
|
}
|
|
return "off_peak"
|
|
}
|
|
|
|
// RecordInputTokens records input token count metrics
|
|
func RecordInputTokens(model string, version string, count int) {
|
|
if count > 0 {
|
|
tokensTotal.WithLabelValues("input", model, version, GetPricingTier()).Add(float64(count))
|
|
}
|
|
}
|
|
|
|
// RecordOutputTokens records output token count metrics
|
|
func RecordOutputTokens(model string, version string, count int) {
|
|
if count > 0 {
|
|
tokensTotal.WithLabelValues("output", model, version, GetPricingTier()).Add(float64(count))
|
|
}
|
|
}
|
|
|
|
// RecordTokenRate records token processing rate metrics
|
|
// This function records BOTH time-based and throughput-based token rate metrics
|
|
// Parameters:
|
|
// - direction: "input" or "output"
|
|
// - model: tokenizer model name (e.g., "glm-4", "claude-3")
|
|
// - version: deployment variant ("stable" or "canary")
|
|
// - duration: time taken to process the tokens
|
|
// - tokenCount: number of tokens processed
|
|
func RecordTokenRate(direction string, model string, version string, duration time.Duration, tokenCount int) {
|
|
if tokenCount <= 0 || duration <= 0 {
|
|
return
|
|
}
|
|
|
|
// Record time-based metric (seconds taken to process tokens)
|
|
tokenRateSeconds.WithLabelValues(direction, model, version).Observe(duration.Seconds())
|
|
|
|
// Record throughput-based metric (tokens per second)
|
|
tokensPerSecond := float64(tokenCount) / duration.Seconds()
|
|
tokenRate.WithLabelValues(direction, model, version).Observe(tokensPerSecond)
|
|
}
|
|
|
|
// RecordInputTokenRate records input token processing rate
|
|
// This is a convenience wrapper for RecordTokenRate with direction="input"
|
|
func RecordInputTokenRate(model string, version string, duration time.Duration, tokenCount int) {
|
|
RecordTokenRate("input", model, version, duration, tokenCount)
|
|
}
|
|
|
|
// RecordOutputTokenRate records output token processing rate
|
|
// This is a convenience wrapper for RecordTokenRate with direction="output"
|
|
func RecordOutputTokenRate(model string, version string, duration time.Duration, tokenCount int) {
|
|
RecordTokenRate("output", model, version, duration, tokenCount)
|
|
}
|
|
|
|
// RecordUsage records all four token counts from a UsageData in a single call.
|
|
// Directions: "input", "output", "cache_read", "cache_write".
|
|
func RecordUsage(model, variant string, usage UsageData) {
|
|
tier := GetPricingTier()
|
|
if usage.InputTokens > 0 {
|
|
tokensTotal.WithLabelValues("input", model, variant, tier).Add(float64(usage.InputTokens))
|
|
}
|
|
if usage.OutputTokens > 0 {
|
|
tokensTotal.WithLabelValues("output", model, variant, tier).Add(float64(usage.OutputTokens))
|
|
}
|
|
if usage.CacheReadTokens > 0 {
|
|
tokensTotal.WithLabelValues("cache_read", model, variant, tier).Add(float64(usage.CacheReadTokens))
|
|
}
|
|
if usage.CacheWriteTokens > 0 {
|
|
tokensTotal.WithLabelValues("cache_write", model, variant, tier).Add(float64(usage.CacheWriteTokens))
|
|
}
|
|
}
|