zai-proxy/proxy/metrics.go

package main

import (
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"
)

var (
	// Request metrics
	requestsTotal = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "zai_proxy_requests_total",
			Help: "Total number of requests by method, path, status code, and variant",
		},
		[]string{"method", "path", "status_code", "variant"},
	)

	requestDuration = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "zai_proxy_request_duration_seconds",
			Help:    "Request duration in seconds",
			Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60, 120, 300},
		},
		[]string{"method", "path", "status_code", "variant"},
	)

	requestSize = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "zai_proxy_request_size_bytes",
			Help:    "Request size in bytes",
			Buckets: prometheus.ExponentialBuckets(100, 10, 8),
		},
		[]string{"method", "path", "variant"},
	)

	responseSize = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "zai_proxy_response_size_bytes",
			Help:    "Response size in bytes",
			Buckets: prometheus.ExponentialBuckets(100, 10, 8),
		},
		[]string{"method", "path", "status_code", "variant"},
	)

	concurrentRequests = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "zai_proxy_concurrent_requests",
			Help: "Number of requests currently being processed",
		},
		[]string{"variant"},
	)

	upstreamErrors = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "zai_proxy_upstream_errors_total",
			Help: "Total number of upstream errors by type",
		},
		[]string{"error_type", "variant"},
	)

	maxWorkers = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "zai_proxy_max_workers",
			Help: "Maximum number of concurrent workers allowed",
		},
		[]string{"variant"},
	)

	workerUtilization = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "zai_proxy_worker_utilization_ratio",
			Help: "Current worker utilization ratio (concurrent_requests/max_workers)",
		},
		[]string{"variant"},
	)

	// Rate limiting metrics
	rateLimitCurrentRate = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "zai_proxy_rate_limit_requests_per_second",
			Help: "Current rate limit in requests per second",
		},
		[]string{"variant"},
	)

	rateLimitWaitTime = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "zai_proxy_rate_limit_wait_seconds",
			Help:    "Time spent waiting for rate limiter",
			Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2, 5, 10},
		},
		[]string{"variant"},
	)

	rateLimitAdjustments = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "zai_proxy_rate_limit_adjustments_total",
			Help: "Total number of rate limit adjustments",
		},
		[]string{"direction", "variant"}, // "increase" or "decrease"
	)

	rateLimitRejections = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "zai_proxy_rate_limit_rejections_total",
			Help: "Total number of requests rejected due to rate limiting",
		},
		[]string{"variant"},
	)

	retryAttempts = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "zai_proxy_retry_attempts_total",
			Help: "Total number of retry attempts",
		},
		[]string{"reason", "variant"}, // "429" or "network_error"
	)

	// Token counting metrics
	tokensTotal = promauto.NewCounterVec(
		prometheus.CounterOpts{
			Name: "zai_proxy_tokens_total",
			Help: "Total number of tokens processed by direction (input/output), model, deployment variant, and pricing tier",
		},
		[]string{"direction", "model", "variant", "pricing_tier"},
	)

	tokenCountDuration = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "zai_proxy_token_count_duration_seconds",
			Help:    "Duration of token counting operations",
			Buckets: []float64{.0001, .0005, .001, .005, .01, .025, .05, .1},
		},
		[]string{"variant"},
	)

	// Token rate metrics - tracks tokens per second throughput
	// This measures how fast tokens are being processed (tokenization speed)
	tokenRateSeconds = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "zai_proxy_token_rate_seconds",
			Help:    "Token processing rate histogram - tracks time taken to process tokens (lower is faster). Labels: direction (input/output), model (glm-4, etc.), variant (stable/canary)",
			Buckets: []float64{.00001, .00005, .0001, .0005, .001, .005, .01, .05, .1},
		},
		[]string{"direction", "model", "variant"},
	)

	// Alternative token rate metric - tokens per second throughput
	tokenRate = promauto.NewHistogramVec(
		prometheus.HistogramOpts{
			Name:    "zai_proxy_token_rate",
			Help:    "Token processing rate in tokens per second (throughput). Labels: direction (input/output), model (glm-4, etc.), variant (stable/canary)",
			Buckets: []float64{10, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000, 100000},
		},
		[]string{"direction", "model", "variant"},
	)

	// Build info metric for version tracking
	buildInfo = promauto.NewGaugeVec(
		prometheus.GaugeOpts{
			Name: "zai_proxy_build_info",
			Help: "Build information including version, variant, commit, and build time",
		},
		[]string{"version", "variant", "commit", "build_time"},
	)
)

// GetPricingTier returns "peak" during 02:00-06:00 ET, "off_peak" otherwise.
// Z.AI Coding Plan: 1x off-peak, 2x peak.
func GetPricingTier() string {
	et := time.FixedZone("ET", -5*3600)
	hour := time.Now().In(et).Hour()
	if hour >= 2 && hour < 6 {
		return "peak"
	}
	return "off_peak"
}

// RecordInputTokens records input token count metrics
func RecordInputTokens(model string, version string, count int) {
	if count > 0 {
		tokensTotal.WithLabelValues("input", model, version, GetPricingTier()).Add(float64(count))
	}
}

// RecordOutputTokens records output token count metrics
func RecordOutputTokens(model string, version string, count int) {
	if count > 0 {
		tokensTotal.WithLabelValues("output", model, version, GetPricingTier()).Add(float64(count))
	}
}

// RecordTokenRate records token processing rate metrics
// This function records BOTH time-based and throughput-based token rate metrics
// Parameters:
//   - direction: "input" or "output"
//   - model: tokenizer model name (e.g., "glm-4", "claude-3")
//   - version: deployment variant ("stable" or "canary")
//   - duration: time taken to process the tokens
//   - tokenCount: number of tokens processed
func RecordTokenRate(direction string, model string, version string, duration time.Duration, tokenCount int) {
	if tokenCount <= 0 || duration <= 0 {
		return
	}

	// Record time-based metric (seconds taken to process tokens)
	tokenRateSeconds.WithLabelValues(direction, model, version).Observe(duration.Seconds())

	// Record throughput-based metric (tokens per second)
	tokensPerSecond := float64(tokenCount) / duration.Seconds()
	tokenRate.WithLabelValues(direction, model, version).Observe(tokensPerSecond)
}

// RecordInputTokenRate records input token processing rate
// This is a convenience wrapper for RecordTokenRate with direction="input"
func RecordInputTokenRate(model string, version string, duration time.Duration, tokenCount int) {
	RecordTokenRate("input", model, version, duration, tokenCount)
}

// RecordOutputTokenRate records output token processing rate
// This is a convenience wrapper for RecordTokenRate with direction="output"
func RecordOutputTokenRate(model string, version string, duration time.Duration, tokenCount int) {
	RecordTokenRate("output", model, version, duration, tokenCount)
}

// RecordUsage records all four token counts from a UsageData in a single call.
// Directions: "input", "output", "cache_read", "cache_write".
func RecordUsage(model, variant string, usage UsageData) {
	tier := GetPricingTier()
	if usage.InputTokens > 0 {
		tokensTotal.WithLabelValues("input", model, variant, tier).Add(float64(usage.InputTokens))
	}
	if usage.OutputTokens > 0 {
		tokensTotal.WithLabelValues("output", model, variant, tier).Add(float64(usage.OutputTokens))
	}
	if usage.CacheReadTokens > 0 {
		tokensTotal.WithLabelValues("cache_read", model, variant, tier).Add(float64(usage.CacheReadTokens))
	}
	if usage.CacheWriteTokens > 0 {
		tokensTotal.WithLabelValues("cache_write", model, variant, tier).Add(float64(usage.CacheWriteTokens))
	}
}