package main import ( "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) var ( // Request metrics requestsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "zai_proxy_requests_total", Help: "Total number of requests by method, path, status code, and variant", }, []string{"method", "path", "status_code", "variant"}, ) requestDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "zai_proxy_request_duration_seconds", Help: "Request duration in seconds", Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 30, 60, 120, 300}, }, []string{"method", "path", "status_code", "variant"}, ) requestSize = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "zai_proxy_request_size_bytes", Help: "Request size in bytes", Buckets: prometheus.ExponentialBuckets(100, 10, 8), }, []string{"method", "path", "variant"}, ) responseSize = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "zai_proxy_response_size_bytes", Help: "Response size in bytes", Buckets: prometheus.ExponentialBuckets(100, 10, 8), }, []string{"method", "path", "status_code", "variant"}, ) concurrentRequests = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "zai_proxy_concurrent_requests", Help: "Number of requests currently being processed", }, []string{"variant"}, ) upstreamErrors = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "zai_proxy_upstream_errors_total", Help: "Total number of upstream errors by type", }, []string{"error_type", "variant"}, ) maxWorkers = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "zai_proxy_max_workers", Help: "Maximum number of concurrent workers allowed", }, []string{"variant"}, ) workerUtilization = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "zai_proxy_worker_utilization_ratio", Help: "Current worker utilization ratio (concurrent_requests/max_workers)", }, []string{"variant"}, ) // Rate limiting metrics rateLimitCurrentRate = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "zai_proxy_rate_limit_requests_per_second", Help: "Current rate limit in requests per second", }, []string{"variant"}, ) rateLimitWaitTime = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "zai_proxy_rate_limit_wait_seconds", Help: "Time spent waiting for rate limiter", Buckets: []float64{.001, .005, .01, .025, .05, .1, .25, .5, 1, 2, 5, 10}, }, []string{"variant"}, ) rateLimitAdjustments = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "zai_proxy_rate_limit_adjustments_total", Help: "Total number of rate limit adjustments", }, []string{"direction", "variant"}, // "increase" or "decrease" ) rateLimitRejections = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "zai_proxy_rate_limit_rejections_total", Help: "Total number of requests rejected due to rate limiting", }, []string{"variant"}, ) retryAttempts = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "zai_proxy_retry_attempts_total", Help: "Total number of retry attempts", }, []string{"reason", "variant"}, // "429" or "network_error" ) // Token counting metrics tokensTotal = promauto.NewCounterVec( prometheus.CounterOpts{ Name: "zai_proxy_tokens_total", Help: "Total number of tokens processed by direction (input/output), model, deployment variant, and pricing tier", }, []string{"direction", "model", "variant", "pricing_tier"}, ) tokenCountDuration = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "zai_proxy_token_count_duration_seconds", Help: "Duration of token counting operations", Buckets: []float64{.0001, .0005, .001, .005, .01, .025, .05, .1}, }, []string{"variant"}, ) // Token rate metrics - tracks tokens per second throughput // This measures how fast tokens are being processed (tokenization speed) tokenRateSeconds = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "zai_proxy_token_rate_seconds", Help: "Token processing rate histogram - tracks time taken to process tokens (lower is faster). Labels: direction (input/output), model (glm-4, etc.), variant (stable/canary)", Buckets: []float64{.00001, .00005, .0001, .0005, .001, .005, .01, .05, .1}, }, []string{"direction", "model", "variant"}, ) // Alternative token rate metric - tokens per second throughput tokenRate = promauto.NewHistogramVec( prometheus.HistogramOpts{ Name: "zai_proxy_token_rate", Help: "Token processing rate in tokens per second (throughput). Labels: direction (input/output), model (glm-4, etc.), variant (stable/canary)", Buckets: []float64{10, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000, 100000}, }, []string{"direction", "model", "variant"}, ) // Build info metric for version tracking buildInfo = promauto.NewGaugeVec( prometheus.GaugeOpts{ Name: "zai_proxy_build_info", Help: "Build information including version, variant, commit, and build time", }, []string{"version", "variant", "commit", "build_time"}, ) ) // GetPricingTier returns "peak" during 02:00-06:00 ET, "off_peak" otherwise. // Z.AI Coding Plan: 1x off-peak, 2x peak. func GetPricingTier() string { et := time.FixedZone("ET", -5*3600) hour := time.Now().In(et).Hour() if hour >= 2 && hour < 6 { return "peak" } return "off_peak" } // RecordInputTokens records input token count metrics func RecordInputTokens(model string, version string, count int) { if count > 0 { tokensTotal.WithLabelValues("input", model, version, GetPricingTier()).Add(float64(count)) } } // RecordOutputTokens records output token count metrics func RecordOutputTokens(model string, version string, count int) { if count > 0 { tokensTotal.WithLabelValues("output", model, version, GetPricingTier()).Add(float64(count)) } } // RecordTokenRate records token processing rate metrics // This function records BOTH time-based and throughput-based token rate metrics // Parameters: // - direction: "input" or "output" // - model: tokenizer model name (e.g., "glm-4", "claude-3") // - version: deployment variant ("stable" or "canary") // - duration: time taken to process the tokens // - tokenCount: number of tokens processed func RecordTokenRate(direction string, model string, version string, duration time.Duration, tokenCount int) { if tokenCount <= 0 || duration <= 0 { return } // Record time-based metric (seconds taken to process tokens) tokenRateSeconds.WithLabelValues(direction, model, version).Observe(duration.Seconds()) // Record throughput-based metric (tokens per second) tokensPerSecond := float64(tokenCount) / duration.Seconds() tokenRate.WithLabelValues(direction, model, version).Observe(tokensPerSecond) } // RecordInputTokenRate records input token processing rate // This is a convenience wrapper for RecordTokenRate with direction="input" func RecordInputTokenRate(model string, version string, duration time.Duration, tokenCount int) { RecordTokenRate("input", model, version, duration, tokenCount) } // RecordOutputTokenRate records output token processing rate // This is a convenience wrapper for RecordTokenRate with direction="output" func RecordOutputTokenRate(model string, version string, duration time.Duration, tokenCount int) { RecordTokenRate("output", model, version, duration, tokenCount) } // RecordUsage records all four token counts from a UsageData in a single call. // Directions: "input", "output", "cache_read", "cache_write". func RecordUsage(model, variant string, usage UsageData) { tier := GetPricingTier() if usage.InputTokens > 0 { tokensTotal.WithLabelValues("input", model, variant, tier).Add(float64(usage.InputTokens)) } if usage.OutputTokens > 0 { tokensTotal.WithLabelValues("output", model, variant, tier).Add(float64(usage.OutputTokens)) } if usage.CacheReadTokens > 0 { tokensTotal.WithLabelValues("cache_read", model, variant, tier).Add(float64(usage.CacheReadTokens)) } if usage.CacheWriteTokens > 0 { tokensTotal.WithLabelValues("cache_write", model, variant, tier).Add(float64(usage.CacheWriteTokens)) } }