zai-proxy/dashboard/model/metrics.go
jedarden 9799d75d2b feat(dashboard): add cache token tracking and running totals panel
Adds cache_read and cache_write token directions throughout the
observability stack so Anthropic prompt-cache billing is visible.

- model/metrics.go: TokensCacheRead, TokensCacheWrite, TokenRateCacheRead,
  TokenRateCacheWrite fields on MetricSnapshot
- collector: reads direction=cache_read/cache_write from
  zai_proxy_tokens_total Prometheus metric
- frontend types.ts: matching TS fields
- TokenPanel: rewritten to show all 4 directions (input, output,
  cache_read, cache_write) on the rate chart; running-total summary
  strip above the chart shows window totals (e.g. "5h window: 1.2M
  input / 340k output / 89k cache_read / 12k cache_write")

Also updates docs/plan/plan.md to accurately document the full
dashboard architecture (backend API, storage schema, SSE hub,
frontend panels, Grafana layer, env vars).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 23:08:28 -04:00

102 lines
4.8 KiB
Go

// Package model defines the data structures for metrics snapshots.
package model
import (
"encoding/json"
"time"
)
// MetricSnapshot represents a single point-in-time collection of metrics
// from a zai-proxy instance.
type MetricSnapshot struct {
Timestamp int64 `json:"timestamp"` // Unix timestamp in milliseconds
Variant string `json:"variant"` // "production" or "canary"
Requests2xx float64 `json:"requests_2xx"` // Total 2xx requests
Requests4xx float64 `json:"requests_4xx"` // Total 4xx requests
Requests5xx float64 `json:"requests_5xx"` // Total 5xx requests
TokensInput float64 `json:"tokens_input"` // Total input tokens
TokensOutput float64 `json:"tokens_output"` // Total output tokens
TokensCacheRead float64 `json:"tokens_cache_read"` // Total cache-read tokens
TokensCacheWrite float64 `json:"tokens_cache_write"` // Total cache-write tokens
ConcurrentRequests float64 `json:"concurrent_requests"` // Current concurrent requests
MaxWorkers float64 `json:"max_workers"` // Maximum workers
RateLimitRps float64 `json:"rate_limit_rps"` // Current rate limit (req/s)
RateLimitRejections float64 `json:"rate_limit_rejections"` // Total rate limit rejections
RateLimitAdjIncrease float64 `json:"rate_limit_adj_increase"` // Total rate limit increases
RateLimitAdjDecrease float64 `json:"rate_limit_adj_decrease"` // Total rate limit decreases
UpstreamErrors float64 `json:"upstream_errors"` // Total upstream errors
RetryAttempts float64 `json:"retry_attempts"` // Total retry attempts
LatencyP50 float64 `json:"latency_p50"` // Request latency p50 (ms)
LatencyP95 float64 `json:"latency_p95"` // Request latency p95 (ms)
LatencyP99 float64 `json:"latency_p99"` // Request latency p99 (ms)
RequestSizeAvg float64 `json:"request_size_avg"` // Average request size (bytes)
ResponseSizeAvg float64 `json:"response_size_avg"` // Average response size (bytes)
TokenRateIn float64 `json:"token_rate_in"` // Input token rate (tokens/s)
TokenRateOut float64 `json:"token_rate_out"` // Output token rate (tokens/s)
TokenRateCacheRead float64 `json:"token_rate_cache_read"` // Cache-read token rate (tokens/s)
TokenRateCacheWrite float64 `json:"token_rate_cache_write"` // Cache-write token rate (tokens/s)
ReqRate float64 `json:"req_rate"` // Request rate (req/s)
ErrorRatePct float64 `json:"error_rate_pct"` // Error rate percentage
WorkerUtilization float64 `json:"worker_utilization"` // Worker utilization ratio (0-1)
StatusCodeRates map[string]float64 `json:"status_code_rates,omitempty"` // Per-status-code request rates (req/s)
}
// ToJSON serializes the snapshot to JSON bytes.
func (s *MetricSnapshot) ToJSON() ([]byte, error) {
return json.Marshal(s)
}
// FromJSON deserializes a snapshot from JSON bytes.
func FromJSON(data []byte) (*MetricSnapshot, error) {
var s MetricSnapshot
err := json.Unmarshal(data, &s)
return &s, err
}
// VariantStatus represents the health status of a single variant.
type VariantStatus struct {
Healthy bool `json:"healthy"`
LastScrape time.Time `json:"last_scrape"`
ReqRate float64 `json:"req_rate"`
ErrorRatePct float64 `json:"error_rate_pct"`
LatencyP50Ms float64 `json:"latency_p50_ms"`
Concurrent float64 `json:"concurrent"`
WorkerUtilization float64 `json:"worker_utilization"`
RateLimitRps float64 `json:"rate_limit_rps"`
TokenRateIn float64 `json:"token_rate_in"`
TokenRateOut float64 `json:"token_rate_out"`
}
// StatusResponse is the response for /api/status.
type StatusResponse struct {
Production *VariantStatus `json:"production,omitempty"`
Canary *VariantStatus `json:"canary,omitempty"`
}
// MarshalJSON serializes the StatusResponse to JSON.
func (r *StatusResponse) MarshalJSON() ([]byte, error) {
type Alias StatusResponse
return json.Marshal((*Alias)(r))
}
// SSEMessage represents a message sent over SSE.
type SSEMessage struct {
Type string `json:"type"`
Data *MetricSnapshot `json:"data,omitempty"`
// For "connected" messages
ScrapeInterval int `json:"scrape_interval,omitempty"`
Variants []string `json:"variants,omitempty"`
}
// HistogramBucket represents a single bucket in a Prometheus histogram.
type HistogramBucket struct {
UpperBound float64
Count float64
}
// Histogram represents a parsed Prometheus histogram.
type Histogram struct {
Buckets []HistogramBucket
Sum float64
Count float64
}