feat(dashboard): add cache token tracking and running totals panel

Adds cache_read and cache_write token directions throughout the observability stack so Anthropic prompt-cache billing is visible. - model/metrics.go: TokensCacheRead, TokensCacheWrite, TokenRateCacheRead, TokenRateCacheWrite fields on MetricSnapshot - collector: reads direction=cache_read/cache_write from zai_proxy_tokens_total Prometheus metric - frontend types.ts: matching TS fields - TokenPanel: rewritten to show all 4 directions (input, output, cache_read, cache_write) on the rate chart; running-total summary strip above the chart shows window totals (e.g. "5h window: 1.2M input / 340k output / 89k cache_read / 12k cache_write") Also updates docs/plan/plan.md to accurately document the full dashboard architecture (backend API, storage schema, SSE hub, frontend panels, Grafana layer, env vars). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 23:08:28 -04:00 · 2026-05-16 23:08:28 -04:00 · 9799d75d2b
commit 9799d75d2b
parent dee82a76a3
5 changed files with 456 additions and 192 deletions
--- a/dashboard/collector/collector.go
+++ b/dashboard/collector/collector.go
@ -259,6 +259,8 @@ func (c *Collector) buildSnapshot(cur, prev map[string][]MetricValue, now, prevT

 	s.TokensInput = sumMetric("zai_proxy_tokens_total", map[string]string{"direction": "input"})
 	s.TokensOutput = sumMetric("zai_proxy_tokens_total", map[string]string{"direction": "output"})
+	s.TokensCacheRead = sumMetric("zai_proxy_tokens_total", map[string]string{"direction": "cache_read"})
+	s.TokensCacheWrite = sumMetric("zai_proxy_tokens_total", map[string]string{"direction": "cache_write"})
 	s.ConcurrentRequests = sumMetric("zai_proxy_concurrent_requests", nil)
 	s.MaxWorkers = sumMetric("zai_proxy_max_workers", nil)
 	s.RateLimitRps = sumMetric("zai_proxy_rate_limit_requests_per_second", nil)
@ -273,6 +275,8 @@ func (c *Collector) buildSnapshot(cur, prev map[string][]MetricValue, now, prevT
 	s.ReqRate = prefixRate("zai_proxy_requests_total", "status_code", "")
 	s.TokenRateIn = rate("zai_proxy_tokens_total", map[string]string{"direction": "input"})
 	s.TokenRateOut = rate("zai_proxy_tokens_total", map[string]string{"direction": "output"})
+	s.TokenRateCacheRead = rate("zai_proxy_tokens_total", map[string]string{"direction": "cache_read"})
+	s.TokenRateCacheWrite = rate("zai_proxy_tokens_total", map[string]string{"direction": "cache_write"})

 	// Per-status-code rates
 	s.StatusCodeRates = make(map[string]float64)
--- a/dashboard/frontend/src/components/panels/TokenPanel.tsx
+++ b/dashboard/frontend/src/components/panels/TokenPanel.tsx
@ -21,22 +21,36 @@ interface TokenPanelProps {
 interface ChartDataPoint {
  timestamp: number;
  time: string;
-  // Production data
-  token_rate_in_prod?: number;
-  token_rate_out_prod?: number;
-  // Canary data
-  token_rate_in_canary?: number;
-  token_rate_out_canary?: number;
-  // Single variant
+  // Single-variant keys
  token_rate_in?: number;
  token_rate_out?: number;
+  token_rate_cache_read?: number;
+  token_rate_cache_write?: number;
+  // Both-variant keys (prod)
+  token_rate_in_prod?: number;
+  token_rate_out_prod?: number;
+  token_rate_cache_read_prod?: number;
+  token_rate_cache_write_prod?: number;
+  // Both-variant keys (canary)
+  token_rate_in_canary?: number;
+  token_rate_out_canary?: number;
+  token_rate_cache_read_canary?: number;
+  token_rate_cache_write_canary?: number;
 }

 const COLORS = {
-  input: '#06b6d4',  // cyan
-  output: '#8b5cf6', // purple
+  input:       '#06b6d4', // cyan
+  output:      '#8b5cf6', // purple
+  cache_read:  '#f59e0b', // amber
+  cache_write: '#10b981', // emerald
 };

+function formatTokenCount(n: number): string {
+  if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(2)}M`;
+  if (n >= 1_000) return `${(n / 1_000).toFixed(1)}k`;
+  return n.toFixed(0);
+}
+
 const CustomTooltip = ({
  active,
  payload,
@ -46,15 +60,12 @@ const CustomTooltip = ({
  payload?: Array<{ name: string; value: number; color: string }>;
  label?: string;
 }) => {
-  if (!active || !payload) {
-    return null;
-  }
-
+  if (!active || !payload) return null;
  return (
    <div className="bg-slate-800 border border-slate-600 rounded-lg p-3 shadow-lg">
      <p className="text-slate-400 text-xs mb-2">{label}</p>
-      {payload.map((entry, index) => (
-        <p key={index} className="text-sm" style={{ color: entry.color }}>
+      {payload.map((entry, i) => (
+        <p key={i} className="text-sm" style={{ color: entry.color }}>
          {entry.name}: {entry.value.toFixed(0)} tok/s
        </p>
      ))}
@ -62,73 +73,106 @@ const CustomTooltip = ({
  );
 };

+/** Compute running totals over the visible window from cumulative counter values. */
+function windowTotal(data: MetricSnapshot[], field: keyof MetricSnapshot): number {
+  if (data.length < 2) return 0;
+  const first = data[0][field] as number;
+  const last = data[data.length - 1][field] as number;
+  const delta = last - first;
+  return delta >= 0 ? delta : last; // guard against counter resets
+}
+
 export function TokenPanel({ data, variant, height = 180 }: TokenPanelProps) {
-  // Calculate current value from latest data
-  const currentValue = useMemo(() => {
-    if (data.length === 0) return null;
-    const latest = data[data.length - 1];
-    return latest.token_rate_in + latest.token_rate_out;
-  }, [data]);
-
-  // Transform data for chart
-  const chartData = useMemo(() => {
-    if (variant === 'both') {
-      // Group by timestamp and separate by variant
-      const grouped = new Map<number, ChartDataPoint>();
-
-      for (const snapshot of data) {
-        const existing = grouped.get(snapshot.timestamp) || {
-          timestamp: snapshot.timestamp,
-          time: new Date(snapshot.timestamp).toLocaleTimeString('en-US', {
-            hour: '2-digit',
-            minute: '2-digit',
-            hour12: false,
-          }),
-        };
-
-        if (snapshot.variant === 'production') {
-          existing.token_rate_in_prod = snapshot.token_rate_in;
-          existing.token_rate_out_prod = snapshot.token_rate_out;
-        } else {
-          existing.token_rate_in_canary = snapshot.token_rate_in;
-          existing.token_rate_out_canary = snapshot.token_rate_out;
-        }
-
-        grouped.set(snapshot.timestamp, existing);
-      }
-
-      return Array.from(grouped.values()).sort((a, b) => a.timestamp - b.timestamp);
-    } else {
-      return data.map((snapshot) => ({
-        timestamp: snapshot.timestamp,
-        time: new Date(snapshot.timestamp).toLocaleTimeString('en-US', {
-          hour: '2-digit',
-          minute: '2-digit',
-          hour12: false,
-        }),
-        token_rate_in: snapshot.token_rate_in,
-        token_rate_out: snapshot.token_rate_out,
-      }));
-    }
+  // Running totals over the visible window
+  const totals = useMemo(() => {
+    const filtered = variant === 'both' ? data : data.filter((s) => s.variant === variant);
+    return {
+      input:       windowTotal(filtered, 'tokens_input'),
+      output:      windowTotal(filtered, 'tokens_output'),
+      cache_read:  windowTotal(filtered, 'tokens_cache_read'),
+      cache_write: windowTotal(filtered, 'tokens_cache_write'),
+    };
  }, [data, variant]);

-  const yAxisFormatter = (value: number) => {
-    if (value >= 1000) {
-      return `${(value / 1000).toFixed(1)}k`;
+  // Duration label for the summary (window span in hours)
+  const windowHours = useMemo(() => {
+    if (data.length < 2) return null;
+    const spanMs = data[data.length - 1].timestamp - data[0].timestamp;
+    const hrs = spanMs / 3_600_000;
+    return hrs >= 1 ? `${hrs.toFixed(0)}h` : `${Math.round(hrs * 60)}m`;
+  }, [data]);
+
+  const chartData = useMemo<ChartDataPoint[]>(() => {
+    if (variant === 'both') {
+      const grouped = new Map<number, ChartDataPoint>();
+      for (const s of data) {
+        const pt = grouped.get(s.timestamp) ?? {
+          timestamp: s.timestamp,
+          time: new Date(s.timestamp).toLocaleTimeString('en-US', {
+            hour: '2-digit', minute: '2-digit', hour12: false,
+          }),
+        };
+        if (s.variant === 'production') {
+          pt.token_rate_in_prod = s.token_rate_in;
+          pt.token_rate_out_prod = s.token_rate_out;
+          pt.token_rate_cache_read_prod = s.token_rate_cache_read;
+          pt.token_rate_cache_write_prod = s.token_rate_cache_write;
+        } else {
+          pt.token_rate_in_canary = s.token_rate_in;
+          pt.token_rate_out_canary = s.token_rate_out;
+          pt.token_rate_cache_read_canary = s.token_rate_cache_read;
+          pt.token_rate_cache_write_canary = s.token_rate_cache_write;
+        }
+        grouped.set(s.timestamp, pt);
+      }
+      return Array.from(grouped.values()).sort((a, b) => a.timestamp - b.timestamp);
    }
-    return value.toFixed(0);
-  };
+    return data.map((s) => ({
+      timestamp: s.timestamp,
+      time: new Date(s.timestamp).toLocaleTimeString('en-US', {
+        hour: '2-digit', minute: '2-digit', hour12: false,
+      }),
+      token_rate_in:         s.token_rate_in,
+      token_rate_out:        s.token_rate_out,
+      token_rate_cache_read: s.token_rate_cache_read,
+      token_rate_cache_write: s.token_rate_cache_write,
+    }));
+  }, [data, variant]);
+
+  const yAxisFormatter = (v: number) =>
+    v >= 1000 ? `${(v / 1000).toFixed(1)}k` : v.toFixed(0);

  return (
    <div className="panel">
+      {/* Header: title + running totals summary */}
      <div className="flex items-start justify-between mb-2">
        <h3 className="panel-header">Token Throughput</h3>
-        {currentValue !== null && (
-          <div className="text-right">
-            <div className="panel-value">{formatRate(currentValue)}/s</div>
-          </div>
+        {windowHours && (
+          <span className="text-xs text-slate-500">{windowHours} window</span>
        )}
      </div>
+
+      {/* Running totals grid */}
+      <div className="grid grid-cols-4 gap-1 mb-3 text-center">
+        <div>
+          <div className="text-xs font-medium" style={{ color: COLORS.input }}>Input</div>
+          <div className="font-mono text-xs text-slate-200">{formatTokenCount(totals.input)}</div>
+        </div>
+        <div>
+          <div className="text-xs font-medium" style={{ color: COLORS.output }}>Output</div>
+          <div className="font-mono text-xs text-slate-200">{formatTokenCount(totals.output)}</div>
+        </div>
+        <div>
+          <div className="text-xs font-medium" style={{ color: COLORS.cache_read }}>Cache↑</div>
+          <div className="font-mono text-xs text-slate-200">{formatTokenCount(totals.cache_read)}</div>
+        </div>
+        <div>
+          <div className="text-xs font-medium" style={{ color: COLORS.cache_write }}>Cache↓</div>
+          <div className="font-mono text-xs text-slate-200">{formatTokenCount(totals.cache_write)}</div>
+        </div>
+      </div>
+
+      {/* Rate time series */}
      <ResponsiveContainer width="100%" height={height}>
        <LineChart data={chartData} margin={{ top: 5, right: 20, left: 0, bottom: 5 }}>
          <CartesianGrid strokeDasharray="3 3" stroke="#334155" />
@ -156,65 +200,21 @@ export function TokenPanel({ data, variant, height = 180 }: TokenPanelProps) {

          {variant === 'both' ? (
            <>
-              {/* Production - solid lines */}
-              <Line
-                type="monotone"
-                dataKey="token_rate_in_prod"
-                name="Input (prod)"
-                stroke={COLORS.input}
-                strokeWidth={2}
-                dot={false}
-                activeDot={{ r: 4 }}
-              />
-              <Line
-                type="monotone"
-                dataKey="token_rate_out_prod"
-                name="Output (prod)"
-                stroke={COLORS.output}
-                strokeWidth={2}
-                dot={false}
-                activeDot={{ r: 4 }}
-              />
-              {/* Canary - dashed lines */}
-              <Line
-                type="monotone"
-                dataKey="token_rate_in_canary"
-                name="Input (canary)"
-                stroke={COLORS.input}
-                strokeWidth={2}
-                strokeDasharray="5 5"
-                dot={false}
-              />
-              <Line
-                type="monotone"
-                dataKey="token_rate_out_canary"
-                name="Output (canary)"
-                stroke={COLORS.output}
-                strokeWidth={2}
-                strokeDasharray="5 5"
-                dot={false}
-              />
+              <Line type="monotone" dataKey="token_rate_in_prod"         name="In (prod)"         stroke={COLORS.input}       strokeWidth={2} dot={false} activeDot={{ r: 3 }} />
+              <Line type="monotone" dataKey="token_rate_out_prod"        name="Out (prod)"        stroke={COLORS.output}      strokeWidth={2} dot={false} activeDot={{ r: 3 }} />
+              <Line type="monotone" dataKey="token_rate_cache_read_prod" name="Cache↑ (prod)"     stroke={COLORS.cache_read}  strokeWidth={1} dot={false} strokeDasharray="4 2" />
+              <Line type="monotone" dataKey="token_rate_cache_write_prod" name="Cache↓ (prod)"    stroke={COLORS.cache_write} strokeWidth={1} dot={false} strokeDasharray="4 2" />
+              <Line type="monotone" dataKey="token_rate_in_canary"       name="In (canary)"       stroke={COLORS.input}       strokeWidth={2} dot={false} strokeDasharray="5 5" />
+              <Line type="monotone" dataKey="token_rate_out_canary"      name="Out (canary)"      stroke={COLORS.output}      strokeWidth={2} dot={false} strokeDasharray="5 5" />
+              <Line type="monotone" dataKey="token_rate_cache_read_canary" name="Cache↑ (canary)" stroke={COLORS.cache_read}  strokeWidth={1} dot={false} strokeDasharray="2 4" />
+              <Line type="monotone" dataKey="token_rate_cache_write_canary" name="Cache↓ (canary)" stroke={COLORS.cache_write} strokeWidth={1} dot={false} strokeDasharray="2 4" />
            </>
          ) : (
            <>
-              <Line
-                type="monotone"
-                dataKey="token_rate_in"
-                name="Input"
-                stroke={COLORS.input}
-                strokeWidth={2}
-                dot={false}
-                activeDot={{ r: 4 }}
-              />
-              <Line
-                type="monotone"
-                dataKey="token_rate_out"
-                name="Output"
-                stroke={COLORS.output}
-                strokeWidth={2}
-                dot={false}
-                activeDot={{ r: 4 }}
-              />
+              <Line type="monotone" dataKey="token_rate_in"         name="Input"      stroke={COLORS.input}       strokeWidth={2} dot={false} activeDot={{ r: 4 }} />
+              <Line type="monotone" dataKey="token_rate_out"        name="Output"     stroke={COLORS.output}      strokeWidth={2} dot={false} activeDot={{ r: 4 }} />
+              <Line type="monotone" dataKey="token_rate_cache_read" name="Cache Read"  stroke={COLORS.cache_read}  strokeWidth={1} dot={false} strokeDasharray="4 2" />
+              <Line type="monotone" dataKey="token_rate_cache_write" name="Cache Write" stroke={COLORS.cache_write} strokeWidth={1} dot={false} strokeDasharray="4 2" />
            </>
          )}
        </LineChart>
--- a/dashboard/frontend/src/lib/types.ts
+++ b/dashboard/frontend/src/lib/types.ts
@ -11,6 +11,8 @@ export interface MetricSnapshot {
  requests_5xx: number;
  tokens_input: number;
  tokens_output: number;
+  tokens_cache_read: number;
+  tokens_cache_write: number;
  concurrent_requests: number;
  max_workers: number;
  rate_limit_rps: number;
@ -26,6 +28,8 @@ export interface MetricSnapshot {
  response_size_avg: number;
  token_rate_in: number; // tokens/s
  token_rate_out: number;
+  token_rate_cache_read: number;
+  token_rate_cache_write: number;
  req_rate: number; // requests/s
  error_rate_pct: number; // percentage
  worker_utilization: number; // ratio 0-1
--- a/dashboard/model/metrics.go
+++ b/dashboard/model/metrics.go
@ -16,6 +16,8 @@ type MetricSnapshot struct {
 	Requests5xx           float64 `json:"requests_5xx"`           // Total 5xx requests
 	TokensInput           float64 `json:"tokens_input"`           // Total input tokens
 	TokensOutput          float64 `json:"tokens_output"`          // Total output tokens
+	TokensCacheRead       float64 `json:"tokens_cache_read"`      // Total cache-read tokens
+	TokensCacheWrite      float64 `json:"tokens_cache_write"`     // Total cache-write tokens
 	ConcurrentRequests    float64 `json:"concurrent_requests"`    // Current concurrent requests
 	MaxWorkers            float64 `json:"max_workers"`            // Maximum workers
 	RateLimitRps          float64 `json:"rate_limit_rps"`         // Current rate limit (req/s)
@ -29,8 +31,10 @@ type MetricSnapshot struct {
 	LatencyP99            float64 `json:"latency_p99"`            // Request latency p99 (ms)
 	RequestSizeAvg        float64 `json:"request_size_avg"`       // Average request size (bytes)
 	ResponseSizeAvg       float64 `json:"response_size_avg"`      // Average response size (bytes)
-	TokenRateIn           float64 `json:"token_rate_in"`          // Input token rate (tokens/s)
-	TokenRateOut          float64 `json:"token_rate_out"`         // Output token rate (tokens/s)
+	TokenRateIn           float64 `json:"token_rate_in"`           // Input token rate (tokens/s)
+	TokenRateOut          float64 `json:"token_rate_out"`          // Output token rate (tokens/s)
+	TokenRateCacheRead    float64 `json:"token_rate_cache_read"`   // Cache-read token rate (tokens/s)
+	TokenRateCacheWrite   float64 `json:"token_rate_cache_write"`  // Cache-write token rate (tokens/s)
 	ReqRate               float64 `json:"req_rate"`               // Request rate (req/s)
 	ErrorRatePct          float64 `json:"error_rate_pct"`         // Error rate percentage
 	WorkerUtilization     float64 `json:"worker_utilization"`     // Worker utilization ratio (0-1)
--- a/docs/plan/plan.md
+++ b/docs/plan/plan.md
@ -1,26 +1,52 @@
 # ZAI Proxy Ecosystem — Plan

+**Last updated:** 2026-05-16
+**Version:** proxy/1.10.0, dashboard/1.0.0
+
 ## Objective

-Provide a stable, observable endpoint for LLM agents to access the Z.AI API without exposing the Z.AI API key as an environment variable or in any other plaintext form accessible to the calling process. The proxy is the sole keeper of the credential; agents authenticate via a shared secret (proxy API key) that carries no Z.AI billing rights on its own.
+Provide a stable, observable endpoint for LLM agents to access the Z.AI API without
+exposing the Z.AI API key to calling processes. The proxy is the sole keeper of the
+credential; agents reach it via cluster-internal DNS — isolation is enforced at the
+network layer, not via per-agent authentication.
+
+## Security Model
+
+| Threat | Mitigation |
+|--------|------------|
+| Agent exfiltrates Z.AI key | Key never leaves proxy pod; agents reach the proxy only via cluster-internal DNS (not public); key is not in agent env, logs, or metrics |
+| Network path to proxy compromised | Proxy is not reachable outside the cluster except via Tailscale ingress; no public IP |
+| Log scraping leaks key | Z.AI key is never logged; incoming Authorization header is overwritten before forwarding, never echoed |
+| Metric label leakage | No credential values in metric labels |
+| Runaway agent burns quota | Global adaptive rate limiter + 429 backoff + `MAX_WORKERS` concurrency cap |
+| Z.AI quota exhaustion | 429 counter triggers alerts before quota fully consumed |
+| Malformed upstream response | Proxy validates response body before committing; retries on empty/truncated JSON |
+
+**What the proxy does NOT do:**
+
+- Validate per-agent credentials (no proxy-key authentication). Any pod that can reach the
+  proxy via cluster DNS is treated as authorized. Access control is the cluster's responsibility.
+- Cache or store responses.
+- Load-balance across multiple Z.AI accounts.

 ## Architecture

 ```
 LLM Agent (Claude Code, NEEDLE worker, etc.)
    │
-    │  POST /v1/chat/completions
-    │  Authorization: Bearer <proxy-key>   ← agent's credential (not the Z.AI key)
+    │  POST /v1/messages  (or any path)
+    │  Authorization: Bearer <any-value>     ← overwritten; not validated
    ▼
 ┌─────────────────────────────────────────────────────┐
 │                    zai-proxy                        │
 │                                                     │
-│  • Validates proxy-key                              │
-│  • Rewrites Authorization → Bearer <zai-api-key>   │
-│  • Rate-limits (token bucket per key)               │
-│  • Counts tokens (request + response)               │
+│  • Overwrites Authorization → Bearer <zai-api-key>  │
+│  • Enforces concurrency cap (MAX_WORKERS)           │
+│  • Global adaptive AIMD rate limiter                │
+│  • Counts tokens (tiktoken / API-reported)          │
+│  • Validates response body; retries on truncation   │
 │  • Records metrics (Prometheus)                     │
-│  • Translates request/response format if needed     │
+│  • TranslateRequest: no-op (Z.AI is Claude-native)  │
 │                                                     │
 └──────────────────┬──────────────────────────────────┘
                   │  HTTPS
@ -28,7 +54,9 @@ LLM Agent (Claude Code, NEEDLE worker, etc.)
           api.z.ai  (Z.AI upstream)
 ```

-The Z.AI API key lives **only** as a Kubernetes Secret (sealed-secrets encrypted at rest, injected as an env var into the proxy pod only). No agent process, worker, or tool ever sees the upstream key.
+The Z.AI API key lives **only** as a Kubernetes Secret (sealed-secrets encrypted at rest,
+injected as an env var into the proxy pod only). No agent process, worker, or tool ever
+sees the upstream key.

 ## Components

@ -36,81 +64,278 @@ The Z.AI API key lives **only** as a Kubernetes Secret (sealed-secrets encrypted

 The core component. Handles:

- **Credential isolation:** accepts `Authorization: Bearer <proxy-key>`, injects the real Z.AI key upstream. Proxy keys are hashed and stored in config; compromise of a proxy key cannot be used to bill or enumerate usage independently.
- **Token counting:** both request and response token counts via tiktoken (for OpenAI-compat models) and GLM tokenizer (for GLM series). Token counts feed the metrics pipeline.
- **Rate limiting:** configurable token-bucket per proxy key. Prevents a runaway agent from exhausting the Z.AI quota. Returns 429 when the bucket is empty.
- **Prometheus metrics:** exposes `/metrics` with request counts, latency histograms, token usage, error rates, and rate-limit hit counts.
- **Request/response translation:** normalises differences between the OpenAI wire format and Z.AI's dialect so agents using standard OpenAI client libraries work without modification.
- **Canary support:** runs two deployment variants (production + canary) simultaneously; traffic split is controlled by the Kubernetes service config, not the proxy itself.
+- **Credential injection:** overwrites the incoming `Authorization` header with
+  `Bearer <ZAI_API_KEY>`. No incoming credential is validated — access is controlled
+  entirely by network policy (cluster-internal DNS + Tailscale boundary).
+
+- **Concurrency cap:** `MAX_WORKERS` (default 10) bounds the number of in-flight
+  requests. Requests beyond the cap receive 503 immediately.
+
+- **Global adaptive rate limiter (AIMD/EWMA):**
+  A single token-bucket limiter serves all traffic. Every 30-second window it inspects
+  the 429 rate from the upstream and adjusts:
+  - If 429-rate > 5 %: updates the estimated ceiling via EWMA
+    (`alpha = 0.3`; default), then drops to `ceiling × (1 − hold_margin)`.
+  - If 429-rate < 1 %: converges toward the hold position in 50 % steps per window;
+    after `probe_interval` clean windows, probes above the ceiling to detect upward shifts.
+  - Rate is bounded by `[RATE_LIMIT_MIN, RATE_LIMIT_MAX]` (defaults: 1–50 req/s).
+  - Parameters tunable via env: `RATE_LIMIT_CEILING_ALPHA`, `RATE_LIMIT_HOLD_MARGIN`,
+    `RATE_LIMIT_PROBE_INTERVAL`.
+  - Reset endpoint: `POST /admin/reset-rate-limit` resets to initial rate (unauthenticated).
+
+- **Retry logic:** on network error, 429, or truncated/empty response body, the proxy
+  retries up to `MAX_RETRIES` times (default 3) with exponential backoff (1 s, 2 s, 4 s).
+  If a 429 carries `Retry-After`, that delay is honoured before the next attempt.
+
+- **Response validation:**
+  - Non-streaming: reads the full body before committing; retries if empty or invalid JSON.
+  - Streaming: peeks the first 4 KiB; retries if the stream opens with zero bytes.
+  - 422 responses are not retried — they indicate a structural request problem.
+    Full request/response bodies are logged for diagnosis.
+
+- **Token counting:** prefers API-reported usage from the response body
+  (`usage.input_tokens`, `usage.output_tokens`, `usage.cache_read_input_tokens`,
+  `usage.cache_creation_input_tokens`). Falls back to tiktoken cl100k_base local counting
+  if the response carries no usage block; further falls back to `SimpleTokenCounter` if
+  tiktoken fails to initialise. Enabled via `TOKEN_COUNTING_ENABLED` (default `true`).
+
+- **Request translation:** `TranslateRequest` is a documented **no-op**. Z.AI natively
+  accepts the Anthropic Claude wire format (including `thinking`, `cache_control`,
+  `system` arrays). Prior field-stripping translations caused 422 errors and were removed.
+
+- **Prometheus metrics:** exposes `/metrics` with request counts, latency histograms,
+  token usage by direction and pricing tier, rate-limiter state, retry counts,
+  and build info.
+
+- **Deployment variants:** `DEPLOYMENT_VARIANT` env distinguishes metric streams from
+  production and canary pods. All Prometheus metrics carry a `variant` label.
+
+- **Canary support:** two Deployments share the `devpod` namespace. The canary
+  (`zai-proxy-v2`) currently carries all production traffic (original `zai-proxy`
+  Deployment is scaled to 0). A `zai-proxy-canary` Service enables weighted traffic
+  splits for testing new versions.

 ### dashboard/ — Metrics Dashboard (Go + React)

-The observability layer. Scrapes the proxy's Prometheus endpoint, persists aggregated data in SQLite, and serves a live React frontend via SSE.
+The observability layer. Three subsystems work together:

-Panels:
- Request rate (req/s)
- Token throughput (tokens/s, split by direction)
- Latency (p50/p95/p99)
- Error rate (4xx, 5xx, 429 broken out separately)
- Rate-limit hit rate
- Concurrency (in-flight requests)
+```
+zai-proxy /metrics
+      │
+      │  HTTP scrape every 5 s (per SCRAPE_TARGETS)
+      ▼
+┌──────────────────────────────────────────────┐
+│  Collector (goroutine per target)            │
+│  • Parses Prometheus text format             │
+│  • Computes per-interval rates (req/s etc.)  │
+│  • Infers variant from target URL            │
+│    ("test"/"canary" → canary, else prod)     │
+│  • Handles counter resets                    │
+└──────────┬───────────────────────────────────┘
+           │ MetricSnapshot channel
+    ┌──────┴──────┐
+    ▼             ▼
+┌────────┐   ┌─────────────────────────────────┐
+│Storage │   │  SSE Hub (broadcast to clients) │
+│        │   │  • "connected" event on join     │
+│5s/24h  │   │    (scrape_interval, variants)   │
+│1m/7d   │   │  • 30 s keepalive heartbeat      │
+│SQLite  │   │  • Drops slow consumers          │
+│WAL     │   └─────────────────────────────────┘
+└────────┘
+      │
+      ▼
+REST API
+  GET /api/events              SSE stream (live)
+  GET /api/metrics?range=&variant=  Historical snapshots
+  GET /api/status              Latest snapshot per variant
+  GET /api/config              Scrape interval + targets
+  GET /healthz                 Health check
+```

-## Telemetry & Error Tracking
+**Storage schema (SQLite, WAL mode):**
+
+| Table | Resolution | Retention |
+|-------|-----------|-----------|
+| `metrics_5s` | 5 s | 24 h |
+| `metrics_1m` | 1 min averages | 7 d |
+
+`QueryRange` automatically selects the table: `metrics_5s` for ranges ≤ 1 h,
+`metrics_1m` for longer ranges. Downsampling runs every 10 minutes. Retention
+purge runs every 10 minutes.
+
+> **Note:** The deployment uses `emptyDir` for `/data` — dashboard history is
+> lost on pod restart. A PVC is commented out in the manifest for future use.
+
+**REST API parameters:**
+
+- `GET /api/metrics?range={5m,15m,1h,6h,24h,7d}&variant={production,canary,all}`
+- Returns a JSON array of `MetricSnapshot` objects
+
+**Snapshot fields computed by collector:**
+
+| Field | Description |
+|-------|-------------|
+| `req_rate` | Requests per second (counter rate over interval) |
+| `token_rate_in/out` | Input/output tokens per second |
+| `error_rate_pct` | `5xx / total * 100` |
+| `latency_p50/p95/p99` | Histogram quantiles (ms) |
+| `request_size_avg` / `response_size_avg` | Histogram mean (bytes) |
+| `status_code_rates` | Per-status-code req/s map |
+| `rate_limit_rps` | Current limiter rate |
+| `rate_limit_adj_increase/decrease` | AIMD adjustment counters |
+| `worker_utilization` | `concurrent / max_workers` |
+
+**Frontend (React/Vite/Tailwind, embedded in binary via `//go:embed`):**
+
+Six panels in a 2×3 responsive grid, each wrapped in an error boundary:
+
+| Panel | What it shows |
+|-------|---------------|
+| Request Rate | req/s time series |
+| Latency | p50 / p95 / p99 (ms) time series |
+| Tokens | Input + output token rate (tokens/s) |
+| Concurrency | In-flight requests vs MAX_WORKERS |
+| Rate Limiter | Current rate, AIMD adjustments, rejections |
+| Errors | Error rate %, upstream errors by type |
+
+Global controls:
+- **Variant toggle:** Production / Canary / Both — filters all panels
+- **Time range selector:** 5 m / 15 m / 1 h / 6 h / 24 h
+- **Theme toggle:** Dark / Light
+- **Status bar:** connection state, req/s, p50, token rate, error %, workers; stale-data indicators per variant
+- **Loading skeleton:** shown until first SSE data arrives
+- **Auto-reconnect:** exponential backoff with countdown timer + manual reconnect button
+- **History backfill:** on connect, fetches REST history for the current time range before live SSE data arrives
+
+**Dashboard environment variables:**
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `SCRAPE_TARGETS` | `http://zai-proxy.mcp.svc.cluster.local:8080/metrics` | Comma-separated scrape URLs |
+| `SCRAPE_INTERVAL` | `5s` | How often to scrape |
+| `SCRAPE_TIMEOUT` | `3s` | Per-scrape HTTP timeout |
+| `LISTEN_ADDR` | `:8080` | Dashboard listen address |
+| `DB_PATH` | `/data/dashboard.db` | SQLite file path |
+| `RETENTION_5S` | `24h` | High-resolution data retention |
+| `RETENTION_1M` | `168h` (7d) | Downsampled data retention |
+
+> The default `SCRAPE_TARGETS` hardcodes `mcp` namespace. In deployments where
+> the proxy runs in a different namespace (e.g., `devpod`), override via env.
+
+### Grafana — Prometheus Dashboard (separate from the React dashboard)
+
+A Grafana dashboard ConfigMap lives at
+`k8s/ardenone-cluster/monitoring/grafana-dashboard-zai-proxy.yml` and queries
+Prometheus directly. Panels:
+
+| Panel | Query |
+|-------|-------|
+| Total Requests (1h) | `increase(zai_proxy_requests_total[1h])` |
+| Error Rate | `rate(4xx+5xx) / rate(total)` |
+| 429 Errors (1h) | `increase(requests_total{status_code="429"}[1h])` |
+| Response Time p90 | `histogram_quantile(0.90, ...)` |
+| Worker Utilization | `sum(zai_proxy_worker_utilization_ratio)` |
+| Rate Limit (current) | `zai_proxy_rate_limit_requests_per_second` |
+| Concurrent Requests | `sum(zai_proxy_concurrent_requests)` |
+| Success Rate | `rate(2xx) / rate(total)` |
+| Request Rate by Status | by `status_code` label |
+| Concurrent vs Max Workers | concurrent + max_workers overlay |
+| Duration Percentiles | p50 / p90 / p99 |
+| Request/Response Size p90 | histogram_quantile on size histograms |
+| Upstream Errors | by `error_type` label |
+| Rate Limit Behavior | retries by reason + adjustments by direction |
+| Token panels | total / input / output `increase(...[1h])` |
+
+## Telemetry & Metrics

 ### Token counting

-Every request and response passes through the token counter before forwarding/returning. The proxy records:
+The proxy records token usage after every request. API-reported counts are preferred;
+tiktoken is the fallback.

 | Metric | Labels |
 |--------|--------|
-| `zai_proxy_tokens_total` | `direction=request\|response`, `model`, `key_id` |
-| `zai_proxy_request_duration_seconds` | `model`, `status_code`, `key_id` |
-| `zai_proxy_requests_total` | `model`, `status_code`, `key_id` |
+| `zai_proxy_tokens_total` | `direction={input,output,cache_read,cache_write}`, `model`, `variant`, `pricing_tier={peak,off_peak}` |
+| `zai_proxy_request_duration_seconds` | `method`, `path`, `status_code`, `variant` |
+| `zai_proxy_requests_total` | `method`, `path`, `status_code`, `variant` |
+| `zai_proxy_request_size_bytes` | `method`, `path`, `variant` |
+| `zai_proxy_response_size_bytes` | `method`, `path`, `status_code`, `variant` |
+| `zai_proxy_concurrent_requests` | `variant` |
+| `zai_proxy_max_workers` | `variant` |
+| `zai_proxy_worker_utilization_ratio` | `variant` |
+| `zai_proxy_token_count_duration_seconds` | `variant` |
+| `zai_proxy_token_rate_seconds` | `direction`, `model`, `variant` |
+| `zai_proxy_token_rate` | `direction`, `model`, `variant` |
+| `zai_proxy_build_info` | `version`, `variant`, `commit`, `build_time` |

-Token counts are also written to the response `X-Tokens-Used` header so the calling agent can track its own consumption without querying the dashboard.
+**Pricing tier:** `GetPricingTier()` returns `peak` between 02:00–06:00 ET (Z.AI 2×
+pricing window), `off_peak` otherwise. Applied to all `tokensTotal` observations.

-### Error rate tracking
+**Token header:** input token count is also set in `X-Token-Input` response header so
+agents can track their own consumption without querying the dashboard.

-Upstream errors (4xx/5xx from Z.AI) are classified and exposed as:
+### Rate-limiter metrics

-| Metric | Description |
-|--------|-------------|
-| `zai_proxy_upstream_errors_total{code="429"}` | Rate-limit responses from Z.AI — indicates quota pressure |
-| `zai_proxy_upstream_errors_total{code="5xx"}` | Z.AI server errors |
-| `zai_proxy_upstream_errors_total{code="4xx"}` | Malformed requests, auth failures |
-| `zai_proxy_rate_limited_total` | Requests dropped by the proxy's own rate limiter (before hitting Z.AI) |
+| Metric | Labels | Description |
+|--------|--------|-------------|
+| `zai_proxy_rate_limit_requests_per_second` | `variant` | Current limiter rate |
+| `zai_proxy_rate_limit_wait_seconds` | `variant` | Time waiting in the limiter |
+| `zai_proxy_rate_limit_adjustments_total` | `direction={increase,decrease,probe}`, `variant` | Algorithm decisions |
+| `zai_proxy_rate_limit_rejections_total` | `variant` | Requests rejected (capacity) |
+| `zai_proxy_retry_attempts_total` | `reason={retry,network_error,429,truncated_response,empty_streaming}`, `variant` | Retry causes |
+| `zai_proxy_upstream_errors_total` | `error_type={422,429,truncated_response,empty_streaming,upstream_connection,write_error,read_error,request_creation}`, `variant` | Error taxonomy |

-429s from Z.AI are given special treatment: the proxy applies automatic back-off and surfaces a `Retry-After` header to the agent, giving agents a signal to pause rather than spin.
+### Error classification
+
+| Upstream condition | Proxy action |
+|-------------------|--------------|
+| 429 + Retry-After | Wait header delay, then retry (up to MAX_RETRIES) |
+| 429 no header | Exponential backoff retry |
+| 422 | Log bodies, no retry, return 422 to client |
+| Empty/invalid JSON body (2xx) | Retry; 502 after MAX_RETRIES |
+| Empty streaming response | Retry; 502 after MAX_RETRIES |
+| Network error | Retry; 502 after MAX_RETRIES |
+| Other 4xx/5xx | Pass through; no retry |

 ### Dashboard alerting targets (future)

- 429 rate from Z.AI > 5% of requests over 5m → alert (quota approaching)
- Proxy-side 429s > 10% → alert (agent is over rate limit)
- p95 latency > 10s → alert (upstream degradation)
- Error rate > 2% → alert
+- 429 rate from Z.AI > 5 % over 5 m → alert (quota pressure)
+- p95 latency > 10 s → alert (upstream degradation)
+- Error rate > 2 % → alert

-## Security Model
+## Environment Variables

-| Threat | Mitigation |
-|--------|------------|
-| Agent exfiltrates Z.AI key | Key never leaves proxy pod; not in agent env, not in logs, not in metrics |
-| Proxy key compromise | Proxy key has no Z.AI billing rights; can be rotated without touching Z.AI |
-| Log scraping | Z.AI key is never logged; proxy key is masked in access logs |
-| Metric label leakage | `key_id` label is a hash, not the raw proxy key |
-| Runaway agent burns quota | Per-key rate limiter + 429 back-off |
-| Z.AI quota exhaustion | 429 counter triggers alerts before quota is fully consumed |
+See [`docs/notes/ENVIRONMENT_VARIABLES.md`](../notes/ENVIRONMENT_VARIABLES.md) for the full
+reference. Key variables:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `ZAI_API_KEY` | required | Upstream Z.AI API key |
+| `DEPLOYMENT_VARIANT` | `production` | Metric stream tag |
+| `MAX_WORKERS` | `10` | Concurrency cap |
+| `TOKEN_COUNTING_ENABLED` | `true` | Enable/disable token counting |
+| `TOKENIZER_MODEL` | `glm-4` | Model label for token metrics |
+| `RATE_LIMIT_INITIAL` | `10.0` | Starting rate (req/s) |
+| `RATE_LIMIT_MIN` | `1.0` | Floor rate |
+| `RATE_LIMIT_MAX` | `50.0` | Ceiling cap |
+| `RATE_LIMIT_CEILING_ALPHA` | `0.3` | EWMA smoothing factor |
+| `RATE_LIMIT_HOLD_MARGIN` | `0.02` | Hold this % below estimated ceiling |
+| `RATE_LIMIT_PROBE_INTERVAL` | `10` | Probe above ceiling every N clean windows |
+| `MAX_RETRIES` | `3` | Max retry attempts |
+| `ZAI_TARGET_URL` | `https://api.z.ai/api/anthropic` | Upstream URL |

 ## Repository Layout

 ```
 zai-proxy/                          (git.ardenone.com/jedarden/zai-proxy)
 ├── proxy/                          Go module: git.ardenone.com/jedarden/zai-proxy
-│   ├── main.go                     HTTP server, routing, auth middleware
-│   ├── translator.go               Request/response format translation
-│   ├── bodyparser.go               Body parsing, streaming support
-│   ├── tokenizer.go                Token counting (tiktoken + GLM)
-│   ├── metrics.go                  Prometheus instrumentation
-│   ├── evaluation/                 Offline eval harness
+│   ├── main.go                     HTTP server, routing, rate limiter, retry logic
+│   ├── translator.go               No-op (Z.AI natively speaks the Claude wire format)
+│   ├── bodyparser.go               Body parsing, streaming capture, usage injection
+│   ├── tokenizer.go                Token counting (tiktoken cl100k_base + GLM fallback)
+│   ├── metrics.go                  Prometheus instrumentation + pricing tier logic
+│   ├── evaluation/                 Offline eval harness (token count accuracy vs Anthropic API)
 │   ├── cmd/evaluate/               CLI for batch evaluation
 │   ├── cmd/demo-eval/              Demo evaluation runner
 │   ├── scripts/                    Load test, canary integration, benchmarks
@ -139,7 +364,8 @@ Build templates live in `jedarden/declarative-config → k8s/iad-ci/argo-workflo
 | `zai-proxy-build` | `proxy/` | `ronaldraygun/zai-proxy:{VERSION}` |
 | `zai-proxy-dashboard-build` | `dashboard/` | `ronaldraygun/zai-proxy-dashboard:{VERSION}` |

-Both templates clone from the public `git.ardenone.com/jedarden/zai-proxy` repo (no auth required). Versions are read from `proxy/VERSION` and `dashboard/VERSION` respectively.
+Both templates clone from `git.ardenone.com/jedarden/zai-proxy` (no auth required).
+Versions are read from `proxy/VERSION` and `dashboard/VERSION` respectively.

 Triggering a build:
 ```bash
@ -157,16 +383,42 @@ EOF

 ## Deployment

-Both components deploy to the `devpod` namespace on `ardenone-cluster` via ArgoCD from `jedarden/declarative-config`.
+Both components deploy to the `devpod` namespace on `ardenone-cluster` via ArgoCD from
+`jedarden/declarative-config`.

 Key manifests:
- `k8s/ardenone-cluster/devpod/zai-proxy.yml` — production Deployment + Service
- `k8s/ardenone-cluster/devpod/zai-proxy-v2.yml` — canary Deployment
+- `k8s/ardenone-cluster/devpod/zai-proxy.yml` — original Deployment (currently replicas=0)
+- `k8s/ardenone-cluster/devpod/zai-proxy-v2.yml` — active production Deployment
 - `k8s/ardenone-cluster/devpod/zai-proxy-canary-deployment.yml` — canary config
+- `k8s/ardenone-cluster/devpod/zai-proxy-canary-service.yml` — weighted traffic split
 - `k8s/ardenone-cluster/devpod/zai-proxy-tailscale.yml` — Tailscale ingress
- `k8s/ardenone-cluster/devpod/zai-api-key.sealedsecret.yml` — encrypted Z.AI API key
+- `k8s/ardenone-cluster/devpod/zai-proxy-servicemonitor.yml` — Prometheus scrape target
+- `k8s/ardenone-cluster/monitoring/grafana-dashboard-zai-proxy.yml` — Grafana dashboard

-The Z.AI API key flows: OpenBao → ESO ExternalSecret → K8s Secret → proxy pod env (read once at startup, never written to any metric, log, or response).
+The Z.AI API key flows: OpenBao → ESO ExternalSecret → K8s Secret → proxy pod env
+(read once at startup; never written to any metric, log, or response).
+
+Workers reach the proxy via cluster-internal DNS:
+- Production: `http://zai-proxy.devpod.svc.cluster.local:8080/api/anthropic`
+- Canary: `http://zai-proxy-test.devpod.svc.cluster.local:8080/api/anthropic`
+
+## Operations
+
+| Document | What it covers |
+|----------|----------------|
+| `docs/notes/ENVIRONMENT_VARIABLES.md` | Full env var reference |
+| `docs/notes/DEPLOYMENT.md` | Production/canary dual-deploy workflow |
+| `docs/notes/CANARY_PROMOTION_PROCEDURE.md` | Step-by-step canary promotion |
+| `docs/notes/CANARY_PROMOTION_CHECKLIST.md` | Go/no-go checklist |
+| `docs/notes/CANARY_ROLLBACK_PROCEDURE.md` | Rollback triggers and steps |
+| `docs/notes/CANARY_TROUBLESHOOTING_GUIDE.md` | Common canary issues |
+| `docs/notes/REGRESSION_TESTING.md` | Regression test suite overview |
+| `docs/notes/REGRESSION_TEST_GUIDE.md` | Running regression tests |
+| `docs/notes/TOKEN_COUNTING.md` | Token counting design and validation |
+| `docs/notes/TOKENIZER_CONFIGURATION.md` | Tokenizer tuning |
+| `docs/notes/MONITORING_SETUP.md` | Grafana + Prometheus setup |
+| `docs/notes/zai-proxy-rate-limiting.md` | Adaptive rate limiter deep-dive |
+| `docs/notes/TROUBLESHOOTING.md` | General troubleshooting |

 ## Migration Status