From dee82a76a3148739a1cf6200961e83bd2826474b Mon Sep 17 00:00:00 2001 From: jedarden Date: Sat, 16 May 2026 16:03:50 -0400 Subject: [PATCH] chore: update module paths and add evaluation package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy - dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard - Update all Go import paths in proxy/ and dashboard/ to match new module paths - Add proxy/evaluation/ package (was missing from initial commit) - Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist Co-Authored-By: Claude Sonnet 4.6 --- dashboard/api/middleware.go | 2 +- dashboard/api/router.go | 4 +- dashboard/api/sse.go | 4 +- dashboard/api/sse_test.go | 2 +- dashboard/collector/collector.go | 2 +- dashboard/collector/parser.go | 2 +- dashboard/collector/parser_test.go | 2 +- dashboard/go.mod | 2 +- dashboard/main.go | 8 +- dashboard/main_test.go | 6 +- dashboard/storage/storage.go | 2 +- dashboard/storage/storage_test.go | 2 +- docs/plan/plan.md | 179 +++++++++ proxy/cmd/demo-eval/main.go | 2 +- proxy/cmd/evaluate/main.go | 2 +- proxy/evaluation/.env.example | 9 + proxy/evaluation/.gitignore | 26 ++ proxy/evaluation/EXAMPLE_USAGE.md | 243 ++++++++++++ proxy/evaluation/README.md | 206 ++++++++++ proxy/evaluation/evaluator.go | 347 ++++++++++++++++ proxy/evaluation/evaluator_test.go | 507 ++++++++++++++++++++++++ proxy/evaluation/pyproject.toml | 26 ++ proxy/evaluation/report.go | 500 +++++++++++++++++++++++ proxy/evaluation/requirements.txt | 25 ++ proxy/evaluation/run_evaluation.py | 325 +++++++++++++++ proxy/evaluation/test_cases.go | 166 ++++++++ proxy/evaluation/zai_eval/__init__.py | 6 + proxy/evaluation/zai_eval/cli.py | 222 +++++++++++ proxy/evaluation/zai_eval/client.py | 211 ++++++++++ proxy/evaluation/zai_eval/metrics.py | 168 ++++++++ proxy/evaluation/zai_eval/models.py | 199 ++++++++++ proxy/evaluation/zai_eval/report.py | 312 +++++++++++++++ proxy/evaluation/zai_eval/test_cases.py | 213 ++++++++++ proxy/go.mod | 2 +- 34 files changed, 3912 insertions(+), 22 deletions(-) create mode 100644 docs/plan/plan.md create mode 100644 proxy/evaluation/.env.example create mode 100644 proxy/evaluation/.gitignore create mode 100644 proxy/evaluation/EXAMPLE_USAGE.md create mode 100644 proxy/evaluation/README.md create mode 100644 proxy/evaluation/evaluator.go create mode 100644 proxy/evaluation/evaluator_test.go create mode 100644 proxy/evaluation/pyproject.toml create mode 100644 proxy/evaluation/report.go create mode 100644 proxy/evaluation/requirements.txt create mode 100755 proxy/evaluation/run_evaluation.py create mode 100644 proxy/evaluation/test_cases.go create mode 100644 proxy/evaluation/zai_eval/__init__.py create mode 100644 proxy/evaluation/zai_eval/cli.py create mode 100644 proxy/evaluation/zai_eval/client.py create mode 100644 proxy/evaluation/zai_eval/metrics.py create mode 100644 proxy/evaluation/zai_eval/models.py create mode 100644 proxy/evaluation/zai_eval/report.py create mode 100644 proxy/evaluation/zai_eval/test_cases.py diff --git a/dashboard/api/middleware.go b/dashboard/api/middleware.go index 31cda56..ad46abb 100644 --- a/dashboard/api/middleware.go +++ b/dashboard/api/middleware.go @@ -7,7 +7,7 @@ import ( "runtime" "time" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/logger" + "git.ardenone.com/jedarden/zai-proxy/dashboard/logger" ) // Middleware is a function that wraps an http.Handler. diff --git a/dashboard/api/router.go b/dashboard/api/router.go index 6a25328..e12b0f6 100644 --- a/dashboard/api/router.go +++ b/dashboard/api/router.go @@ -7,8 +7,8 @@ import ( "strconv" "time" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/storage" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/storage" ) // Router sets up the HTTP routes. diff --git a/dashboard/api/sse.go b/dashboard/api/sse.go index 0667826..75e9570 100644 --- a/dashboard/api/sse.go +++ b/dashboard/api/sse.go @@ -8,8 +8,8 @@ import ( "sync" "time" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/logger" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/logger" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" ) var sseLog = logger.Component("sse") diff --git a/dashboard/api/sse_test.go b/dashboard/api/sse_test.go index da753cd..1c5f254 100644 --- a/dashboard/api/sse_test.go +++ b/dashboard/api/sse_test.go @@ -9,7 +9,7 @@ import ( "testing" "time" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" ) func newTestSSEHub() *SSEHub { diff --git a/dashboard/collector/collector.go b/dashboard/collector/collector.go index 2093029..c605865 100644 --- a/dashboard/collector/collector.go +++ b/dashboard/collector/collector.go @@ -12,7 +12,7 @@ import ( "sync" "time" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" ) // Collector scrapes Prometheus metrics from zai-proxy endpoints. diff --git a/dashboard/collector/parser.go b/dashboard/collector/parser.go index e31ba27..0544ff6 100644 --- a/dashboard/collector/parser.go +++ b/dashboard/collector/parser.go @@ -12,7 +12,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" ) // Parser parses Prometheus exposition format text into metric families. diff --git a/dashboard/collector/parser_test.go b/dashboard/collector/parser_test.go index 868eda2..e20552b 100644 --- a/dashboard/collector/parser_test.go +++ b/dashboard/collector/parser_test.go @@ -4,7 +4,7 @@ import ( "math" "testing" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" ) func TestParser_ParseCounter(t *testing.T) { diff --git a/dashboard/go.mod b/dashboard/go.mod index 6878239..53c6c3d 100644 --- a/dashboard/go.mod +++ b/dashboard/go.mod @@ -1,4 +1,4 @@ -module github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard +module git.ardenone.com/jedarden/zai-proxy/dashboard go 1.23 diff --git a/dashboard/main.go b/dashboard/main.go index 74ddaf4..9ee4a57 100644 --- a/dashboard/main.go +++ b/dashboard/main.go @@ -10,10 +10,10 @@ import ( "os/signal" "syscall" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/api" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/collector" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/logger" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/storage" + "git.ardenone.com/jedarden/zai-proxy/dashboard/api" + "git.ardenone.com/jedarden/zai-proxy/dashboard/collector" + "git.ardenone.com/jedarden/zai-proxy/dashboard/logger" + "git.ardenone.com/jedarden/zai-proxy/dashboard/storage" ) //go:embed frontend/dist/* diff --git a/dashboard/main_test.go b/dashboard/main_test.go index 84446ed..dfe5a0e 100644 --- a/dashboard/main_test.go +++ b/dashboard/main_test.go @@ -8,9 +8,9 @@ import ( "strings" "testing" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/api" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/logger" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/storage" + "git.ardenone.com/jedarden/zai-proxy/dashboard/api" + "git.ardenone.com/jedarden/zai-proxy/dashboard/logger" + "git.ardenone.com/jedarden/zai-proxy/dashboard/storage" ) // TestIndexRedirectLoopBug verifies that / and /index.html return 200 OK diff --git a/dashboard/storage/storage.go b/dashboard/storage/storage.go index 52f5257..3af2745 100644 --- a/dashboard/storage/storage.go +++ b/dashboard/storage/storage.go @@ -10,7 +10,7 @@ import ( "sync" "time" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" ) // Storage provides SQLite-based metric persistence. diff --git a/dashboard/storage/storage_test.go b/dashboard/storage/storage_test.go index 5201892..d56efa4 100644 --- a/dashboard/storage/storage_test.go +++ b/dashboard/storage/storage_test.go @@ -8,7 +8,7 @@ import ( _ "modernc.org/sqlite" - "github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard/model" + "git.ardenone.com/jedarden/zai-proxy/dashboard/model" ) func TestStorage_WriteAndRead(t *testing.T) { diff --git a/docs/plan/plan.md b/docs/plan/plan.md new file mode 100644 index 0000000..401b122 --- /dev/null +++ b/docs/plan/plan.md @@ -0,0 +1,179 @@ +# ZAI Proxy Ecosystem — Plan + +## Objective + +Provide a stable, observable endpoint for LLM agents to access the Z.AI API without exposing the Z.AI API key as an environment variable or in any other plaintext form accessible to the calling process. The proxy is the sole keeper of the credential; agents authenticate via a shared secret (proxy API key) that carries no Z.AI billing rights on its own. + +## Architecture + +``` +LLM Agent (Claude Code, NEEDLE worker, etc.) + │ + │ POST /v1/chat/completions + │ Authorization: Bearer ← agent's credential (not the Z.AI key) + ▼ +┌─────────────────────────────────────────────────────┐ +│ zai-proxy │ +│ │ +│ • Validates proxy-key │ +│ • Rewrites Authorization → Bearer │ +│ • Rate-limits (token bucket per key) │ +│ • Counts tokens (request + response) │ +│ • Records metrics (Prometheus) │ +│ • Translates request/response format if needed │ +│ │ +└──────────────────┬──────────────────────────────────┘ + │ HTTPS + ▼ + api.z.ai (Z.AI upstream) +``` + +The Z.AI API key lives **only** as a Kubernetes Secret (sealed-secrets encrypted at rest, injected as an env var into the proxy pod only). No agent process, worker, or tool ever sees the upstream key. + +## Components + +### proxy/ — Reverse Proxy (Go) + +The core component. Handles: + +- **Credential isolation:** accepts `Authorization: Bearer `, injects the real Z.AI key upstream. Proxy keys are hashed and stored in config; compromise of a proxy key cannot be used to bill or enumerate usage independently. +- **Token counting:** both request and response token counts via tiktoken (for OpenAI-compat models) and GLM tokenizer (for GLM series). Token counts feed the metrics pipeline. +- **Rate limiting:** configurable token-bucket per proxy key. Prevents a runaway agent from exhausting the Z.AI quota. Returns 429 when the bucket is empty. +- **Prometheus metrics:** exposes `/metrics` with request counts, latency histograms, token usage, error rates, and rate-limit hit counts. +- **Request/response translation:** normalises differences between the OpenAI wire format and Z.AI's dialect so agents using standard OpenAI client libraries work without modification. +- **Canary support:** runs two deployment variants (production + canary) simultaneously; traffic split is controlled by the Kubernetes service config, not the proxy itself. + +### dashboard/ — Metrics Dashboard (Go + React) + +The observability layer. Scrapes the proxy's Prometheus endpoint, persists aggregated data in SQLite, and serves a live React frontend via SSE. + +Panels: +- Request rate (req/s) +- Token throughput (tokens/s, split by direction) +- Latency (p50/p95/p99) +- Error rate (4xx, 5xx, 429 broken out separately) +- Rate-limit hit rate +- Concurrency (in-flight requests) + +## Telemetry & Error Tracking + +### Token counting + +Every request and response passes through the token counter before forwarding/returning. The proxy records: + +| Metric | Labels | +|--------|--------| +| `zai_proxy_tokens_total` | `direction=request\|response`, `model`, `key_id` | +| `zai_proxy_request_duration_seconds` | `model`, `status_code`, `key_id` | +| `zai_proxy_requests_total` | `model`, `status_code`, `key_id` | + +Token counts are also written to the response `X-Tokens-Used` header so the calling agent can track its own consumption without querying the dashboard. + +### Error rate tracking + +Upstream errors (4xx/5xx from Z.AI) are classified and exposed as: + +| Metric | Description | +|--------|-------------| +| `zai_proxy_upstream_errors_total{code="429"}` | Rate-limit responses from Z.AI — indicates quota pressure | +| `zai_proxy_upstream_errors_total{code="5xx"}` | Z.AI server errors | +| `zai_proxy_upstream_errors_total{code="4xx"}` | Malformed requests, auth failures | +| `zai_proxy_rate_limited_total` | Requests dropped by the proxy's own rate limiter (before hitting Z.AI) | + +429s from Z.AI are given special treatment: the proxy applies automatic back-off and surfaces a `Retry-After` header to the agent, giving agents a signal to pause rather than spin. + +### Dashboard alerting targets (future) + +- 429 rate from Z.AI > 5% of requests over 5m → alert (quota approaching) +- Proxy-side 429s > 10% → alert (agent is over rate limit) +- p95 latency > 10s → alert (upstream degradation) +- Error rate > 2% → alert + +## Security Model + +| Threat | Mitigation | +|--------|------------| +| Agent exfiltrates Z.AI key | Key never leaves proxy pod; not in agent env, not in logs, not in metrics | +| Proxy key compromise | Proxy key has no Z.AI billing rights; can be rotated without touching Z.AI | +| Log scraping | Z.AI key is never logged; proxy key is masked in access logs | +| Metric label leakage | `key_id` label is a hash, not the raw proxy key | +| Runaway agent burns quota | Per-key rate limiter + 429 back-off | +| Z.AI quota exhaustion | 429 counter triggers alerts before quota is fully consumed | + +## Repository Layout + +``` +zai-proxy/ (git.ardenone.com/jedarden/zai-proxy) +├── proxy/ Go module: git.ardenone.com/jedarden/zai-proxy +│ ├── main.go HTTP server, routing, auth middleware +│ ├── translator.go Request/response format translation +│ ├── bodyparser.go Body parsing, streaming support +│ ├── tokenizer.go Token counting (tiktoken + GLM) +│ ├── metrics.go Prometheus instrumentation +│ ├── evaluation/ Offline eval harness +│ ├── cmd/evaluate/ CLI for batch evaluation +│ ├── cmd/demo-eval/ Demo evaluation runner +│ ├── scripts/ Load test, canary integration, benchmarks +│ ├── tests/ Integration and regression test suites +│ └── Dockerfile Production image +├── dashboard/ Go module: git.ardenone.com/jedarden/zai-proxy/dashboard +│ ├── main.go HTTP server + SSE broadcaster +│ ├── collector/ Prometheus scraper + parser +│ ├── api/ REST + SSE handlers +│ ├── storage/ SQLite persistence layer +│ ├── model/ Shared metric data types +│ ├── logger/ Structured logger +│ └── frontend/ React/Vite/Tailwind dashboard UI +└── docs/ + ├── plan/plan.md This document + ├── notes/ Deployment, operations, canary procedures + └── research/ Tokenizer research, metrics references +``` + +## CI/CD + +Build templates live in `jedarden/declarative-config → k8s/iad-ci/argo-workflows/`: + +| Template | Builds | Pushes to | +|----------|--------|-----------| +| `zai-proxy-build` | `proxy/` | `ronaldraygun/zai-proxy:{VERSION}` | +| `zai-proxy-dashboard-build` | `dashboard/` | `ronaldraygun/zai-proxy-dashboard:{VERSION}` | + +Both templates clone from the public `git.ardenone.com/jedarden/zai-proxy` repo (no auth required). Versions are read from `proxy/VERSION` and `dashboard/VERSION` respectively. + +Triggering a build: +```bash +kubectl --kubeconfig=/home/coding/.kube/iad-ci.kubeconfig create -f - < 0 { + result.InputTokenPercentDiff = float64(result.InputTokenDiff) / float64(anthropicResp.TokenUsage.InputTokens) * 100 + } + if anthropicResp.TokenUsage.OutputTokens > 0 { + result.OutputTokenPercentDiff = float64(result.OutputTokenDiff) / float64(anthropicResp.TokenUsage.OutputTokens) * 100 + } + } + + // Compare response structure (basic check) + result.ResponseStructureMatch = e.compareResponseStructure(zaiResp.Body, anthropicResp.Body) + + return result +} + +// compareResponseStructure performs basic structural comparison +func (e *Evaluator) compareResponseStructure(zaiBody, anthropicBody []byte) bool { + var zaiMap, anthropicMap map[string]interface{} + + if err := json.Unmarshal(zaiBody, &zaiMap); err != nil { + return false + } + if err := json.Unmarshal(anthropicBody, &anthropicMap); err != nil { + return false + } + + // Compare top-level keys + if len(zaiMap) != len(anthropicMap) { + return false + } + + for key := range zaiMap { + if _, ok := anthropicMap[key]; !ok { + return false + } + } + + return true +} + +// RunTests executes multiple test cases and returns aggregated metrics +func (e *Evaluator) RunTests(tests []TestRequest) ([]ComparisonResult, *EvaluationMetrics) { + results := make([]ComparisonResult, len(tests)) + + for i, test := range tests { + results[i] = e.RunTest(test) + } + + metrics := e.calculateMetrics(results) + + return results, metrics +} + +// calculateMetrics computes aggregated metrics from comparison results +func (e *Evaluator) calculateMetrics(results []ComparisonResult) *EvaluationMetrics { + return e.CalculateMetricsFromResults(results) +} + +// CalculateMetricsFromResults computes aggregated metrics from comparison results (public method) +func (e *Evaluator) CalculateMetricsFromResults(results []ComparisonResult) *EvaluationMetrics { + metrics := &EvaluationMetrics{ + TotalTests: len(results), + StructureMatchCount: 0, + } + + var inputTokenSum, outputTokenSum float64 + var inputPercentSum, outputPercentSum float64 + var validInputCount, validOutputCount int + + for _, result := range results { + if result.ZaiResponse.Error == nil && result.AnthropicResponse.Error == nil { + metrics.SuccessfulTests++ + } + + if result.ResponseStructureMatch { + metrics.StructureMatchCount++ + } + + // Calculate MAE and average percentage differences + if result.AnthropicResponse.TokenUsage != nil && result.ZaiResponse.TokenUsage != nil { + inputTokenSum += absFloat(float64(result.InputTokenDiff)) + outputTokenSum += absFloat(float64(result.OutputTokenDiff)) + + if result.AnthropicResponse.TokenUsage.InputTokens > 0 { + inputPercentSum += absFloat(result.InputTokenPercentDiff) + validInputCount++ + } + if result.AnthropicResponse.TokenUsage.OutputTokens > 0 { + outputPercentSum += absFloat(result.OutputTokenPercentDiff) + validOutputCount++ + } + } + } + + if validInputCount > 0 { + metrics.InputTokenMAE = inputTokenSum / float64(validInputCount) + metrics.InputTokenAvgPercentDiff = inputPercentSum / float64(validInputCount) + } + if validOutputCount > 0 { + metrics.OutputTokenMAE = outputTokenSum / float64(validOutputCount) + metrics.OutputTokenAvgPercentDiff = outputPercentSum / float64(validOutputCount) + } + + return metrics +} + +func absFloat(x float64) float64 { + if x < 0 { + return -x + } + return x +} diff --git a/proxy/evaluation/evaluator_test.go b/proxy/evaluation/evaluator_test.go new file mode 100644 index 0000000..c23b25f --- /dev/null +++ b/proxy/evaluation/evaluator_test.go @@ -0,0 +1,507 @@ +package evaluation + +import ( + "encoding/json" + "testing" +) + +// TestGetTestCases verifies test cases are properly defined +func TestGetTestCases(t *testing.T) { + tests := GetTestCases() + + if len(tests) < 10 { + t.Errorf("Expected at least 10 test cases, got %d", len(tests)) + } + + for i, tc := range tests { + if tc.Name == "" { + t.Errorf("Test case %d: missing name", i) + } + if tc.Request.Model == "" { + t.Errorf("Test case %d (%s): missing model", i, tc.Name) + } + if len(tc.Request.Messages) == 0 { + t.Errorf("Test case %d (%s): no messages", i, tc.Name) + } + if tc.Request.MaxTokens <= 0 { + t.Errorf("Test case %d (%s): invalid max_tokens", i, tc.Name) + } + } +} + +// TestEvaluatorCreation tests evaluator initialization +func TestEvaluatorCreation(t *testing.T) { + e := NewEvaluator("http://localhost:8080", "https://api.anthropic.com", "test-key-1", "test-key-2") + + if e == nil { + t.Fatal("NewEvaluator returned nil") + } + + if e.ZaiEndpoint != "http://localhost:8080" { + t.Errorf("Expected ZaiEndpoint 'http://localhost:8080', got '%s'", e.ZaiEndpoint) + } + + if e.ZaiAPIKey != "test-key-1" { + t.Errorf("Expected ZaiAPIKey 'test-key-1', got '%s'", e.ZaiAPIKey) + } + + if e.Client == nil { + t.Error("Client is nil") + } +} + +// TestExtractJSONTokenUsage tests token extraction from JSON responses +func TestExtractJSONTokenUsage(t *testing.T) { + e := NewEvaluator("", "", "", "") + + tests := []struct { + name string + body string + expectInput int + expectOutput int + expectNil bool + }{ + { + name: "Valid response with usage", + body: `{"id":"msg_123","type":"message","usage":{"input_tokens":100,"output_tokens":50}}`, + expectInput: 100, + expectOutput: 50, + expectNil: false, + }, + { + name: "Response with zero tokens", + body: `{"id":"msg_123","usage":{"input_tokens":0,"output_tokens":0}}`, + expectInput: 0, + expectOutput: 0, + expectNil: false, + }, + { + name: "Response without usage", + body: `{"id":"msg_123","type":"message"}`, + expectNil: true, + }, + { + name: "Invalid JSON", + body: `{invalid json}`, + expectNil: true, + }, + { + name: "Empty body", + body: ``, + expectNil: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := e.extractJSONTokenUsage([]byte(tc.body)) + + if tc.expectNil { + if result != nil { + t.Errorf("Expected nil result, got %+v", result) + } + } else { + if result == nil { + t.Fatal("Expected non-nil result, got nil") + } + if result.InputTokens != tc.expectInput { + t.Errorf("InputTokens: got %d, want %d", result.InputTokens, tc.expectInput) + } + if result.OutputTokens != tc.expectOutput { + t.Errorf("OutputTokens: got %d, want %d", result.OutputTokens, tc.expectOutput) + } + } + }) + } +} + +// TestExtractSSETokenUsage tests token extraction from SSE responses +func TestExtractSSETokenUsage(t *testing.T) { + e := NewEvaluator("", "", "", "") + + tests := []struct { + name string + body string + expectInput int + expectOutput int + expectNil bool + }{ + { + name: "Valid SSE with usage in message_delta", + body: `data: {"type":"message_start"} +data: {"type":"content_block_delta","delta":{"text":"Hello"}} +data: {"type":"message_delta","usage":{"input_tokens":10,"output_tokens":20}} +data: {"type":"message_stop"}`, + expectInput: 10, + expectOutput: 20, + expectNil: false, + }, + { + name: "SSE without usage", + body: `data: {"type":"message_start"} +data: {"type":"message_stop"}`, + expectNil: true, + }, + { + name: "Empty SSE", + body: ``, + expectNil: true, + }, + { + name: "SSE with [DONE]", + body: `data: {"type":"content_block_delta","delta":{"text":"Hi"}} +data: [DONE] +data: {"type":"message_stop"}`, + expectNil: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := e.extractSSETokenUsage([]byte(tc.body)) + + if tc.expectNil { + if result != nil { + t.Errorf("Expected nil result, got %+v", result) + } + } else { + if result == nil { + t.Fatal("Expected non-nil result, got nil") + } + if result.InputTokens != tc.expectInput { + t.Errorf("InputTokens: got %d, want %d", result.InputTokens, tc.expectInput) + } + if result.OutputTokens != tc.expectOutput { + t.Errorf("OutputTokens: got %d, want %d", result.OutputTokens, tc.expectOutput) + } + } + }) + } +} + +// TestCompareResponseStructure tests structural comparison +func TestCompareResponseStructure(t *testing.T) { + e := NewEvaluator("", "", "", "") + + tests := []struct { + name string + zaiBody string + anthropicBody string + expectMatch bool + }{ + { + name: "Identical structure", + zaiBody: `{"id":"msg_123","type":"message","content":[],"role":"assistant"}`, + anthropicBody: `{"id":"msg_456","type":"message","content":[],"role":"assistant"}`, + expectMatch: true, + }, + { + name: "Different number of keys", + zaiBody: `{"id":"msg_123","type":"message"}`, + anthropicBody: `{"id":"msg_456","type":"message","extra":"field"}`, + expectMatch: false, + }, + { + name: "Different key names", + zaiBody: `{"id":"msg_123","type":"message"}`, + anthropicBody: `{"id":"msg_456","content":"message"}`, + expectMatch: false, + }, + { + name: "Invalid JSON in zai", + zaiBody: `{invalid}`, + anthropicBody: `{"id":"msg_456"}`, + expectMatch: false, + }, + { + name: "Invalid JSON in anthropic", + zaiBody: `{"id":"msg_123"}`, + anthropicBody: `{invalid}`, + expectMatch: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + result := e.compareResponseStructure([]byte(tc.zaiBody), []byte(tc.anthropicBody)) + if result != tc.expectMatch { + t.Errorf("compareResponseStructure() = %v, want %v", result, tc.expectMatch) + } + }) + } +} + +// TestCalculateMetrics tests metrics calculation +func TestCalculateMetrics(t *testing.T) { + e := NewEvaluator("", "", "", "") + + results := []ComparisonResult{ + { + TestName: "Test 1", + ZaiResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50}, + }, + AnthropicResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 105, OutputTokens: 52}, + }, + InputTokenDiff: -5, + OutputTokenDiff: -2, + InputTokenPercentDiff: -5.0, + OutputTokenPercentDiff: -4.0, + ResponseStructureMatch: true, + }, + { + TestName: "Test 2", + ZaiResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 200, OutputTokens: 100}, + }, + AnthropicResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 190, OutputTokens: 95}, + }, + InputTokenDiff: 10, + OutputTokenDiff: 5, + InputTokenPercentDiff: 5.0, + OutputTokenPercentDiff: 5.0, + ResponseStructureMatch: true, + }, + { + TestName: "Test 3 (no token data)", + ZaiResponse: ResponseData{}, + AnthropicResponse: ResponseData{}, + ResponseStructureMatch: false, + }, + } + + metrics := e.calculateMetrics(results) + + if metrics.TotalTests != 3 { + t.Errorf("TotalTests: got %d, want 3", metrics.TotalTests) + } + + // First two tests have token data + if metrics.InputTokenMAE != 7.5 { // (5 + 10) / 2 + t.Errorf("InputTokenMAE: got %.2f, want 7.5", metrics.InputTokenMAE) + } + + if metrics.OutputTokenMAE != 3.5 { // (2 + 5) / 2 + t.Errorf("OutputTokenMAE: got %.2f, want 3.5", metrics.OutputTokenMAE) + } + + if metrics.InputTokenAvgPercentDiff != 5.0 { + t.Errorf("InputTokenAvgPercentDiff: got %.2f, want 5.0", metrics.InputTokenAvgPercentDiff) + } + + if metrics.OutputTokenAvgPercentDiff != 4.5 { + t.Errorf("OutputTokenAvgPercentDiff: got %.2f, want 4.5", metrics.OutputTokenAvgPercentDiff) + } + + if metrics.StructureMatchCount != 2 { + t.Errorf("StructureMatchCount: got %d, want 2", metrics.StructureMatchCount) + } +} + +// TestReportGeneration tests report generation +func TestReportGeneration(t *testing.T) { + results := []ComparisonResult{ + { + TestName: "Sample Test", + ZaiResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50}, + Duration: 100000000, // 100ms + }, + AnthropicResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50}, + Duration: 90000000, // 90ms + }, + InputTokenMatch: true, + OutputTokenMatch: true, + InputTokenDiff: 0, + OutputTokenDiff: 0, + InputTokenPercentDiff: 0.0, + OutputTokenPercentDiff: 0.0, + ResponseStructureMatch: true, + }, + } + + metrics := &EvaluationMetrics{ + TotalTests: 1, + SuccessfulTests: 1, + InputTokenMAE: 0.0, + OutputTokenMAE: 0.0, + InputTokenAvgPercentDiff: 0.0, + OutputTokenAvgPercentDiff: 0.0, + StructureMatchCount: 1, + } + + reporter := NewReportGenerator(results, metrics) + + // Test text report generation + textReport := reporter.GenerateTextReport() + if textReport == "" { + t.Error("GenerateTextReport() returned empty string") + } + + if len(textReport) < 100 { + t.Errorf("Text report too short: %d characters", len(textReport)) + } + + // Check for expected sections + expectedSections := []string{ + "EXECUTIVE SUMMARY", + "TOKEN ACCURACY METRICS", + "DETAILED TEST RESULTS", + "ANALYSIS AND RECOMMENDATIONS", + } + + for _, section := range expectedSections { + if !contains(textReport, section) { + t.Errorf("Text report missing section: %s", section) + } + } + + // Test JSON report generation + jsonReport, err := reporter.GenerateJSONReport() + if err != nil { + t.Errorf("GenerateJSONReport() error: %v", err) + } + + if len(jsonReport) == 0 { + t.Error("JSON report is empty") + } + + var jsonData map[string]interface{} + if err := json.Unmarshal(jsonReport, &jsonData); err != nil { + t.Errorf("JSON report is invalid: %v", err) + } + + // Verify required fields + requiredFields := []string{"generated_at", "metrics", "test_results", "interpretation"} + for _, field := range requiredFields { + if _, ok := jsonData[field]; !ok { + t.Errorf("JSON report missing field: %s", field) + } + } +} + +// TestPatternAnalysis tests pattern identification +func TestPatternAnalysis(t *testing.T) { + results := []ComparisonResult{ + { + TestName: "Test 1", + ZaiResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 105, OutputTokens: 52}, + }, + AnthropicResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50}, + }, + InputTokenDiff: 5, + OutputTokenDiff: 2, + }, + { + TestName: "Test 2", + ZaiResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 110, OutputTokens: 55}, + }, + AnthropicResponse: ResponseData{ + TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50}, + }, + InputTokenDiff: 10, + OutputTokenDiff: 5, + }, + } + + metrics := &EvaluationMetrics{ + TotalTests: 2, + SuccessfulTests: 2, + InputTokenMAE: 7.5, + OutputTokenMAE: 3.5, + StructureMatchCount: 2, + } + + reporter := NewReportGenerator(results, metrics) + patterns := reporter.identifyPatterns() + + // Should detect Z.AI consistently higher + if !contains(patterns, "Z.AI consistently reports higher input tokens") { + t.Error("Pattern analysis should detect Z.AI consistently higher for input tokens") + } + + if !contains(patterns, "Z.AI consistently reports higher output tokens") { + t.Error("Pattern analysis should detect Z.AI consistently higher for output tokens") + } +} + +// TestRecommendations tests recommendation generation +func TestRecommendations(t *testing.T) { + tests := []struct { + name string + metrics *EvaluationMetrics + expectRecommendation bool + expectedKeyword string + }{ + { + name: "High MAE should recommend tokenizer review", + metrics: &EvaluationMetrics{ + TotalTests: 10, + SuccessfulTests: 10, + InputTokenMAE: 15.0, + OutputTokenMAE: 2.0, + StructureMatchCount: 10, + }, + expectRecommendation: true, + expectedKeyword: "tokenizer", + }, + { + name: "Low success rate should recommend error handling review", + metrics: &EvaluationMetrics{ + TotalTests: 10, + SuccessfulTests: 8, + InputTokenMAE: 2.0, + OutputTokenMAE: 2.0, + StructureMatchCount: 10, + }, + expectRecommendation: true, + expectedKeyword: "failed", + }, + { + name: "Perfect metrics should recommend no action", + metrics: &EvaluationMetrics{ + TotalTests: 10, + SuccessfulTests: 10, + InputTokenMAE: 0.0, + OutputTokenMAE: 0.0, + StructureMatchCount: 10, + }, + expectRecommendation: true, + expectedKeyword: "no immediate action", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + reporter := NewReportGenerator([]ComparisonResult{}, tc.metrics) + recommendations := reporter.getRecommendations() + + if len(recommendations) == 0 { + t.Error("Expected at least one recommendation") + } + + found := false + for _, rec := range recommendations { + if contains(rec, tc.expectedKeyword) { + found = true + break + } + } + + if !found { + t.Errorf("Expected recommendation to contain '%s', got: %v", tc.expectedKeyword, recommendations) + } + }) + } +} + +func contains(s, substr string) bool { + return len(s) >= len(substr) && (s == substr || len(substr) == 0 || + (len(s) > 0 && (s[0:len(substr)] == substr || contains(s[1:], substr)))) +} diff --git a/proxy/evaluation/pyproject.toml b/proxy/evaluation/pyproject.toml new file mode 100644 index 0000000..ae15cfd --- /dev/null +++ b/proxy/evaluation/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "zai-evaluation" +version = "0.1.0" +description = "Evaluation framework comparing z.ai proxy vs Anthropic API token counts" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "httpx>=0.27.0", + "pandas>=2.2.0", + "numpy>=1.26.0", + "scipy>=1.13.0", + "rich>=13.7.0", + "typer>=0.9.0", + "pydantic>=2.6.0", + "python-dotenv>=1.0.0", +] + +[project.scripts] +zai-eval = "zai_eval.cli:app" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["zai_eval"] diff --git a/proxy/evaluation/report.go b/proxy/evaluation/report.go new file mode 100644 index 0000000..03cc468 --- /dev/null +++ b/proxy/evaluation/report.go @@ -0,0 +1,500 @@ +package evaluation + +import ( + "bytes" + "encoding/json" + "fmt" + "os" + "text/template" + "time" +) + +// ReportGenerator creates evaluation reports +type ReportGenerator struct { + results []ComparisonResult + metrics *EvaluationMetrics +} + +// NewReportGenerator creates a new report generator +func NewReportGenerator(results []ComparisonResult, metrics *EvaluationMetrics) *ReportGenerator { + return &ReportGenerator{ + results: results, + metrics: metrics, + } +} + +// GenerateTextReport creates a text-based evaluation report +func (rg *ReportGenerator) GenerateTextReport() string { + var buf bytes.Buffer + + buf.WriteString("╔══════════════════════════════════════════════════════════════════════════════╗\n") + buf.WriteString("║ Z.AI PROXY EVALUATION REPORT ║\n") + buf.WriteString(fmt.Sprintf("║ Generated: %s ║\n", time.Now().Format("2006-01-02 15:04:05"))) + buf.WriteString("╚══════════════════════════════════════════════════════════════════════════════╝\n\n") + + // Executive Summary + buf.WriteString("## EXECUTIVE SUMMARY\n\n") + buf.WriteString(fmt.Sprintf("Total Tests Run: %d\n", rg.metrics.TotalTests)) + buf.WriteString(fmt.Sprintf("Successful Tests: %d (%.1f%%)\n", rg.metrics.SuccessfulTests, float64(rg.metrics.SuccessfulTests)/float64(rg.metrics.TotalTests)*100)) + buf.WriteString(fmt.Sprintf("Structure Match Rate: %d (%.1f%%)\n", rg.metrics.StructureMatchCount, float64(rg.metrics.StructureMatchCount)/float64(rg.metrics.TotalTests)*100)) + buf.WriteString("\n") + + // Token Accuracy Metrics + buf.WriteString("## TOKEN ACCURACY METRICS\n\n") + buf.WriteString("┌────────────────────────┬──────────────┬──────────────┬──────────────────┐\n") + buf.WriteString("│ Metric │ Input Tokens │ Output Tokens │ Difference │\n") + buf.WriteString("├────────────────────────┼──────────────┼──────────────┼──────────────────┤\n") + + if rg.metrics.InputTokenMAE > 0 { + buf.WriteString(fmt.Sprintf("│ Mean Absolute Error │ %12.2f │ %12.2f │ │\n", rg.metrics.InputTokenMAE, rg.metrics.OutputTokenMAE)) + } + if rg.metrics.InputTokenAvgPercentDiff > 0 { + buf.WriteString(fmt.Sprintf("│ Avg Percent Diff │ %11.2f%% │ %11.2f%% │ │\n", rg.metrics.InputTokenAvgPercentDiff, rg.metrics.OutputTokenAvgPercentDiff)) + } + + buf.WriteString("└────────────────────────┴──────────────┴──────────────┴──────────────────┘\n\n") + + // Detailed Test Results + buf.WriteString("## DETAILED TEST RESULTS\n\n") + + for i, result := range rg.results { + buf.WriteString(fmt.Sprintf("### Test %d: %s\n\n", i+1, result.TestName)) + + // Status + zaiStatus := "✓ OK" + if result.ZaiResponse.Error != nil { + zaiStatus = fmt.Sprintf("✗ Error: %s", result.ZaiResponse.Error) + } + anthropicStatus := "✓ OK" + if result.AnthropicResponse.Error != nil { + anthropicStatus = fmt.Sprintf("✗ Error: %s", result.AnthropicResponse.Error) + } + + buf.WriteString(fmt.Sprintf("Z.AI Status: %s\n", zaiStatus)) + buf.WriteString(fmt.Sprintf("Anthropic Status: %s\n", anthropicStatus)) + + // Response times + buf.WriteString(fmt.Sprintf("Z.AI Response: %v\n", result.ZaiResponse.Duration)) + buf.WriteString(fmt.Sprintf("Anthropic Response: %v\n", result.AnthropicResponse.Duration)) + + // Token comparison + if result.ZaiResponse.TokenUsage != nil && result.AnthropicResponse.TokenUsage != nil { + buf.WriteString("\nToken Comparison:\n") + buf.WriteString("┌─────────────────────┬──────────┬──────────┬──────────┬────────────┐\n") + buf.WriteString("│ Direction │ Z.AI │ Anthropic│ Diff │ %% Diff │\n") + buf.WriteString("├─────────────────────┼──────────┼──────────┼──────────┼────────────┤\n") + buf.WriteString(fmt.Sprintf("│ Input Tokens │ %8d │ %8d │ %8d │ %9.2f%% │\n", + result.ZaiResponse.TokenUsage.InputTokens, + result.AnthropicResponse.TokenUsage.InputTokens, + result.InputTokenDiff, + result.InputTokenPercentDiff)) + buf.WriteString(fmt.Sprintf("│ Output Tokens │ %8d │ %8d │ %8d │ %9.2f%% │\n", + result.ZaiResponse.TokenUsage.OutputTokens, + result.AnthropicResponse.TokenUsage.OutputTokens, + result.OutputTokenDiff, + result.OutputTokenPercentDiff)) + buf.WriteString("└─────────────────────┴──────────┴──────────┴──────────┴────────────┘\n") + + // Match indicators + inputMatch := "✓" + if !result.InputTokenMatch { + inputMatch = "✗" + } + outputMatch := "✓" + if !result.OutputTokenMatch { + outputMatch = "✗" + } + buf.WriteString(fmt.Sprintf("\nInput Tokens Match: %s Output Tokens Match: %s\n", inputMatch, outputMatch)) + } else { + buf.WriteString("\n⚠ Token usage data not available for comparison\n") + if result.ZaiResponse.TokenUsage == nil { + buf.WriteString(" - Z.AI token usage: Not available\n") + } + if result.AnthropicResponse.TokenUsage == nil { + buf.WriteString(" - Anthropic token usage: Not available\n") + } + } + + // Structure match + structureMatch := "✓" + if !result.ResponseStructureMatch { + structureMatch = "✗" + } + buf.WriteString(fmt.Sprintf("Structure Match: %s\n\n", structureMatch)) + + // Response snippets (truncated) + if len(result.ZaiResponse.Body) > 0 && len(result.AnthropicResponse.Body) > 0 { + buf.WriteString("Response Preview:\n") + buf.WriteString("Z.AI Response:\n") + buf.WriteString(formatJSONPreview(result.ZaiResponse.Body, 200)) + buf.WriteString("\nAnthropic Response:\n") + buf.WriteString(formatJSONPreview(result.AnthropicResponse.Body, 200)) + buf.WriteString("\n") + } + + buf.WriteString("---\n\n") + } + + // Analysis and Recommendations + buf.WriteString("## ANALYSIS AND RECOMMENDATIONS\n\n") + buf.WriteString(rg.generateAnalysis()) + + return buf.String() +} + +// generateAnalysis creates analysis based on metrics +func (rg *ReportGenerator) generateAnalysis() string { + var buf bytes.Buffer + + // Token accuracy analysis + if rg.metrics.InputTokenMAE > 10 || rg.metrics.OutputTokenMAE > 10 { + buf.WriteString("### ⚠ Token Counting Accuracy Concerns\n\n") + if rg.metrics.InputTokenMAE > 10 { + buf.WriteString(fmt.Sprintf("- Input token MAE (%.2f) exceeds threshold of 10 tokens\n", rg.metrics.InputTokenMAE)) + } + if rg.metrics.OutputTokenMAE > 10 { + buf.WriteString(fmt.Sprintf("- Output token MAE (%.2f) exceeds threshold of 10 tokens\n", rg.metrics.OutputTokenMAE)) + } + buf.WriteString("- Recommendation: Review tokenizer configuration and consider model-specific encoding\n\n") + } else if rg.metrics.InputTokenMAE > 0 { + buf.WriteString("### ✓ Token Counting Accuracy\n\n") + buf.WriteString("Token counts are within acceptable tolerance levels.\n") + buf.WriteString(fmt.Sprintf("- Input MAE: %.2f tokens\n", rg.metrics.InputTokenMAE)) + buf.WriteString(fmt.Sprintf("- Output MAE: %.2f tokens\n\n", rg.metrics.OutputTokenMAE)) + } + + // Percentage difference analysis + if rg.metrics.InputTokenAvgPercentDiff > 5 || rg.metrics.OutputTokenAvgPercentDiff > 5 { + buf.WriteString("### ⚠ Percentage Difference Analysis\n\n") + if rg.metrics.InputTokenAvgPercentDiff > 5 { + buf.WriteString(fmt.Sprintf("- Average input token difference (%.2f%%) exceeds 5%% threshold\n", rg.metrics.InputTokenAvgPercentDiff)) + } + if rg.metrics.OutputTokenAvgPercentDiff > 5 { + buf.WriteString(fmt.Sprintf("- Average output token difference (%.2f%%) exceeds 5%% threshold\n", rg.metrics.OutputTokenAvgPercentDiff)) + } + buf.WriteString("- Recommendation: Investigate systematic biases in token counting\n\n") + } + + // Success rate analysis + successRate := float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests) * 100 + if successRate < 100 { + buf.WriteString(fmt.Sprintf("### ⚠ Success Rate: %.1f%%\n\n", successRate)) + buf.WriteString("Some tests failed. Review error logs above for details.\n\n") + } + + // Structure match analysis + structureRate := float64(rg.metrics.StructureMatchCount) / float64(rg.metrics.TotalTests) * 100 + if structureRate < 100 { + buf.WriteString(fmt.Sprintf("### ⚠ Structure Match Rate: %.1f%%\n\n", structureRate)) + buf.WriteString("Some responses have different structures. This may indicate:\n") + buf.WriteString("- Different response formats between endpoints\n") + buf.WriteString("- Missing or extra fields in responses\n\n") + } + + // Pattern analysis + buf.WriteString("### Pattern Analysis\n\n") + buf.WriteString(rg.identifyPatterns()) + + return buf.String() +} + +// identifyPatterns identifies systematic patterns in discrepancies +func (rg *ReportGenerator) identifyPatterns() string { + var buf bytes.Buffer + + inputConsistent := 0 + inputZaiHigher := 0 + inputZaiLower := 0 + outputConsistent := 0 + outputZaiHigher := 0 + outputZaiLower := 0 + streamingTests := 0 + nonStreamingTests := 0 + + for _, result := range rg.results { + if result.ZaiResponse.TokenUsage != nil && result.AnthropicResponse.TokenUsage != nil { + if result.InputTokenDiff == 0 { + inputConsistent++ + } else if result.InputTokenDiff > 0 { + inputZaiHigher++ + } else { + inputZaiLower++ + } + + if result.OutputTokenDiff == 0 { + outputConsistent++ + } else if result.OutputTokenDiff > 0 { + outputZaiHigher++ + } else { + outputZaiLower++ + } + } + + if result.ZaiResponse.TokenUsage != nil { + // Check if streaming by looking at the test + for _, test := range GetTestCases() { + if test.Name == result.TestName { + if test.Stream { + streamingTests++ + } else { + nonStreamingTests++ + } + break + } + } + } + } + + buf.WriteString("#### Input Token Patterns\n") + buf.WriteString(fmt.Sprintf("- Exact matches: %d\n", inputConsistent)) + buf.WriteString(fmt.Sprintf("- Z.AI higher: %d\n", inputZaiHigher)) + buf.WriteString(fmt.Sprintf("- Z.AI lower: %d\n", inputZaiLower)) + + if inputZaiHigher > inputZaiLower*2 { + buf.WriteString("→ Pattern: Z.AI consistently reports higher input tokens\n") + buf.WriteString(" Possible cause: Different tokenization algorithm or encoding\n") + } else if inputZaiLower > inputZaiHigher*2 { + buf.WriteString("→ Pattern: Z.AI consistently reports lower input tokens\n") + buf.WriteString(" Possible cause: Undercounting or missing tokens in calculation\n") + } else if inputConsistent == 0 { + buf.WriteString("→ Pattern: No exact matches found\n") + buf.WriteString(" Possible cause: Systematic difference in counting methodology\n") + } + + buf.WriteString("\n#### Output Token Patterns\n") + buf.WriteString(fmt.Sprintf("- Exact matches: %d\n", outputConsistent)) + buf.WriteString(fmt.Sprintf("- Z.AI higher: %d\n", outputZaiHigher)) + buf.WriteString(fmt.Sprintf("- Z.AI lower: %d\n", outputZaiLower)) + + if outputZaiHigher > outputZaiLower*2 { + buf.WriteString("→ Pattern: Z.AI consistently reports higher output tokens\n") + buf.WriteString(" Possible cause: Counting control tokens or metadata\n") + } else if outputZaiLower > outputZaiHigher*2 { + buf.WriteString("→ Pattern: Z.AI consistently reports lower output tokens\n") + buf.WriteString(" Possible cause: Truncation or incomplete capture\n") + } + + buf.WriteString(fmt.Sprintf("\n#### Test Type Distribution\n")) + buf.WriteString(fmt.Sprintf("- Streaming tests: %d\n", streamingTests)) + buf.WriteString(fmt.Sprintf("- Non-streaming tests: %d\n", nonStreamingTests)) + + return buf.String() +} + +// formatJSONPreview creates a formatted preview of JSON response +func formatJSONPreview(data []byte, maxLen int) string { + var prettyJSON bytes.Buffer + if err := json.Indent(&prettyJSON, data, "", " "); err != nil { + return string(data) + } + + preview := prettyJSON.String() + if len(preview) > maxLen { + return preview[:maxLen] + "..." + } + return preview +} + +// SaveToFile saves the report to a file +func (rg *ReportGenerator) SaveToFile(filename string) error { + report := rg.GenerateTextReport() + return os.WriteFile(filename, []byte(report), 0644) +} + +// GenerateJSONReport creates a JSON report for programmatic consumption +func (rg *ReportGenerator) GenerateJSONReport() ([]byte, error) { + report := struct { + GeneratedAt string `json:"generated_at"` + Metrics *EvaluationMetrics `json:"metrics"` + TestResults []ComparisonResult `json:"test_results"` + Interpretation map[string]interface{} `json:"interpretation"` + }{ + GeneratedAt: time.Now().Format(time.RFC3339), + Metrics: rg.metrics, + TestResults: rg.results, + Interpretation: map[string]interface{}{ + "overall_accuracy": rg.calculateOverallAccuracy(), + "recommendations": rg.getRecommendations(), + "patterns": rg.identifyPatterns(), + }, + } + + return json.MarshalIndent(report, "", " ") +} + +// calculateOverallAccuracy calculates an overall accuracy score +func (rg *ReportGenerator) calculateOverallAccuracy() map[string]float64 { + accuracy := make(map[string]float64) + + if rg.metrics.TotalTests > 0 { + accuracy["success_rate"] = float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests) * 100 + accuracy["structure_match_rate"] = float64(rg.metrics.StructureMatchCount) / float64(rg.metrics.TotalTests) * 100 + } + + // Token accuracy (inverse of MAE, scaled) + if rg.metrics.InputTokenMAE > 0 { + accuracy["input_token_accuracy"] = 100 - min(rg.metrics.InputTokenAvgPercentDiff, 100) + } + if rg.metrics.OutputTokenMAE > 0 { + accuracy["output_token_accuracy"] = 100 - min(rg.metrics.OutputTokenAvgPercentDiff, 100) + } + + return accuracy +} + +// getRecommendations returns actionable recommendations +func (rg *ReportGenerator) getRecommendations() []string { + var recommendations []string + + if rg.metrics.InputTokenMAE > 10 { + recommendations = append(recommendations, "Input token counting has high variance - verify tokenizer model matches Anthropic's") + } + + if rg.metrics.OutputTokenMAE > 10 { + recommendations = append(recommendations, "Output token counting has high variance - check SSE parsing logic") + } + + successRate := float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests) + if successRate < 1.0 { + recommendations = append(recommendations, "Some requests failed - review error handling and retry logic") + } + + structureRate := float64(rg.metrics.StructureMatchCount) / float64(rg.metrics.TotalTests) + if structureRate < 1.0 { + recommendations = append(recommendations, "Response structure mismatches detected - verify response forwarding") + } + + if len(recommendations) == 0 { + recommendations = append(recommendations, "All metrics within acceptable ranges - no immediate action required") + } + + return recommendations +} + +// GenerateHTMLReport creates an HTML formatted report +func (rg *ReportGenerator) GenerateHTMLReport() (string, error) { + const htmlTemplate = ` + + + + Z.AI Proxy Evaluation Report + + + +
+

🔍 Z.AI Proxy Evaluation Report

+

Generated: {{.Timestamp}}

+ +

Executive Summary

+
+
+
Total Tests
+
{{.TotalTests}}
+
+
+
Success Rate
+
{{.SuccessRate}}%
+
+
+
Input Token MAE
+
{{.InputMAE}}
+
+
+
Output Token MAE
+
{{.OutputMAE}}
+
+
+ +

Detailed Results

+ {{range .Results}} +
+

{{.TestName}}

+ + + + + + + + {{if .ZaiResponse.TokenUsage}} + + + + + + + + + + + + + {{end}} +
MetricZ.AIAnthropicDifference
Input Tokens{{.ZaiResponse.TokenUsage.InputTokens}}{{.AnthropicResponse.TokenUsage.InputTokens}}{{.InputTokenDiff}} ({{.InputTokenPercentDiff}}%)
Output Tokens{{.ZaiResponse.TokenUsage.OutputTokens}}{{.AnthropicResponse.TokenUsage.OutputTokens}}{{.OutputTokenDiff}} ({{.OutputTokenPercentDiff}}%)
+
+ {{end}} +
+ + +` + + data := struct { + Timestamp string + TotalTests int + SuccessRate float64 + HighSuccessRate bool + InputMAE float64 + OutputMAE float64 + LowInputMAE bool + LowOutputMAE bool + Results []ComparisonResult + }{ + Timestamp: time.Now().Format("2006-01-02 15:04:05"), + TotalTests: rg.metrics.TotalTests, + SuccessRate: float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests) * 100, + HighSuccessRate: rg.metrics.SuccessfulTests == rg.metrics.TotalTests, + InputMAE: rg.metrics.InputTokenMAE, + OutputMAE: rg.metrics.OutputTokenMAE, + LowInputMAE: rg.metrics.InputTokenMAE < 10, + LowOutputMAE: rg.metrics.OutputTokenMAE < 10, + Results: rg.results, + } + + tmpl, err := template.New("report").Parse(htmlTemplate) + if err != nil { + return "", err + } + + var buf bytes.Buffer + if err := tmpl.Execute(&buf, data); err != nil { + return "", err + } + + return buf.String(), nil +} + +func min(a, b float64) float64 { + if a < b { + return a + } + return b +} diff --git a/proxy/evaluation/requirements.txt b/proxy/evaluation/requirements.txt new file mode 100644 index 0000000..9d6fe1f --- /dev/null +++ b/proxy/evaluation/requirements.txt @@ -0,0 +1,25 @@ +# Evaluation Framework Requirements +# Python 3.11+ + +# HTTP client +httpx>=0.27.0 + +# Data analysis +pandas>=2.2.0 +numpy>=1.26.0 + +# Statistics +scipy>=1.13.0 + +# CLI and output +rich>=13.7.0 +typer>=0.9.0 + +# JSON handling +pydantic>=2.6.0 + +# Async support +asyncio>=3.4.3 + +# Environment variables +python-dotenv>=1.0.0 diff --git a/proxy/evaluation/run_evaluation.py b/proxy/evaluation/run_evaluation.py new file mode 100755 index 0000000..cb48642 --- /dev/null +++ b/proxy/evaluation/run_evaluation.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +"""Z.AI Proxy Evaluation Framework - CLI Entry Point + +Compares token counts from z.ai proxy with Anthropic API responses. +""" + +import argparse +import json +import os +import sys +from datetime import datetime +from pathlib import Path + +# Add evaluation package to path +sys.path.insert(0, str(Path(__file__).parent)) + +from zai_eval.client import DualClient +from zai_eval.test_cases import get_test_cases +from zai_eval.models import EvaluationResult, EvaluationReport + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Evaluate z.ai proxy token counting against Anthropic API" + ) + parser.add_argument( + "--proxy-url", + default=os.getenv("ZAI_PROXY_URL", "http://localhost:8080"), + help="Z.AI proxy URL (default: from ZAI_PROXY_URL or http://localhost:8080)" + ) + parser.add_argument( + "--proxy-key", + default=os.getenv("ZAI_API_KEY"), + help="Z.AI API key (default: from ZAI_API_KEY)" + ) + parser.add_argument( + "--anthropic-key", + default=os.getenv("ANTHROPIC_API_KEY"), + help="Anthropic API key (default: from ANTHROPIC_API_KEY)" + ) + parser.add_argument( + "--output-dir", + default="evaluation/results", + help="Output directory for reports (default: evaluation/results)" + ) + parser.add_argument( + "--test-name", + help="Run only a specific test case by name" + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable verbose output" + ) + return parser.parse_args() + + +def run_evaluation(args): + """Run the evaluation suite.""" + # Validate required parameters + if not args.proxy_key: + print("Error: Z.AI API key required. Set ZAI_API_KEY or use --proxy-key") + sys.exit(1) + if not args.anthropic_key: + print("Error: Anthropic API key required. Set ANTHROPIC_API_KEY or use --anthropic-key") + sys.exit(1) + + print("=" * 70) + print("Z.AI Proxy Evaluation Framework") + print("=" * 70) + print(f"Proxy URL: {args.proxy_url}") + print(f"Anthropic API: https://api.anthropic.com") + print(f"Output Directory: {args.output_dir}") + print() + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Get test cases + all_tests = get_test_cases() + if args.test_name: + tests = [t for t in all_tests if t.name == args.test_name] + if not tests: + print(f"Error: Test case '{args.test_name}' not found") + print(f"Available tests: {', '.join(t.name for t in all_tests)}") + sys.exit(1) + else: + tests = all_tests + + print(f"Running {len(tests)} test case(s)...") + print() + + # Create dual client + client = DualClient(args.proxy_url, args.proxy_key, args.anthropic_key) + + # Run evaluation + results = [] + for i, test in enumerate(tests, 1): + print(f"[{i}/{len(tests)}] {test.name}: {test.description}") + if args.verbose: + print(f" Model: {test.model}") + print(f" Max tokens: {test.max_tokens}") + print(f" Stream: {test.stream}") + + # Execute parallel requests + proxy_resp, anthropic_resp = client.evaluate_request( + model=test.model, + messages=test.messages, + max_tokens=test.max_tokens, + stream=test.stream, + temperature=test.temperature, + ) + + # Create result + result = EvaluationResult( + request_name=test.name, + proxy_response=proxy_resp, + anthropic_response=anthropic_resp, + input_match=False, + output_match=False, + total_match=False, + ) + result.calculate_metrics() + + results.append(result) + + # Show result + status = "✓" if (proxy_resp.status_code == 200 and anthropic_resp.status_code == 200) else "✗" + print(f" Status: {status}") + print(f" Proxy: {proxy_resp.status_code} | " + f"In: {proxy_resp.input_tokens or 'N/A':>4} | " + f"Out: {proxy_resp.output_tokens or 'N/A':>4} | " + f"Latency: {proxy_resp.latency_ms:.0f}ms") + print(f" Anthropic: {anthropic_resp.status_code} | " + f"In: {anthropic_resp.input_tokens or 'N/A':>4} | " + f"Out: {anthropic_resp.output_tokens or 'N/A':>4} | " + f"Latency: {anthropic_resp.latency_ms:.0f}ms") + + if proxy_resp.status_code == 200 and anthropic_resp.status_code == 200: + match_indicator = "✓" if result.input_match else "✗" + print(f" Input match: {match_indicator} (diff: {result.input_diff}, {result.input_pct_diff:.1f}%)") + match_indicator = "✓" if result.output_match else "✗" + print(f" Output match: {match_indicator} (diff: {result.output_diff}, {result.output_pct_diff:.1f}%)") + elif proxy_resp.error: + print(f" Proxy error: {proxy_resp.error}") + elif anthropic_resp.error: + print(f" Anthropic error: {anthropic_resp.error}") + print() + + # Generate report + print("Generating report...") + report = EvaluationReport( + total_requests=len(tests), + successful_requests=0, + failed_requests=0, + results=results, + ) + report.calculate_summary_metrics() + + # Print summary + print() + print("=" * 70) + print("EVALUATION SUMMARY") + print("=" * 70) + print(f"Total tests: {report.total_requests}") + print(f"Successful: {report.successful_requests}") + print(f"Failed: {report.failed_requests}") + print() + print("Token Accuracy:") + print(f" Input tokens: {report.input_token_accuracy:.1f}%") + print(f" Output tokens: {report.output_token_accuracy:.1f}%") + print(f" Overall: {report.overall_accuracy:.1f}%") + print() + print("Mean Absolute Error:") + print(f" Input tokens: {report.input_mae:.2f}") + print(f" Output tokens: {report.output_mae:.2f}") + print(f" Total tokens: {report.total_mae:.2f}") + print() + print("Systematic Bias:") + print(f" Input bias: {report.input_bias_mean:+.2f} (positive = proxy overcounts)") + print(f" Output bias: {report.output_bias_mean:+.2f} (positive = proxy overcounts)") + print() + print("Latency:") + print(f" Avg proxy: {report.avg_proxy_latency_ms:.0f}ms") + print(f" Avg Anthropic: {report.avg_anthropic_latency_ms:.0f}ms") + print() + + # Save reports + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # JSON report + json_file = output_dir / f"evaluation_report_{timestamp}.json" + with open(json_file, "w") as f: + json.dump(report.dict(), f, indent=2, default=str) + print(f"✓ JSON report saved: {json_file}") + + # Text report + text_file = output_dir / f"evaluation_report_{timestamp}.txt" + with open(text_file, "w") as f: + f.write(generate_text_report(report)) + print(f"✓ Text report saved: {text_file}") + + # Analysis + print() + print("=" * 70) + print("ANALYSIS") + print("=" * 70) + print(generate_analysis(report)) + + return report + + +def generate_text_report(report: EvaluationReport) -> str: + """Generate a detailed text report.""" + lines = [ + "Z.AI Proxy Evaluation Report", + "=" * 70, + f"Generated: {report.timestamp.isoformat()}", + "", + "EXECUTIVE SUMMARY", + "-" * 70, + f"Total Tests: {report.total_requests}", + f"Successful: {report.successful_requests}", + f"Failed: {report.failed_requests}", + "", + "TOKEN ACCURACY METRICS", + "-" * 70, + f"Input Token Accuracy: {report.input_token_accuracy:.1f}%", + f"Output Token Accuracy: {report.output_token_accuracy:.1f}%", + f"Overall Accuracy: {report.overall_accuracy:.1f}%", + "", + "MEAN ABSOLUTE ERROR", + "-" * 70, + f"Input MAE: {report.input_mae:.2f} tokens", + f"Output MAE: {report.output_mae:.2f} tokens", + f"Total MAE: {report.total_mae:.2f} tokens", + "", + "SYSTEMATIC BIAS", + "-" * 70, + f"Input Bias: {report.input_bias_mean:+.2f} (positive = proxy overcounts)", + f"Output Bias: {report.output_bias_mean:+.2f} (positive = proxy overcounts)", + "", + "LATENCY", + "-" * 70, + f"Avg Proxy Latency: {report.avg_proxy_latency_ms:.0f}ms", + f"Avg Anthropic Latency: {report.avg_anthropic_latency_ms:.0f}ms", + "", + "DETAILED RESULTS", + "-" * 70, + ] + + for result in report.results: + lines.extend([ + "", + f"Test: {result.request_name}", + f" Proxy: Status={result.proxy_response.status_code} | " + f"In={result.proxy_response.input_tokens or 'N/A':>4} | " + f"Out={result.proxy_response.output_tokens or 'N/A':>4}", + f" Anthropic: Status={result.anthropic_response.status_code} | " + f"In={result.anthropic_response.input_tokens or 'N/A':>4} | " + f"Out={result.anthropic_response.output_tokens or 'N/A':>4}", + f" Match: Input={result.input_match} | Output={result.output_match}", + f" Diff: Input={result.input_diff} ({result.input_pct_diff:.1f}%) | " + f"Output={result.output_diff} ({result.output_pct_diff:.1f}%)", + ]) + + lines.extend(["", "", "ANALYSIS", "-" * 70]) + lines.append(generate_analysis(report)) + + return "\n".join(lines) + + +def generate_analysis(report: EvaluationReport) -> str: + """Generate analysis and recommendations.""" + lines = [] + + # Token accuracy assessment + if report.input_token_accuracy >= 95: + lines.append("✓ Input token counting is excellent (≥95% accuracy)") + elif report.input_token_accuracy >= 80: + lines.append("⚠ Input token counting needs attention (80-95% accuracy)") + else: + lines.append("✗ Input token counting has significant issues (<80% accuracy)") + + if report.output_token_accuracy >= 95: + lines.append("✓ Output token counting is excellent (≥95% accuracy)") + elif report.output_token_accuracy >= 80: + lines.append("⚠ Output token counting needs attention (80-95% accuracy)") + else: + lines.append("✗ Output token counting has significant issues (<80% accuracy)") + + # MAE assessment + if report.input_mae > 10: + lines.append(f"⚠ High input token MAE ({report.input_mae:.2f}) - review tokenizer configuration") + if report.output_mae > 10: + lines.append(f"⚠ High output token MAE ({report.output_mae:.2f}) - check SSE parsing logic") + + # Bias analysis + if abs(report.input_bias_mean) > 5: + direction = "overcounts" if report.input_bias_mean > 0 else "undercounts" + lines.append(f"⚠ Proxy consistently {direction} input tokens by {abs(report.input_bias_mean):.2f} on average") + if abs(report.output_bias_mean) > 5: + direction = "overcounts" if report.output_bias_mean > 0 else "undercounts" + lines.append(f"⚠ Proxy consistently {direction} output tokens by {abs(report.output_bias_mean):.2f} on average") + + # Pattern analysis + input_matches = sum(1 for r in report.results if r.input_match) + output_matches = sum(1 for r in report.results if r.output_match) + + if input_matches == 0: + lines.append("⚠ No exact input token matches found - systematic difference detected") + if output_matches == 0: + lines.append("⚠ No exact output token matches found - systematic difference detected") + + return "\n".join(lines) + + +if __name__ == "__main__": + args = parse_args() + report = run_evaluation(args) + + # Exit with error if any tests failed + if report.failed_requests > 0: + sys.exit(1) diff --git a/proxy/evaluation/test_cases.go b/proxy/evaluation/test_cases.go new file mode 100644 index 0000000..8062538 --- /dev/null +++ b/proxy/evaluation/test_cases.go @@ -0,0 +1,166 @@ +package evaluation + +// GetTestCases returns a diverse set of test cases for evaluation +func GetTestCases() []TestRequest { + return []TestRequest{ + { + Name: "Simple greeting", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 100, + Messages: []Message{ + {Role: "user", Content: "Hello! How are you?"}, + }, + }, + Stream: false, + }, + { + Name: "Code generation request", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 500, + Messages: []Message{ + {Role: "user", Content: "Write a Python function to calculate fibonacci numbers"}, + }, + }, + Stream: false, + }, + { + Name: "Multi-turn conversation", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 300, + Messages: []Message{ + {Role: "user", Content: "What is the capital of France?"}, + {Role: "assistant", Content: "The capital of France is Paris."}, + {Role: "user", Content: "What is its population?"}, + }, + }, + Stream: false, + }, + { + Name: "Long context input", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 200, + Messages: []Message{ + {Role: "user", Content: generateLongText(500)}, + }, + }, + Stream: false, + }, + { + Name: "JSON response request", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 300, + Messages: []Message{ + {Role: "user", Content: "List 5 colors in JSON format with their hex codes"}, + }, + }, + Stream: false, + }, + { + Name: "Streaming response", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 200, + Messages: []Message{ + {Role: "user", Content: "Tell me a short story about a robot"}, + }, + }, + Stream: true, + }, + { + Name: "Technical documentation", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 400, + Messages: []Message{ + {Role: "user", Content: "Explain the concept of recursion in computer science with examples"}, + }, + }, + Stream: false, + }, + { + Name: "Creative writing", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 350, + Messages: []Message{ + {Role: "user", Content: "Write a haiku about cloud computing"}, + }, + }, + Stream: false, + }, + { + Name: "Data analysis request", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 400, + Messages: []Message{ + {Role: "user", Content: "Analyze the pros and cons of microservices vs monolithic architecture"}, + }, + }, + Stream: false, + }, + { + Name: "Short response", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 50, + Messages: []Message{ + {Role: "user", Content: "What is 2+2?"}, + }, + }, + Stream: false, + }, + { + Name: "Medium response", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 300, + Messages: []Message{ + {Role: "user", Content: "Explain the difference between TCP and UDP"}, + }, + }, + Stream: false, + }, + { + Name: "List generation", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 250, + Messages: []Message{ + {Role: "user", Content: "List 10 common programming paradigms with brief descriptions"}, + }, + }, + Stream: false, + }, + { + Name: "Streaming long response", + Request: ClaudeRequest{ + Model: "claude-3-sonnet-20240229", + MaxTokens: 500, + Messages: []Message{ + {Role: "user", Content: "Write a detailed explanation of how HTTP works, including methods, headers, and status codes"}, + }, + }, + Stream: true, + }, + } +} + +// generateLongText generates repetitive text for testing long inputs +func generateLongText(words int) string { + baseText := "This is a test sentence with multiple words for token counting purposes. " + result := "" + for len(result) < words*5 { // Approximate 5 chars per word + result += baseText + } + maxLen := words * 5 + if maxLen > len(result) { + maxLen = len(result) + } + return result[:maxLen] +} diff --git a/proxy/evaluation/zai_eval/__init__.py b/proxy/evaluation/zai_eval/__init__.py new file mode 100644 index 0000000..ec0e958 --- /dev/null +++ b/proxy/evaluation/zai_eval/__init__.py @@ -0,0 +1,6 @@ +"""Z.AI Proxy Evaluation Framework + +Compares token counts from z.ai proxy with Anthropic API responses. +""" + +__version__ = "0.1.0" diff --git a/proxy/evaluation/zai_eval/cli.py b/proxy/evaluation/zai_eval/cli.py new file mode 100644 index 0000000..c9f89fe --- /dev/null +++ b/proxy/evaluation/zai_eval/cli.py @@ -0,0 +1,222 @@ +"""CLI interface for evaluation framework.""" + +import os +import sys +from pathlib import Path +from typing import Optional + +import typer +from rich.console import Console +from dotenv import load_dotenv + +from zai_eval.client import DualClient +from zai_eval.test_cases import get_test_cases, get_test_case_by_name, TEST_CASES +from zai_eval.models import EvaluationResult +from zai_eval.metrics import calculate_metrics +from zai_eval.report import print_report, save_report_json, save_report_markdown + +app = typer.Typer(help="Z.AI Proxy Evaluation Framework") +console = Console() + + +def get_api_keys() -> tuple[str, str, str]: + """Get API keys from environment variables. + + Returns: + Tuple of (proxy_url, proxy_api_key, anthropic_api_key) + """ + load_dotenv() + + proxy_url = os.getenv("ZAI_PROXY_URL", "http://localhost:8080") + proxy_api_key = os.getenv("ZAI_API_KEY") + anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") + + if not proxy_api_key: + console.print("[red]Error: ZAI_API_KEY environment variable not set[/red]") + console.print("Set it with: export ZAI_API_KEY=your-key") + raise typer.Exit(1) + + if not anthropic_api_key: + console.print("[red]Error: ANTHROPIC_API_KEY environment variable not set[/red]") + console.print("Set it with: export ANTHROPIC_API_KEY=your-key") + raise typer.Exit(1) + + return proxy_url, proxy_api_key, anthropic_api_key + + +@app.command() +def list_tests(): + """List all available test cases.""" + console.print("\n[bold cyan]Available Test Cases[/bold cyan]\n") + + for i, test in enumerate(TEST_CASES, 1): + console.print(f"{i}. [yellow]{test.name}[/yellow]") + console.print(f" {test.description}") + console.print(f" Model: {test.model} | Max tokens: {test.max_tokens}") + console.print() + + +@app.command() +def run( + test_name: Optional[str] = typer.Argument(None, help="Name of specific test to run"), + output_dir: Optional[Path] = typer.Option(None, "--output", "-o", help="Output directory for reports"), + json_output: bool = typer.Option(False, "--json", help="Save JSON report"), + markdown_output: bool = typer.Option(False, "--markdown", help="Save Markdown report"), + verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"), +): + """Run evaluation tests. + + If TEST_NAME is provided, run only that test. Otherwise run all tests. + """ + proxy_url, proxy_api_key, anthropic_api_key = get_api_keys() + + # Get test cases to run + if test_name: + test_case = get_test_case_by_name(test_name) + if not test_case: + console.print(f"[red]Error: Test '{test_name}' not found[/red]") + console.print("Use 'zai-eval list-tests' to see available tests") + raise typer.Exit(1) + tests_to_run = [test_case] + console.print(f"[cyan]Running test: {test_name}[/cyan]\n") + else: + tests_to_run = get_test_cases() + console.print(f"[cyan]Running {len(tests_to_run)} tests[/cyan]\n") + + # Initialize client + client = DualClient(proxy_url, proxy_api_key, anthropic_api_key) + + # Run tests + results = [] + + with console.status("[bold green]Running evaluation...") as status: + for i, test in enumerate(tests_to_run, 1): + status.update(f"[bold green]Running test {i}/{len(tests_to_run)}: {test.name}[/bold green]") + + if verbose: + console.print(f"\n[yellow]Test: {test.name}[/yellow]") + console.print(f" Description: {test.description}") + + proxy_response, anthropic_response = client.evaluate_request( + model=test.model, + messages=test.messages, + max_tokens=test.max_tokens, + stream=test.stream, + temperature=test.temperature, + ) + + result = EvaluationResult( + request_name=test.name, + proxy_response=proxy_response, + anthropic_response=anthropic_response, + ) + result.calculate_metrics() + results.append(result) + + if verbose: + if proxy_response.error: + console.print(f" [red]Proxy error: {proxy_response.error}[/red]") + if anthropic_response.error: + console.print(f" [red]Anthropic error: {anthropic_response.error}[/red]") + console.print(f" Proxy: {proxy_response.input_tokens}/{proxy_response.output_tokens}") + console.print(f" Anthropic: {anthropic_response.input_tokens}/{anthropic_response.output_tokens}") + console.print(f" Diff: {result.total_diff:+d} ({result.total_pct_diff:.1f}%)") + + # Calculate metrics + report = calculate_metrics(results) + + # Print report + print_report(console, report) + + # Save reports if requested + if output_dir: + output_dir.mkdir(parents=True, exist_ok=True) + + if json_output: + json_path = output_dir / "evaluation_report.json" + save_report_json(report, str(json_path)) + console.print(f"\n[green]JSON report saved to: {json_path}[/green]") + + if markdown_output: + md_path = output_dir / "evaluation_report.md" + save_report_markdown(report, str(md_path)) + console.print(f"[green]Markdown report saved to: {md_path}[/green]") + + # Exit with error code if any tests failed + failed_count = sum(1 for r in results if r.proxy_response.error or r.anthropic_response.error) + if failed_count > 0: + raise typer.Exit(1) + + +@app.command() +def quick( + prompt: str = typer.Argument(..., help="Prompt text to test"), + model: str = typer.Option("claude-3-sonnet-20240229", "--model", "-m", help="Model to use"), + max_tokens: int = typer.Option(100, "--max-tokens", help="Max tokens"), +): + """Run a quick single-test evaluation with custom prompt.""" + proxy_url, proxy_api_key, anthropic_api_key = get_api_keys() + + console.print(f"[cyan]Quick test with model: {model}[/cyan]\n") + console.print(f"Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}\n") + + client = DualClient(proxy_url, proxy_api_key, anthropic_api_key) + + messages = [{"role": "user", "content": prompt}] + + proxy_response, anthropic_response = client.evaluate_request( + model=model, + messages=messages, + max_tokens=max_tokens, + ) + + console.print("\n[bold]Results:[/bold]") + console.print(f"Proxy: In={proxy_response.input_tokens or 0}, Out={proxy_response.output_tokens or 0}") + console.print(f"Anthropic: In={anthropic_response.input_tokens or 0}, Out={anthropic_response.output_tokens or 0}") + + if proxy_response.error: + console.print(f"[red]Proxy error: {proxy_response.error}[/red]") + if anthropic_response.error: + console.print(f"[red]Anthropic error: {anthropic_response.error}[/red]") + + +@app.command() +def validate(): + """Validate that both endpoints are accessible.""" + proxy_url, proxy_api_key, anthropic_api_key = get_api_keys() + + console.print("[cyan]Validating endpoints...[/cyan]\n") + + client = DualClient(proxy_url, proxy_api_key, anthropic_api_key) + + # Test proxy + console.print("Testing Z.AI proxy...") + proxy_resp, _ = client.evaluate_request( + model="claude-3-sonnet-20240229", + messages=[{"role": "user", "content": "test"}], + max_tokens=10, + ) + + if proxy_resp.error: + console.print(f" [red]✗ Failed: {proxy_resp.error}[/red]") + else: + console.print(f" [green]✓ OK[/green] (status: {proxy_resp.status_code})") + + # Test Anthropic + console.print("Testing Anthropic API...") + _, anthropic_resp = client.evaluate_request( + model="claude-3-sonnet-20240229", + messages=[{"role": "user", "content": "test"}], + max_tokens=10, + ) + + if anthropic_resp.error: + console.print(f" [red]✗ Failed: {anthropic_resp.error}[/red]") + else: + console.print(f" [green]✓ OK[/green] (status: {anthropic_resp.status_code})") + + console.print() + + +if __name__ == "__main__": + app() diff --git a/proxy/evaluation/zai_eval/client.py b/proxy/evaluation/zai_eval/client.py new file mode 100644 index 0000000..7b04492 --- /dev/null +++ b/proxy/evaluation/zai_eval/client.py @@ -0,0 +1,211 @@ +"""HTTP client for making requests to proxy and Anthropic APIs.""" + +import time +import json +from typing import Optional +import httpx + + +class ProxyClient: + """Client for z.ai proxy.""" + + def __init__(self, base_url: str, api_key: str): + self.base_url = base_url.rstrip("/") + self.api_key = api_key + self.client = httpx.Client(timeout=300.0) + + def make_request( + self, + model: str, + messages: list[dict], + max_tokens: int = 100, + stream: bool = False, + temperature: Optional[float] = None, + ) -> dict: + """Make a request to the proxy and extract token usage.""" + from zai_eval.models import ProxyResponse + + start_time = time.time() + + headers = { + "Content-Type": "application/json", + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01", + } + + payload = { + "model": model, + "messages": messages, + "max_tokens": max_tokens, + "stream": stream, + } + + if temperature is not None: + payload["temperature"] = temperature + + try: + response = self.client.post( + f"{self.base_url}/v1/messages", + headers=headers, + json=payload, + ) + + latency_ms = (time.time() - start_time) * 1000 + + # Extract token counts from headers or response body + input_tokens = None + output_tokens = None + + # Check trailer headers first + input_tokens = response.headers.get("X-Token-Input") + output_tokens = response.headers.get("X-Token-Output") + total_tokens = response.headers.get("X-Token-Total") + + # If not in headers, try response body + if input_tokens is None: + try: + data = response.json() + if "usage" in data: + input_tokens = data["usage"].get("input_tokens") + output_tokens = data["usage"].get("output_tokens") + except (json.JSONDecodeError, KeyError): + pass + + # Convert to int + input_tokens = int(input_tokens) if input_tokens else None + output_tokens = int(output_tokens) if output_tokens else None + total_tokens = int(total_tokens) if total_tokens else None + + return ProxyResponse( + status_code=response.status_code, + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + usage_header=response.headers.get("X-Token-Usage"), + latency_ms=latency_ms, + ) + + except httpx.HTTPError as e: + return ProxyResponse( + status_code=0, + error=str(e), + latency_ms=(time.time() - start_time) * 1000, + ) + + +class AnthropicClient: + """Client for Anthropic API.""" + + def __init__(self, api_key: str): + self.api_key = api_key + self.client = httpx.Client( + base_url="https://api.anthropic.com", + timeout=300.0, + ) + + def make_request( + self, + model: str, + messages: list[dict], + max_tokens: int = 100, + stream: bool = False, + temperature: Optional[float] = None, + ) -> dict: + """Make a request to Anthropic API and extract token usage.""" + from zai_eval.models import AnthropicResponse + + start_time = time.time() + + headers = { + "Content-Type": "application/json", + "x-api-key": self.api_key, + "anthropic-version": "2023-06-01", + } + + payload = { + "model": model, + "messages": messages, + "max_tokens": max_tokens, + "stream": stream, + } + + if temperature is not None: + payload["temperature"] = temperature + + try: + response = self.client.post( + "/v1/messages", + headers=headers, + json=payload, + ) + + latency_ms = (time.time() - start_time) * 1000 + + input_tokens = None + output_tokens = None + + if response.status_code == 200: + try: + data = response.json() + if "usage" in data: + input_tokens = data["usage"].get("input_tokens") + output_tokens = data["usage"].get("output_tokens") + except (json.JSONDecodeError, KeyError): + pass + + return AnthropicResponse( + status_code=response.status_code, + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=(input_tokens or 0) + (output_tokens or 0), + latency_ms=latency_ms, + ) + + except httpx.HTTPError as e: + return AnthropicResponse( + status_code=0, + error=str(e), + latency_ms=(time.time() - start_time) * 1000, + ) + + +class DualClient: + """Client that makes parallel requests to both proxy and Anthropic.""" + + def __init__(self, proxy_url: str, proxy_api_key: str, anthropic_api_key: str): + self.proxy = ProxyClient(proxy_url, proxy_api_key) + self.anthropic = AnthropicClient(anthropic_api_key) + + def evaluate_request( + self, + model: str, + messages: list[dict], + max_tokens: int = 100, + stream: bool = False, + temperature: Optional[float] = None, + ) -> tuple: + """Make parallel requests to both endpoints.""" + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + proxy_future = executor.submit( + self.proxy.make_request, + model, + messages, + max_tokens, + stream, + temperature, + ) + anthropic_future = executor.submit( + self.anthropic.make_request, + model, + messages, + max_tokens, + stream, + temperature, + ) + + proxy_response = proxy_future.result() + anthropic_response = anthropic_future.result() + + return proxy_response, anthropic_response diff --git a/proxy/evaluation/zai_eval/metrics.py b/proxy/evaluation/zai_eval/metrics.py new file mode 100644 index 0000000..5531aac --- /dev/null +++ b/proxy/evaluation/zai_eval/metrics.py @@ -0,0 +1,168 @@ +"""Metrics calculation for evaluation framework.""" + +import numpy as np +from scipy import stats +from typing import List +from zai_eval.models import EvaluationResult, EvaluationReport + + +def calculate_metrics(results: List[EvaluationResult]) -> EvaluationReport: + """Calculate comprehensive metrics from evaluation results. + + Args: + results: List of evaluation results + + Returns: + EvaluationReport with calculated metrics + """ + report = EvaluationReport( + total_requests=len(results), + results=results, + ) + report.calculate_summary_metrics() + return report + + +def calculate_advanced_metrics(results: List[EvaluationResult]) -> dict: + """Calculate advanced statistical metrics. + + Args: + results: List of evaluation results + + Returns: + Dictionary with advanced metrics + """ + successful = [r for r in results if not r.proxy_response.error and not r.anthropic_response.error] + + if not successful: + return {} + + input_diffs = [r.input_diff for r in successful] + output_diffs = [r.output_diff for r in successful] + total_diffs = [r.total_diff for r in successful] + + input_pct_diffs = [r.input_pct_diff for r in successful] + output_pct_diffs = [r.output_pct_diff for r in successful] + + return { + # Standard deviation + "input_diff_std": np.std(input_diffs) if input_diffs else 0, + "output_diff_std": np.std(output_diffs) if output_diffs else 0, + "total_diff_std": np.std(total_diffs) if total_diffs else 0, + # Median + "input_diff_median": np.median(input_diffs) if input_diffs else 0, + "output_diff_median": np.median(output_diffs) if output_diffs else 0, + "total_diff_median": np.median(total_diffs) if total_diffs else 0, + # Percentiles + "input_diff_75th": np.percentile(input_diffs, 75) if input_diffs else 0, + "input_diff_95th": np.percentile(input_diffs, 95) if input_diffs else 0, + "input_diff_99th": np.percentile(input_diffs, 99) if input_diffs else 0, + # Max errors + "input_diff_max": max(input_diffs) if input_diffs else 0, + "output_diff_max": max(output_diffs) if output_diffs else 0, + # Correlation + "input_output_correlation": ( + np.corrcoef( + [r.proxy_response.input_tokens or 0 for r in successful], + [r.anthropic_response.input_tokens or 0 for r in successful], + )[0, 1] + if len(successful) > 1 + else 0 + ), + # Statistical significance tests + "input_ttest": ( + stats.ttest_1samp(input_diffs, 0).pvalue if input_diffs else None + ), + "output_ttest": ( + stats.ttest_1samp(output_diffs, 0).pvalue if output_diffs else None + ), + } + + +def detect_systematic_bias(results: List[EvaluationResult]) -> dict: + """Detect systematic biases in token counting. + + Args: + results: List of evaluation results + + Returns: + Dictionary with bias analysis + """ + successful = [r for r in results if not r.proxy_response.error and not r.anthropic_response.error] + + if not successful: + return {} + + input_diffs = [ + (r.proxy_response.input_tokens or 0) - (r.anthropic_response.input_tokens or 0) + for r in successful + ] + output_diffs = [ + (r.proxy_response.output_tokens or 0) - (r.anthropic_response.output_tokens or 0) + for r in successful + ] + + return { + # Input bias + "input_bias_mean": np.mean(input_diffs) if input_diffs else 0, + "input_bias_std": np.std(input_diffs) if input_diffs else 0, + "input_consistently_high": sum(1 for d in input_diffs if d > 0), + "input_consistently_low": sum(1 for d in input_diffs if d < 0), + # Output bias + "output_bias_mean": np.mean(output_diffs) if output_diffs else 0, + "output_bias_std": np.std(output_diffs) if output_diffs else 0, + "output_consistently_high": sum(1 for d in output_diffs if d > 0), + "output_consistently_low": sum(1 for d in output_diffs if d < 0), + # Bias patterns + "both_high": sum( + 1 for i, o in zip(input_diffs, output_diffs) if i > 0 and o > 0 + ), + "both_low": sum( + 1 for i, o in zip(input_diffs, output_diffs) if i < 0 and o < 0 + ), + "mixed_bias": sum( + 1 for i, o in zip(input_diffs, output_diffs) if (i > 0) != (o > 0) + ), + } + + +def calculate_accuracy_by_token_range(results: List[EvaluationResult]) -> dict: + """Calculate accuracy metrics grouped by token count ranges. + + Args: + results: List of evaluation results + + Returns: + Dictionary with accuracy by token range + """ + successful = [r for r in results if not r.proxy_response.error and not r.anthropic_response.error] + + if not successful: + return {} + + ranges = { + "small (0-100)": [], + "medium (101-500)": [], + "large (501-1000)": [], + "xlarge (1000+)": [], + } + + for r in successful: + total = (r.anthropic_response.input_tokens or 0) + (r.anthropic_response.output_tokens or 0) + if total <= 100: + ranges["small (0-100)"].append(r.total_diff) + elif total <= 500: + ranges["medium (101-500)"].append(r.total_diff) + elif total <= 1000: + ranges["large (501-1000)"].append(r.total_diff) + else: + ranges["xlarge (1000+)"].append(r.total_diff) + + return { + range_name: { + "count": len(diffs), + "mae": np.mean(diffs) if diffs else 0, + "max_error": max(diffs) if diffs else 0, + } + for range_name, diffs in ranges.items() + } diff --git a/proxy/evaluation/zai_eval/models.py b/proxy/evaluation/zai_eval/models.py new file mode 100644 index 0000000..ecbbe1f --- /dev/null +++ b/proxy/evaluation/zai_eval/models.py @@ -0,0 +1,199 @@ +"""Data models for evaluation framework.""" + +from typing import Optional +from pydantic import BaseModel, Field +from datetime import datetime + + +class TokenUsage(BaseModel): + """Token usage from API response.""" + + input_tokens: int + output_tokens: int + total_tokens: int = Field(default_factory=lambda: 0) + + def __post_init__(self): + """Calculate total tokens.""" + if self.total_tokens == 0: + self.total_tokens = self.input_tokens + self.output_tokens + + +class EvaluationRequest(BaseModel): + """A single evaluation request configuration.""" + + name: str + description: str + model: str + max_tokens: int + messages: list[dict] + stream: bool = False + temperature: Optional[float] = None + metadata: dict = Field(default_factory=dict) + + +class ProxyResponse(BaseModel): + """Response from proxy endpoint.""" + + status_code: int + input_tokens: Optional[int] = None + output_tokens: Optional[int] = None + total_tokens: Optional[int] = None + usage_header: Optional[str] = None + error: Optional[str] = None + latency_ms: float = 0 + + +class AnthropicResponse(BaseModel): + """Response from Anthropic API.""" + + status_code: int + input_tokens: Optional[int] = None + output_tokens: Optional[int] = None + total_tokens: Optional[int] = None + error: Optional[str] = None + latency_ms: float = 0 + + +class EvaluationResult(BaseModel): + """Result of comparing proxy vs Anthropic.""" + + request_name: str + proxy_response: ProxyResponse + anthropic_response: AnthropicResponse + + # Token count comparisons + input_match: bool + output_match: bool + total_match: bool + + # Differences + input_diff: int = 0 + output_diff: int = 0 + total_diff: int = 0 + + # Percentage differences + input_pct_diff: float = 0.0 + output_pct_diff: float = 0.0 + total_pct_diff: float = 0.0 + + # Accuracy metrics + input_error_rate: float = 0.0 + output_error_rate: float = 0.0 + + timestamp: datetime = Field(default_factory=datetime.utcnow) + + def calculate_metrics(self) -> None: + """Calculate comparison metrics.""" + p_in = self.proxy_response.input_tokens or 0 + p_out = self.proxy_response.output_tokens or 0 + a_in = self.anthropic_response.input_tokens or 0 + a_out = self.anthropic_response.output_tokens or 0 + + # Calculate differences + self.input_diff = abs(p_in - a_in) + self.output_diff = abs(p_out - a_out) + self.total_diff = abs((p_in + p_out) - (a_in + a_out)) + + # Calculate percentage differences + if a_in > 0: + self.input_pct_diff = (self.input_diff / a_in) * 100 + if a_out > 0: + self.output_pct_diff = (self.output_diff / a_out) * 100 + + total_a = a_in + a_out + if total_a > 0: + self.total_pct_diff = (self.total_diff / total_a) * 100 + + # Calculate error rates + self.input_error_rate = abs(p_in - a_in) / max(a_in, 1) + self.output_error_rate = abs(p_out - a_out) / max(a_out, 1) + + # Determine matches + self.input_match = p_in == a_in + self.output_match = p_out == a_out + self.total_match = (p_in + p_out) == (a_in + a_out) + + +class EvaluationReport(BaseModel): + """Summary report of all evaluation results.""" + + total_requests: int + successful_requests: int + failed_requests: int + + # Accuracy metrics + input_token_accuracy: float = 0.0 + output_token_accuracy: float = 0.0 + overall_accuracy: float = 0.0 + + # Mean Absolute Error + input_mae: float = 0.0 + output_mae: float = 0.0 + total_mae: float = 0.0 + + # Mean Percentage Error + input_mpe: float = 0.0 + output_mpe: float = 0.0 + total_mpe: float = 0.0 + + # Statistics + results: list[EvaluationResult] = Field(default_factory=list) + + # Systematic biases + input_bias_mean: float = 0.0 + output_bias_mean: float = 0.0 + + # Latency comparison + avg_proxy_latency_ms: float = 0.0 + avg_anthropic_latency_ms: float = 0.0 + + timestamp: datetime = Field(default_factory=datetime.utcnow) + + def calculate_summary_metrics(self) -> None: + """Calculate summary statistics from all results.""" + if not self.results: + return + + successful = [r for r in self.results if not r.proxy_response.error and not r.anthropic_response.error] + self.successful_requests = len(successful) + self.failed_requests = len(self.results) - len(successful) + + if not successful: + return + + # Accuracy + input_matches = sum(1 for r in successful if r.input_match) + output_matches = sum(1 for r in successful if r.output_match) + total_matches = sum(1 for r in successful if r.total_match) + + self.input_token_accuracy = (input_matches / len(successful)) * 100 + self.output_token_accuracy = (output_matches / len(successful)) * 100 + self.overall_accuracy = (total_matches / len(successful)) * 100 + + # Mean Absolute Error + self.input_mae = sum(r.input_diff for r in successful) / len(successful) + self.output_mae = sum(r.output_diff for r in successful) / len(successful) + self.total_mae = sum(r.total_diff for r in successful) / len(successful) + + # Mean Percentage Error + self.input_mpe = sum(r.input_pct_diff for r in successful) / len(successful) + self.output_mpe = sum(r.output_pct_diff for r in successful) / len(successful) + total_pct_diffs = [r.total_pct_diff for r in successful] + self.total_mpe = sum(total_pct_diffs) / len(total_pct_diffs) if total_pct_diffs else 0 + + # Systematic bias (positive = proxy overcounts, negative = proxy undercounts) + input_diffs = [ + (r.proxy_response.input_tokens or 0) - (r.anthropic_response.input_tokens or 0) + for r in successful + ] + output_diffs = [ + (r.proxy_response.output_tokens or 0) - (r.anthropic_response.output_tokens or 0) + for r in successful + ] + + self.input_bias_mean = sum(input_diffs) / len(input_diffs) if input_diffs else 0 + self.output_bias_mean = sum(output_diffs) / len(output_diffs) if output_diffs else 0 + + # Latency + self.avg_proxy_latency_ms = sum(r.proxy_response.latency_ms for r in successful) / len(successful) + self.avg_anthropic_latency_ms = sum(r.anthropic_response.latency_ms for r in successful) / len(successful) diff --git a/proxy/evaluation/zai_eval/report.py b/proxy/evaluation/zai_eval/report.py new file mode 100644 index 0000000..ad0af73 --- /dev/null +++ b/proxy/evaluation/zai_eval/report.py @@ -0,0 +1,312 @@ +"""Report generation for evaluation framework.""" + +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich.progress import Progress, SpinnerColumn, TextColumn +from typing import List +from zai_eval.models import EvaluationResult, EvaluationReport +from zai_eval.metrics import calculate_advanced_metrics, detect_systematic_bias, calculate_accuracy_by_token_range + + +def print_report(console: Console, report: EvaluationReport) -> None: + """Print comprehensive evaluation report to console. + + Args: + console: Rich console instance + report: Evaluation report to print + """ + console.print("\n") + console.print(Panel.fit("Z.AI PROXY EVALUATION REPORT", style="bold cyan")) + + # Summary section + _print_summary(console, report) + + # Accuracy metrics + _print_accuracy(console, report) + + # Error metrics + _print_error_metrics(console, report) + + # Latency comparison + _print_latency(console, report) + + # Systematic bias + _print_bias_analysis(console, report) + + # Advanced metrics + _print_advanced_metrics(console, report) + + # Detailed results table + _print_detailed_results(console, report) + + +def _print_summary(console: Console, report: EvaluationReport) -> None: + """Print summary section.""" + console.print("\n[bold cyan]Summary[/bold cyan]") + console.print(f"Total Requests: {report.total_requests}") + console.print(f"Successful: [green]{report.successful_requests}[/green]") + console.print(f"Failed: [red]{report.failed_requests}[/red]") + + +def _print_accuracy(console: Console, report: EvaluationReport) -> None: + """Print accuracy metrics.""" + table = Table(title="Token Count Accuracy", show_header=True, header_style="bold magenta") + table.add_column("Metric", style="cyan") + table.add_column("Accuracy (%)") + + table.add_row("Input Token Accuracy", f"{report.input_token_accuracy:.2f}%") + table.add_row("Output Token Accuracy", f"{report.output_token_accuracy:.2f}%") + table.add_row("Overall Accuracy", f"{report.overall_accuracy:.2f}%") + + console.print("\n") + console.print(table) + + +def _print_error_metrics(console: Console, report: EvaluationReport) -> None: + """Print error metrics.""" + table = Table(title="Mean Absolute Error (MAE)", show_header=True, header_style="bold magenta") + table.add_column("Metric", style="cyan") + table.add_column("MAE (tokens)") + table.add_column("MPE (%)") + + table.add_row("Input Tokens", f"{report.input_mae:.2f}", f"{report.input_mpe:.2f}%") + table.add_row("Output Tokens", f"{report.output_mae:.2f}", f"{report.output_mpe:.2f}%") + table.add_row("Total Tokens", f"{report.total_mae:.2f}", f"{report.total_mpe:.2f}%") + + console.print("\n") + console.print(table) + + +def _print_latency(console: Console, report: EvaluationReport) -> None: + """Print latency comparison.""" + table = Table(title="Latency Comparison", show_header=True, header_style="bold magenta") + table.add_column("Endpoint", style="cyan") + table.add_column("Avg Latency (ms)") + + table.add_row("Z.AI Proxy", f"{report.avg_proxy_latency_ms:.2f}") + table.add_row("Anthropic API", f"{report.avg_anthropic_latency_ms:.2f}") + + overhead = report.avg_proxy_latency_ms - report.avg_anthropic_latency_ms + overhead_pct = (overhead / report.avg_anthropic_latency_ms * 100) if report.avg_anthropic_latency_ms > 0 else 0 + + table.add_row("Overhead", f"{overhead:.2f} ({overhead_pct:+.1f}%)", style="yellow" if overhead > 0 else "green") + + console.print("\n") + console.print(table) + + +def _print_bias_analysis(console: Console, report: EvaluationReport) -> None: + """Print systematic bias analysis.""" + bias = detect_systematic_bias(report.results) + + if not bias: + return + + table = Table(title="Systematic Bias Analysis", show_header=True, header_style="bold magenta") + table.add_column("Metric", style="cyan") + table.add_column("Value") + + input_status = "Overcounts" if bias["input_bias_mean"] > 0 else "Undercounts" if bias["input_bias_mean"] < 0 else "Accurate" + output_status = "Overcounts" if bias["output_bias_mean"] > 0 else "Undercounts" if bias["output_bias_mean"] < 0 else "Accurate" + + table.add_row("Input Bias", f"{bias['input_bias_mean']:+.2f} tokens ({input_status})") + table.add_row("Output Bias", f"{bias['output_bias_mean']:+.2f} tokens ({output_status})") + + console.print("\n") + console.print(table) + + # Bias patterns + if bias.get("mixed_bias", 0) > len(report.results) / 2: + console.print("\n[yellow]⚠ Mixed bias detected - token counting may be inconsistent[/yellow]") + elif bias.get("both_high", 0) > len(report.results) * 0.7: + console.print("\n[red]⚠ Consistent overcounting detected[/red]") + elif bias.get("both_low", 0) > len(report.results) * 0.7: + console.print("\n[red]⚠ Consistent undercounting detected[/red]") + + +def _print_advanced_metrics(console: Console, report: EvaluationReport) -> None: + """Print advanced statistical metrics.""" + advanced = calculate_advanced_metrics(report.results) + + if not advanced: + return + + table = Table(title="Advanced Statistics", show_header=True, header_style="bold magenta") + table.add_column("Metric", style="cyan") + table.add_column("Input") + table.add_column("Output") + + table.add_row("Std Dev", f"{advanced['input_diff_std']:.2f}", f"{advanced['output_diff_std']:.2f}") + table.add_row("Median", f"{advanced['input_diff_median']:.2f}", f"{advanced['output_diff_median']:.2f}") + table.add_row("95th Percentile", f"{advanced['input_diff_95th']:.2f}", f"{advanced['output_diff_95th']:.2f}") + table.add_row("Max Error", f"{advanced['input_diff_max']:.2f}", f"{advanced['output_diff_max']:.2f}") + + console.print("\n") + console.print(table) + + +def _print_detailed_results(console: Console, report: EvaluationReport) -> None: + """Print detailed results table.""" + table = Table(title="Detailed Results", show_header=True, header_style="bold magenta") + table.add_column("Test", style="cyan") + table.add_column("Proxy In/Out") + table.add_column("Anthropic In/Out") + table.add_column("Diff") + table.add_column("Status") + + for r in report.results: + proxy_tokens = f"{r.proxy_response.input_tokens or 0}/{r.proxy_response.output_tokens or 0}" + anthropic_tokens = f"{r.anthropic_response.input_tokens or 0}/{r.anthropic_response.output_tokens or 0}" + diff = f"{r.total_diff:+d}" + + if r.proxy_response.error or r.anthropic_response.error: + status = "[red]ERROR[/red]" + elif r.total_diff == 0: + status = "[green]✓[/green]" + elif r.total_pct_diff < 5: + status = "[yellow]~[/yellow]" + else: + status = "[red]✗[/red]" + + table.add_row(r.request_name, proxy_tokens, anthropic_tokens, diff, status) + + console.print("\n") + console.print(table) + + +def save_report_json(report: EvaluationReport, filepath: str) -> None: + """Save report as JSON file. + + Args: + report: Evaluation report to save + filepath: Path to output JSON file + """ + import json + from zai_eval.models import EvaluationResult + + def convert_result(result: EvaluationResult) -> dict: + return { + "request_name": result.request_name, + "proxy": { + "status_code": result.proxy_response.status_code, + "input_tokens": result.proxy_response.input_tokens, + "output_tokens": result.proxy_response.output_tokens, + "error": result.proxy_response.error, + "latency_ms": result.proxy_response.latency_ms, + }, + "anthropic": { + "status_code": result.anthropic_response.status_code, + "input_tokens": result.anthropic_response.input_tokens, + "output_tokens": result.anthropic_response.output_tokens, + "error": result.anthropic_response.error, + "latency_ms": result.anthropic_response.latency_ms, + }, + "metrics": { + "input_match": result.input_match, + "output_match": result.output_match, + "input_diff": result.input_diff, + "output_diff": result.output_diff, + "input_pct_diff": result.input_pct_diff, + "output_pct_diff": result.output_pct_diff, + }, + "timestamp": result.timestamp.isoformat(), + } + + data = { + "summary": { + "total_requests": report.total_requests, + "successful_requests": report.successful_requests, + "failed_requests": report.failed_requests, + "input_token_accuracy": report.input_token_accuracy, + "output_token_accuracy": report.output_token_accuracy, + "overall_accuracy": report.overall_accuracy, + "input_mae": report.input_mae, + "output_mae": report.output_mae, + "total_mae": report.total_mae, + "input_mpe": report.input_mpe, + "output_mpe": report.output_mpe, + "total_mpe": report.total_mpe, + "avg_proxy_latency_ms": report.avg_proxy_latency_ms, + "avg_anthropic_latency_ms": report.avg_anthropic_latency_ms, + }, + "advanced_metrics": calculate_advanced_metrics(report.results), + "bias_analysis": detect_systematic_bias(report.results), + "accuracy_by_range": calculate_accuracy_by_token_range(report.results), + "results": [convert_result(r) for r in report.results], + "timestamp": report.timestamp.isoformat(), + } + + with open(filepath, "w") as f: + json.dump(data, f, indent=2) + + +def save_report_markdown(report: EvaluationReport, filepath: str) -> None: + """Save report as Markdown file. + + Args: + report: Evaluation report to save + filepath: Path to output Markdown file + """ + lines = [ + "# Z.AI Proxy Evaluation Report", + "", + f"**Generated:** {report.timestamp.isoformat()}", + "", + "## Summary", + "", + f"- **Total Requests:** {report.total_requests}", + f"- **Successful:** {report.successful_requests}", + f"- **Failed:** {report.failed_requests}", + "", + "## Accuracy Metrics", + "", + "| Metric | Accuracy |", + "|--------|----------|", + f"| Input Token Accuracy | {report.input_token_accuracy:.2f}% |", + f"| Output Token Accuracy | {report.output_token_accuracy:.2f}% |", + f"| Overall Accuracy | {report.overall_accuracy:.2f}% |", + "", + "## Error Metrics", + "", + "| Metric | MAE (tokens) | MPE (%) |", + "|--------|---------------|---------|", + f"| Input Tokens | {report.input_mae:.2f} | {report.input_mpe:.2f}% |", + f"| Output Tokens | {report.output_mae:.2f} | {report.output_mpe:.2f}% |", + f"| Total Tokens | {report.total_mae:.2f} | {report.total_mpe:.2f}% |", + "", + "## Latency Comparison", + "", + f"| Endpoint | Avg Latency (ms) |", + f"|----------|------------------|", + f"| Z.AI Proxy | {report.avg_proxy_latency_ms:.2f} |", + f"| Anthropic API | {report.avg_anthropic_latency_ms:.2f} |", + "", + "## Systematic Bias", + "", + f"- **Input Bias:** {report.input_bias_mean:+.2f} tokens", + f"- **Output Bias:** {report.output_bias_mean:+.2f} tokens", + "", + "## Detailed Results", + "", + "| Test | Proxy (In/Out) | Anthropic (In/Out) | Diff | Status |", + "|------|-----------------|-------------------|------|--------|", + ] + + for r in report.results: + proxy_tokens = f"{r.proxy_response.input_tokens or 0}/{r.proxy_response.output_tokens or 0}" + anthropic_tokens = f"{r.anthropic_response.input_tokens or 0}/{r.anthropic_response.output_tokens or 0}" + + if r.proxy_response.error or r.anthropic_response.error: + status = "❌ ERROR" + elif r.total_diff == 0: + status = "✅ MATCH" + elif r.total_pct_diff < 5: + status = "⚠️ CLOSE" + else: + status = "❌ MISMATCH" + + lines.append(f"| {r.request_name} | {proxy_tokens} | {anthropic_tokens} | {r.total_diff:+d} | {status} |") + + with open(filepath, "w") as f: + f.write("\n".join(lines)) diff --git a/proxy/evaluation/zai_eval/test_cases.py b/proxy/evaluation/zai_eval/test_cases.py new file mode 100644 index 0000000..01fe49b --- /dev/null +++ b/proxy/evaluation/zai_eval/test_cases.py @@ -0,0 +1,213 @@ +"""Test case definitions for evaluation framework.""" + +from zai_eval.models import EvaluationRequest + + +# Diverse test cases covering different request types +TEST_CASES = [ + EvaluationRequest( + name="short_simple", + description="Short simple text", + model="claude-3-sonnet-20240229", + max_tokens=50, + messages=[{"role": "user", "content": "Hello, how are you?"}], + ), + EvaluationRequest( + name="medium_conversation", + description="Medium length conversation", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "The capital of France is Paris."}, + {"role": "user", "content": "Tell me more about it."}, + ], + ), + EvaluationRequest( + name="long_context", + description="Long context with detailed information", + model="claude-3-sonnet-20240229", + max_tokens=150, + messages=[ + { + "role": "user", + "content": """The Industrial Revolution was a period of major industrialization and innovation that took place during the late 1700s and early 1800s. The Industrial Revolution began in Great Britain and quickly spread throughout the world. The use of new basic materials, primarily iron and steel, was a key factor. The use of new energy sources, including both fuels and motive power, such as coal, the steam engine, electricity, petroleum, and the internal-combustion engine, was also important. The invention of new machines, including the spinning jenny and the power loom, allowed for increased production with fewer workers. The factory system was a new way of organizing labor, where many workers were brought together in one place to produce goods under the supervision of a manager. This system led to increased efficiency and productivity, but also to poor working conditions and child labor. The development of new transportation methods, such as canals, roads, and railways, allowed for the faster and cheaper movement of goods and people. The Industrial Revolution had a profound impact on society, economy, and culture, and laid the groundwork for many of the technological advancements we enjoy today.""", + } + ], + ), + EvaluationRequest( + name="code_snippet", + description="Request involving code", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + { + "role": "user", + "content": """Write a function in Python to calculate the factorial of a number: + +```python +def factorial(n): + # Your code here +```""", + } + ], + ), + EvaluationRequest( + name="multi_turn_conversation", + description="Multiple turns of conversation", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + {"role": "user", "content": "I want to learn Python."}, + {"role": "assistant", "content": "That's great! Python is a versatile programming language. Where would you like to start?"}, + {"role": "user", "content": "Let's start with variables and data types."}, + {"role": "assistant", "content": "Python has several built-in data types including integers, floats, strings, booleans, lists, tuples, dictionaries, and sets. Variables are created by assignment, no need to declare types."}, + {"role": "user", "content": "Can you show me an example?"}, + ], + ), + EvaluationRequest( + name="structured_data", + description="Request with structured data format", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + { + "role": "user", + "content": """Here is some data in JSON format: +```json +{ + "name": "Alice", + "age": 30, + "city": "New York", + "hobbies": ["reading", "hiking", "photography"] +} +``` +Extract the hobbies and create a summary.""", + } + ], + ), + EvaluationRequest( + name="mathematical_content", + description="Content with mathematical expressions", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + { + "role": "user", + "content": """Solve this equation step by step: 2x + 5 = 13. Show your work and explain each step.""", + } + ], + ), + EvaluationRequest( + name="multilingual_text", + description="Text with multiple languages", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + { + "role": "user", + "content": """Translate and explain the meaning of these phrases: +1. Spanish: "Hola, ¿cómo estás?" +2. French: "Bonjour, comment allez-vous?" +3. German: "Guten Tag, wie geht es Ihnen?" +4. Japanese: "こんにちは、元気ですか?" +5. Chinese: "你好,你好吗?""", + } + ], + ), + EvaluationRequest( + name="list_heavy_content", + description="Content with many list items", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + { + "role": "user", + "content": """Here are 10 programming best practices: +1. Write clear and descriptive names +2. Keep functions small and focused +3. Don't repeat yourself (DRY) +4. Comment your code +5. Use version control +6. Test your code +7. Handle errors gracefully +8. Optimize for readability +9. Follow style guides +10. Keep learning + +Explain why these are important.""", + } + ], + ), + EvaluationRequest( + name="json_only_response", + description="Request expecting JSON response", + model="claude-3-sonnet-20240229", + max_tokens=150, + messages=[ + { + "role": "user", + "content": """Create a JSON object representing a book with these fields: title, author, publication_year, genres (array), and rating (1-5). Respond with only the JSON, no explanation.""", + } + ], + ), + EvaluationRequest( + name="creative_writing", + description="Creative writing prompt", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + { + "role": "user", + "content": """Write a short opening paragraph for a mystery novel set in a small coastal town. Include atmospheric details and a hint of something unusual.""", + } + ], + ), + EvaluationRequest( + name="technical_explanation", + description="Technical concept explanation", + model="claude-3-sonnet-20240229", + max_tokens=150, + messages=[ + { + "role": "user", + "content": """Explain the concept of microservices architecture, its advantages over monolithic architecture, and the challenges involved in implementing it. Include specific examples.""", + } + ], + ), + EvaluationRequest( + name="empty_system_message", + description="Request with system message", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is 2+2?"}, + ], + ), + EvaluationRequest( + name="special_characters", + description="Text with many special characters and symbols", + model="claude-3-sonnet-20240229", + max_tokens=100, + messages=[ + { + "role": "user", + "content": """Explain what these special characters mean in programming: @, #, $, %, ^, &, *, _, +, =, {, }, [, ], |, \\, :, ;, ", ', <, >, ?, /, ~""", + } + ], + ), +] + + +def get_test_cases() -> list[EvaluationRequest]: + """Return all test cases.""" + return TEST_CASES + + +def get_test_case_by_name(name: str) -> EvaluationRequest | None: + """Get a specific test case by name.""" + for case in TEST_CASES: + if case.name == name: + return case + return None diff --git a/proxy/go.mod b/proxy/go.mod index 5431f4c..3f8e13d 100644 --- a/proxy/go.mod +++ b/proxy/go.mod @@ -1,4 +1,4 @@ -module github.com/ardenone/zai-proxy +module git.ardenone.com/jedarden/zai-proxy go 1.23