zai-proxy/proxy/evaluation/evaluator.go
jedarden dee82a76a3 chore: update module paths and add evaluation package
- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy
- dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard
- Update all Go import paths in proxy/ and dashboard/ to match new module paths
- Add proxy/evaluation/ package (was missing from initial commit)
- Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 16:03:50 -04:00

347 lines
9.6 KiB
Go

package evaluation
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"sync"
"time"
)
// TestRequest represents a test case for evaluation
type TestRequest struct {
Name string `json:"name"`
Request ClaudeRequest `json:"request"`
Stream bool `json:"stream"`
}
// ClaudeRequest represents an Anthropic API request
type ClaudeRequest struct {
Model string `json:"model"`
MaxTokens int `json:"max_tokens"`
Messages []Message `json:"messages"`
Stream bool `json:"stream,omitempty"`
}
// Message represents a message in the conversation
type Message struct {
Role string `json:"role"`
Content string `json:"content"`
}
// TokenUsage represents token counts from responses
type TokenUsage struct {
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
}
// ResponseData captures response data from an endpoint
type ResponseData struct {
StatusCode int
Body []byte
Headers http.Header
Duration time.Duration
TokenUsage *TokenUsage
Error error
}
// ComparisonResult represents a comparison between two responses
type ComparisonResult struct {
TestName string
ZaiResponse ResponseData
AnthropicResponse ResponseData
InputTokenMatch bool
OutputTokenMatch bool
InputTokenDiff int
OutputTokenDiff int
InputTokenPercentDiff float64
OutputTokenPercentDiff float64
ResponseStructureMatch bool
}
// EvaluationMetrics contains aggregated metrics
type EvaluationMetrics struct {
TotalTests int
SuccessfulTests int
InputTokenMAE float64
OutputTokenMAE float64
InputTokenAvgPercentDiff float64
OutputTokenAvgPercentDiff float64
StructureMatchCount int
}
// Evaluator manages the evaluation process
type Evaluator struct {
ZaiEndpoint string
AnthropicEndpoint string
ZaiAPIKey string
AnthropicAPIKey string
Client *http.Client
}
// NewEvaluator creates a new evaluator instance
func NewEvaluator(zaiEndpoint, anthropicEndpoint, zaiKey, anthropicKey string) *Evaluator {
return &Evaluator{
ZaiEndpoint: zaiEndpoint,
AnthropicEndpoint: anthropicEndpoint,
ZaiAPIKey: zaiKey,
AnthropicAPIKey: anthropicKey,
Client: &http.Client{
Timeout: 2 * time.Minute,
},
}
}
// sendRequest sends a request to the specified endpoint
func (e *Evaluator) sendRequest(endpoint, apiKey string, req ClaudeRequest) ResponseData {
start := time.Now()
reqBody, err := json.Marshal(req)
if err != nil {
return ResponseData{Error: fmt.Errorf("failed to marshal request: %w", err)}
}
httpReq, err := http.NewRequest("POST", endpoint, bytes.NewReader(reqBody))
if err != nil {
return ResponseData{Error: fmt.Errorf("failed to create request: %w", err)}
}
httpReq.Header.Set("Content-Type", "application/json")
httpReq.Header.Set("x-api-key", apiKey)
httpReq.Header.Set("anthropic-version", "2023-06-01")
resp, err := e.Client.Do(httpReq)
if err != nil {
return ResponseData{Error: fmt.Errorf("request failed: %w", err)}
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return ResponseData{
StatusCode: resp.StatusCode,
Error: fmt.Errorf("failed to read response body: %w", err),
}
}
// Try to extract token usage from response
tokenUsage := e.extractTokenUsage(body, req.Stream)
return ResponseData{
StatusCode: resp.StatusCode,
Body: body,
Headers: resp.Header,
Duration: time.Since(start),
TokenUsage: tokenUsage,
}
}
// extractTokenUsage attempts to extract token usage from response body
func (e *Evaluator) extractTokenUsage(body []byte, isStreaming bool) *TokenUsage {
if isStreaming {
return e.extractSSETokenUsage(body)
}
return e.extractJSONTokenUsage(body)
}
// extractJSONTokenUsage extracts usage from non-streaming JSON response
func (e *Evaluator) extractJSONTokenUsage(body []byte) *TokenUsage {
var resp map[string]interface{}
if err := json.Unmarshal(body, &resp); err != nil {
log.Printf("Warning: failed to parse JSON response: %v", err)
return nil
}
usage, ok := resp["usage"].(map[string]interface{})
if !ok {
return nil
}
inputTokens, _ := usage["input_tokens"].(float64)
outputTokens, _ := usage["output_tokens"].(float64)
return &TokenUsage{
InputTokens: int(inputTokens),
OutputTokens: int(outputTokens),
}
}
// extractSSETokenUsage extracts usage from streaming SSE response
func (e *Evaluator) extractSSETokenUsage(body []byte) *TokenUsage {
lines := bytes.Split(body, []byte("\n"))
for _, line := range lines {
if !bytes.HasPrefix(line, []byte("data: ")) {
continue
}
jsonData := bytes.TrimPrefix(line, []byte("data: "))
if len(jsonData) == 0 || bytes.Equal(jsonData, []byte("[DONE]")) {
continue
}
var event map[string]interface{}
if err := json.Unmarshal(jsonData, &event); err != nil {
continue
}
if eventType, ok := event["type"].(string); ok && eventType == "message_delta" {
if usage, ok := event["usage"].(map[string]interface{}); ok {
inputTokens, _ := usage["input_tokens"].(float64)
outputTokens, _ := usage["output_tokens"].(float64)
return &TokenUsage{
InputTokens: int(inputTokens),
OutputTokens: int(outputTokens),
}
}
}
}
return nil
}
// RunTest executes a single test case, comparing responses from both endpoints
func (e *Evaluator) RunTest(test TestRequest) ComparisonResult {
log.Printf("Running test: %s", test.Name)
// Send requests concurrently
var wg sync.WaitGroup
var zaiResp, anthropicResp ResponseData
wg.Add(2)
go func() {
defer wg.Done()
zaiResp = e.sendRequest(e.ZaiEndpoint, e.ZaiAPIKey, test.Request)
}()
go func() {
defer wg.Done()
anthropicResp = e.sendRequest(e.AnthropicEndpoint, e.AnthropicAPIKey, test.Request)
}()
wg.Wait()
result := ComparisonResult{
TestName: test.Name,
ZaiResponse: zaiResp,
AnthropicResponse: anthropicResp,
}
// Compare token counts if both responses have usage data
if zaiResp.TokenUsage != nil && anthropicResp.TokenUsage != nil {
result.InputTokenMatch = zaiResp.TokenUsage.InputTokens == anthropicResp.TokenUsage.InputTokens
result.OutputTokenMatch = zaiResp.TokenUsage.OutputTokens == anthropicResp.TokenUsage.OutputTokens
result.InputTokenDiff = zaiResp.TokenUsage.InputTokens - anthropicResp.TokenUsage.InputTokens
result.OutputTokenDiff = zaiResp.TokenUsage.OutputTokens - anthropicResp.TokenUsage.OutputTokens
if anthropicResp.TokenUsage.InputTokens > 0 {
result.InputTokenPercentDiff = float64(result.InputTokenDiff) / float64(anthropicResp.TokenUsage.InputTokens) * 100
}
if anthropicResp.TokenUsage.OutputTokens > 0 {
result.OutputTokenPercentDiff = float64(result.OutputTokenDiff) / float64(anthropicResp.TokenUsage.OutputTokens) * 100
}
}
// Compare response structure (basic check)
result.ResponseStructureMatch = e.compareResponseStructure(zaiResp.Body, anthropicResp.Body)
return result
}
// compareResponseStructure performs basic structural comparison
func (e *Evaluator) compareResponseStructure(zaiBody, anthropicBody []byte) bool {
var zaiMap, anthropicMap map[string]interface{}
if err := json.Unmarshal(zaiBody, &zaiMap); err != nil {
return false
}
if err := json.Unmarshal(anthropicBody, &anthropicMap); err != nil {
return false
}
// Compare top-level keys
if len(zaiMap) != len(anthropicMap) {
return false
}
for key := range zaiMap {
if _, ok := anthropicMap[key]; !ok {
return false
}
}
return true
}
// RunTests executes multiple test cases and returns aggregated metrics
func (e *Evaluator) RunTests(tests []TestRequest) ([]ComparisonResult, *EvaluationMetrics) {
results := make([]ComparisonResult, len(tests))
for i, test := range tests {
results[i] = e.RunTest(test)
}
metrics := e.calculateMetrics(results)
return results, metrics
}
// calculateMetrics computes aggregated metrics from comparison results
func (e *Evaluator) calculateMetrics(results []ComparisonResult) *EvaluationMetrics {
return e.CalculateMetricsFromResults(results)
}
// CalculateMetricsFromResults computes aggregated metrics from comparison results (public method)
func (e *Evaluator) CalculateMetricsFromResults(results []ComparisonResult) *EvaluationMetrics {
metrics := &EvaluationMetrics{
TotalTests: len(results),
StructureMatchCount: 0,
}
var inputTokenSum, outputTokenSum float64
var inputPercentSum, outputPercentSum float64
var validInputCount, validOutputCount int
for _, result := range results {
if result.ZaiResponse.Error == nil && result.AnthropicResponse.Error == nil {
metrics.SuccessfulTests++
}
if result.ResponseStructureMatch {
metrics.StructureMatchCount++
}
// Calculate MAE and average percentage differences
if result.AnthropicResponse.TokenUsage != nil && result.ZaiResponse.TokenUsage != nil {
inputTokenSum += absFloat(float64(result.InputTokenDiff))
outputTokenSum += absFloat(float64(result.OutputTokenDiff))
if result.AnthropicResponse.TokenUsage.InputTokens > 0 {
inputPercentSum += absFloat(result.InputTokenPercentDiff)
validInputCount++
}
if result.AnthropicResponse.TokenUsage.OutputTokens > 0 {
outputPercentSum += absFloat(result.OutputTokenPercentDiff)
validOutputCount++
}
}
}
if validInputCount > 0 {
metrics.InputTokenMAE = inputTokenSum / float64(validInputCount)
metrics.InputTokenAvgPercentDiff = inputPercentSum / float64(validInputCount)
}
if validOutputCount > 0 {
metrics.OutputTokenMAE = outputTokenSum / float64(validOutputCount)
metrics.OutputTokenAvgPercentDiff = outputPercentSum / float64(validOutputCount)
}
return metrics
}
func absFloat(x float64) float64 {
if x < 0 {
return -x
}
return x
}