zai-proxy/proxy/evaluation/evaluator_test.go
jedarden dee82a76a3 chore: update module paths and add evaluation package
- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy
- dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard
- Update all Go import paths in proxy/ and dashboard/ to match new module paths
- Add proxy/evaluation/ package (was missing from initial commit)
- Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 16:03:50 -04:00

507 lines
13 KiB
Go

package evaluation
import (
"encoding/json"
"testing"
)
// TestGetTestCases verifies test cases are properly defined
func TestGetTestCases(t *testing.T) {
tests := GetTestCases()
if len(tests) < 10 {
t.Errorf("Expected at least 10 test cases, got %d", len(tests))
}
for i, tc := range tests {
if tc.Name == "" {
t.Errorf("Test case %d: missing name", i)
}
if tc.Request.Model == "" {
t.Errorf("Test case %d (%s): missing model", i, tc.Name)
}
if len(tc.Request.Messages) == 0 {
t.Errorf("Test case %d (%s): no messages", i, tc.Name)
}
if tc.Request.MaxTokens <= 0 {
t.Errorf("Test case %d (%s): invalid max_tokens", i, tc.Name)
}
}
}
// TestEvaluatorCreation tests evaluator initialization
func TestEvaluatorCreation(t *testing.T) {
e := NewEvaluator("http://localhost:8080", "https://api.anthropic.com", "test-key-1", "test-key-2")
if e == nil {
t.Fatal("NewEvaluator returned nil")
}
if e.ZaiEndpoint != "http://localhost:8080" {
t.Errorf("Expected ZaiEndpoint 'http://localhost:8080', got '%s'", e.ZaiEndpoint)
}
if e.ZaiAPIKey != "test-key-1" {
t.Errorf("Expected ZaiAPIKey 'test-key-1', got '%s'", e.ZaiAPIKey)
}
if e.Client == nil {
t.Error("Client is nil")
}
}
// TestExtractJSONTokenUsage tests token extraction from JSON responses
func TestExtractJSONTokenUsage(t *testing.T) {
e := NewEvaluator("", "", "", "")
tests := []struct {
name string
body string
expectInput int
expectOutput int
expectNil bool
}{
{
name: "Valid response with usage",
body: `{"id":"msg_123","type":"message","usage":{"input_tokens":100,"output_tokens":50}}`,
expectInput: 100,
expectOutput: 50,
expectNil: false,
},
{
name: "Response with zero tokens",
body: `{"id":"msg_123","usage":{"input_tokens":0,"output_tokens":0}}`,
expectInput: 0,
expectOutput: 0,
expectNil: false,
},
{
name: "Response without usage",
body: `{"id":"msg_123","type":"message"}`,
expectNil: true,
},
{
name: "Invalid JSON",
body: `{invalid json}`,
expectNil: true,
},
{
name: "Empty body",
body: ``,
expectNil: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
result := e.extractJSONTokenUsage([]byte(tc.body))
if tc.expectNil {
if result != nil {
t.Errorf("Expected nil result, got %+v", result)
}
} else {
if result == nil {
t.Fatal("Expected non-nil result, got nil")
}
if result.InputTokens != tc.expectInput {
t.Errorf("InputTokens: got %d, want %d", result.InputTokens, tc.expectInput)
}
if result.OutputTokens != tc.expectOutput {
t.Errorf("OutputTokens: got %d, want %d", result.OutputTokens, tc.expectOutput)
}
}
})
}
}
// TestExtractSSETokenUsage tests token extraction from SSE responses
func TestExtractSSETokenUsage(t *testing.T) {
e := NewEvaluator("", "", "", "")
tests := []struct {
name string
body string
expectInput int
expectOutput int
expectNil bool
}{
{
name: "Valid SSE with usage in message_delta",
body: `data: {"type":"message_start"}
data: {"type":"content_block_delta","delta":{"text":"Hello"}}
data: {"type":"message_delta","usage":{"input_tokens":10,"output_tokens":20}}
data: {"type":"message_stop"}`,
expectInput: 10,
expectOutput: 20,
expectNil: false,
},
{
name: "SSE without usage",
body: `data: {"type":"message_start"}
data: {"type":"message_stop"}`,
expectNil: true,
},
{
name: "Empty SSE",
body: ``,
expectNil: true,
},
{
name: "SSE with [DONE]",
body: `data: {"type":"content_block_delta","delta":{"text":"Hi"}}
data: [DONE]
data: {"type":"message_stop"}`,
expectNil: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
result := e.extractSSETokenUsage([]byte(tc.body))
if tc.expectNil {
if result != nil {
t.Errorf("Expected nil result, got %+v", result)
}
} else {
if result == nil {
t.Fatal("Expected non-nil result, got nil")
}
if result.InputTokens != tc.expectInput {
t.Errorf("InputTokens: got %d, want %d", result.InputTokens, tc.expectInput)
}
if result.OutputTokens != tc.expectOutput {
t.Errorf("OutputTokens: got %d, want %d", result.OutputTokens, tc.expectOutput)
}
}
})
}
}
// TestCompareResponseStructure tests structural comparison
func TestCompareResponseStructure(t *testing.T) {
e := NewEvaluator("", "", "", "")
tests := []struct {
name string
zaiBody string
anthropicBody string
expectMatch bool
}{
{
name: "Identical structure",
zaiBody: `{"id":"msg_123","type":"message","content":[],"role":"assistant"}`,
anthropicBody: `{"id":"msg_456","type":"message","content":[],"role":"assistant"}`,
expectMatch: true,
},
{
name: "Different number of keys",
zaiBody: `{"id":"msg_123","type":"message"}`,
anthropicBody: `{"id":"msg_456","type":"message","extra":"field"}`,
expectMatch: false,
},
{
name: "Different key names",
zaiBody: `{"id":"msg_123","type":"message"}`,
anthropicBody: `{"id":"msg_456","content":"message"}`,
expectMatch: false,
},
{
name: "Invalid JSON in zai",
zaiBody: `{invalid}`,
anthropicBody: `{"id":"msg_456"}`,
expectMatch: false,
},
{
name: "Invalid JSON in anthropic",
zaiBody: `{"id":"msg_123"}`,
anthropicBody: `{invalid}`,
expectMatch: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
result := e.compareResponseStructure([]byte(tc.zaiBody), []byte(tc.anthropicBody))
if result != tc.expectMatch {
t.Errorf("compareResponseStructure() = %v, want %v", result, tc.expectMatch)
}
})
}
}
// TestCalculateMetrics tests metrics calculation
func TestCalculateMetrics(t *testing.T) {
e := NewEvaluator("", "", "", "")
results := []ComparisonResult{
{
TestName: "Test 1",
ZaiResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
},
AnthropicResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 105, OutputTokens: 52},
},
InputTokenDiff: -5,
OutputTokenDiff: -2,
InputTokenPercentDiff: -5.0,
OutputTokenPercentDiff: -4.0,
ResponseStructureMatch: true,
},
{
TestName: "Test 2",
ZaiResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 200, OutputTokens: 100},
},
AnthropicResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 190, OutputTokens: 95},
},
InputTokenDiff: 10,
OutputTokenDiff: 5,
InputTokenPercentDiff: 5.0,
OutputTokenPercentDiff: 5.0,
ResponseStructureMatch: true,
},
{
TestName: "Test 3 (no token data)",
ZaiResponse: ResponseData{},
AnthropicResponse: ResponseData{},
ResponseStructureMatch: false,
},
}
metrics := e.calculateMetrics(results)
if metrics.TotalTests != 3 {
t.Errorf("TotalTests: got %d, want 3", metrics.TotalTests)
}
// First two tests have token data
if metrics.InputTokenMAE != 7.5 { // (5 + 10) / 2
t.Errorf("InputTokenMAE: got %.2f, want 7.5", metrics.InputTokenMAE)
}
if metrics.OutputTokenMAE != 3.5 { // (2 + 5) / 2
t.Errorf("OutputTokenMAE: got %.2f, want 3.5", metrics.OutputTokenMAE)
}
if metrics.InputTokenAvgPercentDiff != 5.0 {
t.Errorf("InputTokenAvgPercentDiff: got %.2f, want 5.0", metrics.InputTokenAvgPercentDiff)
}
if metrics.OutputTokenAvgPercentDiff != 4.5 {
t.Errorf("OutputTokenAvgPercentDiff: got %.2f, want 4.5", metrics.OutputTokenAvgPercentDiff)
}
if metrics.StructureMatchCount != 2 {
t.Errorf("StructureMatchCount: got %d, want 2", metrics.StructureMatchCount)
}
}
// TestReportGeneration tests report generation
func TestReportGeneration(t *testing.T) {
results := []ComparisonResult{
{
TestName: "Sample Test",
ZaiResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
Duration: 100000000, // 100ms
},
AnthropicResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
Duration: 90000000, // 90ms
},
InputTokenMatch: true,
OutputTokenMatch: true,
InputTokenDiff: 0,
OutputTokenDiff: 0,
InputTokenPercentDiff: 0.0,
OutputTokenPercentDiff: 0.0,
ResponseStructureMatch: true,
},
}
metrics := &EvaluationMetrics{
TotalTests: 1,
SuccessfulTests: 1,
InputTokenMAE: 0.0,
OutputTokenMAE: 0.0,
InputTokenAvgPercentDiff: 0.0,
OutputTokenAvgPercentDiff: 0.0,
StructureMatchCount: 1,
}
reporter := NewReportGenerator(results, metrics)
// Test text report generation
textReport := reporter.GenerateTextReport()
if textReport == "" {
t.Error("GenerateTextReport() returned empty string")
}
if len(textReport) < 100 {
t.Errorf("Text report too short: %d characters", len(textReport))
}
// Check for expected sections
expectedSections := []string{
"EXECUTIVE SUMMARY",
"TOKEN ACCURACY METRICS",
"DETAILED TEST RESULTS",
"ANALYSIS AND RECOMMENDATIONS",
}
for _, section := range expectedSections {
if !contains(textReport, section) {
t.Errorf("Text report missing section: %s", section)
}
}
// Test JSON report generation
jsonReport, err := reporter.GenerateJSONReport()
if err != nil {
t.Errorf("GenerateJSONReport() error: %v", err)
}
if len(jsonReport) == 0 {
t.Error("JSON report is empty")
}
var jsonData map[string]interface{}
if err := json.Unmarshal(jsonReport, &jsonData); err != nil {
t.Errorf("JSON report is invalid: %v", err)
}
// Verify required fields
requiredFields := []string{"generated_at", "metrics", "test_results", "interpretation"}
for _, field := range requiredFields {
if _, ok := jsonData[field]; !ok {
t.Errorf("JSON report missing field: %s", field)
}
}
}
// TestPatternAnalysis tests pattern identification
func TestPatternAnalysis(t *testing.T) {
results := []ComparisonResult{
{
TestName: "Test 1",
ZaiResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 105, OutputTokens: 52},
},
AnthropicResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
},
InputTokenDiff: 5,
OutputTokenDiff: 2,
},
{
TestName: "Test 2",
ZaiResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 110, OutputTokens: 55},
},
AnthropicResponse: ResponseData{
TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
},
InputTokenDiff: 10,
OutputTokenDiff: 5,
},
}
metrics := &EvaluationMetrics{
TotalTests: 2,
SuccessfulTests: 2,
InputTokenMAE: 7.5,
OutputTokenMAE: 3.5,
StructureMatchCount: 2,
}
reporter := NewReportGenerator(results, metrics)
patterns := reporter.identifyPatterns()
// Should detect Z.AI consistently higher
if !contains(patterns, "Z.AI consistently reports higher input tokens") {
t.Error("Pattern analysis should detect Z.AI consistently higher for input tokens")
}
if !contains(patterns, "Z.AI consistently reports higher output tokens") {
t.Error("Pattern analysis should detect Z.AI consistently higher for output tokens")
}
}
// TestRecommendations tests recommendation generation
func TestRecommendations(t *testing.T) {
tests := []struct {
name string
metrics *EvaluationMetrics
expectRecommendation bool
expectedKeyword string
}{
{
name: "High MAE should recommend tokenizer review",
metrics: &EvaluationMetrics{
TotalTests: 10,
SuccessfulTests: 10,
InputTokenMAE: 15.0,
OutputTokenMAE: 2.0,
StructureMatchCount: 10,
},
expectRecommendation: true,
expectedKeyword: "tokenizer",
},
{
name: "Low success rate should recommend error handling review",
metrics: &EvaluationMetrics{
TotalTests: 10,
SuccessfulTests: 8,
InputTokenMAE: 2.0,
OutputTokenMAE: 2.0,
StructureMatchCount: 10,
},
expectRecommendation: true,
expectedKeyword: "failed",
},
{
name: "Perfect metrics should recommend no action",
metrics: &EvaluationMetrics{
TotalTests: 10,
SuccessfulTests: 10,
InputTokenMAE: 0.0,
OutputTokenMAE: 0.0,
StructureMatchCount: 10,
},
expectRecommendation: true,
expectedKeyword: "no immediate action",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
reporter := NewReportGenerator([]ComparisonResult{}, tc.metrics)
recommendations := reporter.getRecommendations()
if len(recommendations) == 0 {
t.Error("Expected at least one recommendation")
}
found := false
for _, rec := range recommendations {
if contains(rec, tc.expectedKeyword) {
found = true
break
}
}
if !found {
t.Errorf("Expected recommendation to contain '%s', got: %v", tc.expectedKeyword, recommendations)
}
})
}
}
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(substr) == 0 ||
(len(s) > 0 && (s[0:len(substr)] == substr || contains(s[1:], substr))))
}