zai-proxy/proxy/evaluation/evaluator_test.go

package evaluation

import (
	"encoding/json"
	"testing"
)

// TestGetTestCases verifies test cases are properly defined
func TestGetTestCases(t *testing.T) {
	tests := GetTestCases()

	if len(tests) < 10 {
		t.Errorf("Expected at least 10 test cases, got %d", len(tests))
	}

	for i, tc := range tests {
		if tc.Name == "" {
			t.Errorf("Test case %d: missing name", i)
		}
		if tc.Request.Model == "" {
			t.Errorf("Test case %d (%s): missing model", i, tc.Name)
		}
		if len(tc.Request.Messages) == 0 {
			t.Errorf("Test case %d (%s): no messages", i, tc.Name)
		}
		if tc.Request.MaxTokens <= 0 {
			t.Errorf("Test case %d (%s): invalid max_tokens", i, tc.Name)
		}
	}
}

// TestEvaluatorCreation tests evaluator initialization
func TestEvaluatorCreation(t *testing.T) {
	e := NewEvaluator("http://localhost:8080", "https://api.anthropic.com", "test-key-1", "test-key-2")

	if e == nil {
		t.Fatal("NewEvaluator returned nil")
	}

	if e.ZaiEndpoint != "http://localhost:8080" {
		t.Errorf("Expected ZaiEndpoint 'http://localhost:8080', got '%s'", e.ZaiEndpoint)
	}

	if e.ZaiAPIKey != "test-key-1" {
		t.Errorf("Expected ZaiAPIKey 'test-key-1', got '%s'", e.ZaiAPIKey)
	}

	if e.Client == nil {
		t.Error("Client is nil")
	}
}

// TestExtractJSONTokenUsage tests token extraction from JSON responses
func TestExtractJSONTokenUsage(t *testing.T) {
	e := NewEvaluator("", "", "", "")

	tests := []struct {
		name         string
		body         string
		expectInput  int
		expectOutput int
		expectNil    bool
	}{
		{
			name: "Valid response with usage",
			body: `{"id":"msg_123","type":"message","usage":{"input_tokens":100,"output_tokens":50}}`,
			expectInput:  100,
			expectOutput: 50,
			expectNil:    false,
		},
		{
			name: "Response with zero tokens",
			body: `{"id":"msg_123","usage":{"input_tokens":0,"output_tokens":0}}`,
			expectInput:  0,
			expectOutput: 0,
			expectNil:    false,
		},
		{
			name:      "Response without usage",
			body:      `{"id":"msg_123","type":"message"}`,
			expectNil: true,
		},
		{
			name:      "Invalid JSON",
			body:      `{invalid json}`,
			expectNil: true,
		},
		{
			name:      "Empty body",
			body:      ``,
			expectNil: true,
		},
	}

	for _, tc := range tests {
		t.Run(tc.name, func(t *testing.T) {
			result := e.extractJSONTokenUsage([]byte(tc.body))

			if tc.expectNil {
				if result != nil {
					t.Errorf("Expected nil result, got %+v", result)
				}
			} else {
				if result == nil {
					t.Fatal("Expected non-nil result, got nil")
				}
				if result.InputTokens != tc.expectInput {
					t.Errorf("InputTokens: got %d, want %d", result.InputTokens, tc.expectInput)
				}
				if result.OutputTokens != tc.expectOutput {
					t.Errorf("OutputTokens: got %d, want %d", result.OutputTokens, tc.expectOutput)
				}
			}
		})
	}
}

// TestExtractSSETokenUsage tests token extraction from SSE responses
func TestExtractSSETokenUsage(t *testing.T) {
	e := NewEvaluator("", "", "", "")

	tests := []struct {
		name         string
		body         string
		expectInput  int
		expectOutput int
		expectNil    bool
	}{
		{
			name: "Valid SSE with usage in message_delta",
			body: `data: {"type":"message_start"}
data: {"type":"content_block_delta","delta":{"text":"Hello"}}
data: {"type":"message_delta","usage":{"input_tokens":10,"output_tokens":20}}
data: {"type":"message_stop"}`,
			expectInput:  10,
			expectOutput: 20,
			expectNil:    false,
		},
		{
			name: "SSE without usage",
			body: `data: {"type":"message_start"}
data: {"type":"message_stop"}`,
			expectNil: true,
		},
		{
			name:      "Empty SSE",
			body:      ``,
			expectNil: true,
		},
		{
			name: "SSE with [DONE]",
			body: `data: {"type":"content_block_delta","delta":{"text":"Hi"}}
data: [DONE]
data: {"type":"message_stop"}`,
			expectNil: true,
		},
	}

	for _, tc := range tests {
		t.Run(tc.name, func(t *testing.T) {
			result := e.extractSSETokenUsage([]byte(tc.body))

			if tc.expectNil {
				if result != nil {
					t.Errorf("Expected nil result, got %+v", result)
				}
			} else {
				if result == nil {
					t.Fatal("Expected non-nil result, got nil")
				}
				if result.InputTokens != tc.expectInput {
					t.Errorf("InputTokens: got %d, want %d", result.InputTokens, tc.expectInput)
				}
				if result.OutputTokens != tc.expectOutput {
					t.Errorf("OutputTokens: got %d, want %d", result.OutputTokens, tc.expectOutput)
				}
			}
		})
	}
}

// TestCompareResponseStructure tests structural comparison
func TestCompareResponseStructure(t *testing.T) {
	e := NewEvaluator("", "", "", "")

	tests := []struct {
		name     string
		zaiBody  string
		anthropicBody string
		expectMatch bool
	}{
		{
			name: "Identical structure",
			zaiBody:  `{"id":"msg_123","type":"message","content":[],"role":"assistant"}`,
			anthropicBody: `{"id":"msg_456","type":"message","content":[],"role":"assistant"}`,
			expectMatch: true,
		},
		{
			name:          "Different number of keys",
			zaiBody:       `{"id":"msg_123","type":"message"}`,
			anthropicBody: `{"id":"msg_456","type":"message","extra":"field"}`,
			expectMatch:   false,
		},
		{
			name:          "Different key names",
			zaiBody:       `{"id":"msg_123","type":"message"}`,
			anthropicBody: `{"id":"msg_456","content":"message"}`,
			expectMatch:   false,
		},
		{
			name:          "Invalid JSON in zai",
			zaiBody:       `{invalid}`,
			anthropicBody: `{"id":"msg_456"}`,
			expectMatch:   false,
		},
		{
			name:          "Invalid JSON in anthropic",
			zaiBody:       `{"id":"msg_123"}`,
			anthropicBody: `{invalid}`,
			expectMatch:   false,
		},
	}

	for _, tc := range tests {
		t.Run(tc.name, func(t *testing.T) {
			result := e.compareResponseStructure([]byte(tc.zaiBody), []byte(tc.anthropicBody))
			if result != tc.expectMatch {
				t.Errorf("compareResponseStructure() = %v, want %v", result, tc.expectMatch)
			}
		})
	}
}

// TestCalculateMetrics tests metrics calculation
func TestCalculateMetrics(t *testing.T) {
	e := NewEvaluator("", "", "", "")

	results := []ComparisonResult{
		{
			TestName: "Test 1",
			ZaiResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
			},
			AnthropicResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 105, OutputTokens: 52},
			},
			InputTokenDiff:       -5,
			OutputTokenDiff:      -2,
			InputTokenPercentDiff: -5.0,
			OutputTokenPercentDiff: -4.0,
			ResponseStructureMatch: true,
		},
		{
			TestName: "Test 2",
			ZaiResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 200, OutputTokens: 100},
			},
			AnthropicResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 190, OutputTokens: 95},
			},
			InputTokenDiff:       10,
			OutputTokenDiff:      5,
			InputTokenPercentDiff: 5.0,
			OutputTokenPercentDiff: 5.0,
			ResponseStructureMatch: true,
		},
		{
			TestName: "Test 3 (no token data)",
			ZaiResponse: ResponseData{},
			AnthropicResponse: ResponseData{},
			ResponseStructureMatch: false,
		},
	}

	metrics := e.calculateMetrics(results)

	if metrics.TotalTests != 3 {
		t.Errorf("TotalTests: got %d, want 3", metrics.TotalTests)
	}

	// First two tests have token data
	if metrics.InputTokenMAE != 7.5 { // (5 + 10) / 2
		t.Errorf("InputTokenMAE: got %.2f, want 7.5", metrics.InputTokenMAE)
	}

	if metrics.OutputTokenMAE != 3.5 { // (2 + 5) / 2
		t.Errorf("OutputTokenMAE: got %.2f, want 3.5", metrics.OutputTokenMAE)
	}

	if metrics.InputTokenAvgPercentDiff != 5.0 {
		t.Errorf("InputTokenAvgPercentDiff: got %.2f, want 5.0", metrics.InputTokenAvgPercentDiff)
	}

	if metrics.OutputTokenAvgPercentDiff != 4.5 {
		t.Errorf("OutputTokenAvgPercentDiff: got %.2f, want 4.5", metrics.OutputTokenAvgPercentDiff)
	}

	if metrics.StructureMatchCount != 2 {
		t.Errorf("StructureMatchCount: got %d, want 2", metrics.StructureMatchCount)
	}
}

// TestReportGeneration tests report generation
func TestReportGeneration(t *testing.T) {
	results := []ComparisonResult{
		{
			TestName: "Sample Test",
			ZaiResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
				Duration:   100000000, // 100ms
			},
			AnthropicResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
				Duration:   90000000, // 90ms
			},
			InputTokenMatch:       true,
			OutputTokenMatch:      true,
			InputTokenDiff:        0,
			OutputTokenDiff:       0,
			InputTokenPercentDiff: 0.0,
			OutputTokenPercentDiff: 0.0,
			ResponseStructureMatch: true,
		},
	}

	metrics := &EvaluationMetrics{
		TotalTests:             1,
		SuccessfulTests:        1,
		InputTokenMAE:          0.0,
		OutputTokenMAE:         0.0,
		InputTokenAvgPercentDiff: 0.0,
		OutputTokenAvgPercentDiff: 0.0,
		StructureMatchCount:    1,
	}

	reporter := NewReportGenerator(results, metrics)

	// Test text report generation
	textReport := reporter.GenerateTextReport()
	if textReport == "" {
		t.Error("GenerateTextReport() returned empty string")
	}

	if len(textReport) < 100 {
		t.Errorf("Text report too short: %d characters", len(textReport))
	}

	// Check for expected sections
	expectedSections := []string{
		"EXECUTIVE SUMMARY",
		"TOKEN ACCURACY METRICS",
		"DETAILED TEST RESULTS",
		"ANALYSIS AND RECOMMENDATIONS",
	}

	for _, section := range expectedSections {
		if !contains(textReport, section) {
			t.Errorf("Text report missing section: %s", section)
		}
	}

	// Test JSON report generation
	jsonReport, err := reporter.GenerateJSONReport()
	if err != nil {
		t.Errorf("GenerateJSONReport() error: %v", err)
	}

	if len(jsonReport) == 0 {
		t.Error("JSON report is empty")
	}

	var jsonData map[string]interface{}
	if err := json.Unmarshal(jsonReport, &jsonData); err != nil {
		t.Errorf("JSON report is invalid: %v", err)
	}

	// Verify required fields
	requiredFields := []string{"generated_at", "metrics", "test_results", "interpretation"}
	for _, field := range requiredFields {
		if _, ok := jsonData[field]; !ok {
			t.Errorf("JSON report missing field: %s", field)
		}
	}
}

// TestPatternAnalysis tests pattern identification
func TestPatternAnalysis(t *testing.T) {
	results := []ComparisonResult{
		{
			TestName: "Test 1",
			ZaiResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 105, OutputTokens: 52},
			},
			AnthropicResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
			},
			InputTokenDiff:  5,
			OutputTokenDiff: 2,
		},
		{
			TestName: "Test 2",
			ZaiResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 110, OutputTokens: 55},
			},
			AnthropicResponse: ResponseData{
				TokenUsage: &TokenUsage{InputTokens: 100, OutputTokens: 50},
			},
			InputTokenDiff:  10,
			OutputTokenDiff: 5,
		},
	}

	metrics := &EvaluationMetrics{
		TotalTests:          2,
		SuccessfulTests:     2,
		InputTokenMAE:       7.5,
		OutputTokenMAE:      3.5,
		StructureMatchCount: 2,
	}

	reporter := NewReportGenerator(results, metrics)
	patterns := reporter.identifyPatterns()

	// Should detect Z.AI consistently higher
	if !contains(patterns, "Z.AI consistently reports higher input tokens") {
		t.Error("Pattern analysis should detect Z.AI consistently higher for input tokens")
	}

	if !contains(patterns, "Z.AI consistently reports higher output tokens") {
		t.Error("Pattern analysis should detect Z.AI consistently higher for output tokens")
	}
}

// TestRecommendations tests recommendation generation
func TestRecommendations(t *testing.T) {
	tests := []struct {
		name             string
		metrics          *EvaluationMetrics
		expectRecommendation bool
		expectedKeyword  string
	}{
		{
			name: "High MAE should recommend tokenizer review",
			metrics: &EvaluationMetrics{
				TotalTests:       10,
				SuccessfulTests:  10,
				InputTokenMAE:    15.0,
				OutputTokenMAE:   2.0,
				StructureMatchCount: 10,
			},
			expectRecommendation: true,
			expectedKeyword:  "tokenizer",
		},
		{
			name: "Low success rate should recommend error handling review",
			metrics: &EvaluationMetrics{
				TotalTests:       10,
				SuccessfulTests:  8,
				InputTokenMAE:    2.0,
				OutputTokenMAE:   2.0,
				StructureMatchCount: 10,
			},
			expectRecommendation: true,
			expectedKeyword:  "failed",
		},
		{
			name: "Perfect metrics should recommend no action",
			metrics: &EvaluationMetrics{
				TotalTests:       10,
				SuccessfulTests:  10,
				InputTokenMAE:    0.0,
				OutputTokenMAE:   0.0,
				StructureMatchCount: 10,
			},
			expectRecommendation: true,
			expectedKeyword:  "no immediate action",
		},
	}

	for _, tc := range tests {
		t.Run(tc.name, func(t *testing.T) {
			reporter := NewReportGenerator([]ComparisonResult{}, tc.metrics)
			recommendations := reporter.getRecommendations()

			if len(recommendations) == 0 {
				t.Error("Expected at least one recommendation")
			}

			found := false
			for _, rec := range recommendations {
				if contains(rec, tc.expectedKeyword) {
					found = true
					break
				}
			}

			if !found {
				t.Errorf("Expected recommendation to contain '%s', got: %v", tc.expectedKeyword, recommendations)
			}
		})
	}
}

func contains(s, substr string) bool {
	return len(s) >= len(substr) && (s == substr || len(substr) == 0 ||
		(len(s) > 0 && (s[0:len(substr)] == substr || contains(s[1:], substr))))
}