zai-proxy/proxy/evaluation/evaluator.go

package evaluation

import (
	"bytes"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"net/http"
	"sync"
	"time"
)

// TestRequest represents a test case for evaluation
type TestRequest struct {
	Name    string          `json:"name"`
	Request ClaudeRequest   `json:"request"`
	Stream  bool            `json:"stream"`
}

// ClaudeRequest represents an Anthropic API request
type ClaudeRequest struct {
	Model    string        `json:"model"`
	MaxTokens int          `json:"max_tokens"`
	Messages []Message     `json:"messages"`
	Stream   bool          `json:"stream,omitempty"`
}

// Message represents a message in the conversation
type Message struct {
	Role    string `json:"role"`
	Content string `json:"content"`
}

// TokenUsage represents token counts from responses
type TokenUsage struct {
	InputTokens  int `json:"input_tokens"`
	OutputTokens int `json:"output_tokens"`
}

// ResponseData captures response data from an endpoint
type ResponseData struct {
	StatusCode int
	Body       []byte
	Headers    http.Header
	Duration   time.Duration
	TokenUsage *TokenUsage
	Error      error
}

// ComparisonResult represents a comparison between two responses
type ComparisonResult struct {
	TestName             string
	ZaiResponse          ResponseData
	AnthropicResponse    ResponseData
	InputTokenMatch      bool
	OutputTokenMatch     bool
	InputTokenDiff       int
	OutputTokenDiff      int
	InputTokenPercentDiff float64
	OutputTokenPercentDiff float64
	ResponseStructureMatch bool
}

// EvaluationMetrics contains aggregated metrics
type EvaluationMetrics struct {
	TotalTests             int
	SuccessfulTests        int
	InputTokenMAE          float64
	OutputTokenMAE         float64
	InputTokenAvgPercentDiff float64
	OutputTokenAvgPercentDiff float64
	StructureMatchCount    int
}

// Evaluator manages the evaluation process
type Evaluator struct {
	ZaiEndpoint       string
	AnthropicEndpoint string
	ZaiAPIKey         string
	AnthropicAPIKey   string
	Client            *http.Client
}

// NewEvaluator creates a new evaluator instance
func NewEvaluator(zaiEndpoint, anthropicEndpoint, zaiKey, anthropicKey string) *Evaluator {
	return &Evaluator{
		ZaiEndpoint:       zaiEndpoint,
		AnthropicEndpoint: anthropicEndpoint,
		ZaiAPIKey:         zaiKey,
		AnthropicAPIKey:   anthropicKey,
		Client: &http.Client{
			Timeout: 2 * time.Minute,
		},
	}
}

// sendRequest sends a request to the specified endpoint
func (e *Evaluator) sendRequest(endpoint, apiKey string, req ClaudeRequest) ResponseData {
	start := time.Now()

	reqBody, err := json.Marshal(req)
	if err != nil {
		return ResponseData{Error: fmt.Errorf("failed to marshal request: %w", err)}
	}

	httpReq, err := http.NewRequest("POST", endpoint, bytes.NewReader(reqBody))
	if err != nil {
		return ResponseData{Error: fmt.Errorf("failed to create request: %w", err)}
	}

	httpReq.Header.Set("Content-Type", "application/json")
	httpReq.Header.Set("x-api-key", apiKey)
	httpReq.Header.Set("anthropic-version", "2023-06-01")

	resp, err := e.Client.Do(httpReq)
	if err != nil {
		return ResponseData{Error: fmt.Errorf("request failed: %w", err)}
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return ResponseData{
			StatusCode: resp.StatusCode,
			Error:      fmt.Errorf("failed to read response body: %w", err),
		}
	}

	// Try to extract token usage from response
	tokenUsage := e.extractTokenUsage(body, req.Stream)

	return ResponseData{
		StatusCode: resp.StatusCode,
		Body:       body,
		Headers:    resp.Header,
		Duration:   time.Since(start),
		TokenUsage: tokenUsage,
	}
}

// extractTokenUsage attempts to extract token usage from response body
func (e *Evaluator) extractTokenUsage(body []byte, isStreaming bool) *TokenUsage {
	if isStreaming {
		return e.extractSSETokenUsage(body)
	}
	return e.extractJSONTokenUsage(body)
}

// extractJSONTokenUsage extracts usage from non-streaming JSON response
func (e *Evaluator) extractJSONTokenUsage(body []byte) *TokenUsage {
	var resp map[string]interface{}
	if err := json.Unmarshal(body, &resp); err != nil {
		log.Printf("Warning: failed to parse JSON response: %v", err)
		return nil
	}

	usage, ok := resp["usage"].(map[string]interface{})
	if !ok {
		return nil
	}

	inputTokens, _ := usage["input_tokens"].(float64)
	outputTokens, _ := usage["output_tokens"].(float64)

	return &TokenUsage{
		InputTokens:  int(inputTokens),
		OutputTokens: int(outputTokens),
	}
}

// extractSSETokenUsage extracts usage from streaming SSE response
func (e *Evaluator) extractSSETokenUsage(body []byte) *TokenUsage {
	lines := bytes.Split(body, []byte("\n"))
	for _, line := range lines {
		if !bytes.HasPrefix(line, []byte("data: ")) {
			continue
		}

		jsonData := bytes.TrimPrefix(line, []byte("data: "))
		if len(jsonData) == 0 || bytes.Equal(jsonData, []byte("[DONE]")) {
			continue
		}

		var event map[string]interface{}
		if err := json.Unmarshal(jsonData, &event); err != nil {
			continue
		}

		if eventType, ok := event["type"].(string); ok && eventType == "message_delta" {
			if usage, ok := event["usage"].(map[string]interface{}); ok {
				inputTokens, _ := usage["input_tokens"].(float64)
				outputTokens, _ := usage["output_tokens"].(float64)
				return &TokenUsage{
					InputTokens:  int(inputTokens),
					OutputTokens: int(outputTokens),
				}
			}
		}
	}

	return nil
}

// RunTest executes a single test case, comparing responses from both endpoints
func (e *Evaluator) RunTest(test TestRequest) ComparisonResult {
	log.Printf("Running test: %s", test.Name)

	// Send requests concurrently
	var wg sync.WaitGroup
	var zaiResp, anthropicResp ResponseData

	wg.Add(2)
	go func() {
		defer wg.Done()
		zaiResp = e.sendRequest(e.ZaiEndpoint, e.ZaiAPIKey, test.Request)
	}()
	go func() {
		defer wg.Done()
		anthropicResp = e.sendRequest(e.AnthropicEndpoint, e.AnthropicAPIKey, test.Request)
	}()
	wg.Wait()

	result := ComparisonResult{
		TestName:          test.Name,
		ZaiResponse:       zaiResp,
		AnthropicResponse: anthropicResp,
	}

	// Compare token counts if both responses have usage data
	if zaiResp.TokenUsage != nil && anthropicResp.TokenUsage != nil {
		result.InputTokenMatch = zaiResp.TokenUsage.InputTokens == anthropicResp.TokenUsage.InputTokens
		result.OutputTokenMatch = zaiResp.TokenUsage.OutputTokens == anthropicResp.TokenUsage.OutputTokens
		result.InputTokenDiff = zaiResp.TokenUsage.InputTokens - anthropicResp.TokenUsage.InputTokens
		result.OutputTokenDiff = zaiResp.TokenUsage.OutputTokens - anthropicResp.TokenUsage.OutputTokens

		if anthropicResp.TokenUsage.InputTokens > 0 {
			result.InputTokenPercentDiff = float64(result.InputTokenDiff) / float64(anthropicResp.TokenUsage.InputTokens) * 100
		}
		if anthropicResp.TokenUsage.OutputTokens > 0 {
			result.OutputTokenPercentDiff = float64(result.OutputTokenDiff) / float64(anthropicResp.TokenUsage.OutputTokens) * 100
		}
	}

	// Compare response structure (basic check)
	result.ResponseStructureMatch = e.compareResponseStructure(zaiResp.Body, anthropicResp.Body)

	return result
}

// compareResponseStructure performs basic structural comparison
func (e *Evaluator) compareResponseStructure(zaiBody, anthropicBody []byte) bool {
	var zaiMap, anthropicMap map[string]interface{}

	if err := json.Unmarshal(zaiBody, &zaiMap); err != nil {
		return false
	}
	if err := json.Unmarshal(anthropicBody, &anthropicMap); err != nil {
		return false
	}

	// Compare top-level keys
	if len(zaiMap) != len(anthropicMap) {
		return false
	}

	for key := range zaiMap {
		if _, ok := anthropicMap[key]; !ok {
			return false
		}
	}

	return true
}

// RunTests executes multiple test cases and returns aggregated metrics
func (e *Evaluator) RunTests(tests []TestRequest) ([]ComparisonResult, *EvaluationMetrics) {
	results := make([]ComparisonResult, len(tests))

	for i, test := range tests {
		results[i] = e.RunTest(test)
	}

	metrics := e.calculateMetrics(results)

	return results, metrics
}

// calculateMetrics computes aggregated metrics from comparison results
func (e *Evaluator) calculateMetrics(results []ComparisonResult) *EvaluationMetrics {
	return e.CalculateMetricsFromResults(results)
}

// CalculateMetricsFromResults computes aggregated metrics from comparison results (public method)
func (e *Evaluator) CalculateMetricsFromResults(results []ComparisonResult) *EvaluationMetrics {
	metrics := &EvaluationMetrics{
		TotalTests:       len(results),
		StructureMatchCount: 0,
	}

	var inputTokenSum, outputTokenSum float64
	var inputPercentSum, outputPercentSum float64
	var validInputCount, validOutputCount int

	for _, result := range results {
		if result.ZaiResponse.Error == nil && result.AnthropicResponse.Error == nil {
			metrics.SuccessfulTests++
		}

		if result.ResponseStructureMatch {
			metrics.StructureMatchCount++
		}

		// Calculate MAE and average percentage differences
		if result.AnthropicResponse.TokenUsage != nil && result.ZaiResponse.TokenUsage != nil {
			inputTokenSum += absFloat(float64(result.InputTokenDiff))
			outputTokenSum += absFloat(float64(result.OutputTokenDiff))

			if result.AnthropicResponse.TokenUsage.InputTokens > 0 {
				inputPercentSum += absFloat(result.InputTokenPercentDiff)
				validInputCount++
			}
			if result.AnthropicResponse.TokenUsage.OutputTokens > 0 {
				outputPercentSum += absFloat(result.OutputTokenPercentDiff)
				validOutputCount++
			}
		}
	}

	if validInputCount > 0 {
		metrics.InputTokenMAE = inputTokenSum / float64(validInputCount)
		metrics.InputTokenAvgPercentDiff = inputPercentSum / float64(validInputCount)
	}
	if validOutputCount > 0 {
		metrics.OutputTokenMAE = outputTokenSum / float64(validOutputCount)
		metrics.OutputTokenAvgPercentDiff = outputPercentSum / float64(validOutputCount)
	}

	return metrics
}

func absFloat(x float64) float64 {
	if x < 0 {
		return -x
	}
	return x
}