zai-proxy/proxy/cmd/demo-eval/main.go
jedarden dee82a76a3 chore: update module paths and add evaluation package
- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy
- dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard
- Update all Go import paths in proxy/ and dashboard/ to match new module paths
- Add proxy/evaluation/ package (was missing from initial commit)
- Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 16:03:50 -04:00

125 lines
3.9 KiB
Go

package main
import (
"fmt"
"log"
"os"
"time"
"git.ardenone.com/jedarden/zai-proxy/evaluation"
)
func main() {
log.Println("🔍 Z.AI Proxy Evaluation Framework - Demo Mode")
log.Println("==============================================")
log.Println("Running simulated evaluation without real API calls...")
log.Println()
// Create simulated results to demonstrate the framework
results := generateSimulatedResults()
// Calculate metrics
eval := evaluation.NewEvaluator("", "", "", "")
metrics := eval.CalculateMetricsFromResults(results)
// Generate report
log.Println("Generating reports...")
reporter := evaluation.NewReportGenerator(results, metrics)
// Save text report
textReport := reporter.GenerateTextReport()
if err := os.WriteFile("evaluation-report.txt", []byte(textReport), 0644); err != nil {
log.Fatalf("Failed to save text report: %v", err)
}
log.Println("✓ Text report saved to: evaluation-report.txt")
// Save JSON report
jsonReport, err := reporter.GenerateJSONReport()
if err != nil {
log.Fatalf("Failed to generate JSON report: %v", err)
}
if err := os.WriteFile("evaluation-report.json", jsonReport, 0644); err != nil {
log.Fatalf("Failed to save JSON report: %v", err)
}
log.Println("✓ JSON report saved to: evaluation-report.json")
// Print summary
fmt.Println("\n" + textReport)
log.Println("\n✓ Evaluation complete!")
log.Println("To run with real endpoints:")
log.Println(" export ZAI_API_KEY=your-zai-key")
log.Println(" export ANTHROPIC_API_KEY=your-anthropic-key")
log.Println(" go run cmd/evaluate/main.go -zai-endpoint http://localhost:8080/v1/messages")
}
// generateSimulatedResults creates realistic test results for demonstration
func generateSimulatedResults() []evaluation.ComparisonResult {
tests := evaluation.GetTestCases()
results := make([]evaluation.ComparisonResult, len(tests))
// Simulate various scenarios:
// - Some tests with perfect matches
// - Some tests with small discrepancies (undercounting)
// - Some tests with larger discrepancies
scenarios := []struct {
inputDiff int
outputDiff int
structMatch bool
error bool
}{
{0, 0, true, false}, // Perfect match
{2, 1, true, false}, // Small discrepancy
{-1, 0, true, false}, // Z.AI undercounts
{5, 3, true, false}, // Medium discrepancy
{0, 0, true, false}, // Perfect match
{1, 2, true, false}, // Small discrepancy (streaming)
{3, 1, true, false}, // Medium discrepancy
{0, 1, true, false}, // Nearly perfect
{0, 0, true, false}, // Perfect match
{4, 2, true, false}, // Medium discrepancy
{0, 0, true, false}, // Perfect match
{2, 1, true, false}, // Small discrepancy
{1, 3, true, false}, // Small discrepancy (streaming)
}
for i, test := range tests {
if i >= len(scenarios) {
break
}
scenario := scenarios[i]
// Generate base token counts
baseInput := (i+1)*10 + 20
baseOutput := (i+1)*5 + 15
results[i] = evaluation.ComparisonResult{
TestName: test.Name,
ZaiResponse: evaluation.ResponseData{
StatusCode: 200,
Duration: time.Second + time.Duration(i*100)*time.Millisecond,
TokenUsage: &evaluation.TokenUsage{
InputTokens: baseInput + scenario.inputDiff,
OutputTokens: baseOutput + scenario.outputDiff,
},
},
AnthropicResponse: evaluation.ResponseData{
StatusCode: 200,
Duration: time.Duration(900+i*100)*time.Millisecond,
TokenUsage: &evaluation.TokenUsage{
InputTokens: baseInput,
OutputTokens: baseOutput,
},
},
InputTokenMatch: scenario.inputDiff == 0,
OutputTokenMatch: scenario.outputDiff == 0,
InputTokenDiff: scenario.inputDiff,
OutputTokenDiff: scenario.outputDiff,
InputTokenPercentDiff: float64(scenario.inputDiff) / float64(baseInput) * 100,
OutputTokenPercentDiff: float64(scenario.outputDiff) / float64(baseOutput) * 100,
ResponseStructureMatch: scenario.structMatch,
}
}
return results
}