- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy - dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard - Update all Go import paths in proxy/ and dashboard/ to match new module paths - Add proxy/evaluation/ package (was missing from initial commit) - Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
500 lines
20 KiB
Go
500 lines
20 KiB
Go
package evaluation
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"text/template"
|
|
"time"
|
|
)
|
|
|
|
// ReportGenerator creates evaluation reports
|
|
type ReportGenerator struct {
|
|
results []ComparisonResult
|
|
metrics *EvaluationMetrics
|
|
}
|
|
|
|
// NewReportGenerator creates a new report generator
|
|
func NewReportGenerator(results []ComparisonResult, metrics *EvaluationMetrics) *ReportGenerator {
|
|
return &ReportGenerator{
|
|
results: results,
|
|
metrics: metrics,
|
|
}
|
|
}
|
|
|
|
// GenerateTextReport creates a text-based evaluation report
|
|
func (rg *ReportGenerator) GenerateTextReport() string {
|
|
var buf bytes.Buffer
|
|
|
|
buf.WriteString("╔══════════════════════════════════════════════════════════════════════════════╗\n")
|
|
buf.WriteString("║ Z.AI PROXY EVALUATION REPORT ║\n")
|
|
buf.WriteString(fmt.Sprintf("║ Generated: %s ║\n", time.Now().Format("2006-01-02 15:04:05")))
|
|
buf.WriteString("╚══════════════════════════════════════════════════════════════════════════════╝\n\n")
|
|
|
|
// Executive Summary
|
|
buf.WriteString("## EXECUTIVE SUMMARY\n\n")
|
|
buf.WriteString(fmt.Sprintf("Total Tests Run: %d\n", rg.metrics.TotalTests))
|
|
buf.WriteString(fmt.Sprintf("Successful Tests: %d (%.1f%%)\n", rg.metrics.SuccessfulTests, float64(rg.metrics.SuccessfulTests)/float64(rg.metrics.TotalTests)*100))
|
|
buf.WriteString(fmt.Sprintf("Structure Match Rate: %d (%.1f%%)\n", rg.metrics.StructureMatchCount, float64(rg.metrics.StructureMatchCount)/float64(rg.metrics.TotalTests)*100))
|
|
buf.WriteString("\n")
|
|
|
|
// Token Accuracy Metrics
|
|
buf.WriteString("## TOKEN ACCURACY METRICS\n\n")
|
|
buf.WriteString("┌────────────────────────┬──────────────┬──────────────┬──────────────────┐\n")
|
|
buf.WriteString("│ Metric │ Input Tokens │ Output Tokens │ Difference │\n")
|
|
buf.WriteString("├────────────────────────┼──────────────┼──────────────┼──────────────────┤\n")
|
|
|
|
if rg.metrics.InputTokenMAE > 0 {
|
|
buf.WriteString(fmt.Sprintf("│ Mean Absolute Error │ %12.2f │ %12.2f │ │\n", rg.metrics.InputTokenMAE, rg.metrics.OutputTokenMAE))
|
|
}
|
|
if rg.metrics.InputTokenAvgPercentDiff > 0 {
|
|
buf.WriteString(fmt.Sprintf("│ Avg Percent Diff │ %11.2f%% │ %11.2f%% │ │\n", rg.metrics.InputTokenAvgPercentDiff, rg.metrics.OutputTokenAvgPercentDiff))
|
|
}
|
|
|
|
buf.WriteString("└────────────────────────┴──────────────┴──────────────┴──────────────────┘\n\n")
|
|
|
|
// Detailed Test Results
|
|
buf.WriteString("## DETAILED TEST RESULTS\n\n")
|
|
|
|
for i, result := range rg.results {
|
|
buf.WriteString(fmt.Sprintf("### Test %d: %s\n\n", i+1, result.TestName))
|
|
|
|
// Status
|
|
zaiStatus := "✓ OK"
|
|
if result.ZaiResponse.Error != nil {
|
|
zaiStatus = fmt.Sprintf("✗ Error: %s", result.ZaiResponse.Error)
|
|
}
|
|
anthropicStatus := "✓ OK"
|
|
if result.AnthropicResponse.Error != nil {
|
|
anthropicStatus = fmt.Sprintf("✗ Error: %s", result.AnthropicResponse.Error)
|
|
}
|
|
|
|
buf.WriteString(fmt.Sprintf("Z.AI Status: %s\n", zaiStatus))
|
|
buf.WriteString(fmt.Sprintf("Anthropic Status: %s\n", anthropicStatus))
|
|
|
|
// Response times
|
|
buf.WriteString(fmt.Sprintf("Z.AI Response: %v\n", result.ZaiResponse.Duration))
|
|
buf.WriteString(fmt.Sprintf("Anthropic Response: %v\n", result.AnthropicResponse.Duration))
|
|
|
|
// Token comparison
|
|
if result.ZaiResponse.TokenUsage != nil && result.AnthropicResponse.TokenUsage != nil {
|
|
buf.WriteString("\nToken Comparison:\n")
|
|
buf.WriteString("┌─────────────────────┬──────────┬──────────┬──────────┬────────────┐\n")
|
|
buf.WriteString("│ Direction │ Z.AI │ Anthropic│ Diff │ %% Diff │\n")
|
|
buf.WriteString("├─────────────────────┼──────────┼──────────┼──────────┼────────────┤\n")
|
|
buf.WriteString(fmt.Sprintf("│ Input Tokens │ %8d │ %8d │ %8d │ %9.2f%% │\n",
|
|
result.ZaiResponse.TokenUsage.InputTokens,
|
|
result.AnthropicResponse.TokenUsage.InputTokens,
|
|
result.InputTokenDiff,
|
|
result.InputTokenPercentDiff))
|
|
buf.WriteString(fmt.Sprintf("│ Output Tokens │ %8d │ %8d │ %8d │ %9.2f%% │\n",
|
|
result.ZaiResponse.TokenUsage.OutputTokens,
|
|
result.AnthropicResponse.TokenUsage.OutputTokens,
|
|
result.OutputTokenDiff,
|
|
result.OutputTokenPercentDiff))
|
|
buf.WriteString("└─────────────────────┴──────────┴──────────┴──────────┴────────────┘\n")
|
|
|
|
// Match indicators
|
|
inputMatch := "✓"
|
|
if !result.InputTokenMatch {
|
|
inputMatch = "✗"
|
|
}
|
|
outputMatch := "✓"
|
|
if !result.OutputTokenMatch {
|
|
outputMatch = "✗"
|
|
}
|
|
buf.WriteString(fmt.Sprintf("\nInput Tokens Match: %s Output Tokens Match: %s\n", inputMatch, outputMatch))
|
|
} else {
|
|
buf.WriteString("\n⚠ Token usage data not available for comparison\n")
|
|
if result.ZaiResponse.TokenUsage == nil {
|
|
buf.WriteString(" - Z.AI token usage: Not available\n")
|
|
}
|
|
if result.AnthropicResponse.TokenUsage == nil {
|
|
buf.WriteString(" - Anthropic token usage: Not available\n")
|
|
}
|
|
}
|
|
|
|
// Structure match
|
|
structureMatch := "✓"
|
|
if !result.ResponseStructureMatch {
|
|
structureMatch = "✗"
|
|
}
|
|
buf.WriteString(fmt.Sprintf("Structure Match: %s\n\n", structureMatch))
|
|
|
|
// Response snippets (truncated)
|
|
if len(result.ZaiResponse.Body) > 0 && len(result.AnthropicResponse.Body) > 0 {
|
|
buf.WriteString("Response Preview:\n")
|
|
buf.WriteString("Z.AI Response:\n")
|
|
buf.WriteString(formatJSONPreview(result.ZaiResponse.Body, 200))
|
|
buf.WriteString("\nAnthropic Response:\n")
|
|
buf.WriteString(formatJSONPreview(result.AnthropicResponse.Body, 200))
|
|
buf.WriteString("\n")
|
|
}
|
|
|
|
buf.WriteString("---\n\n")
|
|
}
|
|
|
|
// Analysis and Recommendations
|
|
buf.WriteString("## ANALYSIS AND RECOMMENDATIONS\n\n")
|
|
buf.WriteString(rg.generateAnalysis())
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// generateAnalysis creates analysis based on metrics
|
|
func (rg *ReportGenerator) generateAnalysis() string {
|
|
var buf bytes.Buffer
|
|
|
|
// Token accuracy analysis
|
|
if rg.metrics.InputTokenMAE > 10 || rg.metrics.OutputTokenMAE > 10 {
|
|
buf.WriteString("### ⚠ Token Counting Accuracy Concerns\n\n")
|
|
if rg.metrics.InputTokenMAE > 10 {
|
|
buf.WriteString(fmt.Sprintf("- Input token MAE (%.2f) exceeds threshold of 10 tokens\n", rg.metrics.InputTokenMAE))
|
|
}
|
|
if rg.metrics.OutputTokenMAE > 10 {
|
|
buf.WriteString(fmt.Sprintf("- Output token MAE (%.2f) exceeds threshold of 10 tokens\n", rg.metrics.OutputTokenMAE))
|
|
}
|
|
buf.WriteString("- Recommendation: Review tokenizer configuration and consider model-specific encoding\n\n")
|
|
} else if rg.metrics.InputTokenMAE > 0 {
|
|
buf.WriteString("### ✓ Token Counting Accuracy\n\n")
|
|
buf.WriteString("Token counts are within acceptable tolerance levels.\n")
|
|
buf.WriteString(fmt.Sprintf("- Input MAE: %.2f tokens\n", rg.metrics.InputTokenMAE))
|
|
buf.WriteString(fmt.Sprintf("- Output MAE: %.2f tokens\n\n", rg.metrics.OutputTokenMAE))
|
|
}
|
|
|
|
// Percentage difference analysis
|
|
if rg.metrics.InputTokenAvgPercentDiff > 5 || rg.metrics.OutputTokenAvgPercentDiff > 5 {
|
|
buf.WriteString("### ⚠ Percentage Difference Analysis\n\n")
|
|
if rg.metrics.InputTokenAvgPercentDiff > 5 {
|
|
buf.WriteString(fmt.Sprintf("- Average input token difference (%.2f%%) exceeds 5%% threshold\n", rg.metrics.InputTokenAvgPercentDiff))
|
|
}
|
|
if rg.metrics.OutputTokenAvgPercentDiff > 5 {
|
|
buf.WriteString(fmt.Sprintf("- Average output token difference (%.2f%%) exceeds 5%% threshold\n", rg.metrics.OutputTokenAvgPercentDiff))
|
|
}
|
|
buf.WriteString("- Recommendation: Investigate systematic biases in token counting\n\n")
|
|
}
|
|
|
|
// Success rate analysis
|
|
successRate := float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests) * 100
|
|
if successRate < 100 {
|
|
buf.WriteString(fmt.Sprintf("### ⚠ Success Rate: %.1f%%\n\n", successRate))
|
|
buf.WriteString("Some tests failed. Review error logs above for details.\n\n")
|
|
}
|
|
|
|
// Structure match analysis
|
|
structureRate := float64(rg.metrics.StructureMatchCount) / float64(rg.metrics.TotalTests) * 100
|
|
if structureRate < 100 {
|
|
buf.WriteString(fmt.Sprintf("### ⚠ Structure Match Rate: %.1f%%\n\n", structureRate))
|
|
buf.WriteString("Some responses have different structures. This may indicate:\n")
|
|
buf.WriteString("- Different response formats between endpoints\n")
|
|
buf.WriteString("- Missing or extra fields in responses\n\n")
|
|
}
|
|
|
|
// Pattern analysis
|
|
buf.WriteString("### Pattern Analysis\n\n")
|
|
buf.WriteString(rg.identifyPatterns())
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// identifyPatterns identifies systematic patterns in discrepancies
|
|
func (rg *ReportGenerator) identifyPatterns() string {
|
|
var buf bytes.Buffer
|
|
|
|
inputConsistent := 0
|
|
inputZaiHigher := 0
|
|
inputZaiLower := 0
|
|
outputConsistent := 0
|
|
outputZaiHigher := 0
|
|
outputZaiLower := 0
|
|
streamingTests := 0
|
|
nonStreamingTests := 0
|
|
|
|
for _, result := range rg.results {
|
|
if result.ZaiResponse.TokenUsage != nil && result.AnthropicResponse.TokenUsage != nil {
|
|
if result.InputTokenDiff == 0 {
|
|
inputConsistent++
|
|
} else if result.InputTokenDiff > 0 {
|
|
inputZaiHigher++
|
|
} else {
|
|
inputZaiLower++
|
|
}
|
|
|
|
if result.OutputTokenDiff == 0 {
|
|
outputConsistent++
|
|
} else if result.OutputTokenDiff > 0 {
|
|
outputZaiHigher++
|
|
} else {
|
|
outputZaiLower++
|
|
}
|
|
}
|
|
|
|
if result.ZaiResponse.TokenUsage != nil {
|
|
// Check if streaming by looking at the test
|
|
for _, test := range GetTestCases() {
|
|
if test.Name == result.TestName {
|
|
if test.Stream {
|
|
streamingTests++
|
|
} else {
|
|
nonStreamingTests++
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
buf.WriteString("#### Input Token Patterns\n")
|
|
buf.WriteString(fmt.Sprintf("- Exact matches: %d\n", inputConsistent))
|
|
buf.WriteString(fmt.Sprintf("- Z.AI higher: %d\n", inputZaiHigher))
|
|
buf.WriteString(fmt.Sprintf("- Z.AI lower: %d\n", inputZaiLower))
|
|
|
|
if inputZaiHigher > inputZaiLower*2 {
|
|
buf.WriteString("→ Pattern: Z.AI consistently reports higher input tokens\n")
|
|
buf.WriteString(" Possible cause: Different tokenization algorithm or encoding\n")
|
|
} else if inputZaiLower > inputZaiHigher*2 {
|
|
buf.WriteString("→ Pattern: Z.AI consistently reports lower input tokens\n")
|
|
buf.WriteString(" Possible cause: Undercounting or missing tokens in calculation\n")
|
|
} else if inputConsistent == 0 {
|
|
buf.WriteString("→ Pattern: No exact matches found\n")
|
|
buf.WriteString(" Possible cause: Systematic difference in counting methodology\n")
|
|
}
|
|
|
|
buf.WriteString("\n#### Output Token Patterns\n")
|
|
buf.WriteString(fmt.Sprintf("- Exact matches: %d\n", outputConsistent))
|
|
buf.WriteString(fmt.Sprintf("- Z.AI higher: %d\n", outputZaiHigher))
|
|
buf.WriteString(fmt.Sprintf("- Z.AI lower: %d\n", outputZaiLower))
|
|
|
|
if outputZaiHigher > outputZaiLower*2 {
|
|
buf.WriteString("→ Pattern: Z.AI consistently reports higher output tokens\n")
|
|
buf.WriteString(" Possible cause: Counting control tokens or metadata\n")
|
|
} else if outputZaiLower > outputZaiHigher*2 {
|
|
buf.WriteString("→ Pattern: Z.AI consistently reports lower output tokens\n")
|
|
buf.WriteString(" Possible cause: Truncation or incomplete capture\n")
|
|
}
|
|
|
|
buf.WriteString(fmt.Sprintf("\n#### Test Type Distribution\n"))
|
|
buf.WriteString(fmt.Sprintf("- Streaming tests: %d\n", streamingTests))
|
|
buf.WriteString(fmt.Sprintf("- Non-streaming tests: %d\n", nonStreamingTests))
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// formatJSONPreview creates a formatted preview of JSON response
|
|
func formatJSONPreview(data []byte, maxLen int) string {
|
|
var prettyJSON bytes.Buffer
|
|
if err := json.Indent(&prettyJSON, data, "", " "); err != nil {
|
|
return string(data)
|
|
}
|
|
|
|
preview := prettyJSON.String()
|
|
if len(preview) > maxLen {
|
|
return preview[:maxLen] + "..."
|
|
}
|
|
return preview
|
|
}
|
|
|
|
// SaveToFile saves the report to a file
|
|
func (rg *ReportGenerator) SaveToFile(filename string) error {
|
|
report := rg.GenerateTextReport()
|
|
return os.WriteFile(filename, []byte(report), 0644)
|
|
}
|
|
|
|
// GenerateJSONReport creates a JSON report for programmatic consumption
|
|
func (rg *ReportGenerator) GenerateJSONReport() ([]byte, error) {
|
|
report := struct {
|
|
GeneratedAt string `json:"generated_at"`
|
|
Metrics *EvaluationMetrics `json:"metrics"`
|
|
TestResults []ComparisonResult `json:"test_results"`
|
|
Interpretation map[string]interface{} `json:"interpretation"`
|
|
}{
|
|
GeneratedAt: time.Now().Format(time.RFC3339),
|
|
Metrics: rg.metrics,
|
|
TestResults: rg.results,
|
|
Interpretation: map[string]interface{}{
|
|
"overall_accuracy": rg.calculateOverallAccuracy(),
|
|
"recommendations": rg.getRecommendations(),
|
|
"patterns": rg.identifyPatterns(),
|
|
},
|
|
}
|
|
|
|
return json.MarshalIndent(report, "", " ")
|
|
}
|
|
|
|
// calculateOverallAccuracy calculates an overall accuracy score
|
|
func (rg *ReportGenerator) calculateOverallAccuracy() map[string]float64 {
|
|
accuracy := make(map[string]float64)
|
|
|
|
if rg.metrics.TotalTests > 0 {
|
|
accuracy["success_rate"] = float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests) * 100
|
|
accuracy["structure_match_rate"] = float64(rg.metrics.StructureMatchCount) / float64(rg.metrics.TotalTests) * 100
|
|
}
|
|
|
|
// Token accuracy (inverse of MAE, scaled)
|
|
if rg.metrics.InputTokenMAE > 0 {
|
|
accuracy["input_token_accuracy"] = 100 - min(rg.metrics.InputTokenAvgPercentDiff, 100)
|
|
}
|
|
if rg.metrics.OutputTokenMAE > 0 {
|
|
accuracy["output_token_accuracy"] = 100 - min(rg.metrics.OutputTokenAvgPercentDiff, 100)
|
|
}
|
|
|
|
return accuracy
|
|
}
|
|
|
|
// getRecommendations returns actionable recommendations
|
|
func (rg *ReportGenerator) getRecommendations() []string {
|
|
var recommendations []string
|
|
|
|
if rg.metrics.InputTokenMAE > 10 {
|
|
recommendations = append(recommendations, "Input token counting has high variance - verify tokenizer model matches Anthropic's")
|
|
}
|
|
|
|
if rg.metrics.OutputTokenMAE > 10 {
|
|
recommendations = append(recommendations, "Output token counting has high variance - check SSE parsing logic")
|
|
}
|
|
|
|
successRate := float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests)
|
|
if successRate < 1.0 {
|
|
recommendations = append(recommendations, "Some requests failed - review error handling and retry logic")
|
|
}
|
|
|
|
structureRate := float64(rg.metrics.StructureMatchCount) / float64(rg.metrics.TotalTests)
|
|
if structureRate < 1.0 {
|
|
recommendations = append(recommendations, "Response structure mismatches detected - verify response forwarding")
|
|
}
|
|
|
|
if len(recommendations) == 0 {
|
|
recommendations = append(recommendations, "All metrics within acceptable ranges - no immediate action required")
|
|
}
|
|
|
|
return recommendations
|
|
}
|
|
|
|
// GenerateHTMLReport creates an HTML formatted report
|
|
func (rg *ReportGenerator) GenerateHTMLReport() (string, error) {
|
|
const htmlTemplate = `
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Z.AI Proxy Evaluation Report</title>
|
|
<style>
|
|
body { font-family: Arial, sans-serif; margin: 40px; background: #f5f5f5; }
|
|
.container { max-width: 1200px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
|
|
h1 { color: #333; border-bottom: 3px solid #4CAF50; padding-bottom: 10px; }
|
|
h2 { color: #555; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }
|
|
.summary { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin: 20px 0; }
|
|
.metric-card { background: #f9f9f9; padding: 20px; border-radius: 6px; border-left: 4px solid #4CAF50; }
|
|
.metric-label { font-size: 12px; color: #666; text-transform: uppercase; }
|
|
.metric-value { font-size: 32px; font-weight: bold; color: #333; }
|
|
.success { color: #4CAF50; }
|
|
.warning { color: #ff9800; }
|
|
.error { color: #f44336; }
|
|
table { width: 100%; border-collapse: collapse; margin: 20px 0; }
|
|
th, td { padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }
|
|
th { background: #f5f5f5; font-weight: 600; }
|
|
.test-result { margin: 20px 0; padding: 15px; background: #f9f9f9; border-radius: 6px; }
|
|
.match { color: #4CAF50; font-weight: bold; }
|
|
.mismatch { color: #ff9800; font-weight: bold; }
|
|
.timestamp { color: #999; font-size: 12px; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<h1>🔍 Z.AI Proxy Evaluation Report</h1>
|
|
<p class="timestamp">Generated: {{.Timestamp}}</p>
|
|
|
|
<h2>Executive Summary</h2>
|
|
<div class="summary">
|
|
<div class="metric-card">
|
|
<div class="metric-label">Total Tests</div>
|
|
<div class="metric-value">{{.TotalTests}}</div>
|
|
</div>
|
|
<div class="metric-card">
|
|
<div class="metric-label">Success Rate</div>
|
|
<div class="metric-value {{if .HighSuccessRate}}success{{else}}warning{{end}}">{{.SuccessRate}}%</div>
|
|
</div>
|
|
<div class="metric-card">
|
|
<div class="metric-label">Input Token MAE</div>
|
|
<div class="metric-value {{if .LowInputMAE}}success{{else}}warning{{end}}">{{.InputMAE}}</div>
|
|
</div>
|
|
<div class="metric-card">
|
|
<div class="metric-label">Output Token MAE</div>
|
|
<div class="metric-value {{if .LowOutputMAE}}success{{else}}warning{{end}}">{{.OutputMAE}}</div>
|
|
</div>
|
|
</div>
|
|
|
|
<h2>Detailed Results</h2>
|
|
{{range .Results}}
|
|
<div class="test-result">
|
|
<h3>{{.TestName}}</h3>
|
|
<table>
|
|
<tr>
|
|
<th>Metric</th>
|
|
<th>Z.AI</th>
|
|
<th>Anthropic</th>
|
|
<th>Difference</th>
|
|
</tr>
|
|
{{if .ZaiResponse.TokenUsage}}
|
|
<tr>
|
|
<td>Input Tokens</td>
|
|
<td>{{.ZaiResponse.TokenUsage.InputTokens}}</td>
|
|
<td>{{.AnthropicResponse.TokenUsage.InputTokens}}</td>
|
|
<td class="{{if .InputTokenMatch}}match{{else}}mismatch{{end}}">{{.InputTokenDiff}} ({{.InputTokenPercentDiff}}%)</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Output Tokens</td>
|
|
<td>{{.ZaiResponse.TokenUsage.OutputTokens}}</td>
|
|
<td>{{.AnthropicResponse.TokenUsage.OutputTokens}}</td>
|
|
<td class="{{if .OutputTokenMatch}}match{{else}}mismatch{{end}}">{{.OutputTokenDiff}} ({{.OutputTokenPercentDiff}}%)</td>
|
|
</tr>
|
|
{{end}}
|
|
</table>
|
|
</div>
|
|
{{end}}
|
|
</div>
|
|
</body>
|
|
</html>
|
|
`
|
|
|
|
data := struct {
|
|
Timestamp string
|
|
TotalTests int
|
|
SuccessRate float64
|
|
HighSuccessRate bool
|
|
InputMAE float64
|
|
OutputMAE float64
|
|
LowInputMAE bool
|
|
LowOutputMAE bool
|
|
Results []ComparisonResult
|
|
}{
|
|
Timestamp: time.Now().Format("2006-01-02 15:04:05"),
|
|
TotalTests: rg.metrics.TotalTests,
|
|
SuccessRate: float64(rg.metrics.SuccessfulTests) / float64(rg.metrics.TotalTests) * 100,
|
|
HighSuccessRate: rg.metrics.SuccessfulTests == rg.metrics.TotalTests,
|
|
InputMAE: rg.metrics.InputTokenMAE,
|
|
OutputMAE: rg.metrics.OutputTokenMAE,
|
|
LowInputMAE: rg.metrics.InputTokenMAE < 10,
|
|
LowOutputMAE: rg.metrics.OutputTokenMAE < 10,
|
|
Results: rg.results,
|
|
}
|
|
|
|
tmpl, err := template.New("report").Parse(htmlTemplate)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
if err := tmpl.Execute(&buf, data); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return buf.String(), nil
|
|
}
|
|
|
|
func min(a, b float64) float64 {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|