zai-proxy/proxy/evaluation/zai_eval/report.py
jedarden dee82a76a3 chore: update module paths and add evaluation package
- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy
- dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard
- Update all Go import paths in proxy/ and dashboard/ to match new module paths
- Add proxy/evaluation/ package (was missing from initial commit)
- Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 16:03:50 -04:00

312 lines
12 KiB
Python

"""Report generation for evaluation framework."""
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
from typing import List
from zai_eval.models import EvaluationResult, EvaluationReport
from zai_eval.metrics import calculate_advanced_metrics, detect_systematic_bias, calculate_accuracy_by_token_range
def print_report(console: Console, report: EvaluationReport) -> None:
"""Print comprehensive evaluation report to console.
Args:
console: Rich console instance
report: Evaluation report to print
"""
console.print("\n")
console.print(Panel.fit("Z.AI PROXY EVALUATION REPORT", style="bold cyan"))
# Summary section
_print_summary(console, report)
# Accuracy metrics
_print_accuracy(console, report)
# Error metrics
_print_error_metrics(console, report)
# Latency comparison
_print_latency(console, report)
# Systematic bias
_print_bias_analysis(console, report)
# Advanced metrics
_print_advanced_metrics(console, report)
# Detailed results table
_print_detailed_results(console, report)
def _print_summary(console: Console, report: EvaluationReport) -> None:
"""Print summary section."""
console.print("\n[bold cyan]Summary[/bold cyan]")
console.print(f"Total Requests: {report.total_requests}")
console.print(f"Successful: [green]{report.successful_requests}[/green]")
console.print(f"Failed: [red]{report.failed_requests}[/red]")
def _print_accuracy(console: Console, report: EvaluationReport) -> None:
"""Print accuracy metrics."""
table = Table(title="Token Count Accuracy", show_header=True, header_style="bold magenta")
table.add_column("Metric", style="cyan")
table.add_column("Accuracy (%)")
table.add_row("Input Token Accuracy", f"{report.input_token_accuracy:.2f}%")
table.add_row("Output Token Accuracy", f"{report.output_token_accuracy:.2f}%")
table.add_row("Overall Accuracy", f"{report.overall_accuracy:.2f}%")
console.print("\n")
console.print(table)
def _print_error_metrics(console: Console, report: EvaluationReport) -> None:
"""Print error metrics."""
table = Table(title="Mean Absolute Error (MAE)", show_header=True, header_style="bold magenta")
table.add_column("Metric", style="cyan")
table.add_column("MAE (tokens)")
table.add_column("MPE (%)")
table.add_row("Input Tokens", f"{report.input_mae:.2f}", f"{report.input_mpe:.2f}%")
table.add_row("Output Tokens", f"{report.output_mae:.2f}", f"{report.output_mpe:.2f}%")
table.add_row("Total Tokens", f"{report.total_mae:.2f}", f"{report.total_mpe:.2f}%")
console.print("\n")
console.print(table)
def _print_latency(console: Console, report: EvaluationReport) -> None:
"""Print latency comparison."""
table = Table(title="Latency Comparison", show_header=True, header_style="bold magenta")
table.add_column("Endpoint", style="cyan")
table.add_column("Avg Latency (ms)")
table.add_row("Z.AI Proxy", f"{report.avg_proxy_latency_ms:.2f}")
table.add_row("Anthropic API", f"{report.avg_anthropic_latency_ms:.2f}")
overhead = report.avg_proxy_latency_ms - report.avg_anthropic_latency_ms
overhead_pct = (overhead / report.avg_anthropic_latency_ms * 100) if report.avg_anthropic_latency_ms > 0 else 0
table.add_row("Overhead", f"{overhead:.2f} ({overhead_pct:+.1f}%)", style="yellow" if overhead > 0 else "green")
console.print("\n")
console.print(table)
def _print_bias_analysis(console: Console, report: EvaluationReport) -> None:
"""Print systematic bias analysis."""
bias = detect_systematic_bias(report.results)
if not bias:
return
table = Table(title="Systematic Bias Analysis", show_header=True, header_style="bold magenta")
table.add_column("Metric", style="cyan")
table.add_column("Value")
input_status = "Overcounts" if bias["input_bias_mean"] > 0 else "Undercounts" if bias["input_bias_mean"] < 0 else "Accurate"
output_status = "Overcounts" if bias["output_bias_mean"] > 0 else "Undercounts" if bias["output_bias_mean"] < 0 else "Accurate"
table.add_row("Input Bias", f"{bias['input_bias_mean']:+.2f} tokens ({input_status})")
table.add_row("Output Bias", f"{bias['output_bias_mean']:+.2f} tokens ({output_status})")
console.print("\n")
console.print(table)
# Bias patterns
if bias.get("mixed_bias", 0) > len(report.results) / 2:
console.print("\n[yellow]⚠ Mixed bias detected - token counting may be inconsistent[/yellow]")
elif bias.get("both_high", 0) > len(report.results) * 0.7:
console.print("\n[red]⚠ Consistent overcounting detected[/red]")
elif bias.get("both_low", 0) > len(report.results) * 0.7:
console.print("\n[red]⚠ Consistent undercounting detected[/red]")
def _print_advanced_metrics(console: Console, report: EvaluationReport) -> None:
"""Print advanced statistical metrics."""
advanced = calculate_advanced_metrics(report.results)
if not advanced:
return
table = Table(title="Advanced Statistics", show_header=True, header_style="bold magenta")
table.add_column("Metric", style="cyan")
table.add_column("Input")
table.add_column("Output")
table.add_row("Std Dev", f"{advanced['input_diff_std']:.2f}", f"{advanced['output_diff_std']:.2f}")
table.add_row("Median", f"{advanced['input_diff_median']:.2f}", f"{advanced['output_diff_median']:.2f}")
table.add_row("95th Percentile", f"{advanced['input_diff_95th']:.2f}", f"{advanced['output_diff_95th']:.2f}")
table.add_row("Max Error", f"{advanced['input_diff_max']:.2f}", f"{advanced['output_diff_max']:.2f}")
console.print("\n")
console.print(table)
def _print_detailed_results(console: Console, report: EvaluationReport) -> None:
"""Print detailed results table."""
table = Table(title="Detailed Results", show_header=True, header_style="bold magenta")
table.add_column("Test", style="cyan")
table.add_column("Proxy In/Out")
table.add_column("Anthropic In/Out")
table.add_column("Diff")
table.add_column("Status")
for r in report.results:
proxy_tokens = f"{r.proxy_response.input_tokens or 0}/{r.proxy_response.output_tokens or 0}"
anthropic_tokens = f"{r.anthropic_response.input_tokens or 0}/{r.anthropic_response.output_tokens or 0}"
diff = f"{r.total_diff:+d}"
if r.proxy_response.error or r.anthropic_response.error:
status = "[red]ERROR[/red]"
elif r.total_diff == 0:
status = "[green]✓[/green]"
elif r.total_pct_diff < 5:
status = "[yellow]~[/yellow]"
else:
status = "[red]✗[/red]"
table.add_row(r.request_name, proxy_tokens, anthropic_tokens, diff, status)
console.print("\n")
console.print(table)
def save_report_json(report: EvaluationReport, filepath: str) -> None:
"""Save report as JSON file.
Args:
report: Evaluation report to save
filepath: Path to output JSON file
"""
import json
from zai_eval.models import EvaluationResult
def convert_result(result: EvaluationResult) -> dict:
return {
"request_name": result.request_name,
"proxy": {
"status_code": result.proxy_response.status_code,
"input_tokens": result.proxy_response.input_tokens,
"output_tokens": result.proxy_response.output_tokens,
"error": result.proxy_response.error,
"latency_ms": result.proxy_response.latency_ms,
},
"anthropic": {
"status_code": result.anthropic_response.status_code,
"input_tokens": result.anthropic_response.input_tokens,
"output_tokens": result.anthropic_response.output_tokens,
"error": result.anthropic_response.error,
"latency_ms": result.anthropic_response.latency_ms,
},
"metrics": {
"input_match": result.input_match,
"output_match": result.output_match,
"input_diff": result.input_diff,
"output_diff": result.output_diff,
"input_pct_diff": result.input_pct_diff,
"output_pct_diff": result.output_pct_diff,
},
"timestamp": result.timestamp.isoformat(),
}
data = {
"summary": {
"total_requests": report.total_requests,
"successful_requests": report.successful_requests,
"failed_requests": report.failed_requests,
"input_token_accuracy": report.input_token_accuracy,
"output_token_accuracy": report.output_token_accuracy,
"overall_accuracy": report.overall_accuracy,
"input_mae": report.input_mae,
"output_mae": report.output_mae,
"total_mae": report.total_mae,
"input_mpe": report.input_mpe,
"output_mpe": report.output_mpe,
"total_mpe": report.total_mpe,
"avg_proxy_latency_ms": report.avg_proxy_latency_ms,
"avg_anthropic_latency_ms": report.avg_anthropic_latency_ms,
},
"advanced_metrics": calculate_advanced_metrics(report.results),
"bias_analysis": detect_systematic_bias(report.results),
"accuracy_by_range": calculate_accuracy_by_token_range(report.results),
"results": [convert_result(r) for r in report.results],
"timestamp": report.timestamp.isoformat(),
}
with open(filepath, "w") as f:
json.dump(data, f, indent=2)
def save_report_markdown(report: EvaluationReport, filepath: str) -> None:
"""Save report as Markdown file.
Args:
report: Evaluation report to save
filepath: Path to output Markdown file
"""
lines = [
"# Z.AI Proxy Evaluation Report",
"",
f"**Generated:** {report.timestamp.isoformat()}",
"",
"## Summary",
"",
f"- **Total Requests:** {report.total_requests}",
f"- **Successful:** {report.successful_requests}",
f"- **Failed:** {report.failed_requests}",
"",
"## Accuracy Metrics",
"",
"| Metric | Accuracy |",
"|--------|----------|",
f"| Input Token Accuracy | {report.input_token_accuracy:.2f}% |",
f"| Output Token Accuracy | {report.output_token_accuracy:.2f}% |",
f"| Overall Accuracy | {report.overall_accuracy:.2f}% |",
"",
"## Error Metrics",
"",
"| Metric | MAE (tokens) | MPE (%) |",
"|--------|---------------|---------|",
f"| Input Tokens | {report.input_mae:.2f} | {report.input_mpe:.2f}% |",
f"| Output Tokens | {report.output_mae:.2f} | {report.output_mpe:.2f}% |",
f"| Total Tokens | {report.total_mae:.2f} | {report.total_mpe:.2f}% |",
"",
"## Latency Comparison",
"",
f"| Endpoint | Avg Latency (ms) |",
f"|----------|------------------|",
f"| Z.AI Proxy | {report.avg_proxy_latency_ms:.2f} |",
f"| Anthropic API | {report.avg_anthropic_latency_ms:.2f} |",
"",
"## Systematic Bias",
"",
f"- **Input Bias:** {report.input_bias_mean:+.2f} tokens",
f"- **Output Bias:** {report.output_bias_mean:+.2f} tokens",
"",
"## Detailed Results",
"",
"| Test | Proxy (In/Out) | Anthropic (In/Out) | Diff | Status |",
"|------|-----------------|-------------------|------|--------|",
]
for r in report.results:
proxy_tokens = f"{r.proxy_response.input_tokens or 0}/{r.proxy_response.output_tokens or 0}"
anthropic_tokens = f"{r.anthropic_response.input_tokens or 0}/{r.anthropic_response.output_tokens or 0}"
if r.proxy_response.error or r.anthropic_response.error:
status = "❌ ERROR"
elif r.total_diff == 0:
status = "✅ MATCH"
elif r.total_pct_diff < 5:
status = "⚠️ CLOSE"
else:
status = "❌ MISMATCH"
lines.append(f"| {r.request_name} | {proxy_tokens} | {anthropic_tokens} | {r.total_diff:+d} | {status} |")
with open(filepath, "w") as f:
f.write("\n".join(lines))