zai-proxy/proxy/evaluation/run_evaluation.py
jedarden dee82a76a3 chore: update module paths and add evaluation package
- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy
- dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard
- Update all Go import paths in proxy/ and dashboard/ to match new module paths
- Add proxy/evaluation/ package (was missing from initial commit)
- Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 16:03:50 -04:00

325 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""Z.AI Proxy Evaluation Framework - CLI Entry Point
Compares token counts from z.ai proxy with Anthropic API responses.
"""
import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
# Add evaluation package to path
sys.path.insert(0, str(Path(__file__).parent))
from zai_eval.client import DualClient
from zai_eval.test_cases import get_test_cases
from zai_eval.models import EvaluationResult, EvaluationReport
def parse_args():
parser = argparse.ArgumentParser(
description="Evaluate z.ai proxy token counting against Anthropic API"
)
parser.add_argument(
"--proxy-url",
default=os.getenv("ZAI_PROXY_URL", "http://localhost:8080"),
help="Z.AI proxy URL (default: from ZAI_PROXY_URL or http://localhost:8080)"
)
parser.add_argument(
"--proxy-key",
default=os.getenv("ZAI_API_KEY"),
help="Z.AI API key (default: from ZAI_API_KEY)"
)
parser.add_argument(
"--anthropic-key",
default=os.getenv("ANTHROPIC_API_KEY"),
help="Anthropic API key (default: from ANTHROPIC_API_KEY)"
)
parser.add_argument(
"--output-dir",
default="evaluation/results",
help="Output directory for reports (default: evaluation/results)"
)
parser.add_argument(
"--test-name",
help="Run only a specific test case by name"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose output"
)
return parser.parse_args()
def run_evaluation(args):
"""Run the evaluation suite."""
# Validate required parameters
if not args.proxy_key:
print("Error: Z.AI API key required. Set ZAI_API_KEY or use --proxy-key")
sys.exit(1)
if not args.anthropic_key:
print("Error: Anthropic API key required. Set ANTHROPIC_API_KEY or use --anthropic-key")
sys.exit(1)
print("=" * 70)
print("Z.AI Proxy Evaluation Framework")
print("=" * 70)
print(f"Proxy URL: {args.proxy_url}")
print(f"Anthropic API: https://api.anthropic.com")
print(f"Output Directory: {args.output_dir}")
print()
# Create output directory
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Get test cases
all_tests = get_test_cases()
if args.test_name:
tests = [t for t in all_tests if t.name == args.test_name]
if not tests:
print(f"Error: Test case '{args.test_name}' not found")
print(f"Available tests: {', '.join(t.name for t in all_tests)}")
sys.exit(1)
else:
tests = all_tests
print(f"Running {len(tests)} test case(s)...")
print()
# Create dual client
client = DualClient(args.proxy_url, args.proxy_key, args.anthropic_key)
# Run evaluation
results = []
for i, test in enumerate(tests, 1):
print(f"[{i}/{len(tests)}] {test.name}: {test.description}")
if args.verbose:
print(f" Model: {test.model}")
print(f" Max tokens: {test.max_tokens}")
print(f" Stream: {test.stream}")
# Execute parallel requests
proxy_resp, anthropic_resp = client.evaluate_request(
model=test.model,
messages=test.messages,
max_tokens=test.max_tokens,
stream=test.stream,
temperature=test.temperature,
)
# Create result
result = EvaluationResult(
request_name=test.name,
proxy_response=proxy_resp,
anthropic_response=anthropic_resp,
input_match=False,
output_match=False,
total_match=False,
)
result.calculate_metrics()
results.append(result)
# Show result
status = "" if (proxy_resp.status_code == 200 and anthropic_resp.status_code == 200) else ""
print(f" Status: {status}")
print(f" Proxy: {proxy_resp.status_code} | "
f"In: {proxy_resp.input_tokens or 'N/A':>4} | "
f"Out: {proxy_resp.output_tokens or 'N/A':>4} | "
f"Latency: {proxy_resp.latency_ms:.0f}ms")
print(f" Anthropic: {anthropic_resp.status_code} | "
f"In: {anthropic_resp.input_tokens or 'N/A':>4} | "
f"Out: {anthropic_resp.output_tokens or 'N/A':>4} | "
f"Latency: {anthropic_resp.latency_ms:.0f}ms")
if proxy_resp.status_code == 200 and anthropic_resp.status_code == 200:
match_indicator = "" if result.input_match else ""
print(f" Input match: {match_indicator} (diff: {result.input_diff}, {result.input_pct_diff:.1f}%)")
match_indicator = "" if result.output_match else ""
print(f" Output match: {match_indicator} (diff: {result.output_diff}, {result.output_pct_diff:.1f}%)")
elif proxy_resp.error:
print(f" Proxy error: {proxy_resp.error}")
elif anthropic_resp.error:
print(f" Anthropic error: {anthropic_resp.error}")
print()
# Generate report
print("Generating report...")
report = EvaluationReport(
total_requests=len(tests),
successful_requests=0,
failed_requests=0,
results=results,
)
report.calculate_summary_metrics()
# Print summary
print()
print("=" * 70)
print("EVALUATION SUMMARY")
print("=" * 70)
print(f"Total tests: {report.total_requests}")
print(f"Successful: {report.successful_requests}")
print(f"Failed: {report.failed_requests}")
print()
print("Token Accuracy:")
print(f" Input tokens: {report.input_token_accuracy:.1f}%")
print(f" Output tokens: {report.output_token_accuracy:.1f}%")
print(f" Overall: {report.overall_accuracy:.1f}%")
print()
print("Mean Absolute Error:")
print(f" Input tokens: {report.input_mae:.2f}")
print(f" Output tokens: {report.output_mae:.2f}")
print(f" Total tokens: {report.total_mae:.2f}")
print()
print("Systematic Bias:")
print(f" Input bias: {report.input_bias_mean:+.2f} (positive = proxy overcounts)")
print(f" Output bias: {report.output_bias_mean:+.2f} (positive = proxy overcounts)")
print()
print("Latency:")
print(f" Avg proxy: {report.avg_proxy_latency_ms:.0f}ms")
print(f" Avg Anthropic: {report.avg_anthropic_latency_ms:.0f}ms")
print()
# Save reports
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# JSON report
json_file = output_dir / f"evaluation_report_{timestamp}.json"
with open(json_file, "w") as f:
json.dump(report.dict(), f, indent=2, default=str)
print(f"✓ JSON report saved: {json_file}")
# Text report
text_file = output_dir / f"evaluation_report_{timestamp}.txt"
with open(text_file, "w") as f:
f.write(generate_text_report(report))
print(f"✓ Text report saved: {text_file}")
# Analysis
print()
print("=" * 70)
print("ANALYSIS")
print("=" * 70)
print(generate_analysis(report))
return report
def generate_text_report(report: EvaluationReport) -> str:
"""Generate a detailed text report."""
lines = [
"Z.AI Proxy Evaluation Report",
"=" * 70,
f"Generated: {report.timestamp.isoformat()}",
"",
"EXECUTIVE SUMMARY",
"-" * 70,
f"Total Tests: {report.total_requests}",
f"Successful: {report.successful_requests}",
f"Failed: {report.failed_requests}",
"",
"TOKEN ACCURACY METRICS",
"-" * 70,
f"Input Token Accuracy: {report.input_token_accuracy:.1f}%",
f"Output Token Accuracy: {report.output_token_accuracy:.1f}%",
f"Overall Accuracy: {report.overall_accuracy:.1f}%",
"",
"MEAN ABSOLUTE ERROR",
"-" * 70,
f"Input MAE: {report.input_mae:.2f} tokens",
f"Output MAE: {report.output_mae:.2f} tokens",
f"Total MAE: {report.total_mae:.2f} tokens",
"",
"SYSTEMATIC BIAS",
"-" * 70,
f"Input Bias: {report.input_bias_mean:+.2f} (positive = proxy overcounts)",
f"Output Bias: {report.output_bias_mean:+.2f} (positive = proxy overcounts)",
"",
"LATENCY",
"-" * 70,
f"Avg Proxy Latency: {report.avg_proxy_latency_ms:.0f}ms",
f"Avg Anthropic Latency: {report.avg_anthropic_latency_ms:.0f}ms",
"",
"DETAILED RESULTS",
"-" * 70,
]
for result in report.results:
lines.extend([
"",
f"Test: {result.request_name}",
f" Proxy: Status={result.proxy_response.status_code} | "
f"In={result.proxy_response.input_tokens or 'N/A':>4} | "
f"Out={result.proxy_response.output_tokens or 'N/A':>4}",
f" Anthropic: Status={result.anthropic_response.status_code} | "
f"In={result.anthropic_response.input_tokens or 'N/A':>4} | "
f"Out={result.anthropic_response.output_tokens or 'N/A':>4}",
f" Match: Input={result.input_match} | Output={result.output_match}",
f" Diff: Input={result.input_diff} ({result.input_pct_diff:.1f}%) | "
f"Output={result.output_diff} ({result.output_pct_diff:.1f}%)",
])
lines.extend(["", "", "ANALYSIS", "-" * 70])
lines.append(generate_analysis(report))
return "\n".join(lines)
def generate_analysis(report: EvaluationReport) -> str:
"""Generate analysis and recommendations."""
lines = []
# Token accuracy assessment
if report.input_token_accuracy >= 95:
lines.append("✓ Input token counting is excellent (≥95% accuracy)")
elif report.input_token_accuracy >= 80:
lines.append("⚠ Input token counting needs attention (80-95% accuracy)")
else:
lines.append("✗ Input token counting has significant issues (<80% accuracy)")
if report.output_token_accuracy >= 95:
lines.append("✓ Output token counting is excellent (≥95% accuracy)")
elif report.output_token_accuracy >= 80:
lines.append("⚠ Output token counting needs attention (80-95% accuracy)")
else:
lines.append("✗ Output token counting has significant issues (<80% accuracy)")
# MAE assessment
if report.input_mae > 10:
lines.append(f"⚠ High input token MAE ({report.input_mae:.2f}) - review tokenizer configuration")
if report.output_mae > 10:
lines.append(f"⚠ High output token MAE ({report.output_mae:.2f}) - check SSE parsing logic")
# Bias analysis
if abs(report.input_bias_mean) > 5:
direction = "overcounts" if report.input_bias_mean > 0 else "undercounts"
lines.append(f"⚠ Proxy consistently {direction} input tokens by {abs(report.input_bias_mean):.2f} on average")
if abs(report.output_bias_mean) > 5:
direction = "overcounts" if report.output_bias_mean > 0 else "undercounts"
lines.append(f"⚠ Proxy consistently {direction} output tokens by {abs(report.output_bias_mean):.2f} on average")
# Pattern analysis
input_matches = sum(1 for r in report.results if r.input_match)
output_matches = sum(1 for r in report.results if r.output_match)
if input_matches == 0:
lines.append("⚠ No exact input token matches found - systematic difference detected")
if output_matches == 0:
lines.append("⚠ No exact output token matches found - systematic difference detected")
return "\n".join(lines)
if __name__ == "__main__":
args = parse_args()
report = run_evaluation(args)
# Exit with error if any tests failed
if report.failed_requests > 0:
sys.exit(1)