zai-proxy/proxy/evaluation/run_evaluation.py

#!/usr/bin/env python3
"""Z.AI Proxy Evaluation Framework - CLI Entry Point

Compares token counts from z.ai proxy with Anthropic API responses.
"""

import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path

# Add evaluation package to path
sys.path.insert(0, str(Path(__file__).parent))

from zai_eval.client import DualClient
from zai_eval.test_cases import get_test_cases
from zai_eval.models import EvaluationResult, EvaluationReport


def parse_args():
    parser = argparse.ArgumentParser(
        description="Evaluate z.ai proxy token counting against Anthropic API"
    )
    parser.add_argument(
        "--proxy-url",
        default=os.getenv("ZAI_PROXY_URL", "http://localhost:8080"),
        help="Z.AI proxy URL (default: from ZAI_PROXY_URL or http://localhost:8080)"
    )
    parser.add_argument(
        "--proxy-key",
        default=os.getenv("ZAI_API_KEY"),
        help="Z.AI API key (default: from ZAI_API_KEY)"
    )
    parser.add_argument(
        "--anthropic-key",
        default=os.getenv("ANTHROPIC_API_KEY"),
        help="Anthropic API key (default: from ANTHROPIC_API_KEY)"
    )
    parser.add_argument(
        "--output-dir",
        default="evaluation/results",
        help="Output directory for reports (default: evaluation/results)"
    )
    parser.add_argument(
        "--test-name",
        help="Run only a specific test case by name"
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Enable verbose output"
    )
    return parser.parse_args()


def run_evaluation(args):
    """Run the evaluation suite."""
    # Validate required parameters
    if not args.proxy_key:
        print("Error: Z.AI API key required. Set ZAI_API_KEY or use --proxy-key")
        sys.exit(1)
    if not args.anthropic_key:
        print("Error: Anthropic API key required. Set ANTHROPIC_API_KEY or use --anthropic-key")
        sys.exit(1)

    print("=" * 70)
    print("Z.AI Proxy Evaluation Framework")
    print("=" * 70)
    print(f"Proxy URL:        {args.proxy_url}")
    print(f"Anthropic API:    https://api.anthropic.com")
    print(f"Output Directory: {args.output_dir}")
    print()

    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Get test cases
    all_tests = get_test_cases()
    if args.test_name:
        tests = [t for t in all_tests if t.name == args.test_name]
        if not tests:
            print(f"Error: Test case '{args.test_name}' not found")
            print(f"Available tests: {', '.join(t.name for t in all_tests)}")
            sys.exit(1)
    else:
        tests = all_tests

    print(f"Running {len(tests)} test case(s)...")
    print()

    # Create dual client
    client = DualClient(args.proxy_url, args.proxy_key, args.anthropic_key)

    # Run evaluation
    results = []
    for i, test in enumerate(tests, 1):
        print(f"[{i}/{len(tests)}] {test.name}: {test.description}")
        if args.verbose:
            print(f"  Model: {test.model}")
            print(f"  Max tokens: {test.max_tokens}")
            print(f"  Stream: {test.stream}")

        # Execute parallel requests
        proxy_resp, anthropic_resp = client.evaluate_request(
            model=test.model,
            messages=test.messages,
            max_tokens=test.max_tokens,
            stream=test.stream,
            temperature=test.temperature,
        )

        # Create result
        result = EvaluationResult(
            request_name=test.name,
            proxy_response=proxy_resp,
            anthropic_response=anthropic_resp,
            input_match=False,
            output_match=False,
            total_match=False,
        )
        result.calculate_metrics()

        results.append(result)

        # Show result
        status = "✓" if (proxy_resp.status_code == 200 and anthropic_resp.status_code == 200) else "✗"
        print(f"  Status: {status}")
        print(f"  Proxy:      {proxy_resp.status_code} | "
              f"In: {proxy_resp.input_tokens or 'N/A':>4} | "
              f"Out: {proxy_resp.output_tokens or 'N/A':>4} | "
              f"Latency: {proxy_resp.latency_ms:.0f}ms")
        print(f"  Anthropic:  {anthropic_resp.status_code} | "
              f"In: {anthropic_resp.input_tokens or 'N/A':>4} | "
              f"Out: {anthropic_resp.output_tokens or 'N/A':>4} | "
              f"Latency: {anthropic_resp.latency_ms:.0f}ms")

        if proxy_resp.status_code == 200 and anthropic_resp.status_code == 200:
            match_indicator = "✓" if result.input_match else "✗"
            print(f"  Input match:  {match_indicator} (diff: {result.input_diff}, {result.input_pct_diff:.1f}%)")
            match_indicator = "✓" if result.output_match else "✗"
            print(f"  Output match: {match_indicator} (diff: {result.output_diff}, {result.output_pct_diff:.1f}%)")
        elif proxy_resp.error:
            print(f"  Proxy error: {proxy_resp.error}")
        elif anthropic_resp.error:
            print(f"  Anthropic error: {anthropic_resp.error}")
        print()

    # Generate report
    print("Generating report...")
    report = EvaluationReport(
        total_requests=len(tests),
        successful_requests=0,
        failed_requests=0,
        results=results,
    )
    report.calculate_summary_metrics()

    # Print summary
    print()
    print("=" * 70)
    print("EVALUATION SUMMARY")
    print("=" * 70)
    print(f"Total tests:       {report.total_requests}")
    print(f"Successful:        {report.successful_requests}")
    print(f"Failed:            {report.failed_requests}")
    print()
    print("Token Accuracy:")
    print(f"  Input tokens:     {report.input_token_accuracy:.1f}%")
    print(f"  Output tokens:    {report.output_token_accuracy:.1f}%")
    print(f"  Overall:          {report.overall_accuracy:.1f}%")
    print()
    print("Mean Absolute Error:")
    print(f"  Input tokens:     {report.input_mae:.2f}")
    print(f"  Output tokens:    {report.output_mae:.2f}")
    print(f"  Total tokens:     {report.total_mae:.2f}")
    print()
    print("Systematic Bias:")
    print(f"  Input bias:       {report.input_bias_mean:+.2f} (positive = proxy overcounts)")
    print(f"  Output bias:      {report.output_bias_mean:+.2f} (positive = proxy overcounts)")
    print()
    print("Latency:")
    print(f"  Avg proxy:        {report.avg_proxy_latency_ms:.0f}ms")
    print(f"  Avg Anthropic:    {report.avg_anthropic_latency_ms:.0f}ms")
    print()

    # Save reports
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # JSON report
    json_file = output_dir / f"evaluation_report_{timestamp}.json"
    with open(json_file, "w") as f:
        json.dump(report.dict(), f, indent=2, default=str)
    print(f"✓ JSON report saved: {json_file}")

    # Text report
    text_file = output_dir / f"evaluation_report_{timestamp}.txt"
    with open(text_file, "w") as f:
        f.write(generate_text_report(report))
    print(f"✓ Text report saved: {text_file}")

    # Analysis
    print()
    print("=" * 70)
    print("ANALYSIS")
    print("=" * 70)
    print(generate_analysis(report))

    return report


def generate_text_report(report: EvaluationReport) -> str:
    """Generate a detailed text report."""
    lines = [
        "Z.AI Proxy Evaluation Report",
        "=" * 70,
        f"Generated: {report.timestamp.isoformat()}",
        "",
        "EXECUTIVE SUMMARY",
        "-" * 70,
        f"Total Tests:       {report.total_requests}",
        f"Successful:        {report.successful_requests}",
        f"Failed:            {report.failed_requests}",
        "",
        "TOKEN ACCURACY METRICS",
        "-" * 70,
        f"Input Token Accuracy:  {report.input_token_accuracy:.1f}%",
        f"Output Token Accuracy: {report.output_token_accuracy:.1f}%",
        f"Overall Accuracy:      {report.overall_accuracy:.1f}%",
        "",
        "MEAN ABSOLUTE ERROR",
        "-" * 70,
        f"Input MAE:       {report.input_mae:.2f} tokens",
        f"Output MAE:      {report.output_mae:.2f} tokens",
        f"Total MAE:       {report.total_mae:.2f} tokens",
        "",
        "SYSTEMATIC BIAS",
        "-" * 70,
        f"Input Bias:      {report.input_bias_mean:+.2f} (positive = proxy overcounts)",
        f"Output Bias:     {report.output_bias_mean:+.2f} (positive = proxy overcounts)",
        "",
        "LATENCY",
        "-" * 70,
        f"Avg Proxy Latency:     {report.avg_proxy_latency_ms:.0f}ms",
        f"Avg Anthropic Latency: {report.avg_anthropic_latency_ms:.0f}ms",
        "",
        "DETAILED RESULTS",
        "-" * 70,
    ]

    for result in report.results:
        lines.extend([
            "",
            f"Test: {result.request_name}",
            f"  Proxy:      Status={result.proxy_response.status_code} | "
            f"In={result.proxy_response.input_tokens or 'N/A':>4} | "
            f"Out={result.proxy_response.output_tokens or 'N/A':>4}",
            f"  Anthropic:  Status={result.anthropic_response.status_code} | "
            f"In={result.anthropic_response.input_tokens or 'N/A':>4} | "
            f"Out={result.anthropic_response.output_tokens or 'N/A':>4}",
            f"  Match:      Input={result.input_match} | Output={result.output_match}",
            f"  Diff:       Input={result.input_diff} ({result.input_pct_diff:.1f}%) | "
            f"Output={result.output_diff} ({result.output_pct_diff:.1f}%)",
        ])

    lines.extend(["", "", "ANALYSIS", "-" * 70])
    lines.append(generate_analysis(report))

    return "\n".join(lines)


def generate_analysis(report: EvaluationReport) -> str:
    """Generate analysis and recommendations."""
    lines = []

    # Token accuracy assessment
    if report.input_token_accuracy >= 95:
        lines.append("✓ Input token counting is excellent (≥95% accuracy)")
    elif report.input_token_accuracy >= 80:
        lines.append("⚠ Input token counting needs attention (80-95% accuracy)")
    else:
        lines.append("✗ Input token counting has significant issues (<80% accuracy)")

    if report.output_token_accuracy >= 95:
        lines.append("✓ Output token counting is excellent (≥95% accuracy)")
    elif report.output_token_accuracy >= 80:
        lines.append("⚠ Output token counting needs attention (80-95% accuracy)")
    else:
        lines.append("✗ Output token counting has significant issues (<80% accuracy)")

    # MAE assessment
    if report.input_mae > 10:
        lines.append(f"⚠ High input token MAE ({report.input_mae:.2f}) - review tokenizer configuration")
    if report.output_mae > 10:
        lines.append(f"⚠ High output token MAE ({report.output_mae:.2f}) - check SSE parsing logic")

    # Bias analysis
    if abs(report.input_bias_mean) > 5:
        direction = "overcounts" if report.input_bias_mean > 0 else "undercounts"
        lines.append(f"⚠ Proxy consistently {direction} input tokens by {abs(report.input_bias_mean):.2f} on average")
    if abs(report.output_bias_mean) > 5:
        direction = "overcounts" if report.output_bias_mean > 0 else "undercounts"
        lines.append(f"⚠ Proxy consistently {direction} output tokens by {abs(report.output_bias_mean):.2f} on average")

    # Pattern analysis
    input_matches = sum(1 for r in report.results if r.input_match)
    output_matches = sum(1 for r in report.results if r.output_match)

    if input_matches == 0:
        lines.append("⚠ No exact input token matches found - systematic difference detected")
    if output_matches == 0:
        lines.append("⚠ No exact output token matches found - systematic difference detected")

    return "\n".join(lines)


if __name__ == "__main__":
    args = parse_args()
    report = run_evaluation(args)

    # Exit with error if any tests failed
    if report.failed_requests > 0:
        sys.exit(1)