- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy - dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard - Update all Go import paths in proxy/ and dashboard/ to match new module paths - Add proxy/evaluation/ package (was missing from initial commit) - Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
222 lines
7.9 KiB
Python
222 lines
7.9 KiB
Python
"""CLI interface for evaluation framework."""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import typer
|
|
from rich.console import Console
|
|
from dotenv import load_dotenv
|
|
|
|
from zai_eval.client import DualClient
|
|
from zai_eval.test_cases import get_test_cases, get_test_case_by_name, TEST_CASES
|
|
from zai_eval.models import EvaluationResult
|
|
from zai_eval.metrics import calculate_metrics
|
|
from zai_eval.report import print_report, save_report_json, save_report_markdown
|
|
|
|
app = typer.Typer(help="Z.AI Proxy Evaluation Framework")
|
|
console = Console()
|
|
|
|
|
|
def get_api_keys() -> tuple[str, str, str]:
|
|
"""Get API keys from environment variables.
|
|
|
|
Returns:
|
|
Tuple of (proxy_url, proxy_api_key, anthropic_api_key)
|
|
"""
|
|
load_dotenv()
|
|
|
|
proxy_url = os.getenv("ZAI_PROXY_URL", "http://localhost:8080")
|
|
proxy_api_key = os.getenv("ZAI_API_KEY")
|
|
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
|
|
|
if not proxy_api_key:
|
|
console.print("[red]Error: ZAI_API_KEY environment variable not set[/red]")
|
|
console.print("Set it with: export ZAI_API_KEY=your-key")
|
|
raise typer.Exit(1)
|
|
|
|
if not anthropic_api_key:
|
|
console.print("[red]Error: ANTHROPIC_API_KEY environment variable not set[/red]")
|
|
console.print("Set it with: export ANTHROPIC_API_KEY=your-key")
|
|
raise typer.Exit(1)
|
|
|
|
return proxy_url, proxy_api_key, anthropic_api_key
|
|
|
|
|
|
@app.command()
|
|
def list_tests():
|
|
"""List all available test cases."""
|
|
console.print("\n[bold cyan]Available Test Cases[/bold cyan]\n")
|
|
|
|
for i, test in enumerate(TEST_CASES, 1):
|
|
console.print(f"{i}. [yellow]{test.name}[/yellow]")
|
|
console.print(f" {test.description}")
|
|
console.print(f" Model: {test.model} | Max tokens: {test.max_tokens}")
|
|
console.print()
|
|
|
|
|
|
@app.command()
|
|
def run(
|
|
test_name: Optional[str] = typer.Argument(None, help="Name of specific test to run"),
|
|
output_dir: Optional[Path] = typer.Option(None, "--output", "-o", help="Output directory for reports"),
|
|
json_output: bool = typer.Option(False, "--json", help="Save JSON report"),
|
|
markdown_output: bool = typer.Option(False, "--markdown", help="Save Markdown report"),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
|
|
):
|
|
"""Run evaluation tests.
|
|
|
|
If TEST_NAME is provided, run only that test. Otherwise run all tests.
|
|
"""
|
|
proxy_url, proxy_api_key, anthropic_api_key = get_api_keys()
|
|
|
|
# Get test cases to run
|
|
if test_name:
|
|
test_case = get_test_case_by_name(test_name)
|
|
if not test_case:
|
|
console.print(f"[red]Error: Test '{test_name}' not found[/red]")
|
|
console.print("Use 'zai-eval list-tests' to see available tests")
|
|
raise typer.Exit(1)
|
|
tests_to_run = [test_case]
|
|
console.print(f"[cyan]Running test: {test_name}[/cyan]\n")
|
|
else:
|
|
tests_to_run = get_test_cases()
|
|
console.print(f"[cyan]Running {len(tests_to_run)} tests[/cyan]\n")
|
|
|
|
# Initialize client
|
|
client = DualClient(proxy_url, proxy_api_key, anthropic_api_key)
|
|
|
|
# Run tests
|
|
results = []
|
|
|
|
with console.status("[bold green]Running evaluation...") as status:
|
|
for i, test in enumerate(tests_to_run, 1):
|
|
status.update(f"[bold green]Running test {i}/{len(tests_to_run)}: {test.name}[/bold green]")
|
|
|
|
if verbose:
|
|
console.print(f"\n[yellow]Test: {test.name}[/yellow]")
|
|
console.print(f" Description: {test.description}")
|
|
|
|
proxy_response, anthropic_response = client.evaluate_request(
|
|
model=test.model,
|
|
messages=test.messages,
|
|
max_tokens=test.max_tokens,
|
|
stream=test.stream,
|
|
temperature=test.temperature,
|
|
)
|
|
|
|
result = EvaluationResult(
|
|
request_name=test.name,
|
|
proxy_response=proxy_response,
|
|
anthropic_response=anthropic_response,
|
|
)
|
|
result.calculate_metrics()
|
|
results.append(result)
|
|
|
|
if verbose:
|
|
if proxy_response.error:
|
|
console.print(f" [red]Proxy error: {proxy_response.error}[/red]")
|
|
if anthropic_response.error:
|
|
console.print(f" [red]Anthropic error: {anthropic_response.error}[/red]")
|
|
console.print(f" Proxy: {proxy_response.input_tokens}/{proxy_response.output_tokens}")
|
|
console.print(f" Anthropic: {anthropic_response.input_tokens}/{anthropic_response.output_tokens}")
|
|
console.print(f" Diff: {result.total_diff:+d} ({result.total_pct_diff:.1f}%)")
|
|
|
|
# Calculate metrics
|
|
report = calculate_metrics(results)
|
|
|
|
# Print report
|
|
print_report(console, report)
|
|
|
|
# Save reports if requested
|
|
if output_dir:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
if json_output:
|
|
json_path = output_dir / "evaluation_report.json"
|
|
save_report_json(report, str(json_path))
|
|
console.print(f"\n[green]JSON report saved to: {json_path}[/green]")
|
|
|
|
if markdown_output:
|
|
md_path = output_dir / "evaluation_report.md"
|
|
save_report_markdown(report, str(md_path))
|
|
console.print(f"[green]Markdown report saved to: {md_path}[/green]")
|
|
|
|
# Exit with error code if any tests failed
|
|
failed_count = sum(1 for r in results if r.proxy_response.error or r.anthropic_response.error)
|
|
if failed_count > 0:
|
|
raise typer.Exit(1)
|
|
|
|
|
|
@app.command()
|
|
def quick(
|
|
prompt: str = typer.Argument(..., help="Prompt text to test"),
|
|
model: str = typer.Option("claude-3-sonnet-20240229", "--model", "-m", help="Model to use"),
|
|
max_tokens: int = typer.Option(100, "--max-tokens", help="Max tokens"),
|
|
):
|
|
"""Run a quick single-test evaluation with custom prompt."""
|
|
proxy_url, proxy_api_key, anthropic_api_key = get_api_keys()
|
|
|
|
console.print(f"[cyan]Quick test with model: {model}[/cyan]\n")
|
|
console.print(f"Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}\n")
|
|
|
|
client = DualClient(proxy_url, proxy_api_key, anthropic_api_key)
|
|
|
|
messages = [{"role": "user", "content": prompt}]
|
|
|
|
proxy_response, anthropic_response = client.evaluate_request(
|
|
model=model,
|
|
messages=messages,
|
|
max_tokens=max_tokens,
|
|
)
|
|
|
|
console.print("\n[bold]Results:[/bold]")
|
|
console.print(f"Proxy: In={proxy_response.input_tokens or 0}, Out={proxy_response.output_tokens or 0}")
|
|
console.print(f"Anthropic: In={anthropic_response.input_tokens or 0}, Out={anthropic_response.output_tokens or 0}")
|
|
|
|
if proxy_response.error:
|
|
console.print(f"[red]Proxy error: {proxy_response.error}[/red]")
|
|
if anthropic_response.error:
|
|
console.print(f"[red]Anthropic error: {anthropic_response.error}[/red]")
|
|
|
|
|
|
@app.command()
|
|
def validate():
|
|
"""Validate that both endpoints are accessible."""
|
|
proxy_url, proxy_api_key, anthropic_api_key = get_api_keys()
|
|
|
|
console.print("[cyan]Validating endpoints...[/cyan]\n")
|
|
|
|
client = DualClient(proxy_url, proxy_api_key, anthropic_api_key)
|
|
|
|
# Test proxy
|
|
console.print("Testing Z.AI proxy...")
|
|
proxy_resp, _ = client.evaluate_request(
|
|
model="claude-3-sonnet-20240229",
|
|
messages=[{"role": "user", "content": "test"}],
|
|
max_tokens=10,
|
|
)
|
|
|
|
if proxy_resp.error:
|
|
console.print(f" [red]✗ Failed: {proxy_resp.error}[/red]")
|
|
else:
|
|
console.print(f" [green]✓ OK[/green] (status: {proxy_resp.status_code})")
|
|
|
|
# Test Anthropic
|
|
console.print("Testing Anthropic API...")
|
|
_, anthropic_resp = client.evaluate_request(
|
|
model="claude-3-sonnet-20240229",
|
|
messages=[{"role": "user", "content": "test"}],
|
|
max_tokens=10,
|
|
)
|
|
|
|
if anthropic_resp.error:
|
|
console.print(f" [red]✗ Failed: {anthropic_resp.error}[/red]")
|
|
else:
|
|
console.print(f" [green]✓ OK[/green] (status: {anthropic_resp.status_code})")
|
|
|
|
console.print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|