"""CLI interface for evaluation framework.""" import os import sys from pathlib import Path from typing import Optional import typer from rich.console import Console from dotenv import load_dotenv from zai_eval.client import DualClient from zai_eval.test_cases import get_test_cases, get_test_case_by_name, TEST_CASES from zai_eval.models import EvaluationResult from zai_eval.metrics import calculate_metrics from zai_eval.report import print_report, save_report_json, save_report_markdown app = typer.Typer(help="Z.AI Proxy Evaluation Framework") console = Console() def get_api_keys() -> tuple[str, str, str]: """Get API keys from environment variables. Returns: Tuple of (proxy_url, proxy_api_key, anthropic_api_key) """ load_dotenv() proxy_url = os.getenv("ZAI_PROXY_URL", "http://localhost:8080") proxy_api_key = os.getenv("ZAI_API_KEY") anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") if not proxy_api_key: console.print("[red]Error: ZAI_API_KEY environment variable not set[/red]") console.print("Set it with: export ZAI_API_KEY=your-key") raise typer.Exit(1) if not anthropic_api_key: console.print("[red]Error: ANTHROPIC_API_KEY environment variable not set[/red]") console.print("Set it with: export ANTHROPIC_API_KEY=your-key") raise typer.Exit(1) return proxy_url, proxy_api_key, anthropic_api_key @app.command() def list_tests(): """List all available test cases.""" console.print("\n[bold cyan]Available Test Cases[/bold cyan]\n") for i, test in enumerate(TEST_CASES, 1): console.print(f"{i}. [yellow]{test.name}[/yellow]") console.print(f" {test.description}") console.print(f" Model: {test.model} | Max tokens: {test.max_tokens}") console.print() @app.command() def run( test_name: Optional[str] = typer.Argument(None, help="Name of specific test to run"), output_dir: Optional[Path] = typer.Option(None, "--output", "-o", help="Output directory for reports"), json_output: bool = typer.Option(False, "--json", help="Save JSON report"), markdown_output: bool = typer.Option(False, "--markdown", help="Save Markdown report"), verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"), ): """Run evaluation tests. If TEST_NAME is provided, run only that test. Otherwise run all tests. """ proxy_url, proxy_api_key, anthropic_api_key = get_api_keys() # Get test cases to run if test_name: test_case = get_test_case_by_name(test_name) if not test_case: console.print(f"[red]Error: Test '{test_name}' not found[/red]") console.print("Use 'zai-eval list-tests' to see available tests") raise typer.Exit(1) tests_to_run = [test_case] console.print(f"[cyan]Running test: {test_name}[/cyan]\n") else: tests_to_run = get_test_cases() console.print(f"[cyan]Running {len(tests_to_run)} tests[/cyan]\n") # Initialize client client = DualClient(proxy_url, proxy_api_key, anthropic_api_key) # Run tests results = [] with console.status("[bold green]Running evaluation...") as status: for i, test in enumerate(tests_to_run, 1): status.update(f"[bold green]Running test {i}/{len(tests_to_run)}: {test.name}[/bold green]") if verbose: console.print(f"\n[yellow]Test: {test.name}[/yellow]") console.print(f" Description: {test.description}") proxy_response, anthropic_response = client.evaluate_request( model=test.model, messages=test.messages, max_tokens=test.max_tokens, stream=test.stream, temperature=test.temperature, ) result = EvaluationResult( request_name=test.name, proxy_response=proxy_response, anthropic_response=anthropic_response, ) result.calculate_metrics() results.append(result) if verbose: if proxy_response.error: console.print(f" [red]Proxy error: {proxy_response.error}[/red]") if anthropic_response.error: console.print(f" [red]Anthropic error: {anthropic_response.error}[/red]") console.print(f" Proxy: {proxy_response.input_tokens}/{proxy_response.output_tokens}") console.print(f" Anthropic: {anthropic_response.input_tokens}/{anthropic_response.output_tokens}") console.print(f" Diff: {result.total_diff:+d} ({result.total_pct_diff:.1f}%)") # Calculate metrics report = calculate_metrics(results) # Print report print_report(console, report) # Save reports if requested if output_dir: output_dir.mkdir(parents=True, exist_ok=True) if json_output: json_path = output_dir / "evaluation_report.json" save_report_json(report, str(json_path)) console.print(f"\n[green]JSON report saved to: {json_path}[/green]") if markdown_output: md_path = output_dir / "evaluation_report.md" save_report_markdown(report, str(md_path)) console.print(f"[green]Markdown report saved to: {md_path}[/green]") # Exit with error code if any tests failed failed_count = sum(1 for r in results if r.proxy_response.error or r.anthropic_response.error) if failed_count > 0: raise typer.Exit(1) @app.command() def quick( prompt: str = typer.Argument(..., help="Prompt text to test"), model: str = typer.Option("claude-3-sonnet-20240229", "--model", "-m", help="Model to use"), max_tokens: int = typer.Option(100, "--max-tokens", help="Max tokens"), ): """Run a quick single-test evaluation with custom prompt.""" proxy_url, proxy_api_key, anthropic_api_key = get_api_keys() console.print(f"[cyan]Quick test with model: {model}[/cyan]\n") console.print(f"Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}\n") client = DualClient(proxy_url, proxy_api_key, anthropic_api_key) messages = [{"role": "user", "content": prompt}] proxy_response, anthropic_response = client.evaluate_request( model=model, messages=messages, max_tokens=max_tokens, ) console.print("\n[bold]Results:[/bold]") console.print(f"Proxy: In={proxy_response.input_tokens or 0}, Out={proxy_response.output_tokens or 0}") console.print(f"Anthropic: In={anthropic_response.input_tokens or 0}, Out={anthropic_response.output_tokens or 0}") if proxy_response.error: console.print(f"[red]Proxy error: {proxy_response.error}[/red]") if anthropic_response.error: console.print(f"[red]Anthropic error: {anthropic_response.error}[/red]") @app.command() def validate(): """Validate that both endpoints are accessible.""" proxy_url, proxy_api_key, anthropic_api_key = get_api_keys() console.print("[cyan]Validating endpoints...[/cyan]\n") client = DualClient(proxy_url, proxy_api_key, anthropic_api_key) # Test proxy console.print("Testing Z.AI proxy...") proxy_resp, _ = client.evaluate_request( model="claude-3-sonnet-20240229", messages=[{"role": "user", "content": "test"}], max_tokens=10, ) if proxy_resp.error: console.print(f" [red]✗ Failed: {proxy_resp.error}[/red]") else: console.print(f" [green]✓ OK[/green] (status: {proxy_resp.status_code})") # Test Anthropic console.print("Testing Anthropic API...") _, anthropic_resp = client.evaluate_request( model="claude-3-sonnet-20240229", messages=[{"role": "user", "content": "test"}], max_tokens=10, ) if anthropic_resp.error: console.print(f" [red]✗ Failed: {anthropic_resp.error}[/red]") else: console.print(f" [green]✓ OK[/green] (status: {anthropic_resp.status_code})") console.print() if __name__ == "__main__": app()