- proxy/go.mod: github.com/ardenone/zai-proxy → git.ardenone.com/jedarden/zai-proxy - dashboard/go.mod: github.com/ardenone/ardenone-cluster/containers/zai-proxy-dashboard → git.ardenone.com/jedarden/zai-proxy/dashboard - Update all Go import paths in proxy/ and dashboard/ to match new module paths - Add proxy/evaluation/ package (was missing from initial commit) - Add docs/plan/plan.md with architecture, security model, telemetry design, and migration checklist Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
211 lines
6.3 KiB
Python
211 lines
6.3 KiB
Python
"""HTTP client for making requests to proxy and Anthropic APIs."""
|
|
|
|
import time
|
|
import json
|
|
from typing import Optional
|
|
import httpx
|
|
|
|
|
|
class ProxyClient:
|
|
"""Client for z.ai proxy."""
|
|
|
|
def __init__(self, base_url: str, api_key: str):
|
|
self.base_url = base_url.rstrip("/")
|
|
self.api_key = api_key
|
|
self.client = httpx.Client(timeout=300.0)
|
|
|
|
def make_request(
|
|
self,
|
|
model: str,
|
|
messages: list[dict],
|
|
max_tokens: int = 100,
|
|
stream: bool = False,
|
|
temperature: Optional[float] = None,
|
|
) -> dict:
|
|
"""Make a request to the proxy and extract token usage."""
|
|
from zai_eval.models import ProxyResponse
|
|
|
|
start_time = time.time()
|
|
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"x-api-key": self.api_key,
|
|
"anthropic-version": "2023-06-01",
|
|
}
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"max_tokens": max_tokens,
|
|
"stream": stream,
|
|
}
|
|
|
|
if temperature is not None:
|
|
payload["temperature"] = temperature
|
|
|
|
try:
|
|
response = self.client.post(
|
|
f"{self.base_url}/v1/messages",
|
|
headers=headers,
|
|
json=payload,
|
|
)
|
|
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
# Extract token counts from headers or response body
|
|
input_tokens = None
|
|
output_tokens = None
|
|
|
|
# Check trailer headers first
|
|
input_tokens = response.headers.get("X-Token-Input")
|
|
output_tokens = response.headers.get("X-Token-Output")
|
|
total_tokens = response.headers.get("X-Token-Total")
|
|
|
|
# If not in headers, try response body
|
|
if input_tokens is None:
|
|
try:
|
|
data = response.json()
|
|
if "usage" in data:
|
|
input_tokens = data["usage"].get("input_tokens")
|
|
output_tokens = data["usage"].get("output_tokens")
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
# Convert to int
|
|
input_tokens = int(input_tokens) if input_tokens else None
|
|
output_tokens = int(output_tokens) if output_tokens else None
|
|
total_tokens = int(total_tokens) if total_tokens else None
|
|
|
|
return ProxyResponse(
|
|
status_code=response.status_code,
|
|
input_tokens=input_tokens,
|
|
output_tokens=output_tokens,
|
|
total_tokens=total_tokens,
|
|
usage_header=response.headers.get("X-Token-Usage"),
|
|
latency_ms=latency_ms,
|
|
)
|
|
|
|
except httpx.HTTPError as e:
|
|
return ProxyResponse(
|
|
status_code=0,
|
|
error=str(e),
|
|
latency_ms=(time.time() - start_time) * 1000,
|
|
)
|
|
|
|
|
|
class AnthropicClient:
|
|
"""Client for Anthropic API."""
|
|
|
|
def __init__(self, api_key: str):
|
|
self.api_key = api_key
|
|
self.client = httpx.Client(
|
|
base_url="https://api.anthropic.com",
|
|
timeout=300.0,
|
|
)
|
|
|
|
def make_request(
|
|
self,
|
|
model: str,
|
|
messages: list[dict],
|
|
max_tokens: int = 100,
|
|
stream: bool = False,
|
|
temperature: Optional[float] = None,
|
|
) -> dict:
|
|
"""Make a request to Anthropic API and extract token usage."""
|
|
from zai_eval.models import AnthropicResponse
|
|
|
|
start_time = time.time()
|
|
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"x-api-key": self.api_key,
|
|
"anthropic-version": "2023-06-01",
|
|
}
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"max_tokens": max_tokens,
|
|
"stream": stream,
|
|
}
|
|
|
|
if temperature is not None:
|
|
payload["temperature"] = temperature
|
|
|
|
try:
|
|
response = self.client.post(
|
|
"/v1/messages",
|
|
headers=headers,
|
|
json=payload,
|
|
)
|
|
|
|
latency_ms = (time.time() - start_time) * 1000
|
|
|
|
input_tokens = None
|
|
output_tokens = None
|
|
|
|
if response.status_code == 200:
|
|
try:
|
|
data = response.json()
|
|
if "usage" in data:
|
|
input_tokens = data["usage"].get("input_tokens")
|
|
output_tokens = data["usage"].get("output_tokens")
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
return AnthropicResponse(
|
|
status_code=response.status_code,
|
|
input_tokens=input_tokens,
|
|
output_tokens=output_tokens,
|
|
total_tokens=(input_tokens or 0) + (output_tokens or 0),
|
|
latency_ms=latency_ms,
|
|
)
|
|
|
|
except httpx.HTTPError as e:
|
|
return AnthropicResponse(
|
|
status_code=0,
|
|
error=str(e),
|
|
latency_ms=(time.time() - start_time) * 1000,
|
|
)
|
|
|
|
|
|
class DualClient:
|
|
"""Client that makes parallel requests to both proxy and Anthropic."""
|
|
|
|
def __init__(self, proxy_url: str, proxy_api_key: str, anthropic_api_key: str):
|
|
self.proxy = ProxyClient(proxy_url, proxy_api_key)
|
|
self.anthropic = AnthropicClient(anthropic_api_key)
|
|
|
|
def evaluate_request(
|
|
self,
|
|
model: str,
|
|
messages: list[dict],
|
|
max_tokens: int = 100,
|
|
stream: bool = False,
|
|
temperature: Optional[float] = None,
|
|
) -> tuple:
|
|
"""Make parallel requests to both endpoints."""
|
|
import concurrent.futures
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
proxy_future = executor.submit(
|
|
self.proxy.make_request,
|
|
model,
|
|
messages,
|
|
max_tokens,
|
|
stream,
|
|
temperature,
|
|
)
|
|
anthropic_future = executor.submit(
|
|
self.anthropic.make_request,
|
|
model,
|
|
messages,
|
|
max_tokens,
|
|
stream,
|
|
temperature,
|
|
)
|
|
|
|
proxy_response = proxy_future.result()
|
|
anthropic_response = anthropic_future.result()
|
|
|
|
return proxy_response, anthropic_response
|