zai-proxy/proxy/scripts/canary_integration_tests.sh

#!/bin/bash
# Canary Integration Tests for zai-proxy
# Tests production and canary deployments to verify token counting behavior
#
# Usage:
#   ./scripts/canary_integration_tests.sh [--production-url URL] [--canary-url URL] [--api-key KEY]
#
# Environment Variables:
#   PRODUCTION_URL - Production proxy URL (default: http://zai-proxy.devpod.svc.cluster.local:8080)
#   CANARY_URL - Canary proxy URL (default: http://zai-proxy-canary.devpod.svc.cluster.local:8080)
#   ZAI_API_KEY - API key for authentication (required)
#
# Exit Codes:
#   0 - All tests passed
#   1 - Test failures
#   2 - Configuration errors

set -euo pipefail

# Color output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Test counters
TESTS_TOTAL=0
TESTS_PASSED=0
TESTS_FAILED=0
TESTS_SKIPPED=0

# Test results directory
RESULTS_DIR="${RESULTS_DIR:-/tmp/zai-proxy-canary-tests}"
mkdir -p "$RESULTS_DIR"

# Timestamp for this test run
TIMESTAMP=$(date -u +"%Y%m%d_%H%M%S")
RESULTS_FILE="$RESULTS_DIR/canary_test_results_$TIMESTAMP.txt"
SUMMARY_FILE="$RESULTS_DIR/canary_test_summary_$TIMESTAMP.json"

# Default URLs
PRODUCTION_URL="${PRODUCTION_URL:-http://zai-proxy.devpod.svc.cluster.local:8080}"
CANARY_URL="${CANARY_URL:-http://zai-proxy-canary.devpod.svc.cluster.local:8080}"

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --production-url)
            PRODUCTION_URL="$2"
            shift 2
            ;;
        --canary-url)
            CANARY_URL="$2"
            shift 2
            ;;
        --api-key)
            ZAI_API_KEY="$2"
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [--production-url URL] [--canary-url URL] [--api-key KEY]"
            echo ""
            echo "Environment Variables:"
            echo "  PRODUCTION_URL - Production proxy URL"
            echo "  CANARY_URL - Canary proxy URL"
            echo "  ZAI_API_KEY - API key for authentication"
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 2
            ;;
    esac
done

# Validate required parameters
if [[ -z "${ZAI_API_KEY:-}" ]]; then
    echo -e "${RED}Error: ZAI_API_KEY is required${NC}"
    echo "Set via environment variable or --api-key argument"
    exit 2
fi

# Helper functions
log_header() {
    echo -e "${BLUE}=== $1 ===${NC}"
    echo "=== $1 ===" >> "$RESULTS_FILE"
}

log_test() {
    echo -e "${YELLOW}[TEST]${NC} $1"
    echo "[TEST] $1" >> "$RESULTS_FILE"
}

log_pass() {
    echo -e "${GREEN}[PASS]${NC} $1"
    echo "[PASS] $1" >> "$RESULTS_FILE"
    ((TESTS_PASSED++))
}

log_fail() {
    echo -e "${RED}[FAIL]${NC} $1"
    echo "[FAIL] $1" >> "$RESULTS_FILE"
    ((TESTS_FAILED++))
}

log_skip() {
    echo -e "${YELLOW}[SKIP]${NC} $1"
    echo "[SKIP] $1" >> "$RESULTS_FILE"
    ((TESTS_SKIPPED++))
}

log_info() {
    echo "$1"
    echo "$1" >> "$RESULTS_FILE"
}

# Record test start
((TESTS_TOTAL++))

# Initialize results file
{
    echo "Z.AI Proxy Canary Integration Tests"
    echo "Started: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
    echo "Production URL: $PRODUCTION_URL"
    echo "Canary URL: $CANARY_URL"
    echo ""
} > "$RESULTS_FILE"

log_header "Z.AI Proxy Canary Integration Tests"
log_info "Production URL: $PRODUCTION_URL"
log_info "Canary URL: $CANARY_URL"
log_info "Results: $RESULTS_FILE"
echo ""

# Test 1: Health Check
log_test "Health Check - Production"
if curl -sf "$PRODUCTION_URL/health" > /dev/null 2>&1; then
    log_pass "Production health endpoint is responding"
else
    log_fail "Production health endpoint is not responding"
fi
((TESTS_TOTAL++))

log_test "Health Check - Canary"
if curl -sf "$CANARY_URL/health" > /dev/null 2>&1; then
    log_pass "Canary health endpoint is responding"
    CANARY_AVAILABLE=true
else
    log_skip "Canary deployment not available (expected for initial testing)"
    CANARY_AVAILABLE=false
fi
echo ""

# Test 2: Metrics Endpoint
log_test "Metrics Endpoint - Production"
if curl -sf "$PRODUCTION_URL/metrics" > /dev/null 2>&1; then
    log_pass "Production metrics endpoint is responding"

    # Check for token counting metrics
    if curl -s "$PRODUCTION_URL/metrics" | grep -q "zai_proxy_tokens_total"; then
        log_pass "Production has token counting metrics (token counting ENABLED)"
        PRODUCTION_TOKEN_COUNTING=true
    else
        log_info "Production missing token counting metrics (token counting DISABLED)"
        PRODUCTION_TOKEN_COUNTING=false
    fi
else
    log_fail "Production metrics endpoint is not responding"
fi
((TESTS_TOTAL++))

if [[ "$CANARY_AVAILABLE" == "true" ]]; then
    log_test "Metrics Endpoint - Canary"
    if curl -sf "$CANARY_URL/metrics" > /dev/null 2>&1; then
        log_pass "Canary metrics endpoint is responding"

        # Check for token counting metrics
        if curl -s "$CANARY_URL/metrics" | grep -q "zai_proxy_tokens_total"; then
            log_pass "Canary has token counting metrics (token counting ENABLED)"
            CANARY_TOKEN_COUNTING=true
        else
            log_fail "Canary missing token counting metrics (token counting DISABLED)"
            CANARY_TOKEN_COUNTING=false
        fi
    else
        log_fail "Canary metrics endpoint is not responding"
    fi
    ((TESTS_TOTAL++))
fi
echo ""

# Test 3: Token Counting Validation
log_header "Token Counting Validation Tests"

# Test 3.1: Basic Request Token Counting
log_test "Basic Request - Token Counting"
RESPONSE=$(curl -s -X POST "$PRODUCTION_URL/v1/messages" \
    -H "Content-Type: application/json" \
    -H "x-api-key: $ZAI_API_KEY" \
    -H "anthropic-version: 2023-06-01" \
    -d '{
        "model": "glm-4",
        "max_tokens": 50,
        "messages": [{"role": "user", "content": "Hello, how are you?"}]
    }')

if echo "$RESPONSE" | jq -e '.usage' > /dev/null 2>&1; then
    INPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.input_tokens // "null"')
    OUTPUT_TOKENS=$(echo "$RESPONSE" | jq -r '.usage.output_tokens // "null"')

    if [[ "$INPUT_TOKENS" != "null" && "$OUTPUT_TOKENS" != "null" ]]; then
        log_pass "Production returns token usage: input=$INPUT_TOKENS, output=$OUTPUT_TOKENS"
    else
        log_fail "Production usage field missing token counts: $RESPONSE"
    fi
else
    log_info "Production does not return usage field (token counting may be disabled)"
fi
((TESTS_TOTAL++))

# Test 3.2: Streaming Request Token Counting
log_test "Streaming Request - Token Counting"
STREAM_RESPONSE=$(curl -s -X POST "$PRODUCTION_URL/v1/messages" \
    -H "Content-Type: application/json" \
    -H "x-api-key: $ZAI_API_KEY" \
    -H "anthropic-version: 2023-06-01" \
    -d '{
        "model": "glm-4",
        "max_tokens": 50,
        "stream": true,
        "messages": [{"role": "user", "content": "Say hello"}]
    }')

# Check for usage in streaming response
if echo "$STREAM_RESPONSE" | grep -q '"usage"'; then
    # Extract usage from message_delta event
    USAGE_LINE=$(echo "$STREAM_RESPONSE" | grep '"usage"' | head -1)
    log_pass "Streaming response includes token usage: $USAGE_LINE"
else
    log_info "Streaming response does not include token usage"
fi
((TESTS_TOTAL++))
echo ""

# Test 4: Format Comparison
log_header "Format Comparison Tests"

# Test 4.1: Response Format Validation
log_test "Response Format Validation"
FORMAT_RESPONSE=$(curl -s -X POST "$PRODUCTION_URL/v1/messages" \
    -H "Content-Type: application/json" \
    -H "x-api-key: $ZAI_API_KEY" \
    -H "anthropic-version: 2023-06-01" \
    -d '{
        "model": "glm-4",
        "max_tokens": 50,
        "messages": [{"role": "user", "content": "Hi"}]
    }')

# Validate response structure
HAS_ID=$(echo "$FORMAT_RESPONSE" | jq -e '.id' > /dev/null 2>&1 && echo "true" || echo "false")
HAS_TYPE=$(echo "$FORMAT_RESPONSE" | jq -e '.type' > /dev/null 2>&1 && echo "true" || echo "false")
HAS_ROLE=$(echo "$FORMAT_RESPONSE" | jq -e '.role' > /dev/null 2>&1 && echo "true" || echo "false")
HAS_CONTENT=$(echo "$FORMAT_RESPONSE" | jq -e '.content' > /dev/null 2>&1 && echo "true" || echo "false")

if [[ "$HAS_ID" == "true" && "$HAS_TYPE" == "true" && "$HAS_ROLE" == "true" && "$HAS_CONTENT" == "true" ]]; then
    log_pass "Response has valid structure: id, type, role, content"
else
    log_fail "Response structure incomplete: id=$HAS_ID, type=$HAS_TYPE, role=$HAS_ROLE, content=$HAS_CONTENT"
fi
((TESTS_TOTAL++))
echo ""

# Test 5: Performance Benchmarks
log_header "Performance Benchmark Tests"

log_test "Latency Benchmark (10 requests)"
TOTAL_TIME=0
NUM_REQUESTS=10

for i in $(seq 1 $NUM_REQUESTS); do
    START=$(date +%s%3N)
    curl -s -X POST "$PRODUCTION_URL/v1/messages" \
        -H "Content-Type: application/json" \
        -H "x-api-key: $ZAI_API_KEY" \
        -H "anthropic-version: 2023-06-01" \
        -d '{
            "model": "glm-4",
            "max_tokens": 20,
            "messages": [{"role": "user", "content": "Hi"}]
        }' > /dev/null
    END=$(date +%s%3N)
    ELAPSED=$((END - START))
    TOTAL_TIME=$((TOTAL_TIME + ELAPSED))
done

AVG_LATENCY=$((TOTAL_TIME / NUM_REQUESTS))
log_info "Average latency: ${AVG_LATENCY}ms over $NUM_REQUESTS requests"

if [[ $AVG_LATENCY -lt 100 ]]; then
    log_pass "Latency excellent (<100ms average)"
elif [[ $AVG_LATENCY -lt 500 ]]; then
    log_pass "Latency acceptable (<500ms average)"
else
    log_fail "Latency high (>=500ms average): ${AVG_LATENCY}ms"
fi
((TESTS_TOTAL++))
echo ""

# Test 6: Streaming Response Tests
log_header "Streaming Response Tests"

log_test "Streaming Response Format"
STREAM_TEST=$(curl -s -X POST "$PRODUCTION_URL/v1/messages" \
    -H "Content-Type: application/json" \
    -H "x-api-key: $ZAI_API_KEY" \
    -H "anthropic-version: 2023-06-01" \
    -d '{
        "model": "glm-4",
        "max_tokens": 50,
        "stream": true,
        "messages": [{"role": "user", "content": "Count to 5"}]
    }')

# Check for required SSE events
HAS_MESSAGE_START=$(echo "$STREAM_TEST" | grep -q "message_start" && echo "true" || echo "false")
HAS_CONTENT_DELTA=$(echo "$STREAM_TEST" | grep -q "content_block_delta" && echo "true" || echo "false")
HAS_MESSAGE_STOP=$(echo "$STREAM_TEST" | grep -q "message_stop" && echo "true" || echo "false")

if [[ "$HAS_MESSAGE_START" == "true" && "$HAS_CONTENT_DELTA" == "true" && "$HAS_MESSAGE_STOP" == "true" ]]; then
    log_pass "Streaming response has valid SSE format"
else
    log_fail "Streaming response missing SSE events: start=$HAS_MESSAGE_START, delta=$HAS_CONTENT_DELTA, stop=$HAS_MESSAGE_STOP"
fi
((TESTS_TOTAL++))
echo ""

# Test 7: Load Testing
log_header "Load Testing - Concurrent Requests"

log_test "Concurrent Request Test (5 parallel)"
TEMP_DIR=$(mktemp -d)
PIDS=()

for i in {1..5}; do
    (
        START=$(date +%s%3N)
        curl -s -X POST "$PRODUCTION_URL/v1/messages" \
            -H "Content-Type: application/json" \
            -H "x-api-key: $ZAI_API_KEY" \
            -H "anthropic-version: 2023-06-01" \
            -d '{
                "model": "glm-4",
                "max_tokens": 20,
                "messages": [{"role": "user", "content": "Test"}]
            }' > "$TEMP_DIR/response_$i.json" 2>&1
        END=$(date +%s%3N)
        echo $((END - START)) > "$TEMP_DIR/latency_$i.txt"
    ) &
    PIDS+=($!)
done

# Wait for all requests to complete
for pid in "${PIDS[@]}"; do
    wait $pid || true
done

# Check results
SUCCESS_COUNT=0
TOTAL_LATENCY=0
for i in {1..5}; do
    if [[ -f "$TEMP_DIR/response_$i.json" ]]; then
        if jq -e '.content' "$TEMP_DIR/response_$i.json" > /dev/null 2>&1; then
            ((SUCCESS_COUNT++))
        fi
    fi
    if [[ -f "$TEMP_DIR/latency_$i.txt" ]]; then
        LATENCY=$(cat "$TEMP_DIR/latency_$i.txt")
        TOTAL_LATENCY=$((TOTAL_LATENCY + LATENCY))
    fi
done

rm -rf "$TEMP_DIR"

log_info "Concurrent test: $SUCCESS_COUNT/5 requests succeeded"
if [[ $SUCCESS_COUNT -eq 5 ]]; then
    log_pass "All concurrent requests succeeded"
else
    log_fail "Some concurrent requests failed: $SUCCESS_COUNT/5"
fi
((TESTS_TOTAL++))
echo ""

# Test 8: Production Metrics Monitoring
log_header "Production Metrics Monitoring"

log_test "Production Token Counting Metrics"
PROD_METRICS=$(curl -s "$PRODUCTION_URL/metrics" 2>/dev/null || echo "")

if [[ -n "$PROD_METRICS" ]]; then
    # Extract token counts from metrics
    INPUT_TOTAL=$(echo "$PROD_METRICS" | grep 'zai_proxy_tokens_total{direction="input"' | grep -oP '[0-9]+$' | head -1 || echo "0")
    OUTPUT_TOTAL=$(echo "$PROD_METRICS" | grep 'zai_proxy_tokens_total{direction="output"' | grep -oP '[0-9]+$' | head -1 || echo "0")

    log_info "Production metrics - Input tokens: $INPUT_TOTAL, Output tokens: $OUTPUT_TOTAL"

    if [[ "$INPUT_TOTAL" -gt 0 || "$OUTPUT_TOTAL" -gt 0 ]]; then
        log_pass "Production has recorded token usage"
    else
        log_info "Production has not recorded token usage yet (may be new deployment)"
    fi

    # Check request counts
    REQUEST_TOTAL=$(echo "$PROD_METRICS" | grep 'zai_proxy_requests_total' | grep -oP '[0-9]+$' | awk '{s+=$1} END {print s}' || echo "0")
    log_info "Total requests processed: $REQUEST_TOTAL"
else
    log_fail "Could not fetch production metrics"
fi
((TESTS_TOTAL++))
echo ""

# Canary vs Production Comparison (if canary is available)
if [[ "$CANARY_AVAILABLE" == "true" ]]; then
    log_header "Canary vs Production Comparison"

    log_test "Token Counting Feature Comparison"
    if [[ "$PRODUCTION_TOKEN_COUNTING" == "true" && "$CANARY_TOKEN_COUNTING" == "true" ]]; then
        log_pass "Both production and canary have token counting enabled"
    elif [[ "$PRODUCTION_TOKEN_COUNTING" == "false" && "$CANARY_TOKEN_COUNTING" == "true" ]]; then
        log_pass "Canary has token counting, production does not (EXPECTED for canary testing)"
    elif [[ "$PRODUCTION_TOKEN_COUNTING" == "true" && "$CANARY_TOKEN_COUNTING" == "false" ]]; then
        log_fail "Production has token counting but canary does not (UNEXPECTED)"
    else
        log_info "Both production and canary have token counting disabled"
    fi
    ((TESTS_TOTAL++))

    # Test identical request on both
    log_test "Identical Request Comparison"

    # Production request
    PROD_RESP=$(curl -s -X POST "$PRODUCTION_URL/v1/messages" \
        -H "Content-Type: application/json" \
        -H "x-api-key: $ZAI_API_KEY" \
        -H "anthropic-version: 2023-06-01" \
        -d '{
            "model": "glm-4",
            "max_tokens": 50,
            "messages": [{"role": "user", "content": "What is 2+2?"}]
        }')

    # Canary request
    CANARY_RESP=$(curl -s -X POST "$CANARY_URL/v1/messages" \
        -H "Content-Type: application/json" \
        -H "x-api-key: $ZAI_API_KEY" \
        -H "anthropic-version: 2023-06-01" \
        -d '{
            "model": "glm-4",
            "max_tokens": 50,
            "messages": [{"role": "user", "content": "What is 2+2?"}]
        }')

    PROD_INPUT=$(echo "$PROD_RESP" | jq -r '.usage.input_tokens // "null"')
    CANARY_INPUT=$(echo "$CANARY_RESP" | jq -r '.usage.input_tokens // "null"')

    if [[ "$PROD_INPUT" != "null" && "$CANARY_INPUT" != "null" ]]; then
        log_info "Production input tokens: $PROD_INPUT, Canary input tokens: $CANARY_INPUT"
        if [[ "$PROD_INPUT" -eq "$CANARY_INPUT" ]]; then
            log_pass "Input token counts match between production and canary"
        else
            log_info "Input token counts differ (expected if implementations differ)"
        fi
    else
        log_info "Cannot compare token counts (one or both missing usage field)"
    fi
    ((TESTS_TOTAL++))
    echo ""
fi

# Generate Summary
log_header "Test Summary"

log_info "Total Tests: $TESTS_TOTAL"
log_info "Passed: $TESTS_PASSED"
log_info "Failed: $TESTS_FAILED"
log_info "Skipped: $TESTS_SKIPPED"

# Calculate pass rate
if [[ $TESTS_TOTAL -gt 0 ]]; then
    PASS_RATE=$((TESTS_PASSED * 100 / TESTS_TOTAL))
    log_info "Pass Rate: ${PASS_RATE}%"
fi

# Write JSON summary
cat > "$SUMMARY_FILE" << EOF
{
  "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
  "production_url": "$PRODUCTION_URL",
  "canary_url": "$CANARY_URL",
  "tests": {
    "total": $TESTS_TOTAL,
    "passed": $TESTS_PASSED,
    "failed": $TESTS_FAILED,
    "skipped": $TESTS_SKIPPED,
    "pass_rate": $PASS_RATE
  },
  "production": {
    "token_counting_enabled": $PRODUCTION_TOKEN_COUNTING
  },
  "canary": {
    "available": $CANARY_AVAILABLE,
    "token_counting_enabled": ${CANARY_TOKEN_COUNTING:-false}
  }
}
EOF

log_info "Results saved to: $RESULTS_FILE"
log_info "Summary saved to: $SUMMARY_FILE"

# Final verdict
echo ""
if [[ $TESTS_FAILED -eq 0 ]]; then
    log_pass "All tests passed!"
    exit 0
elif [[ $TESTS_FAILED -lt $((TESTS_TOTAL / 2)) ]]; then
    log_info "Some tests failed but majority passed"
    exit 1
else
    log_fail "Majority of tests failed"
    exit 1
fi