zai-proxy/proxy/performance_benchmark_test.go

package main

import (
	"bytes"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/http/httptest"
	"os"
	"runtime"
	"strings"
	"sync"
	"sync/atomic"
	"testing"
	"time"
)

// BenchmarkConfig holds configuration for benchmark runs
type BenchmarkConfig struct {
	Enabled           bool
	ConcurrentUsers   []int
	InputSizes        []int // in characters
	IterationsPerSize int
}

// BenchmarkResult holds results from a single benchmark run
type BenchmarkResult struct {
	Name              string
	TotalRequests     int
	TotalDuration     time.Duration
	AvgLatency        time.Duration
	P50Latency        time.Duration
	P95Latency        time.Duration
	P99Latency        time.Duration
	MinLatency        time.Duration
	MaxLatency        time.Duration
	RequestsPerSecond float64
	ThroughputMBps    float64
	TokenCountTime    time.Duration
	SuccessCount      int
	ErrorCount        int
}

// BenchmarkSuite runs comprehensive performance benchmarks
type BenchmarkSuite struct {
	results []BenchmarkResult
	mu      sync.Mutex
}

// Test data generators
var (
	shortText = "Hello, how are you today?"

	mediumText = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 10)

	longText = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 100)

	veryLongText = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 1000)

	multiMessageShort = []RequestMessage{
		{Role: "user", Content: json.RawMessage(`"What is the capital of France?"`)},
		{Role: "assistant", Content: json.RawMessage(`"The capital of France is Paris."`)},
		{Role: "user", Content: json.RawMessage(`"What is its population?"`)},
		{Role: "assistant", Content: json.RawMessage(`"Paris has a population of approximately 2.1 million people within the city limits."`)},
		{Role: "user", Content: json.RawMessage(`"Tell me more about its history."`)},
	}

	multiMessageLong []RequestMessage
)

func init() {
	// Initialize multiMessageLong with dynamic text variables
	multiMessageLong = []RequestMessage{
		{Role: "user", Content: json.RawMessage(`"` + longText + `"`)},
		{Role: "assistant", Content: json.RawMessage(`"` + longText + `"`)},
		{Role: "user", Content: json.RawMessage(`"` + mediumText + `"`)},
		{Role: "assistant", Content: json.RawMessage(`"` + mediumText + `"`)},
		{Role: "user", Content: json.RawMessage(`"` + shortText + `"`)},
	}
}

// mockUpstream creates a mock upstream server for testing
func mockUpstream(responseDelay time.Duration, responseBody string) *httptest.Server {
	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		time.Sleep(responseDelay)
		w.Header().Set("Content-Type", "application/json")
		w.WriteHeader(http.StatusOK)
		w.Write([]byte(responseBody))
	}))
}

// createMockServer creates a test proxy server
func createMockServer(tokenCountingEnabled bool, upstreamURL string) *httptest.Server {
	// Set environment variable for token counting
	if !tokenCountingEnabled {
		os.Setenv("TOKEN_COUNTING_ENABLED", "false")
	} else {
		os.Setenv("TOKEN_COUNTING_ENABLED", "true")
	}

	// Initialize token counter based on setting
	var tc TokenCounter
	if tokenCountingEnabled {
		tikToken, err := NewTikTokenCounter()
		if err != nil {
			tc = NewSimpleTokenCounter()
		} else {
			tc = tikToken
		}
	}

	// Track concurrent requests
	var currentRequests int64

	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		start := time.Now()

		// Simulate concurrent request tracking
		atomicAddInt64(&currentRequests, 1)
		defer atomicAddInt64(&currentRequests, -1)

		// Track request size
		if r.ContentLength > 0 {
			requestSize.WithLabelValues(r.Method, r.URL.Path).Observe(float64(r.ContentLength))
		}

		// Capture request body for token counting
		var requestBody []byte
		var inputTokens int
		var tokenCountTime time.Duration

		if r.Body != nil && tc != nil {
			var buf bytes.Buffer
			tee := io.TeeReader(r.Body, &buf)
			requestBody, _ = io.ReadAll(tee)
			r.Body = io.NopCloser(&buf)

			// Measure token counting time
			countStart := time.Now()
			inputTokens, _ = CountRequestTokens(requestBody, tc)
			tokenCountTime = time.Since(countStart)

			if inputTokens > 0 {
				tokensTotal.WithLabelValues("input", tokenizerModel).Add(float64(inputTokens))
			}
		} else if r.Body != nil {
			requestBody, _ = io.ReadAll(r.Body)
		}

		// Create mock response
		mockResponse := map[string]interface{}{
			"id":      "msg_123",
			"type":    "message",
			"role":    "assistant",
			"content": []map[string]interface{}{
				{
					"type": "text",
					"text": "This is a mock response for testing purposes.",
				},
			},
		}

		// Add usage if token counting is enabled
		if tc != nil {
			mockResponse["usage"] = map[string]int{
				"input_tokens":  inputTokens,
				"output_tokens": 10,
			}
		}

		responseBody, _ := json.Marshal(mockResponse)

		// Set headers
		w.Header().Set("Content-Type", "application/json")
		if tc != nil && inputTokens > 0 {
			w.Header().Set("X-Token-Input", fmt.Sprintf("%d", inputTokens))
		}

		w.WriteHeader(http.StatusOK)
		w.Write(responseBody)

		// Record metrics
		duration := time.Since(start)
		requestsTotal.WithLabelValues(r.Method, r.URL.Path, "200").Inc()
		requestDuration.WithLabelValues(r.Method, r.URL.Path, "200").Observe(duration.Seconds())
		responseSize.WithLabelValues(r.Method, r.URL.Path, "200").Observe(float64(len(responseBody)))

		// Log token counting time if enabled
		if tc != nil && tokenCountTime > 0 {
			tokenCountDuration.WithLabelValues(deploymentVariant).Observe(tokenCountTime.Seconds())
		}
	}))
}

// atomicAddInt64 is a helper for atomic int64 operations
func atomicAddInt64(addr *int64, delta int64) int64 {
	return atomic.AddInt64(addr, delta)
}

// BenchmarkTokenCountingBaseline measures performance WITHOUT token counting
func BenchmarkTokenCountingBaseline(b *testing.B) {
	req := createTestRequest(shortText, false)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		body, _ := json.Marshal(req)
		_ = len(body)
	}
}

// BenchmarkTokenCountingEnabled measures performance WITH token counting
func BenchmarkTokenCountingEnabled(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := createTestRequest(shortText, false)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		body, _ := json.Marshal(req)
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkTokenCountingShortText benchmarks short text input
func BenchmarkTokenCountingShortText(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := createTestRequest(shortText, false)
	body, _ := json.Marshal(req)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkTokenCountingMediumText benchmarks medium text input
func BenchmarkTokenCountingMediumText(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := createTestRequest(mediumText, false)
	body, _ := json.Marshal(req)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkTokenCountingLongText benchmarks long text input
func BenchmarkTokenCountingLongTextPerformance(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := createTestRequest(longText, false)
	body, _ := json.Marshal(req)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkTokenCountingVeryLongText benchmarks very long text input
func BenchmarkTokenCountingVeryLongText(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := createTestRequest(veryLongText, false)
	body, _ := json.Marshal(req)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkTokenCountingMultiMessage benchmarks multi-message conversations
func BenchmarkTokenCountingMultiMessage(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := RequestBody{
		Model:    "glm-4",
		Messages: multiMessageShort,
		Stream:   false,
	}
	body, _ := json.Marshal(req)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkTokenCountingMultiMessageLong benchmarks long multi-message conversations
func BenchmarkTokenCountingMultiMessageLong(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := RequestBody{
		Model:    "glm-4",
		Messages: multiMessageLong,
		Stream:   false,
	}
	body, _ := json.Marshal(req)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkConcurrentRequests10 measures 10 concurrent requests
func BenchmarkConcurrentRequests10(b *testing.B) {
	benchmarkConcurrentRequests(b, 10)
}

// BenchmarkConcurrentRequests50 measures 50 concurrent requests
func BenchmarkConcurrentRequests50(b *testing.B) {
	benchmarkConcurrentRequests(b, 50)
}

// BenchmarkConcurrentRequests100 measures 100 concurrent requests
func BenchmarkConcurrentRequests100(b *testing.B) {
	benchmarkConcurrentRequests(b, 100)
}

// benchmarkConcurrentRequests is a helper for concurrent request benchmarks
func benchmarkConcurrentRequests(b *testing.B, concurrency int) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	req := createTestRequest(mediumText, false)
	body, _ := json.Marshal(req)

	b.ResetTimer()

	b.RunParallel(func(pb *testing.PB) {
		for pb.Next() {
			_, _ = CountRequestTokens(body, counter)
		}
	})
}

// BenchmarkEndToEndNoTokenCounting benchmarks full request flow without token counting
func BenchmarkEndToEndNoTokenCounting(b *testing.B) {
	server := createMockServer(false, "")
	defer server.Close()

	req := createTestRequest(shortText, false)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		body, _ := json.Marshal(req)
		httpReq, _ := http.NewRequest("POST", server.URL+"/v1/messages", bytes.NewReader(body))
		httpReq.Header.Set("Content-Type", "application/json")

		client := &http.Client{Timeout: 5 * time.Second}
		resp, err := client.Do(httpReq)
		if err != nil {
			b.Fatalf("Request failed: %v", err)
		}
		resp.Body.Close()
	}
}

// BenchmarkEndToEndWithTokenCounting benchmarks full request flow with token counting
func BenchmarkEndToEndWithTokenCounting(b *testing.B) {
	server := createMockServer(true, "")
	defer server.Close()

	req := createTestRequest(shortText, false)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		body, _ := json.Marshal(req)
		httpReq, _ := http.NewRequest("POST", server.URL+"/v1/messages", bytes.NewReader(body))
		httpReq.Header.Set("Content-Type", "application/json")

		client := &http.Client{Timeout: 5 * time.Second}
		resp, err := client.Do(httpReq)
		if err != nil {
			b.Fatalf("Request failed: %v", err)
		}
		resp.Body.Close()
	}
}

// TestTokenCountingOverhead measures the overhead of token counting
func TestTokenCountingOverhead(t *testing.T) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		t.Skipf("Failed to initialize tokenizer: %v", err)
	}

	testCases := []struct {
		name     string
		text     string
		maxDelay time.Duration
	}{
		{"Short text", shortText, 5 * time.Millisecond},
		{"Medium text", mediumText, 10 * time.Millisecond},
		{"Long text", longText, 50 * time.Millisecond},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			req := createTestRequest(tc.text, false)
			body, _ := json.Marshal(req)

			// Warm up
			for i := 0; i < 10; i++ {
				CountRequestTokens(body, counter)
			}

			// Measure
			iterations := 100
			start := time.Now()
			for i := 0; i < iterations; i++ {
				CountRequestTokens(body, counter)
			}
			elapsed := time.Since(start)
			avgTime := elapsed / time.Duration(iterations)

			t.Logf("Average token counting time: %v", avgTime)

			if avgTime > tc.maxDelay {
				t.Errorf("Token counting too slow: %v (max: %v)", avgTime, tc.maxDelay)
			}
		})
	}
}

// TestConcurrentLoad tests the system under concurrent load
func TestConcurrentLoad(t *testing.T) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		t.Skipf("Failed to initialize tokenizer: %v", err)
	}

	concurrencyLevels := []int{10, 50, 100}
	requestsPerLevel := 100

	for _, concurrency := range concurrencyLevels {
		t.Run(fmt.Sprintf("Concurrent_%d", concurrency), func(t *testing.T) {
			req := createTestRequest(mediumText, false)
			body, _ := json.Marshal(req)

			var wg sync.WaitGroup
			var mu sync.Mutex
			latencies := make([]time.Duration, 0, requestsPerLevel)
			errors := 0
			startTime := time.Now()

			for i := 0; i < concurrency; i++ {
				wg.Add(1)
				go func() {
					defer wg.Done()

					for j := 0; j < requestsPerLevel/concurrency; j++ {
						reqStart := time.Now()
						_, err := CountRequestTokens(body, counter)
						reqDuration := time.Since(reqStart)

						mu.Lock()
						latencies = append(latencies, reqDuration)
						if err != nil {
							errors++
						}
						mu.Unlock()
					}
				}()
			}

			wg.Wait()
			totalDuration := time.Since(startTime)

			// Calculate statistics
			if len(latencies) == 0 {
				t.Fatal("No latencies recorded")
			}

			// Sort latencies for percentiles
			for i := 0; i < len(latencies); i++ {
				for j := i + 1; j < len(latencies); j++ {
					if latencies[i] > latencies[j] {
						latencies[i], latencies[j] = latencies[j], latencies[i]
					}
				}
			}

			totalRequests := len(latencies)
			p50 := latencies[totalRequests*50/100]
			p95 := latencies[totalRequests*95/100]
			p99 := latencies[totalRequests*99/100]
			minLat := latencies[0]
			maxLat := latencies[totalRequests-1]

			var sum time.Duration
			for _, l := range latencies {
				sum += l
			}
			avgLat := sum / time.Duration(totalRequests)

			rps := float64(totalRequests) / totalDuration.Seconds()

			t.Logf("Concurrency: %d", concurrency)
			t.Logf("Total requests: %d", totalRequests)
			t.Logf("Total duration: %v", totalDuration)
			t.Logf("Requests per second: %.2f", rps)
			t.Logf("Errors: %d", errors)
			t.Logf("Avg latency: %v", avgLat)
			t.Logf("Min latency: %v", minLat)
			t.Logf("Max latency: %v", maxLat)
			t.Logf("P50 latency: %v", p50)
			t.Logf("P95 latency: %v", p95)
			t.Logf("P99 latency: %v", p99)

			// Verify targets
			if avgLat > 5*time.Millisecond {
				t.Errorf("Average latency exceeds 5ms target: %v", avgLat)
			}

			if errors > 0 {
				t.Errorf("Encountered %d errors", errors)
			}
		})
	}
}

// TestMemoryProfile measures memory allocation patterns
func TestMemoryProfile(t *testing.T) {
	if testing.Short() {
		t.Skip("Skipping memory profile in short mode")
	}

	counter, err := NewTikTokenCounter()
	if err != nil {
		t.Skipf("Failed to initialize tokenizer: %v", err)
	}

	testCases := []struct {
		name string
		text string
	}{
		{"Short", shortText},
		{"Medium", mediumText},
		{"Long", longText},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			req := createTestRequest(tc.text, false)
			body, _ := json.Marshal(req)

			// Force GC before measuring
			runtime.GC()

			var m1 runtime.MemStats
			runtime.ReadMemStats(&m1)

			// Run token counting many times
			iterations := 1000
			for i := 0; i < iterations; i++ {
				CountRequestTokens(body, counter)
			}

			// Force GC after measuring
			runtime.GC()

			var m2 runtime.MemStats
			runtime.ReadMemStats(&m2)

			// Calculate memory usage
			allocDiff := m2.TotalAlloc - m1.TotalAlloc
			avgAlloc := allocDiff / uint64(iterations)

			t.Logf("Text length: %d bytes", len(tc.text))
			t.Logf("Total allocations: %d bytes", m2.TotalAlloc-m1.TotalAlloc)
			t.Logf("Avg allocation per request: %d bytes", avgAlloc)

			// Verify memory usage is reasonable
			// We expect < 10KB per request even for long texts
			maxAlloc := uint64(10 * 1024)
			if avgAlloc > maxAlloc {
				t.Errorf("Memory allocation too high: %d bytes (max: %d)", avgAlloc, maxAlloc)
			}
		})
	}
}

// TestComparisonTokenCounting compares baseline vs token counting performance
func TestComparisonTokenCounting(t *testing.T) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		t.Skipf("Failed to initialize tokenizer: %v", err)
	}

	testCases := []struct {
		name string
		text string
	}{
		{"Short", shortText},
		{"Medium", mediumText},
		{"Long", longText},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			req := createTestRequest(tc.text, false)
			body, _ := json.Marshal(req)

			iterations := 100

			// Measure baseline (no token counting)
			startBaseline := time.Now()
			for i := 0; i < iterations; i++ {
				_ = len(body)
			}
			baselineDuration := time.Since(startBaseline)

			// Measure with token counting
			startCounting := time.Now()
			for i := 0; i < iterations; i++ {
				CountRequestTokens(body, counter)
			}
			countingDuration := time.Since(startCounting)

			// Calculate overhead
			overhead := countingDuration - baselineDuration
			overheadPercent := float64(overhead) / float64(baselineDuration) * 100
			avgOverheadPerRequest := overhead / time.Duration(iterations)

			t.Logf("Baseline duration: %v", baselineDuration)
			t.Logf("Counting duration: %v", countingDuration)
			t.Logf("Overhead: %v (%.2f%%)", overhead, overheadPercent)
			t.Logf("Avg overhead per request: %v", avgOverheadPerRequest)

			// Verify overhead is acceptable
			// We expect < 5ms overhead per request
			if avgOverheadPerRequest > 5*time.Millisecond {
				t.Errorf("Overhead too high: %v per request (max: 5ms)", avgOverheadPerRequest)
			}
		})
	}
}

// Helper functions

func createTestRequest(content string, stream bool) RequestBody {
	return RequestBody{
		Model: "glm-4",
		Messages: []RequestMessage{
			{
				Role:    "user",
				Content: json.RawMessage(`"` + content + `"`),
			},
		},
		Stream: stream,
	}
}

// printBenchmarkSummary prints a summary of benchmark results
func printBenchmarkSummary(results []BenchmarkResult) {
	fmt.Println("\n=== BENCHMARK SUMMARY ===")
	fmt.Printf("%-30s %10s %10s %10s %10s %10s\n",
		"Test Name", "Requests", "Avg (ms)", "P95 (ms)", "P99 (ms)", "RPS")
	fmt.Println(strings.Repeat("-", 90))

	for _, r := range results {
		fmt.Printf("%-30s %10d %10.2f %10.2f %10.2f %10.0f\n",
			r.Name,
			r.TotalRequests,
			float64(r.AvgLatency.Milliseconds()),
			float64(r.P95Latency.Milliseconds()),
			float64(r.P99Latency.Milliseconds()),
			r.RequestsPerSecond)
	}
	fmt.Println(strings.Repeat("-", 90))
}

// BenchmarkSimpleTokenCounter compares against the fallback implementation
func BenchmarkSimpleTokenCounter(b *testing.B) {
	counter := NewSimpleTokenCounter()
	req := createTestRequest(mediumText, false)
	body, _ := json.Marshal(req)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		_, _ = CountRequestTokens(body, counter)
	}
}

// BenchmarkTikTokenCounterParallel tests parallel access to the tokenizer
func BenchmarkTikTokenCounterParallel(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		for pb.Next() {
			_, _ = counter.CountTokens(mediumText)
		}
	})
}

// BenchmarkCountSSE parses Server-Sent Events for token counting
func BenchmarkCountSSE(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	sseContent := generateSSEContent(mediumText)

	b.ResetTimer()
	b.ReportAllocs()

	for i := 0; i < b.N; i++ {
		// Create a mock response body capture
		body := &ResponseBodyCapture{
			originalBody: io.NopCloser(bytes.NewReader([]byte(sseContent))),
			buffer:       &bytes.Buffer{},
			teeReader:    io.TeeReader(bytes.NewReader([]byte(sseContent)), &bytes.Buffer{}),
			counter:      counter,
		}
		// Pre-fill buffer
		body.buffer.WriteString(sseContent)
		_, _ = body.CountOutputTokens()
	}
}

// BenchmarkTokenCountingLatency single operation latency measurement
func BenchmarkTokenCountingLatency(b *testing.B) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		b.Skipf("Failed to initialize tokenizer: %v", err)
	}

	if b.N == 1 {
		// Single operation for detailed timing
		start := time.Now()
		tokens, err := counter.CountTokens(mediumText)
		elapsed := time.Since(start)

		b.Logf("Single CountTokens operation: %v for %d tokens", elapsed, tokens)
		if err != nil {
			b.Errorf("CountTokens error: %v", err)
		}

		// Check if we meet the <5ms target
		if elapsed > 5*time.Millisecond {
			b.Errorf("Latency exceeds 5ms target: %v", elapsed)
		}
		return
	}

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_, _ = counter.CountTokens(mediumText)
	}
}

// generateSSEContent creates SSE format content for benchmarking
func generateSSEContent(text string) string {
	var buf bytes.Buffer

	// Message start
	buf.WriteString("data: {\"type\":\"message_start\",\"message\":{\"id\":\"msg_123\",\"type\":\"message\",\"role\":\"assistant\"}}\n\n")

	// Content block start
	buf.WriteString("data: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"text\",\"text\":\"\"}}\n\n")

	// Content block delta (split text into chunks)
	chunkSize := 50
	for i := 0; i < len(text); i += chunkSize {
		end := i + chunkSize
		if end > len(text) {
			end = len(text)
		}
		chunk := text[i:end]
		deltaJSON, _ := json.Marshal(map[string]interface{}{
			"type":  "text_delta",
			"text":  chunk,
		})
		buf.WriteString(fmt.Sprintf("data: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":%s}\n\n", string(deltaJSON)))
	}

	// Content block stop
	buf.WriteString("data: {\"type\":\"content_block_stop\",\"index\":0}\n\n")

	// Message delta
	buf.WriteString("data: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\"},\"usage\":{\"output_tokens\":0}}\n\n")

	// Message stop
	buf.WriteString("data: {\"type\":\"message_stop\"}\n\n")

	return buf.String()
}

// TestTokenizerLatencyTarget validates the <5ms target for various input sizes
func TestTokenizerLatencyTarget(t *testing.T) {
	counter, err := NewTikTokenCounter()
	if err != nil {
		t.Skipf("Failed to initialize tokenizer: %v", err)
	}

	testCases := []struct {
		name     string
		text     string
		targetMs int64
	}{
		{"Small", shortText, 1},
		{"Medium", mediumText, 2},
		{"Long", longText, 5},
	}

	for _, tc := range testCases {
		t.Run(tc.name, func(t *testing.T) {
			// Warm up
			for i := 0; i < 100; i++ {
				_, _ = counter.CountTokens(tc.text)
			}

			// Measure
			iterations := 1000
			start := time.Now()
			for i := 0; i < iterations; i++ {
				_, _ = counter.CountTokens(tc.text)
			}
			elapsed := time.Since(start)
			avgNs := elapsed.Nanoseconds() / int64(iterations)
			avgMs := float64(avgNs) / 1_000_000

			t.Logf("%s text: Average token counting time: %.2f ms (target: %dms)", tc.name, avgMs, tc.targetMs)

			if avgMs > float64(tc.targetMs) {
				t.Errorf("Overhead %.2f ms exceeds target %d ms for %s text", avgMs, tc.targetMs, tc.name)
			}
		})
	}
}