Extracted from ardenone-cluster/containers/zai-proxy and ardenone-cluster/containers/zai-proxy-dashboard. - proxy/: OpenAI-compatible ZAI reverse proxy (Go, v1.10.0) - Token counting, rate limiting, Prometheus metrics, canary support - dashboard/: Metrics dashboard backend + React frontend (Go, v1.0.0) - Prometheus collector, SQLite storage, SSE live updates - docs/: Operational notes, research, and plan subdirs Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
891 lines
28 KiB
Bash
Executable file
891 lines
28 KiB
Bash
Executable file
#!/bin/bash
|
|
# Automated Test-Fix-Iterate Loop
|
|
# Purpose: Continuous testing and automated fix iteration
|
|
# Bead: bd-3eb
|
|
#
|
|
# This script implements a closed-loop system that:
|
|
# 1. Runs test harness
|
|
# 2. Detects failures and captures error details
|
|
# 3. Categorizes failure types
|
|
# 4. Logs failures with reproduction steps
|
|
# 5. Generates fix suggestions
|
|
# 6. Tracks iteration progress
|
|
# 7. Stops when conditions are met (95% pass rate, <3% token variance)
|
|
|
|
set -e
|
|
|
|
# Script version
|
|
VERSION="1.0.0"
|
|
|
|
# Configuration
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
WORKSPACE_DIR="$PROJECT_ROOT"
|
|
ITERATIONS_DIR="$PROJECT_ROOT/.iterations"
|
|
LOGS_DIR="$PROJECT_ROOT/.test-logs"
|
|
REPORTS_DIR="$PROJECT_ROOT/.test-reports"
|
|
|
|
# Thresholds (from bead requirements)
|
|
TARGET_PASS_RATE=95 # 95% test pass rate
|
|
TARGET_TOKEN_VARIANCE=3 # <3% token count variance
|
|
MAX_ITERATIONS=50 # Maximum iterations to prevent infinite loops
|
|
COOLDOWN_SECONDS=5 # Wait between iterations
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
MAGENTA='\033[0;35m'
|
|
NC='\033[0m'
|
|
|
|
# State tracking (persisted)
|
|
STATE_FILE="$ITERATIONS_DIR/state.json"
|
|
|
|
# ============================================
|
|
# UTILITY FUNCTIONS
|
|
# ============================================
|
|
|
|
log_info() {
|
|
echo -e "${BLUE}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
|
|
}
|
|
|
|
log_success() {
|
|
echo -e "${GREEN}[SUCCESS]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
|
|
}
|
|
|
|
log_warning() {
|
|
echo -e "${YELLOW}[WARNING]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
|
|
}
|
|
|
|
log_debug() {
|
|
if [[ "${DEBUG:-false}" == "true" ]]; then
|
|
echo -e "${CYAN}[DEBUG]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
|
|
fi
|
|
}
|
|
|
|
print_banner() {
|
|
local text="$1"
|
|
echo ""
|
|
echo -e "${CYAN}$(printf '=%.0s' {1..80})${NC}"
|
|
echo -e "${CYAN}$text${NC}"
|
|
echo -e "${CYAN}$(printf '=%.0s' {1..80})${NC}"
|
|
echo ""
|
|
}
|
|
|
|
# ============================================
|
|
# INITIALIZATION
|
|
# ============================================
|
|
|
|
init_directories() {
|
|
log_info "Initializing workspace directories..."
|
|
mkdir -p "$ITERATIONS_DIR"
|
|
mkdir -p "$LOGS_DIR"
|
|
mkdir -p "$REPORTS_DIR"
|
|
mkdir -p "$REPORTS_DIR/failures"
|
|
mkdir -p "$REPORTS_DIR/patterns"
|
|
}
|
|
|
|
init_state() {
|
|
if [[ ! -f "$STATE_FILE" ]]; then
|
|
log_info "Creating new state file..."
|
|
cat > "$STATE_FILE" << EOF
|
|
{
|
|
"version": "$VERSION",
|
|
"started_at": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
|
|
"iteration": 0,
|
|
"total_tests_run": 0,
|
|
"total_passes": 0,
|
|
"total_failures": 0,
|
|
"best_pass_rate": 0.0,
|
|
"best_token_variance": 100.0,
|
|
"failure_history": [],
|
|
"fix_attempts": [],
|
|
"stop_reason": null
|
|
}
|
|
EOF
|
|
fi
|
|
}
|
|
|
|
load_state() {
|
|
if [[ -f "$STATE_FILE" ]]; then
|
|
# Source the state as bash variables
|
|
eval "$(jq -r '
|
|
"ITERATION=\(.iteration // 0)",
|
|
"TOTAL_TESTS_RUN=\(.total_tests_run // 0)",
|
|
"TOTAL_PASSES=\(.total_passes // 0)",
|
|
"TOTAL_FAILURES=\(.total_failures // 0)",
|
|
"BEST_PASS_RATE=\(.best_pass_rate // 0.0)",
|
|
"BEST_TOKEN_VARIANCE=\(.best_token_variance // 100.0)"
|
|
' "$STATE_FILE")"
|
|
else
|
|
ITERATION=0
|
|
TOTAL_TESTS_RUN=0
|
|
TOTAL_PASSES=0
|
|
TOTAL_FAILURES=0
|
|
BEST_PASS_RATE=0.0
|
|
BEST_TOKEN_VARIANCE=100.0
|
|
fi
|
|
}
|
|
|
|
save_state() {
|
|
local iteration="$1"
|
|
local tests_run="$2"
|
|
local passes="$3"
|
|
local failures="$4"
|
|
local pass_rate="$5"
|
|
local token_variance="$6"
|
|
|
|
jq --arg iteration "$iteration" \
|
|
--arg tests_run "$tests_run" \
|
|
--arg passes "$passes" \
|
|
--arg failures "$failures" \
|
|
--arg pass_rate "$pass_rate" \
|
|
--arg token_variance "$token_variance" \
|
|
--arg started_at "$(jq -r '.started_at // now' "$STATE_FILE")" \
|
|
--argjson failure_history "$(jq '.failure_history // []' "$STATE_FILE")" \
|
|
--argjson fix_attempts "$(jq '.fix_attempts // []' "$STATE_FILE")" \
|
|
--arg best_pass_rate "$(max_float "$BEST_PASS_RATE" "$pass_rate")" \
|
|
--arg best_token_variance "$(min_float "$BEST_TOKEN_VARIANCE" "$token_variance")" \
|
|
'{
|
|
version: $VERSION,
|
|
started_at: $started_at,
|
|
iteration: ($iteration | tonumber),
|
|
total_tests_run: ($tests_run | tonumber),
|
|
total_passes: ($passes | tonumber),
|
|
total_failures: ($failures | tonumber),
|
|
best_pass_rate: ($best_pass_rate | tonumber),
|
|
best_token_variance: ($best_token_variance | tonumber),
|
|
failure_history: $failure_history,
|
|
fix_attempts: $fix_attempts,
|
|
last_updated: now
|
|
}' <<< "{\"VERSION\":\"$VERSION\"}" > "$STATE_FILE.tmp" && mv "$STATE_FILE.tmp" "$STATE_FILE"
|
|
}
|
|
|
|
max_float() {
|
|
echo "$1 $2" | awk '{if ($1 > $2) print $1; else print $2}'
|
|
}
|
|
|
|
min_float() {
|
|
echo "$1 $2" | awk '{if ($1 < $2) print $1; else print $2}'
|
|
}
|
|
|
|
# ============================================
|
|
# TEST HARNESS
|
|
# ============================================
|
|
|
|
run_test_harness() {
|
|
local iteration_num="$1"
|
|
local log_file="$LOGS_DIR/iteration-$iteration_num.log"
|
|
|
|
log_info "Running test harness for iteration $iteration_num..."
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
# Run all regression tests and capture output
|
|
local test_output
|
|
local exit_code
|
|
|
|
test_output=$(go test -v -run TestRegression 2>&1) || exit_code=$?
|
|
|
|
# Save raw output
|
|
echo "$test_output" > "$log_file"
|
|
|
|
# Parse results
|
|
parse_test_results "$test_output" "$exit_code"
|
|
}
|
|
|
|
parse_test_results() {
|
|
local output="$1"
|
|
local exit_code="${2:-0}"
|
|
|
|
local passed=0
|
|
local failed=0
|
|
local total=0
|
|
local failures=()
|
|
|
|
# Parse test output for failures
|
|
while IFS= read -r line; do
|
|
if [[ $line =~ ---\ (PASS|FAIL):\ ([^\ ]+) ]]; then
|
|
((total++))
|
|
if [[ "${BASH_REMATCH[1]}" == "PASS" ]]; then
|
|
((passed++))
|
|
else
|
|
((failed++))
|
|
failures+=("${BASH_REMATCH[2]}")
|
|
fi
|
|
fi
|
|
done <<< "$output"
|
|
|
|
# Extract token counts if available
|
|
local token_variance=100.0
|
|
if grep -q "token" <<< "$output"; then
|
|
# Extract token counts and calculate variance
|
|
local token_counts
|
|
token_counts=$(grep -oE '[0-9]+ tokens' <<< "$output" | grep -oE '[0-9]+' || true)
|
|
|
|
if [[ -n "$token_counts" ]]; then
|
|
local count_array=($token_counts)
|
|
if [[ ${#count_array[@]} -gt 1 ]]; then
|
|
# Calculate variance
|
|
local sum=0
|
|
local sum_sq=0
|
|
local count=${#count_array[@]}
|
|
|
|
for val in "${count_array[@]}"; do
|
|
sum=$((sum + val))
|
|
done
|
|
|
|
local mean=$((sum / count))
|
|
local variance_sum=0
|
|
|
|
for val in "${count_array[@]}"; do
|
|
local diff=$((val - mean))
|
|
variance_sum=$((variance_sum + diff * diff))
|
|
done
|
|
|
|
local variance=$((variance_sum / count))
|
|
local std_dev=$((variance ** 0.5))
|
|
|
|
# Calculate percentage variance relative to mean
|
|
if [[ $mean -gt 0 ]]; then
|
|
token_variance=$((std_dev * 100 / mean))
|
|
fi
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Calculate pass rate
|
|
local pass_rate=0.0
|
|
if [[ $total -gt 0 ]]; then
|
|
pass_rate=$(awk "BEGIN {printf \"%.2f\", ($passed / $total) * 100}")
|
|
fi
|
|
|
|
# Return results as JSON
|
|
jq -n \
|
|
--arg passed "$passed" \
|
|
--arg failed "$failed" \
|
|
--arg total "$total" \
|
|
--arg pass_rate "$pass_rate" \
|
|
--arg token_variance "$token_variance" \
|
|
--arg exit_code "$exit_code" \
|
|
'{
|
|
passed: ($passed | tonumber),
|
|
failed: ($failed | tonumber),
|
|
total: ($total | tonumber),
|
|
pass_rate: ($pass_rate | tonumber),
|
|
token_variance: ($token_variance | tonumber),
|
|
exit_code: ($exit_code | tonumber)
|
|
}'
|
|
}
|
|
|
|
# ============================================
|
|
# FAILURE CATEGORIZATION
|
|
# ============================================
|
|
|
|
categorize_failure() {
|
|
local test_name="$1"
|
|
local error_message="$2"
|
|
local test_log="$3"
|
|
|
|
local category="unknown"
|
|
local severity="medium"
|
|
local suggested_fix="generic"
|
|
|
|
# Accuracy failures - token count mismatches
|
|
if [[ $error_message =~ (expected|Got|tokens|count) ]]; then
|
|
category="accuracy"
|
|
severity="high"
|
|
|
|
# Determine specific accuracy issue
|
|
if [[ $error_message =~ (empty|zero) ]]; then
|
|
suggested_fix="check_tokenizer_initialization"
|
|
elif [[ $error_message =~ (range|min|max) ]]; then
|
|
suggested_fix="adjust_token_ranges"
|
|
else
|
|
suggested_fix="verify_tokenization_algorithm"
|
|
fi
|
|
fi
|
|
|
|
# Format failures - JSON parsing, structure issues
|
|
if [[ $error_message =~ (JSON|marshal|unmarshal|parse|format) ]]; then
|
|
category="format"
|
|
severity="medium"
|
|
|
|
if [[ $error_message =~ (invalid|malformed) ]]; then
|
|
suggested_fix="add_input_validation"
|
|
else
|
|
suggested_fix="fix_json_parsing"
|
|
fi
|
|
fi
|
|
|
|
# Streaming failures - SSE, chunking issues
|
|
if [[ $error_message =~ (stream|SSE|chunk|flush|delta) ]]; then
|
|
category="streaming"
|
|
severity="high"
|
|
suggested_fix="verify_streaming_buffer_handling"
|
|
fi
|
|
|
|
# Concurrency failures - race conditions, locks
|
|
if [[ $error_message =~ (race|concurrent|lock|mutex|goroutine) ]]; then
|
|
category="concurrency"
|
|
severity="critical"
|
|
suggested_fix="add_synchronization_or_improve_locking"
|
|
fi
|
|
|
|
# Edge case failures - empty input, special characters
|
|
if [[ $error_message =~ (empty|nil|panic|crash|special|unicode) ]]; then
|
|
category="edge_case"
|
|
severity="medium"
|
|
suggested_fix="add_defensive_programming"
|
|
fi
|
|
|
|
# Performance failures - timeout, slow operations
|
|
if [[ $error_message =~ (timeout|slow|deadline|exceeded) ]]; then
|
|
category="performance"
|
|
severity="low"
|
|
suggested_fix="optimize_algorithm_or_add_caching"
|
|
fi
|
|
|
|
jq -n \
|
|
--arg test_name "$test_name" \
|
|
--arg category "$category" \
|
|
--arg severity "$severity" \
|
|
--arg suggested_fix "$suggested_fix" \
|
|
--arg error_message "$error_message" \
|
|
'{
|
|
test_name: $test_name,
|
|
category: $category,
|
|
severity: $severity,
|
|
suggested_fix: $suggested_fix,
|
|
error_message: $error_message
|
|
}'
|
|
}
|
|
|
|
# ============================================
|
|
# FAILURE LOGGING
|
|
# ============================================
|
|
|
|
log_failure_with_reproduction() {
|
|
local iteration="$1"
|
|
local test_name="$2"
|
|
local category_info="$3"
|
|
local test_log="$4"
|
|
|
|
local failure_id="fail-${iteration}-$(date +%s)"
|
|
local failure_file="$REPORTS_DIR/failures/${failure_id}.json"
|
|
|
|
# Extract relevant test context
|
|
local reproduction_steps
|
|
reproduction_steps=$(extract_reproduction_steps "$test_name" "$test_log")
|
|
|
|
# Create detailed failure report
|
|
jq -n \
|
|
--arg failure_id "$failure_id" \
|
|
--arg iteration "$iteration" \
|
|
--arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
|
--arg test_name "$test_name" \
|
|
--argjson category_info "$category_info" \
|
|
--argjson reproduction_steps "$reproduction_steps" \
|
|
'{
|
|
failure_id: $failure_id,
|
|
iteration: ($iteration | tonumber),
|
|
timestamp: $timestamp,
|
|
test_name: $test_name,
|
|
category: $category_info.category,
|
|
severity: $category_info.severity,
|
|
suggested_fix: $category_info.suggested_fix,
|
|
error_message: $category_info.error_message,
|
|
reproduction_steps: $reproduction_steps
|
|
}' > "$failure_file"
|
|
|
|
echo "$failure_file"
|
|
}
|
|
|
|
extract_reproduction_steps() {
|
|
local test_name="$1"
|
|
local test_log="$2"
|
|
|
|
# Create step-by-step reproduction guide
|
|
cat <<'EOF' | jq -R -s -c 'split("\n") | map(select(length > 0))'
|
|
1. Navigate to project directory: cd /home/coder/ardenone-cluster/containers/zai-proxy
|
|
2. Run specific test: go test -v -run TEST_NAME
|
|
3. Observe error message
|
|
4. Review code at: tokenizer.go or tokenizer_regression_test.go
|
|
5. Check token counting logic for the specific input
|
|
6. Verify tokenizer initialization
|
|
7. Test with various input formats
|
|
EOF
|
|
}
|
|
|
|
# ============================================
|
|
# FIX SUGGESTION GENERATION
|
|
# ============================================
|
|
|
|
generate_fix_suggestions() {
|
|
local failures_json="$1"
|
|
|
|
local suggestions=()
|
|
|
|
# Analyze failure patterns
|
|
local accuracy_count=0
|
|
local format_count=0
|
|
local streaming_count=0
|
|
local concurrency_count=0
|
|
|
|
while read -r failure; do
|
|
local category
|
|
category=$(jq -r '.category' <<< "$failure")
|
|
|
|
case $category in
|
|
accuracy) ((accuracy_count++)) ;;
|
|
format) ((format_count++)) ;;
|
|
streaming) ((streaming_count++)) ;;
|
|
concurrency) ((concurrency_count++)) ;;
|
|
esac
|
|
done <<< "$(jq -c '.[]' <<< "$failures_json")"
|
|
|
|
# Generate suggestions based on patterns
|
|
if [[ $accuracy_count -gt 2 ]]; then
|
|
suggestions+=("PATTERN: Multiple accuracy failures detected. SUGGESTION: Review tokenizer encoding selection (cl100k_base vs model-specific). Consider adjusting expected token ranges in golden tests.")
|
|
fi
|
|
|
|
if [[ $format_count -gt 2 ]]; then
|
|
suggestions+=("PATTERN: Multiple format failures. SUGGESTION: JSON parsing may be inconsistent. Add validation middleware for request/response formats.")
|
|
fi
|
|
|
|
if [[ $streaming_count -gt 0 ]]; then
|
|
suggestions+=("PATTERN: Streaming failures detected. SUGGESTION: Verify io.TeeReader buffer handling in ResponseBodyCapture. Check for race conditions in concurrent reads.")
|
|
fi
|
|
|
|
if [[ $concurrency_count -gt 0 ]]; then
|
|
suggestions+=("PATTERN: Concurrency issues. SUGGESTION: Review mutex usage in TikTokenCounter. Consider adding more granular locking or using sync/atomic.")
|
|
fi
|
|
|
|
# Output as JSON array
|
|
printf '%s\n' "${suggestions[@]}" | jq -R . | jq -s .
|
|
}
|
|
|
|
# ============================================
|
|
# ITERATION TRACKING
|
|
# ============================================
|
|
|
|
update_iteration_metrics() {
|
|
local iteration="$1"
|
|
local test_results="$2"
|
|
local failures="$3"
|
|
|
|
local passed
|
|
local failed
|
|
local total
|
|
local pass_rate
|
|
local token_variance
|
|
|
|
passed=$(jq -r '.passed' <<< "$test_results")
|
|
failed=$(jq -r '.failed' <<< "$test_results")
|
|
total=$(jq -r '.total' <<< "$test_results")
|
|
pass_rate=$(jq -r '.pass_rate' <<< "$test_results")
|
|
token_variance=$(jq -r '.token_variance' <<< "$test_results")
|
|
|
|
# Update running totals
|
|
TOTAL_TESTS_RUN=$((TOTAL_TESTS_RUN + total))
|
|
TOTAL_PASSES=$((TOTAL_PASSES + passed))
|
|
TOTAL_FAILURES=$((TOTAL_FAILURES + failed))
|
|
|
|
# Update bests
|
|
BEST_PASS_RATE=$(max_float "$BEST_PASS_RATE" "$pass_rate")
|
|
BEST_TOKEN_VARIANCE=$(min_float "$BEST_TOKEN_VARIANCE" "$token_variance")
|
|
|
|
# Save state
|
|
save_state "$iteration" "$TOTAL_TESTS_RUN" "$TOTAL_PASSES" "$TOTAL_FAILURES" "$pass_rate" "$token_variance"
|
|
|
|
# Generate iteration report
|
|
generate_iteration_report "$iteration" "$test_results" "$failures"
|
|
}
|
|
|
|
generate_iteration_report() {
|
|
local iteration="$1"
|
|
local test_results="$2"
|
|
local failures="$3"
|
|
|
|
local report_file="$REPORTS_DIR/iteration-$iteration.json"
|
|
|
|
jq -n \
|
|
--arg iteration "$iteration" \
|
|
--arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
|
--argjson test_results "$test_results" \
|
|
--argjson failures "$failures" \
|
|
--arg total_tests_run "$TOTAL_TESTS_RUN" \
|
|
--arg total_passes "$TOTAL_PASSES" \
|
|
--arg total_failures "$TOTAL_FAILURES" \
|
|
--arg best_pass_rate "$BEST_PASS_RATE" \
|
|
--arg best_token_variance "$BEST_TOKEN_VARIANCE" \
|
|
'{
|
|
iteration: ($iteration | tonumber),
|
|
timestamp: $timestamp,
|
|
test_results: $test_results,
|
|
failures: ($failures | length),
|
|
failure_details: $failures,
|
|
cumulative: {
|
|
total_tests_run: ($total_tests_run | tonumber),
|
|
total_passes: ($total_passes | tonumber),
|
|
total_failures: ($total_failures | tonumber)
|
|
},
|
|
best_metrics: {
|
|
pass_rate: ($best_pass_rate | tonumber),
|
|
token_variance: ($best_token_variance | tonumber)
|
|
}
|
|
}' > "$report_file"
|
|
|
|
log_info "Iteration report saved to $report_file"
|
|
}
|
|
|
|
# ============================================
|
|
# STOP CONDITION CHECKER
|
|
# ============================================
|
|
|
|
check_stop_conditions() {
|
|
local test_results="$1"
|
|
|
|
local pass_rate
|
|
local token_variance
|
|
local should_stop=false
|
|
local stop_reason=""
|
|
|
|
pass_rate=$(jq -r '.pass_rate' <<< "$test_results")
|
|
token_variance=$(jq -r '.token_variance' <<< "$test_results")
|
|
|
|
# Check pass rate threshold
|
|
if (( $(echo "$pass_rate >= $TARGET_PASS_RATE" | bc -l) )); then
|
|
should_stop=true
|
|
stop_reason="Target pass rate achieved: ${pass_rate}% >= ${TARGET_PASS_RATE}%"
|
|
fi
|
|
|
|
# Check token variance threshold
|
|
if (( $(echo "$token_variance < $TARGET_TOKEN_VARIANCE" | bc -l) )); then
|
|
if [[ "$should_stop" == "true" ]]; then
|
|
stop_reason="$stop_reason AND target token variance achieved: ${token_variance}% < ${TARGET_TOKEN_VARIANCE}%"
|
|
else
|
|
should_stop=true
|
|
stop_reason="Target token variance achieved: ${token_variance}% < ${TARGET_TOKEN_VARIANCE}%"
|
|
fi
|
|
fi
|
|
|
|
# Check for perfect score
|
|
if (( $(echo "$pass_rate == 100.0" | bc -l) )) && \
|
|
(( $(echo "$token_variance == 0.0" | bc -l) )); then
|
|
should_stop=true
|
|
stop_reason="Perfect score achieved: 100% pass rate, 0% token variance"
|
|
fi
|
|
|
|
jq -n \
|
|
--arg should_stop "$should_stop" \
|
|
--arg stop_reason "$stop_reason" \
|
|
--arg pass_rate "$pass_rate" \
|
|
--arg token_variance "$token_variance" \
|
|
'{
|
|
should_stop: ($should_stop == "true"),
|
|
reason: $stop_reason,
|
|
current_metrics: {
|
|
pass_rate: ($pass_rate | tonumber),
|
|
token_variance: ($token_variance | tonumber)
|
|
},
|
|
targets: {
|
|
pass_rate: 95.0,
|
|
token_variance: 3.0
|
|
}
|
|
}'
|
|
}
|
|
|
|
# ============================================
|
|
# PROGRESS DISPLAY
|
|
# ============================================
|
|
|
|
display_progress() {
|
|
local iteration="$1"
|
|
local test_results="$2"
|
|
local failures="$3"
|
|
|
|
local pass_rate
|
|
local token_variance
|
|
local failed
|
|
|
|
pass_rate=$(jq -r '.pass_rate' <<< "$test_results")
|
|
token_variance=$(jq -r '.token_variance' <<< "$test_results")
|
|
failed=$(jq -r '.failed' <<< "$test_results")
|
|
|
|
print_banner "Iteration $iteration Summary"
|
|
|
|
# Metrics display
|
|
echo -e "${CYAN}Current Metrics:${NC}"
|
|
echo " Pass Rate: ${pass_rate}% (target: ${TARGET_PASS_RATE}%)"
|
|
echo " Token Variance: ${token_variance}% (target: <${TARGET_TOKEN_VARIANCE}%)"
|
|
echo ""
|
|
|
|
# Best metrics
|
|
echo -e "${CYAN}Best Metrics (all time):${NC}"
|
|
echo " Pass Rate: ${BEST_PASS_RATE}%"
|
|
echo " Token Variance: ${BEST_TOKEN_VARIANCE}%"
|
|
echo ""
|
|
|
|
# Failures summary
|
|
if [[ $failed -gt 0 ]]; then
|
|
echo -e "${RED}Failures: $failed${NC}"
|
|
|
|
# Group by category
|
|
local accuracy=0 format=0 streaming=0 concurrency=0 edge_case=0
|
|
while read -r failure; do
|
|
local category
|
|
category=$(jq -r '.category' <<< "$failure")
|
|
case $category in
|
|
accuracy) ((accuracy++)) ;;
|
|
format) ((format++)) ;;
|
|
streaming) ((streaming++)) ;;
|
|
concurrency) ((concurrency++)) ;;
|
|
edge_case) ((edge_case++)) ;;
|
|
esac
|
|
done <<< "$(jq -c '.[]' <<< "$failures")"
|
|
|
|
echo -e " ${YELLOW}Breakdown by category:${NC}"
|
|
[[ $accuracy -gt 0 ]] && echo " Accuracy: $accuracy"
|
|
[[ $format -gt 0 ]] && echo " Format: $format"
|
|
[[ $streaming -gt 0 ]] && echo " Streaming: $streaming"
|
|
[[ $concurrency -gt 0 ]] && echo " Concurrency: $concurrency"
|
|
[[ $edge_case -gt 0 ]] && echo " Edge Case: $edge_case"
|
|
else
|
|
echo -e "${GREEN}No failures!${NC}"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================
|
|
# MAIN LOOP
|
|
# ============================================
|
|
|
|
main() {
|
|
print_banner "🔄 Automated Test-Fix-Iterate Loop v$VERSION"
|
|
|
|
# Initialize
|
|
init_directories
|
|
init_state
|
|
load_state
|
|
|
|
log_info "Starting test-fix-iterate loop..."
|
|
log_info "Stop conditions: pass rate >= ${TARGET_PASS_RATE}%, token variance < ${TARGET_TOKEN_VARIANCE}%"
|
|
log_info "Maximum iterations: $MAX_ITERATIONS"
|
|
|
|
local iteration=$ITERATION
|
|
local final_reason=""
|
|
|
|
# Main iteration loop
|
|
while [[ $iteration -lt $MAX_ITERATIONS ]]; do
|
|
((iteration++))
|
|
|
|
print_banner "🧪 Iteration $iteration/$MAX_ITERATIONS"
|
|
|
|
# Run test harness
|
|
local test_results
|
|
test_results=$(run_test_harness "$iteration")
|
|
|
|
# Parse and categorize failures
|
|
local failures_array=()
|
|
|
|
local passed
|
|
local failed
|
|
passed=$(jq -r '.passed' <<< "$test_results")
|
|
failed=$(jq -r '.failed' <<< "$test_results")
|
|
|
|
if [[ $failed -gt 0 ]]; then
|
|
log_warning "Detected $failed test failures. Analyzing..."
|
|
|
|
# Read test log for error details
|
|
local test_log="$LOGS_DIR/iteration-$iteration.log"
|
|
|
|
# Categorize each failure
|
|
while IFS= read -r line; do
|
|
if [[ $line =~ FAIL:\ ([^\ ]+) ]]; then
|
|
local test_name="${BASH_REMATCH[1]}"
|
|
local error_msg
|
|
error_msg=$(grep -A 5 "$test_name" "$test_log" | head -6)
|
|
|
|
local category_info
|
|
category_info=$(categorize_failure "$test_name" "$error_msg" "$test_log")
|
|
|
|
# Log with reproduction steps
|
|
local failure_file
|
|
failure_file=$(log_failure_with_reproduction "$iteration" "$test_name" "$category_info" "$test_log")
|
|
|
|
failures_array+=("$(cat "$failure_file")")
|
|
fi
|
|
done < "$test_log"
|
|
fi
|
|
|
|
# Convert failures array to JSON
|
|
local failures_json
|
|
failures_json=$(printf '%s\n' "${failures_array[@]}" | jq -s .)
|
|
|
|
# Generate fix suggestions if there are failures
|
|
if [[ $failed -gt 0 ]]; then
|
|
local suggestions
|
|
suggestions=$(generate_fix_suggestions "$failures_json")
|
|
|
|
log_info "Fix suggestions generated:"
|
|
jq -r '.[]' <<< "$suggestions" | while IFS= read -r suggestion; do
|
|
echo -e " ${YELLOW}•${NC} $suggestion"
|
|
done
|
|
|
|
# Save suggestions
|
|
echo "$suggestions" > "$REPORTS_DIR/patterns/iteration-$iteration-suggestions.json"
|
|
fi
|
|
|
|
# Update iteration metrics
|
|
update_iteration_metrics "$iteration" "$test_results" "$failures_json"
|
|
|
|
# Display progress
|
|
display_progress "$iteration" "$test_results" "$failures_json"
|
|
|
|
# Check stop conditions
|
|
local stop_check
|
|
stop_check=$(check_stop_conditions "$test_results")
|
|
|
|
local should_stop
|
|
should_stop=$(jq -r '.should_stop' <<< "$stop_check")
|
|
|
|
if [[ "$should_stop" == "true" ]]; then
|
|
final_reason=$(jq -r '.reason' <<< "$stop_check")
|
|
log_success "$final_reason"
|
|
break
|
|
fi
|
|
|
|
# Cooldown before next iteration
|
|
if [[ $iteration -lt $MAX_ITERATIONS ]]; then
|
|
log_info "Waiting ${COOLDOWN_SECONDS}s before next iteration..."
|
|
sleep $COOLDOWN_SECONDS
|
|
fi
|
|
done
|
|
|
|
# Final report
|
|
print_final_report "$iteration" "$final_reason"
|
|
}
|
|
|
|
print_final_report() {
|
|
local final_iteration="$1"
|
|
local stop_reason="$2"
|
|
|
|
local final_report="$REPORTS_DIR/final-report.json"
|
|
|
|
jq -n \
|
|
--arg version "$VERSION" \
|
|
--arg final_iteration "$final_iteration" \
|
|
--arg stop_reason "$stop_reason" \
|
|
--arg total_tests_run "$TOTAL_TESTS_RUN" \
|
|
--arg total_passes "$TOTAL_PASSES" \
|
|
--arg total_failures "$TOTAL_FAILURES" \
|
|
--arg best_pass_rate "$BEST_PASS_RATE" \
|
|
--arg best_token_variance "$BEST_TOKEN_VARIANCE" \
|
|
--arg completed_at "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
|
|
'{
|
|
version: $version,
|
|
final_iteration: ($final_iteration | tonumber),
|
|
stop_reason: $stop_reason,
|
|
completed_at: $completed_at,
|
|
summary: {
|
|
total_tests_run: ($total_tests_run | tonumber),
|
|
total_passes: ($total_passes | tonumber),
|
|
total_failures: ($total_failures | tonumber),
|
|
best_pass_rate: ($best_pass_rate | tonumber),
|
|
best_token_variance: ($best_token_variance | tonumber)
|
|
}
|
|
}' > "$final_report"
|
|
|
|
print_banner "📊 Final Report"
|
|
|
|
echo -e "${GREEN}Test-Fix-Iterate Loop Completed${NC}"
|
|
echo ""
|
|
echo "Total iterations: $final_iteration"
|
|
echo "Stop reason: $stop_reason"
|
|
echo ""
|
|
echo "Summary:"
|
|
echo " Total tests run: $TOTAL_TESTS_RUN"
|
|
echo " Total passes: $TOTAL_PASSES"
|
|
echo " Total failures: $TOTAL_FAILURES"
|
|
echo " Best pass rate: ${BEST_PASS_RATE}%"
|
|
echo " Best token variance: ${BEST_TOKEN_VARIANCE}%"
|
|
echo ""
|
|
echo "Reports saved to: $REPORTS_DIR"
|
|
echo "Final report: $final_report"
|
|
echo ""
|
|
|
|
# Check if targets were met
|
|
if (( $(echo "$BEST_PASS_RATE >= $TARGET_PASS_RATE" | bc -l) )); then
|
|
echo -e "${GREEN}✅ PASS RATE TARGET ACHIEVED${NC}"
|
|
else
|
|
echo -e "${YELLOW}⚠️ Pass rate target not met: ${BEST_PASS_RATE}% < ${TARGET_PASS_RATE}%${NC}"
|
|
fi
|
|
|
|
if (( $(echo "$BEST_TOKEN_VARIANCE < $TARGET_TOKEN_VARIANCE" | bc -l) )); then
|
|
echo -e "${GREEN}✅ TOKEN VARIANCE TARGET ACHIEVED${NC}"
|
|
else
|
|
echo -e "${YELLOW}⚠️ Token variance target not met: ${BEST_TOKEN_VARIANCE}% >= ${TARGET_TOKEN_VARIANCE}%${NC}"
|
|
fi
|
|
|
|
echo ""
|
|
}
|
|
|
|
# ============================================
|
|
# SCRIPT ENTRY POINT
|
|
# ============================================
|
|
|
|
# Parse command line arguments
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--debug)
|
|
DEBUG=true
|
|
shift
|
|
;;
|
|
--max-iterations)
|
|
MAX_ITERATIONS="$2"
|
|
shift 2
|
|
;;
|
|
--target-pass-rate)
|
|
TARGET_PASS_RATE="$2"
|
|
shift 2
|
|
;;
|
|
--target-variance)
|
|
TARGET_TOKEN_VARIANCE="$2"
|
|
shift 2
|
|
;;
|
|
-h|--help)
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo ""
|
|
echo "Automated Test-Fix-Iterate Loop for continuous testing"
|
|
echo ""
|
|
echo "Options:"
|
|
echo " --debug Enable debug output"
|
|
echo " --max-iterations N Maximum iterations (default: 50)"
|
|
echo " --target-pass-rate N Target pass rate % (default: 95)"
|
|
echo " --target-variance N Target token variance % (default: 3)"
|
|
echo " -h, --help Show this help"
|
|
echo ""
|
|
echo "Stop Conditions:"
|
|
echo " - Pass rate >= 95%"
|
|
echo " - Token variance < 3%"
|
|
echo " - Maximum iterations reached"
|
|
exit 0
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1"
|
|
echo "Use -h or --help for usage information"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Run main loop
|
|
main
|