feat(worker): add rating recovery CLI mode (-mode=recalc-ratings)

Implements the rating recovery procedure specified in plan §12.3.
Running 'go run ./cmd/acb-worker -mode=recalc-ratings' will:
1. Reset all bot ratings to Glicko-2 defaults (mu=1500, phi=350, sigma=0.06)
2. Fetch all completed matches from the database in chronological order
3. Replay each match to recompute Glicko-2 ratings from scratch
4. Update the bots table with the recalculated ratings

This is needed for disaster recovery when ratings are corrupted or lost.

Database functions added:
- ResetAllRatings: resets all bot ratings to defaults
- GetAllCompletedMatches: fetches completed matches chronologically with participants
- UpdateAllRatings: bulk updates all bot ratings in a single transaction

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-04 00:37:10 -04:00
parent aeef954590
commit 467b7b67ea
2 changed files with 239 additions and 0 deletions

View file

@ -689,3 +689,132 @@ func (c *DBClient) UpdateMapEngagement(ctx context.Context, mapID string, engage
return tx.Commit()
}
// CompletedMatchForRecalc represents a completed match with participants for rating recalculation.
type CompletedMatchForRecalc struct {
ID string
CompletedAt time.Time
Winner *int // player_slot of winner, nil for draw
WinnerBotID *string // bot_id of winner (derived from winner player_slot)
Participants []MatchParticipantForRecalc
}
// MatchParticipantForRecalc represents a match participant for rating recalculation.
type MatchParticipantForRecalc struct {
BotID string
PlayerSlot int
}
// ResetAllRatings resets all bot ratings to Glicko-2 default values.
func (c *DBClient) ResetAllRatings(ctx context.Context) error {
_, err := c.db.ExecContext(ctx, `
UPDATE bots
SET rating_mu = $1, rating_phi = $2, rating_sigma = $3
`, glicko2DefaultMu, glicko2DefaultRD, glicko2Tau)
if err != nil {
return fmt.Errorf("failed to reset ratings: %w", err)
}
return nil
}
// GetAllCompletedMatches fetches all completed matches with their participants
// in chronological order (by completed_at). Used for rating recovery.
func (c *DBClient) GetAllCompletedMatches(ctx context.Context) ([]CompletedMatchForRecalc, error) {
// First, get all completed matches in order
rows, err := c.db.QueryContext(ctx, `
SELECT match_id, winner, completed_at
FROM matches
WHERE status = 'completed' AND completed_at IS NOT NULL
ORDER BY completed_at ASC
`)
if err != nil {
return nil, fmt.Errorf("failed to query completed matches: %w", err)
}
defer rows.Close()
var matches []CompletedMatchForRecalc
for rows.Next() {
var m CompletedMatchForRecalc
err := rows.Scan(&m.ID, &m.Winner, &m.CompletedAt)
if err != nil {
return nil, fmt.Errorf("failed to scan match: %w", err)
}
matches = append(matches, m)
}
if rows.Err() != nil {
return nil, fmt.Errorf("error iterating matches: %w", rows.Err())
}
// For each match, get participants
for i := range matches {
partRows, err := c.db.QueryContext(ctx, `
SELECT bot_id, player_slot
FROM match_participants
WHERE match_id = $1
ORDER BY player_slot
`, matches[i].ID)
if err != nil {
return nil, fmt.Errorf("failed to query participants for match %s: %w", matches[i].ID, err)
}
var participants []MatchParticipantForRecalc
for partRows.Next() {
var p MatchParticipantForRecalc
err := partRows.Scan(&p.BotID, &p.PlayerSlot)
if err != nil {
partRows.Close()
return nil, fmt.Errorf("failed to scan participant: %w", err)
}
participants = append(participants, p)
}
partRows.Close()
if partRows.Err() != nil {
return nil, fmt.Errorf("error iterating participants for match %s: %w", matches[i].ID, partRows.Err())
}
matches[i].Participants = participants
// Derive WinnerBotID from Winner (player_slot)
if matches[i].Winner != nil {
for _, p := range participants {
if p.PlayerSlot == *matches[i].Winner {
winnerID := p.BotID
matches[i].WinnerBotID = &winnerID
break
}
}
}
}
return matches, nil
}
// UpdateAllRatings updates all bot ratings in a single transaction.
func (c *DBClient) UpdateAllRatings(ctx context.Context, ratings map[string]Glicko2Rating) error {
tx, err := c.db.BeginTx(ctx, nil)
if err != nil {
return fmt.Errorf("failed to begin transaction: %w", err)
}
defer tx.Rollback()
now := time.Now().UTC()
for botID, rating := range ratings {
_, err := tx.ExecContext(ctx, `
UPDATE bots
SET rating_mu = $1, rating_phi = $2, rating_sigma = $3, last_active = $4
WHERE bot_id = $5
`, rating.Mu, rating.Phi, rating.Sigma, now, botID)
if err != nil {
return fmt.Errorf("failed to update rating for bot %s: %w", botID, err)
}
}
if err := tx.Commit(); err != nil {
return fmt.Errorf("failed to commit transaction: %w", err)
}
return nil
}

View file

@ -23,6 +23,7 @@ import (
"github.com/aicodebattle/acb/metrics"
"image/png"
)
// Config holds worker configuration.
type Config struct {
DatabaseURL string // PostgreSQL connection URL
@ -63,6 +64,7 @@ func main() {
turnTimeout := flag.Duration("timeout", 3*time.Second, "Per-turn bot timeout")
maxRetries := flag.Int("retries", 3, "Max retries for transient errors")
verbose := flag.Bool("verbose", getEnv("ACB_VERBOSE", "false") == "true", "Enable verbose logging")
mode := flag.String("mode", "worker", "Operation mode: 'worker' (normal polling) or 'recalc-ratings' (disaster recovery)")
flag.Parse()
// Validate required config
@ -97,6 +99,20 @@ func main() {
}
defer dbClient.Close()
// Handle different operation modes
switch *mode {
case "recalc-ratings":
// Disaster recovery: recompute all ratings from match history
logger := log.New(os.Stdout, "[recalc-ratings] ", log.LstdFlags)
if err := recalcRatings(context.Background(), dbClient, logger, *verbose); err != nil {
log.Fatalf("Rating recalculation failed: %v", err)
}
logger.Println("Rating recalculation completed successfully")
return
}
// Normal worker mode (default)
// Create B2 client (optional - if not configured, replays won't be uploaded to cold archive)
var b2Client *B2Client
if cfg.B2Endpoint != "" && cfg.B2AccessKey != "" && cfg.B2SecretKey != "" {
@ -236,6 +252,7 @@ func (w *Worker) pollAndExecute(ctx context.Context) error {
metrics.MatchThroughput.Inc()
metrics.WorkerMatchesTotal.Inc()
metrics.WorkerMatchDuration.Observe(time.Since(matchStart).Seconds())
// Upload replay to B2
replayURL := ""
if w.b2 != nil {
@ -578,3 +595,96 @@ func (w *Worker) computeRatingUpdates(claimData *JobClaimData, result *MatchResu
// Compute rating updates
return ComputeRatingUpdates(botIDs, ratings, scores)
}
// recalcRatings recalculates all Glicko-2 ratings from scratch by replaying
// all completed matches in chronological order. Used for disaster recovery
// when ratings are corrupted or lost.
func recalcRatings(ctx context.Context, db *DBClient, logger *log.Logger, verbose bool) error {
logger.Println("Starting rating recalculation...")
logger.Println("Step 1: Resetting all bot ratings to defaults")
// Step 1: Reset all bot ratings to defaults
if err := db.ResetAllRatings(ctx); err != nil {
return fmt.Errorf("failed to reset ratings: %w", err)
}
logger.Println(" All ratings reset to defaults (mu=1500, phi=350, sigma=0.06)")
// Step 2: Fetch all completed matches in chronological order
logger.Println("Step 2: Fetching completed matches in chronological order")
matches, err := db.GetAllCompletedMatches(ctx)
if err != nil {
return fmt.Errorf("failed to fetch matches: %w", err)
}
logger.Printf(" Found %d completed matches to process", len(matches))
if len(matches) == 0 {
logger.Println("No matches to process, ratings remain at defaults")
return nil
}
// Step 3: Track current ratings in memory
currentRatings := make(map[string]Glicko2Rating)
// Step 4: Process each match in order
logger.Println("Step 3: Replaying matches to recompute ratings")
processed := 0
for _, match := range matches {
// Ensure all participants have default ratings initialized
for _, p := range match.Participants {
if _, exists := currentRatings[p.BotID]; !exists {
currentRatings[p.BotID] = Glicko2Rating{
Mu: glicko2DefaultMu,
Phi: glicko2DefaultRD,
Sigma: glicko2Tau, // default sigma
}
}
}
// Build arrays for rating computation
n := len(match.Participants)
botIDs := make([]string, n)
ratings := make([]Glicko2Rating, n)
scores := make([]float64, n)
for i, p := range match.Participants {
botIDs[i] = p.BotID
ratings[i] = currentRatings[p.BotID]
// Determine score based on match result
// If winner is a player slot, convert to bot_id and score accordingly
if match.Winner == nil {
// Draw or no winner
scores[i] = 0.5
} else if match.WinnerBotID != nil && *match.WinnerBotID == p.BotID {
scores[i] = 1.0
} else {
scores[i] = 0.0
}
}
// Compute new ratings using Glicko-2
newRatings := UpdateRatings(ratings, scores)
// Update stored ratings
for i, botID := range botIDs {
currentRatings[botID] = newRatings[i]
}
processed++
if processed%1000 == 0 || verbose {
logger.Printf(" Processed %d/%d matches (match_id=%s)", processed, len(matches), match.ID)
}
}
logger.Printf(" Processed all %d matches", processed)
// Step 5: Write final ratings back to database
logger.Println("Step 4: Writing recalculated ratings to database")
if err := db.UpdateAllRatings(ctx, currentRatings); err != nil {
return fmt.Errorf("failed to write ratings: %w", err)
}
logger.Printf(" Updated ratings for %d bots", len(currentRatings))
logger.Println("Rating recalculation complete")
return nil
}