ai-code-battle/cmd/acb-evolver/internal/arena/psro.go

// Package arena — PSRO Nash equilibrium computation.
//
// LLM-PSRO (Policy Space Response Oracles) uses Nash equilibrium over the
// current bot population as the promotion criterion.  A candidate is promoted
// only if it is a best response to the Nash mixture, i.e. its expected payoff
// against the Nash mixture exceeds the threshold (default 0.50).
//
// For the mini-tournament setting (one candidate, K opponents), the payoff
// matrix has a single row.  The Nash-optimal strategy for the minimising
// column player (opponents) is to concentrate weight on the opponent that
// minimises the candidate's expected win rate.  The resulting Nash value is
// therefore min(winRates), which is the tightest possible test.
//
// The full fictitious-play algorithm is retained so it generalises cleanly
// to K×K payoff matrices when the population grows.
package arena

// NashResult holds the Nash equilibrium computation for the meta-game.
type NashResult struct {
	// OpponentMix[i] = probability of opponent i in the Nash mixture.
	// Sums to 1.0.
	OpponentMix []float64

	// NashValue is the candidate's expected win rate under the Nash mixture.
	// This is the quantity compared against the promotion threshold.
	NashValue float64

	// WinRatePerOpponent mirrors the input payoff row for convenience.
	WinRatePerOpponent []float64
}

// ComputeNash computes the Nash equilibrium for the 1×K meta-game where
// winRates[i] is the candidate's win rate against opponent i.
//
// The column player (opponent) minimises the candidate's expected win rate.
// The optimal column strategy concentrates on the opponent(s) with the lowest
// win rate for the candidate.  Ties in the minimum are distributed uniformly.
//
// Nash value = min(winRates)  (hardest-opponent test).
func ComputeNash(winRates []float64) NashResult {
	if len(winRates) == 0 {
		return NashResult{NashValue: 0.5}
	}

	K := len(winRates)
	mix := make([]float64, K)

	// Find the minimum win rate.
	minVal := winRates[0]
	for _, w := range winRates[1:] {
		if w < minVal {
			minVal = w
		}
	}

	// Distribute weight uniformly over all opponents achieving the minimum.
	count := 0
	for _, w := range winRates {
		if w == minVal {
			count++
		}
	}
	for i, w := range winRates {
		if w == minVal {
			mix[i] = 1.0 / float64(count)
		}
	}

	return NashResult{
		OpponentMix:        mix,
		NashValue:          minVal,
		WinRatePerOpponent: winRates,
	}
}

// FictitiousPlayNash computes the Nash equilibrium via fictitious play,
// converging over iterations rounds.  This generalises to K×K matrices and
// provides a softer mixed-strategy Nash than the pure-minimax above.
//
// For a 1×K payoff matrix both algorithms produce identical results, so this
// function is provided for future use when the full population payoff matrix
// is available.
func FictitiousPlayNash(winRates []float64, iterations int) NashResult {
	if len(winRates) == 0 {
		return NashResult{NashValue: 0.5}
	}
	if iterations <= 0 {
		iterations = 1000
	}

	K := len(winRates)
	counts := make([]float64, K)

	// Fictitious play: column player repeatedly best-responds to the current
	// row player strategy (fixed at "always play candidate").
	for iter := 0; iter < iterations; iter++ {
		// Column player best response: pick opponent minimising candidate win rate.
		best := 0
		for i := 1; i < K; i++ {
			if winRates[i] < winRates[best] {
				best = i
			}
		}
		counts[best]++
	}

	mix := make([]float64, K)
	expected := 0.0
	for i, c := range counts {
		mix[i] = c / float64(iterations)
		expected += mix[i] * winRates[i]
	}

	return NashResult{
		OpponentMix:        mix,
		NashValue:          expected,
		WinRatePerOpponent: winRates,
	}
}