Three-stage fail-fast validator for LLM-generated bot candidates: - syntax.go: language-aware parse (go/parser for Go; py_compile, rustfmt, tsc, javac, php -l for others; brace-balance fallback) - schema.go: regex detection of /health + /turn endpoints and "moves" field - sandbox.go: nsjail-isolated smoke test — builds bot, polls /health, sends 5 signed /turn requests, verifies JSON moves responses - validator.go: orchestrates stages with fail-fast short-circuit DB layer: - programs table + CRUD (create, get, list, updateFitness, setPromoted) - validation_log table with RecordValidation, IslandPassRates, IslandValidationStats for per-island pass-rate tracking - seed.go: 6 generation-0 bots across alpha/beta/gamma/delta islands MAP-Elites grid (mapelites/grid.go): 2-D behavior grid on aggression×economy axes; TryPlace keeps the fittest occupant per niche. acb-evolver CLI gains two new subcommands: validate <file> -lang <lang> [-island <island>] [-nsjail] [-nolog] validation-stats (tabular per-island pass-rate breakdown) cmd/acb-api/db.go: add programs table to API schema so the API can query promoted evolved bots. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
139 lines
3.8 KiB
Go
139 lines
3.8 KiB
Go
// Package validator implements a three-stage validation pipeline for
|
|
// LLM-generated bot candidates:
|
|
//
|
|
// 1. Syntax — parse the generated code for the target language
|
|
// 2. Schema — verify the bot exposes the required HTTP endpoints
|
|
// 3. Sandbox — run the bot in a nsjail container, send 5 test /turn
|
|
// requests, and verify valid JSON responses
|
|
//
|
|
// The pipeline is fail-fast: if any stage fails, subsequent stages are
|
|
// skipped. The raw LLM output is preserved in Report so the evolution
|
|
// loop can embed it in retry prompts.
|
|
package validator
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
)
|
|
|
|
// Stage identifies a validation pipeline stage.
|
|
type Stage string
|
|
|
|
const (
|
|
StageSyntax Stage = "syntax"
|
|
StageSchema Stage = "schema"
|
|
StageSandbox Stage = "sandbox"
|
|
)
|
|
|
|
// StageResult holds the outcome of one pipeline stage.
|
|
type StageResult struct {
|
|
Stage Stage
|
|
Passed bool
|
|
Error string // non-empty on failure
|
|
Duration time.Duration // wall-clock time for the stage
|
|
}
|
|
|
|
// Report is the complete outcome of a validation run. It is returned
|
|
// even when a stage fails so callers can log the partial results.
|
|
type Report struct {
|
|
Language string
|
|
Stages []StageResult
|
|
Passed bool // true only when all three stages pass
|
|
LLMOutput string // raw LLM response; preserved for retry / learning
|
|
}
|
|
|
|
// LastStage returns the name of the last stage that was executed (whether
|
|
// it passed or not). Returns "" when no stages ran.
|
|
func (r *Report) LastStage() Stage {
|
|
if len(r.Stages) == 0 {
|
|
return ""
|
|
}
|
|
return r.Stages[len(r.Stages)-1].Stage
|
|
}
|
|
|
|
// Config controls pipeline behaviour.
|
|
type Config struct {
|
|
// SyntaxTimeout caps the external process used for syntax checking.
|
|
SyntaxTimeout time.Duration
|
|
// SandboxTimeout caps the entire sandbox smoke test (build + run + requests).
|
|
SandboxTimeout time.Duration
|
|
// SmokeRequests is the number of /turn requests sent during the smoke test.
|
|
SmokeRequests int
|
|
// UseNsjail enables nsjail-based process isolation during the smoke test.
|
|
// Falls back to plain exec when nsjail is not found in PATH.
|
|
UseNsjail bool
|
|
// NsjailPath overrides the nsjail binary name / path (default "nsjail").
|
|
NsjailPath string
|
|
}
|
|
|
|
// DefaultConfig returns a Config with production-ready defaults.
|
|
func DefaultConfig() Config {
|
|
return Config{
|
|
SyntaxTimeout: 15 * time.Second,
|
|
SandboxTimeout: 60 * time.Second,
|
|
SmokeRequests: 5,
|
|
UseNsjail: true,
|
|
NsjailPath: "nsjail",
|
|
}
|
|
}
|
|
|
|
// Validate runs the full three-stage pipeline for the given bot code.
|
|
// llmOutput is the raw text from which code was extracted; it is stored
|
|
// in the report for retry or learning.
|
|
//
|
|
// The returned error is only non-nil for unexpected infrastructure failures
|
|
// (e.g. temp-dir creation). Validation failures are encoded in Report.Passed
|
|
// and the individual StageResult.Error fields.
|
|
func Validate(ctx context.Context, code, language, llmOutput string, cfg Config) (*Report, error) {
|
|
r := &Report{
|
|
Language: language,
|
|
LLMOutput: llmOutput,
|
|
}
|
|
|
|
type step struct {
|
|
name Stage
|
|
fn func(context.Context) error
|
|
}
|
|
|
|
steps := []step{
|
|
{
|
|
StageSyntax,
|
|
func(ctx context.Context) error {
|
|
return CheckSyntax(ctx, code, language, cfg.SyntaxTimeout)
|
|
},
|
|
},
|
|
{
|
|
StageSchema,
|
|
func(_ context.Context) error {
|
|
return CheckSchema(code, language)
|
|
},
|
|
},
|
|
{
|
|
StageSandbox,
|
|
func(ctx context.Context) error {
|
|
return RunSmokeTest(ctx, code, language, cfg)
|
|
},
|
|
},
|
|
}
|
|
|
|
allPassed := true
|
|
for _, s := range steps {
|
|
t0 := time.Now()
|
|
err := s.fn(ctx)
|
|
sr := StageResult{
|
|
Stage: s.name,
|
|
Passed: err == nil,
|
|
Duration: time.Since(t0),
|
|
}
|
|
if err != nil {
|
|
sr.Error = err.Error()
|
|
allPassed = false
|
|
}
|
|
r.Stages = append(r.Stages, sr)
|
|
if err != nil {
|
|
break // fail-fast: skip remaining stages
|
|
}
|
|
}
|
|
r.Passed = allPassed
|
|
return r, nil
|
|
}
|