feat(bd-j1t): worker state machine — gap-based stuck detection + web WorkerDetail NeedleState

- Add state_gap stuck detection using lastStateTransition — fires when
  a worker in an active NeedleState hasn't transitioned within the
  configurable threshold (default 5min, critical at 2x).
- Update web WorkerDetail.tsx to display needleState with icons and
  colors instead of coarse active/idle/error status.
- Add stuckDetection.test.ts with 16 tests covering gap-based detection,
  legacy patterns, and edge cases.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-04-21 13:32:46 -04:00
parent e0c16e19af
commit 8f7d7cf72d
3 changed files with 319 additions and 28 deletions

View file

@ -0,0 +1,233 @@
/**
* Tests for Stuck Worker Detection
*/
import { describe, it, expect } from 'vitest';
import { isWorkerStuck, getStuckReason, getStuckIndicator, StuckPattern } from './stuckDetection.js';
import { LogEvent, WorkerInfo } from '../../types.js';
const makeWorker = (overrides: Partial<WorkerInfo> = {}): WorkerInfo => ({
id: 'w-test',
status: 'active',
beadsCompleted: 3,
firstSeen: Date.now() - 5 * 60 * 1000,
lastActivity: Date.now(),
activeFiles: [],
hasCollision: false,
activeDirectories: [],
collisionTypes: [],
eventCount: 10,
...overrides,
});
const makeEvent = (overrides: Partial<LogEvent> = {}): LogEvent => ({
ts: Date.now(),
worker: 'w-test',
level: 'info',
msg: 'test event',
...overrides,
});
describe('Stuck Detection', () => {
describe('isWorkerStuck', () => {
it('returns null for a healthy worker with recent events', () => {
const worker = makeWorker();
const events = [makeEvent()];
expect(isWorkerStuck(worker, events)).toBeNull();
});
it('returns null when no events exist', () => {
const worker = makeWorker();
expect(isWorkerStuck(worker, [])).toBeNull();
});
});
describe('state-transition gap detection', () => {
it('detects worker stuck in WORKING with no state transition for too long', () => {
const gapMs = 10 * 60 * 1000; // 10 minutes
const worker = makeWorker({
needleState: 'WORKING',
lastStateTransition: Date.now() - gapMs,
});
const events = [makeEvent()];
const pattern = isWorkerStuck(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(pattern).not.toBeNull();
expect(pattern!.type).toBe('state_gap');
expect(pattern!.severity).toBe('warning');
expect(pattern!.reason).toContain('WORKING');
expect(pattern!.reason).toContain('10m');
});
it('escalates to critical at 2x the gap threshold', () => {
const gapMs = 15 * 60 * 1000; // 15 minutes (> 2×5min threshold)
const worker = makeWorker({
needleState: 'WORKING',
lastStateTransition: Date.now() - gapMs,
});
const events = [makeEvent()];
const pattern = isWorkerStuck(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(pattern).not.toBeNull();
expect(pattern!.severity).toBe('critical');
});
it('does not fire for STOPPED workers', () => {
const worker = makeWorker({
needleState: 'STOPPED',
lastStateTransition: Date.now() - 10 * 60 * 1000,
});
const events = [makeEvent()];
const pattern = isWorkerStuck(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(pattern).toBeNull();
});
it('does not fire when gap is under threshold', () => {
const worker = makeWorker({
needleState: 'WORKING',
lastStateTransition: Date.now() - 2 * 60 * 1000, // 2 min
});
const events = [makeEvent()];
const pattern = isWorkerStuck(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(pattern).toBeNull();
});
it('does not fire when needleState is not set', () => {
const worker = makeWorker();
const events = [makeEvent()];
const pattern = isWorkerStuck(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(pattern).toBeNull();
});
it('detects gap for SELECTING state', () => {
const worker = makeWorker({
needleState: 'SELECTING',
lastStateTransition: Date.now() - 8 * 60 * 1000,
});
const events = [makeEvent()];
const pattern = isWorkerStuck(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(pattern).not.toBeNull();
expect(pattern!.type).toBe('state_gap');
expect(pattern!.reason).toContain('SELECTING');
});
it('detects gap for CLAIMING state', () => {
const worker = makeWorker({
needleState: 'CLAIMING',
lastStateTransition: Date.now() - 7 * 60 * 1000,
});
const events = [makeEvent()];
const pattern = isWorkerStuck(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(pattern).not.toBeNull();
expect(pattern!.reason).toContain('CLAIMING');
});
});
describe('getStuckReason', () => {
it('returns the reason string when stuck', () => {
const worker = makeWorker({
needleState: 'WORKING',
lastStateTransition: Date.now() - 10 * 60 * 1000,
});
const events = [makeEvent()];
const reason = getStuckReason(worker, events, {
stateTransitionGapMs: 5 * 60 * 1000,
});
expect(reason).toContain('WORKING');
});
it('returns null when not stuck', () => {
const worker = makeWorker();
const events = [makeEvent()];
expect(getStuckReason(worker, events)).toBeNull();
});
});
describe('getStuckIndicator', () => {
it('returns ⚠ for critical', () => {
const pattern: StuckPattern = {
type: 'state_gap',
reason: 'test',
severity: 'critical',
evidence: [],
suggestion: 'test',
};
expect(getStuckIndicator(pattern)).toBe('⚠');
});
it('returns ⚡ for warning', () => {
const pattern: StuckPattern = {
type: 'state_gap',
reason: 'test',
severity: 'warning',
evidence: [],
suggestion: 'test',
};
expect(getStuckIndicator(pattern)).toBe('⚡');
});
it('returns empty string for null', () => {
expect(getStuckIndicator(null)).toBe('');
});
});
describe('legacy detection (non-state-transition)', () => {
it('still detects repeated tool calls', () => {
const worker = makeWorker();
const events: LogEvent[] = [];
for (let i = 0; i < 6; i++) {
events.push(makeEvent({ tool: 'Read', path: '/src/index.ts', ts: Date.now() - i * 10000 }));
}
const pattern = isWorkerStuck(worker, events);
expect(pattern).not.toBeNull();
expect(pattern!.type).toBe('repeated_tool');
});
it('still detects no progress', () => {
const worker = makeWorker({
lastActivity: Date.now() - 3 * 60 * 1000,
});
const events = [makeEvent({ ts: Date.now() - 3 * 60 * 1000 })];
const pattern = isWorkerStuck(worker, events, {
noProgressThresholdMs: 2 * 60 * 1000,
});
expect(pattern).not.toBeNull();
expect(pattern!.type).toBe('no_progress');
});
});
});

View file

@ -9,7 +9,7 @@ import { LogEvent, WorkerInfo } from '../../types.js';
export interface StuckPattern {
/** Type of stuck pattern detected */
type: 'repeated_tool' | 'no_progress' | 'circular_edit' | 'long_running';
type: 'repeated_tool' | 'no_progress' | 'circular_edit' | 'long_running' | 'state_gap';
/** Human-readable description */
reason: string;
@ -36,6 +36,11 @@ export interface StuckDetectionOptions {
/** Threshold for long-running tasks (ms), default 10 minutes */
longRunningThresholdMs?: number;
/** Threshold for state-transition gap (ms), default 5 minutes.
* Fires when a worker with a needleState has not transitioned
* within this window while in an active state (WORKING, etc.). */
stateTransitionGapMs?: number;
}
const DEFAULT_OPTIONS: Required<StuckDetectionOptions> = {
@ -43,6 +48,7 @@ const DEFAULT_OPTIONS: Required<StuckDetectionOptions> = {
repeatedToolThreshold: 5,
noProgressThresholdMs: 2 * 60 * 1000, // 2 minutes
longRunningThresholdMs: 10 * 60 * 1000, // 10 minutes
stateTransitionGapMs: 5 * 60 * 1000, // 5 minutes
};
/**
@ -68,6 +74,7 @@ export function isWorkerStuck(
// Check patterns in order of severity
const patterns = [
detectStateTransitionGap(worker, opts),
detectRepeatedToolCalls(recentEvents, opts),
detectNoProgress(worker, recentEvents, opts),
detectCircularEdits(recentEvents, opts),
@ -96,6 +103,47 @@ export function getStuckReason(
return pattern?.reason ?? null;
}
/**
* Detect state-transition gap worker stuck in an active state without
* transitioning. Uses lastStateTransition from WorkerInfo (set by the store
* when processing worker.state_transition events).
*/
function detectStateTransitionGap(
worker: WorkerInfo,
opts: Required<StuckDetectionOptions>
): StuckPattern | null {
if (!worker.needleState || !worker.lastStateTransition) {
return null;
}
// Only active states are relevant — STOPPED workers are fine.
const activeStates = ['BOOTING', 'SELECTING', 'CLAIMING', 'WORKING', 'CLOSING'] as const;
if (!activeStates.includes(worker.needleState as typeof activeStates[number])) {
return null;
}
const now = Date.now();
const gapMs = now - worker.lastStateTransition;
if (gapMs > opts.stateTransitionGapMs) {
const minutes = Math.floor(gapMs / 60000);
const isCritical = gapMs > opts.stateTransitionGapMs * 2;
return {
type: 'state_gap',
reason: `No state transition for ${minutes}m while in ${worker.needleState}`,
severity: isCritical ? 'critical' : 'warning',
evidence: [
`State: ${worker.needleState} since ${new Date(worker.lastStateTransition).toISOString()}`,
`Gap: ${minutes}m (threshold: ${Math.floor(opts.stateTransitionGapMs / 60000)}m)`,
],
suggestion: 'Worker may be stuck — check if the agent is waiting on an external resource or deadlocked',
};
}
return null;
}
/**
* Detect repeated tool calls with same parameters
*/

View file

@ -1,5 +1,23 @@
import React from 'react';
import { WorkerInfo, LogEvent } from '../types';
import { WorkerInfo, LogEvent, NeedleState } from '../types';
const NEEDLE_STATE_ICONS: Record<NeedleState, string> = {
BOOTING: '⏳',
SELECTING: '🔍',
CLAIMING: '🎯',
WORKING: '●',
CLOSING: '⏹',
STOPPED: '○',
};
const NEEDLE_STATE_COLORS: Record<NeedleState, string> = {
BOOTING: '#5bc0de',
SELECTING: '#f0ad4e',
CLAIMING: '#9b59b6',
WORKING: '#5cb85c',
CLOSING: '#f0ad4e',
STOPPED: '#777',
};
interface WorkerDetailProps {
/** The worker to display details for */
@ -12,16 +30,6 @@ interface WorkerDetailProps {
allWorkerEvents?: LogEvent[];
}
/**
* WorkerDetail Component
*
* Displays detailed information about a selected worker including:
* - Worker ID and status
* - Activity statistics (event count, current tool)
* - Timing information (last seen, uptime)
* - Recent events list
* - Collision information if applicable
*/
const WorkerDetail: React.FC<WorkerDetailProps> = ({
worker,
onClose,
@ -42,18 +50,14 @@ const WorkerDetail: React.FC<WorkerDetailProps> = ({
return new Date(timestamp).toLocaleTimeString();
};
const getStatusIcon = (): string => {
switch (worker.status) {
case 'active':
return '●';
case 'idle':
return '○';
case 'error':
return '✗';
default:
return '?';
}
};
const stateIcon = worker.needleState
? NEEDLE_STATE_ICONS[worker.needleState]
: worker.status === 'active' ? '●' : worker.status === 'idle' ? '○' : '✗';
const stateLabel = worker.needleState ?? worker.status.toUpperCase();
const stateColor = worker.needleState
? NEEDLE_STATE_COLORS[worker.needleState]
: undefined;
const stateCssClass = worker.needleState ? undefined : worker.status;
const eventsToShow = allWorkerEvents || worker.recentEvents || [];
@ -62,8 +66,11 @@ const WorkerDetail: React.FC<WorkerDetailProps> = ({
{/* Header with close button */}
<div className="worker-detail-header">
<h2>
<span className={`worker-status-icon ${worker.status}`}>
{getStatusIcon()}
<span
className={`worker-status-icon ${stateCssClass ?? ''}`}
style={stateColor ? { color: stateColor } : undefined}
>
{stateIcon}
</span>
{worker.id}
</h2>
@ -103,8 +110,11 @@ const WorkerDetail: React.FC<WorkerDetailProps> = ({
<h3>Status</h3>
<div className="detail-row">
<span className="detail-label">State</span>
<span className={`detail-value worker-status ${worker.status}`}>
{worker.status}
<span
className={`detail-value worker-status ${stateCssClass ?? ''}`}
style={stateColor ? { color: stateColor } : undefined}
>
{stateLabel}
</span>
</div>
<div className="detail-row">