feat(bd-j1t): worker state machine — gap-based stuck detection + web WorkerDetail NeedleState
- Add state_gap stuck detection using lastStateTransition — fires when a worker in an active NeedleState hasn't transitioned within the configurable threshold (default 5min, critical at 2x). - Update web WorkerDetail.tsx to display needleState with icons and colors instead of coarse active/idle/error status. - Add stuckDetection.test.ts with 16 tests covering gap-based detection, legacy patterns, and edge cases. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
e0c16e19af
commit
8f7d7cf72d
3 changed files with 319 additions and 28 deletions
233
src/tui/utils/stuckDetection.test.ts
Normal file
233
src/tui/utils/stuckDetection.test.ts
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
/**
|
||||
* Tests for Stuck Worker Detection
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { isWorkerStuck, getStuckReason, getStuckIndicator, StuckPattern } from './stuckDetection.js';
|
||||
import { LogEvent, WorkerInfo } from '../../types.js';
|
||||
|
||||
const makeWorker = (overrides: Partial<WorkerInfo> = {}): WorkerInfo => ({
|
||||
id: 'w-test',
|
||||
status: 'active',
|
||||
beadsCompleted: 3,
|
||||
firstSeen: Date.now() - 5 * 60 * 1000,
|
||||
lastActivity: Date.now(),
|
||||
activeFiles: [],
|
||||
hasCollision: false,
|
||||
activeDirectories: [],
|
||||
collisionTypes: [],
|
||||
eventCount: 10,
|
||||
...overrides,
|
||||
});
|
||||
|
||||
const makeEvent = (overrides: Partial<LogEvent> = {}): LogEvent => ({
|
||||
ts: Date.now(),
|
||||
worker: 'w-test',
|
||||
level: 'info',
|
||||
msg: 'test event',
|
||||
...overrides,
|
||||
});
|
||||
|
||||
describe('Stuck Detection', () => {
|
||||
describe('isWorkerStuck', () => {
|
||||
it('returns null for a healthy worker with recent events', () => {
|
||||
const worker = makeWorker();
|
||||
const events = [makeEvent()];
|
||||
|
||||
expect(isWorkerStuck(worker, events)).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when no events exist', () => {
|
||||
const worker = makeWorker();
|
||||
|
||||
expect(isWorkerStuck(worker, [])).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('state-transition gap detection', () => {
|
||||
it('detects worker stuck in WORKING with no state transition for too long', () => {
|
||||
const gapMs = 10 * 60 * 1000; // 10 minutes
|
||||
const worker = makeWorker({
|
||||
needleState: 'WORKING',
|
||||
lastStateTransition: Date.now() - gapMs,
|
||||
});
|
||||
const events = [makeEvent()];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).not.toBeNull();
|
||||
expect(pattern!.type).toBe('state_gap');
|
||||
expect(pattern!.severity).toBe('warning');
|
||||
expect(pattern!.reason).toContain('WORKING');
|
||||
expect(pattern!.reason).toContain('10m');
|
||||
});
|
||||
|
||||
it('escalates to critical at 2x the gap threshold', () => {
|
||||
const gapMs = 15 * 60 * 1000; // 15 minutes (> 2×5min threshold)
|
||||
const worker = makeWorker({
|
||||
needleState: 'WORKING',
|
||||
lastStateTransition: Date.now() - gapMs,
|
||||
});
|
||||
const events = [makeEvent()];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).not.toBeNull();
|
||||
expect(pattern!.severity).toBe('critical');
|
||||
});
|
||||
|
||||
it('does not fire for STOPPED workers', () => {
|
||||
const worker = makeWorker({
|
||||
needleState: 'STOPPED',
|
||||
lastStateTransition: Date.now() - 10 * 60 * 1000,
|
||||
});
|
||||
const events = [makeEvent()];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).toBeNull();
|
||||
});
|
||||
|
||||
it('does not fire when gap is under threshold', () => {
|
||||
const worker = makeWorker({
|
||||
needleState: 'WORKING',
|
||||
lastStateTransition: Date.now() - 2 * 60 * 1000, // 2 min
|
||||
});
|
||||
const events = [makeEvent()];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).toBeNull();
|
||||
});
|
||||
|
||||
it('does not fire when needleState is not set', () => {
|
||||
const worker = makeWorker();
|
||||
const events = [makeEvent()];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).toBeNull();
|
||||
});
|
||||
|
||||
it('detects gap for SELECTING state', () => {
|
||||
const worker = makeWorker({
|
||||
needleState: 'SELECTING',
|
||||
lastStateTransition: Date.now() - 8 * 60 * 1000,
|
||||
});
|
||||
const events = [makeEvent()];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).not.toBeNull();
|
||||
expect(pattern!.type).toBe('state_gap');
|
||||
expect(pattern!.reason).toContain('SELECTING');
|
||||
});
|
||||
|
||||
it('detects gap for CLAIMING state', () => {
|
||||
const worker = makeWorker({
|
||||
needleState: 'CLAIMING',
|
||||
lastStateTransition: Date.now() - 7 * 60 * 1000,
|
||||
});
|
||||
const events = [makeEvent()];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).not.toBeNull();
|
||||
expect(pattern!.reason).toContain('CLAIMING');
|
||||
});
|
||||
});
|
||||
|
||||
describe('getStuckReason', () => {
|
||||
it('returns the reason string when stuck', () => {
|
||||
const worker = makeWorker({
|
||||
needleState: 'WORKING',
|
||||
lastStateTransition: Date.now() - 10 * 60 * 1000,
|
||||
});
|
||||
const events = [makeEvent()];
|
||||
|
||||
const reason = getStuckReason(worker, events, {
|
||||
stateTransitionGapMs: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(reason).toContain('WORKING');
|
||||
});
|
||||
|
||||
it('returns null when not stuck', () => {
|
||||
const worker = makeWorker();
|
||||
const events = [makeEvent()];
|
||||
|
||||
expect(getStuckReason(worker, events)).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('getStuckIndicator', () => {
|
||||
it('returns ⚠ for critical', () => {
|
||||
const pattern: StuckPattern = {
|
||||
type: 'state_gap',
|
||||
reason: 'test',
|
||||
severity: 'critical',
|
||||
evidence: [],
|
||||
suggestion: 'test',
|
||||
};
|
||||
expect(getStuckIndicator(pattern)).toBe('⚠');
|
||||
});
|
||||
|
||||
it('returns ⚡ for warning', () => {
|
||||
const pattern: StuckPattern = {
|
||||
type: 'state_gap',
|
||||
reason: 'test',
|
||||
severity: 'warning',
|
||||
evidence: [],
|
||||
suggestion: 'test',
|
||||
};
|
||||
expect(getStuckIndicator(pattern)).toBe('⚡');
|
||||
});
|
||||
|
||||
it('returns empty string for null', () => {
|
||||
expect(getStuckIndicator(null)).toBe('');
|
||||
});
|
||||
});
|
||||
|
||||
describe('legacy detection (non-state-transition)', () => {
|
||||
it('still detects repeated tool calls', () => {
|
||||
const worker = makeWorker();
|
||||
const events: LogEvent[] = [];
|
||||
for (let i = 0; i < 6; i++) {
|
||||
events.push(makeEvent({ tool: 'Read', path: '/src/index.ts', ts: Date.now() - i * 10000 }));
|
||||
}
|
||||
|
||||
const pattern = isWorkerStuck(worker, events);
|
||||
|
||||
expect(pattern).not.toBeNull();
|
||||
expect(pattern!.type).toBe('repeated_tool');
|
||||
});
|
||||
|
||||
it('still detects no progress', () => {
|
||||
const worker = makeWorker({
|
||||
lastActivity: Date.now() - 3 * 60 * 1000,
|
||||
});
|
||||
const events = [makeEvent({ ts: Date.now() - 3 * 60 * 1000 })];
|
||||
|
||||
const pattern = isWorkerStuck(worker, events, {
|
||||
noProgressThresholdMs: 2 * 60 * 1000,
|
||||
});
|
||||
|
||||
expect(pattern).not.toBeNull();
|
||||
expect(pattern!.type).toBe('no_progress');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -9,7 +9,7 @@ import { LogEvent, WorkerInfo } from '../../types.js';
|
|||
|
||||
export interface StuckPattern {
|
||||
/** Type of stuck pattern detected */
|
||||
type: 'repeated_tool' | 'no_progress' | 'circular_edit' | 'long_running';
|
||||
type: 'repeated_tool' | 'no_progress' | 'circular_edit' | 'long_running' | 'state_gap';
|
||||
|
||||
/** Human-readable description */
|
||||
reason: string;
|
||||
|
|
@ -36,6 +36,11 @@ export interface StuckDetectionOptions {
|
|||
|
||||
/** Threshold for long-running tasks (ms), default 10 minutes */
|
||||
longRunningThresholdMs?: number;
|
||||
|
||||
/** Threshold for state-transition gap (ms), default 5 minutes.
|
||||
* Fires when a worker with a needleState has not transitioned
|
||||
* within this window while in an active state (WORKING, etc.). */
|
||||
stateTransitionGapMs?: number;
|
||||
}
|
||||
|
||||
const DEFAULT_OPTIONS: Required<StuckDetectionOptions> = {
|
||||
|
|
@ -43,6 +48,7 @@ const DEFAULT_OPTIONS: Required<StuckDetectionOptions> = {
|
|||
repeatedToolThreshold: 5,
|
||||
noProgressThresholdMs: 2 * 60 * 1000, // 2 minutes
|
||||
longRunningThresholdMs: 10 * 60 * 1000, // 10 minutes
|
||||
stateTransitionGapMs: 5 * 60 * 1000, // 5 minutes
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -68,6 +74,7 @@ export function isWorkerStuck(
|
|||
|
||||
// Check patterns in order of severity
|
||||
const patterns = [
|
||||
detectStateTransitionGap(worker, opts),
|
||||
detectRepeatedToolCalls(recentEvents, opts),
|
||||
detectNoProgress(worker, recentEvents, opts),
|
||||
detectCircularEdits(recentEvents, opts),
|
||||
|
|
@ -96,6 +103,47 @@ export function getStuckReason(
|
|||
return pattern?.reason ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect state-transition gap — worker stuck in an active state without
|
||||
* transitioning. Uses lastStateTransition from WorkerInfo (set by the store
|
||||
* when processing worker.state_transition events).
|
||||
*/
|
||||
function detectStateTransitionGap(
|
||||
worker: WorkerInfo,
|
||||
opts: Required<StuckDetectionOptions>
|
||||
): StuckPattern | null {
|
||||
if (!worker.needleState || !worker.lastStateTransition) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Only active states are relevant — STOPPED workers are fine.
|
||||
const activeStates = ['BOOTING', 'SELECTING', 'CLAIMING', 'WORKING', 'CLOSING'] as const;
|
||||
if (!activeStates.includes(worker.needleState as typeof activeStates[number])) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const now = Date.now();
|
||||
const gapMs = now - worker.lastStateTransition;
|
||||
|
||||
if (gapMs > opts.stateTransitionGapMs) {
|
||||
const minutes = Math.floor(gapMs / 60000);
|
||||
const isCritical = gapMs > opts.stateTransitionGapMs * 2;
|
||||
|
||||
return {
|
||||
type: 'state_gap',
|
||||
reason: `No state transition for ${minutes}m while in ${worker.needleState}`,
|
||||
severity: isCritical ? 'critical' : 'warning',
|
||||
evidence: [
|
||||
`State: ${worker.needleState} since ${new Date(worker.lastStateTransition).toISOString()}`,
|
||||
`Gap: ${minutes}m (threshold: ${Math.floor(opts.stateTransitionGapMs / 60000)}m)`,
|
||||
],
|
||||
suggestion: 'Worker may be stuck — check if the agent is waiting on an external resource or deadlocked',
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect repeated tool calls with same parameters
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1,5 +1,23 @@
|
|||
import React from 'react';
|
||||
import { WorkerInfo, LogEvent } from '../types';
|
||||
import { WorkerInfo, LogEvent, NeedleState } from '../types';
|
||||
|
||||
const NEEDLE_STATE_ICONS: Record<NeedleState, string> = {
|
||||
BOOTING: '⏳',
|
||||
SELECTING: '🔍',
|
||||
CLAIMING: '🎯',
|
||||
WORKING: '●',
|
||||
CLOSING: '⏹',
|
||||
STOPPED: '○',
|
||||
};
|
||||
|
||||
const NEEDLE_STATE_COLORS: Record<NeedleState, string> = {
|
||||
BOOTING: '#5bc0de',
|
||||
SELECTING: '#f0ad4e',
|
||||
CLAIMING: '#9b59b6',
|
||||
WORKING: '#5cb85c',
|
||||
CLOSING: '#f0ad4e',
|
||||
STOPPED: '#777',
|
||||
};
|
||||
|
||||
interface WorkerDetailProps {
|
||||
/** The worker to display details for */
|
||||
|
|
@ -12,16 +30,6 @@ interface WorkerDetailProps {
|
|||
allWorkerEvents?: LogEvent[];
|
||||
}
|
||||
|
||||
/**
|
||||
* WorkerDetail Component
|
||||
*
|
||||
* Displays detailed information about a selected worker including:
|
||||
* - Worker ID and status
|
||||
* - Activity statistics (event count, current tool)
|
||||
* - Timing information (last seen, uptime)
|
||||
* - Recent events list
|
||||
* - Collision information if applicable
|
||||
*/
|
||||
const WorkerDetail: React.FC<WorkerDetailProps> = ({
|
||||
worker,
|
||||
onClose,
|
||||
|
|
@ -42,18 +50,14 @@ const WorkerDetail: React.FC<WorkerDetailProps> = ({
|
|||
return new Date(timestamp).toLocaleTimeString();
|
||||
};
|
||||
|
||||
const getStatusIcon = (): string => {
|
||||
switch (worker.status) {
|
||||
case 'active':
|
||||
return '●';
|
||||
case 'idle':
|
||||
return '○';
|
||||
case 'error':
|
||||
return '✗';
|
||||
default:
|
||||
return '?';
|
||||
}
|
||||
};
|
||||
const stateIcon = worker.needleState
|
||||
? NEEDLE_STATE_ICONS[worker.needleState]
|
||||
: worker.status === 'active' ? '●' : worker.status === 'idle' ? '○' : '✗';
|
||||
const stateLabel = worker.needleState ?? worker.status.toUpperCase();
|
||||
const stateColor = worker.needleState
|
||||
? NEEDLE_STATE_COLORS[worker.needleState]
|
||||
: undefined;
|
||||
const stateCssClass = worker.needleState ? undefined : worker.status;
|
||||
|
||||
const eventsToShow = allWorkerEvents || worker.recentEvents || [];
|
||||
|
||||
|
|
@ -62,8 +66,11 @@ const WorkerDetail: React.FC<WorkerDetailProps> = ({
|
|||
{/* Header with close button */}
|
||||
<div className="worker-detail-header">
|
||||
<h2>
|
||||
<span className={`worker-status-icon ${worker.status}`}>
|
||||
{getStatusIcon()}
|
||||
<span
|
||||
className={`worker-status-icon ${stateCssClass ?? ''}`}
|
||||
style={stateColor ? { color: stateColor } : undefined}
|
||||
>
|
||||
{stateIcon}
|
||||
</span>
|
||||
{worker.id}
|
||||
</h2>
|
||||
|
|
@ -103,8 +110,11 @@ const WorkerDetail: React.FC<WorkerDetailProps> = ({
|
|||
<h3>Status</h3>
|
||||
<div className="detail-row">
|
||||
<span className="detail-label">State</span>
|
||||
<span className={`detail-value worker-status ${worker.status}`}>
|
||||
{worker.status}
|
||||
<span
|
||||
className={`detail-value worker-status ${stateCssClass ?? ''}`}
|
||||
style={stateColor ? { color: stateColor } : undefined}
|
||||
>
|
||||
{stateLabel}
|
||||
</span>
|
||||
</div>
|
||||
<div className="detail-row">
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue