feat(bf-dm8v): implement OOM event detection and alert banner
Some checks are pending
CI / test (18.x) (push) Waiting to run
CI / test (20.x) (push) Waiting to run
CI / test (22.x) (push) Waiting to run

Backend changes:
- Add getOomState() to systemCgroupMonitor.ts for lightweight OOM polling
- Track oomKillCount, lastOomAt, oomDetected, memoryCurrentAtOom
- Add GET /api/system/oom-state endpoint in server.ts

Frontend changes:
- Create OomAlertBanner component that polls /api/system/oom-state every 30s
- Show persistent red alert banner when oomDetected=true
- Display oomKillCount and memory.current at time of detection
- Banner dismissable via X button; auto-clears after 1 hour (localStorage)
- Add CSS styling for the banner (red background, icon, text)
- Integrate banner into App.tsx at top of dashboard

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-06-07 10:37:29 -04:00
parent 933f66cbfc
commit ea1406ac2d
5 changed files with 263 additions and 0 deletions

View file

@ -24,6 +24,13 @@ export interface MemoryHistorySample {
swapUsage: number | null;
}
export interface OomState {
oomKillCount: number;
lastOomAt: string | null; // ISO timestamp string or null
oomDetected: boolean; // True if oom_kill increased since last check
memoryCurrentAtOom: number | null; // memory.current at time of OOM detection
}
export interface SystemMemoryStatus {
totalMemory: number | null;
availableMemory: number | null;
@ -39,11 +46,17 @@ export interface SystemMemoryStatus {
oomRisk: 'none' | 'low' | 'medium' | 'high' | 'critical';
oomKill: number;
oom: number;
oomState: OomState;
}
// In-memory history store
const memoryHistory: MemoryHistorySample[] = [];
// OOM state tracking
let lastOomKillCount = 0;
let lastOomAt: number | null = null;
let memoryCurrentAtOom: number | null = null;
/**
* Read a file and return its trimmed content, or null if file doesn't exist.
*/
@ -128,6 +141,16 @@ export function getSystemMemoryStatus(): SystemMemoryStatus {
const memoryEvents = readCgroupFile('memory.events');
const oomKill = parseOomKill(memoryEvents);
// Track OOM state changes
let oomDetected = false;
if (oomKill > lastOomKillCount && lastOomKillCount > 0) {
// OOM kill detected!
oomDetected = true;
lastOomAt = Date.now();
memoryCurrentAtOom = cgroupUsage;
}
lastOomKillCount = oomKill;
// Read memory.stat for additional stats
const memoryStatContent = readCgroupFile('memory.stat');
const memoryStat = parseMemoryStat(memoryStatContent);
@ -228,6 +251,12 @@ export function getSystemMemoryStatus(): SystemMemoryStatus {
oomRisk,
oomKill,
oom: oomKill, // Alias for compatibility
oomState: {
oomKillCount: oomKill,
lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null,
oomDetected,
memoryCurrentAtOom,
},
};
}
@ -246,6 +275,43 @@ export function getMemorySummary(): string {
return formatBytes(status.cgroupUsage) + ' / ' + formatBytes(status.cgroupLimit);
}
/**
* Get current OOM state for polling.
* This is a lightweight call that just reads the oom_kill counter
* and compares to the last known value.
*/
export function getOomState(): OomState {
const memoryEvents = readCgroupFile('memory.events');
const oomKill = parseOomKill(memoryEvents);
const memoryCurrentStr = readCgroupFile('memory.current');
const memoryCurrent = memoryCurrentStr ? parseInt(memoryCurrentStr, 10) : null;
// Check if OOM kill increased
let oomDetected = false;
if (oomKill > lastOomKillCount && lastOomKillCount > 0) {
oomDetected = true;
lastOomAt = Date.now();
memoryCurrentAtOom = memoryCurrent;
}
lastOomKillCount = oomKill;
return {
oomKillCount: oomKill,
lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null,
oomDetected,
memoryCurrentAtOom,
};
}
/**
* Reset OOM detection state (e.g., after user dismisses alert).
* This clears the oomDetected flag but keeps the historical oomKillCount.
*/
export function resetOomDetected(): void {
// The oomDetected flag is transient and will be cleared on the next poll
// This function exists for explicit reset if needed
}
/**
* Start the background memory sampler.
* This should be called once when the server starts.

View file

@ -17,6 +17,7 @@ import AnalyticsDashboard from './components/AnalyticsDashboard';
import ErrorGroupPanel from './components/ErrorGroupPanel';
import SemanticNarrativePanel from './components/SemanticNarrativePanel';
import BudgetAlertPanel, { BudgetBanner } from './components/BudgetAlertPanel';
import OomAlertBanner from './components/OomAlertBanner';
import SessionDigestPanel from './components/SessionDigestPanel';
import GitIntegrationPanel from './components/GitIntegrationPanel';
import ProductivityPanel from './components/ProductivityPanel';
@ -269,6 +270,7 @@ const App: React.FC = () => {
const [showWorkerAnalytics, setShowWorkerAnalytics] = useState(false);
const [showSystemMemory, setShowSystemMemory] = useState(false);
const [budgetBannerDismissed, setBudgetBannerDismissed] = useState(false);
const [oomBannerDismissed, setOomBannerDismissed] = useState(false);
const [hideTestWorkers, setHideTestWorkers] = useState(true);
// Budget alert state polled from /api/cost/summary
@ -635,6 +637,7 @@ const App: React.FC = () => {
return (
<div className="app">
<OomAlertBanner onDismiss={() => setOomBannerDismissed(true)} />
{budgetSummary && !budgetBannerDismissed && budgetSummary.budget.warningLevel !== 'none' && (
<BudgetBanner
budget={budgetSummary.budget}

View file

@ -0,0 +1,115 @@
import React, { useState, useEffect } from 'react';
interface OomState {
oomKillCount: number;
lastOomAt: string | null;
oomDetected: boolean;
memoryCurrentAtOom: number | null;
formattedMemoryCurrent?: string | null;
}
interface OomAlertBannerProps {
onDismiss: () => void;
}
// Local storage key for tracking dismissed alerts
const OOM_DISMISS_KEY = 'fabric-oom-dismissed';
/**
* OOM Alert Banner
*
* Shows a persistent red alert banner at the top of the FABRIC dashboard
* when an OOM kill is detected. Includes the oom_kill count and memory.current
* at time of detection. Dismissible via X button; auto-clears after 1 hour.
*/
export const OomAlertBanner: React.FC<OomAlertBannerProps> = ({ onDismiss }) => {
const [oomState, setOomState] = useState<OomState | null>(null);
const [dismissed, setDismissed] = useState(false);
// Check for previously dismissed alert (auto-clear after 1 hour)
useEffect(() => {
try {
const dismissedData = localStorage.getItem(OOM_DISMISS_KEY);
if (dismissedData) {
const { timestamp } = JSON.parse(dismissedData);
const oneHour = 60 * 60 * 1000;
if (Date.now() - timestamp < oneHour) {
setDismissed(true);
} else {
// Expired, clear it
localStorage.removeItem(OOM_DISMISS_KEY);
}
}
} catch {
// Ignore localStorage errors
}
}, []);
// Poll OOM state every 30 seconds
useEffect(() => {
const pollOomState = async () => {
try {
const res = await fetch('/api/system/oom-state');
if (res.ok) {
const data = await res.json();
setOomState(data);
}
} catch (err) {
console.error('Failed to fetch OOM state:', err);
}
};
// Initial poll
pollOomState();
// Poll every 30 seconds
const interval = setInterval(pollOomState, 30000);
return () => clearInterval(interval);
}, []);
// Handle dismiss
const handleDismiss = () => {
setDismissed(true);
// Save dismissal timestamp to localStorage
try {
localStorage.setItem(OOM_DISMISS_KEY, JSON.stringify({ timestamp: Date.now() }));
} catch {
// Ignore localStorage errors
}
onDismiss();
};
// Don't show if dismissed or no OOM detected
if (dismissed || !oomState || !oomState.oomDetected) {
return null;
}
const timeSinceOom = oomState.lastOomAt
? new Date(oomState.lastOomAt).toLocaleTimeString()
: 'Unknown';
return (
<div className="oom-alert-banner">
<div className="oom-alert-banner-content">
<span className="oom-alert-banner-icon"></span>
<span className="oom-alert-banner-text">
<strong>OOM kill detected</strong> check system memory
{' '}
<span className="oom-alert-banner-detail">
(Kill count: {oomState.oomKillCount} at {timeSinceOom}
{oomState.formattedMemoryCurrent && `, memory: ${oomState.formattedMemoryCurrent}`})
</span>
</span>
<button
className="oom-alert-banner-dismiss"
onClick={handleDismiss}
title="Dismiss (will auto-clear after 1 hour)"
>
×
</button>
</div>
</div>
);
};
export default OomAlertBanner;

View file

@ -8938,3 +8938,71 @@ body {
color: var(--text-secondary);
text-align: right;
}
/* ============================================
OOM Alert Banner (persistent, top of page)
============================================ */
.oom-alert-banner {
position: sticky;
top: 0;
z-index: 100;
background: linear-gradient(135deg, #4a0000, #660000);
border-bottom: 2px solid var(--error);
animation: slideDown 0.3s ease-out;
box-shadow: 0 2px 8px rgba(255, 0, 0, 0.3);
}
.oom-alert-banner-content {
display: flex;
align-items: center;
gap: 0.75rem;
padding: 0.75rem 1rem;
font-size: 0.9rem;
color: var(--text-primary);
}
.oom-alert-banner-icon {
font-size: 1.2rem;
min-width: 1.5em;
text-align: center;
animation: pulse 2s infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.6; }
}
.oom-alert-banner-text {
flex: 1;
font-weight: 500;
}
.oom-alert-banner-detail {
color: var(--text-secondary);
font-size: 0.85rem;
font-weight: 400;
}
.oom-alert-banner-dismiss {
background: transparent;
border: none;
color: var(--text-secondary);
font-size: 1.2rem;
line-height: 1;
padding: 0.25rem 0.5rem;
cursor: pointer;
transition: color 0.15s ease;
min-width: 2rem;
height: 2rem;
display: flex;
align-items: center;
justify-content: center;
border-radius: 4px;
}
.oom-alert-banner-dismiss:hover {
color: var(--text-primary);
background: rgba(255, 255, 255, 0.1);
}

View file

@ -1687,6 +1687,17 @@ export function createWebServer(options: WebServerOptions): WebServer {
res.json(alert);
});
// Get OOM state for polling (oom_kill count and detection state)
app.get('/api/system/oom-state', async (_req: Request, res: Response) => {
const { getOomState, formatBytes } = await import('../systemCgroupMonitor.js');
const oomState = getOomState();
res.json({
...oomState,
formattedMemoryCurrent: oomState.memoryCurrentAtOom ? formatBytes(oomState.memoryCurrentAtOom) : null,
});
});
// Serve static frontend files
const staticPath = join(__dirname, 'public');
app.use(express.static(staticPath));