diff --git a/src/systemCgroupMonitor.ts b/src/systemCgroupMonitor.ts index 78073f0..b963239 100644 --- a/src/systemCgroupMonitor.ts +++ b/src/systemCgroupMonitor.ts @@ -24,6 +24,13 @@ export interface MemoryHistorySample { swapUsage: number | null; } +export interface OomState { + oomKillCount: number; + lastOomAt: string | null; // ISO timestamp string or null + oomDetected: boolean; // True if oom_kill increased since last check + memoryCurrentAtOom: number | null; // memory.current at time of OOM detection +} + export interface SystemMemoryStatus { totalMemory: number | null; availableMemory: number | null; @@ -39,11 +46,17 @@ export interface SystemMemoryStatus { oomRisk: 'none' | 'low' | 'medium' | 'high' | 'critical'; oomKill: number; oom: number; + oomState: OomState; } // In-memory history store const memoryHistory: MemoryHistorySample[] = []; +// OOM state tracking +let lastOomKillCount = 0; +let lastOomAt: number | null = null; +let memoryCurrentAtOom: number | null = null; + /** * Read a file and return its trimmed content, or null if file doesn't exist. */ @@ -128,6 +141,16 @@ export function getSystemMemoryStatus(): SystemMemoryStatus { const memoryEvents = readCgroupFile('memory.events'); const oomKill = parseOomKill(memoryEvents); + // Track OOM state changes + let oomDetected = false; + if (oomKill > lastOomKillCount && lastOomKillCount > 0) { + // OOM kill detected! + oomDetected = true; + lastOomAt = Date.now(); + memoryCurrentAtOom = cgroupUsage; + } + lastOomKillCount = oomKill; + // Read memory.stat for additional stats const memoryStatContent = readCgroupFile('memory.stat'); const memoryStat = parseMemoryStat(memoryStatContent); @@ -228,6 +251,12 @@ export function getSystemMemoryStatus(): SystemMemoryStatus { oomRisk, oomKill, oom: oomKill, // Alias for compatibility + oomState: { + oomKillCount: oomKill, + lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null, + oomDetected, + memoryCurrentAtOom, + }, }; } @@ -246,6 +275,43 @@ export function getMemorySummary(): string { return formatBytes(status.cgroupUsage) + ' / ' + formatBytes(status.cgroupLimit); } +/** + * Get current OOM state for polling. + * This is a lightweight call that just reads the oom_kill counter + * and compares to the last known value. + */ +export function getOomState(): OomState { + const memoryEvents = readCgroupFile('memory.events'); + const oomKill = parseOomKill(memoryEvents); + const memoryCurrentStr = readCgroupFile('memory.current'); + const memoryCurrent = memoryCurrentStr ? parseInt(memoryCurrentStr, 10) : null; + + // Check if OOM kill increased + let oomDetected = false; + if (oomKill > lastOomKillCount && lastOomKillCount > 0) { + oomDetected = true; + lastOomAt = Date.now(); + memoryCurrentAtOom = memoryCurrent; + } + lastOomKillCount = oomKill; + + return { + oomKillCount: oomKill, + lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null, + oomDetected, + memoryCurrentAtOom, + }; +} + +/** + * Reset OOM detection state (e.g., after user dismisses alert). + * This clears the oomDetected flag but keeps the historical oomKillCount. + */ +export function resetOomDetected(): void { + // The oomDetected flag is transient and will be cleared on the next poll + // This function exists for explicit reset if needed +} + /** * Start the background memory sampler. * This should be called once when the server starts. diff --git a/src/web/frontend/src/App.tsx b/src/web/frontend/src/App.tsx index 78dfa40..20d9994 100644 --- a/src/web/frontend/src/App.tsx +++ b/src/web/frontend/src/App.tsx @@ -17,6 +17,7 @@ import AnalyticsDashboard from './components/AnalyticsDashboard'; import ErrorGroupPanel from './components/ErrorGroupPanel'; import SemanticNarrativePanel from './components/SemanticNarrativePanel'; import BudgetAlertPanel, { BudgetBanner } from './components/BudgetAlertPanel'; +import OomAlertBanner from './components/OomAlertBanner'; import SessionDigestPanel from './components/SessionDigestPanel'; import GitIntegrationPanel from './components/GitIntegrationPanel'; import ProductivityPanel from './components/ProductivityPanel'; @@ -269,6 +270,7 @@ const App: React.FC = () => { const [showWorkerAnalytics, setShowWorkerAnalytics] = useState(false); const [showSystemMemory, setShowSystemMemory] = useState(false); const [budgetBannerDismissed, setBudgetBannerDismissed] = useState(false); + const [oomBannerDismissed, setOomBannerDismissed] = useState(false); const [hideTestWorkers, setHideTestWorkers] = useState(true); // Budget alert state polled from /api/cost/summary @@ -635,6 +637,7 @@ const App: React.FC = () => { return (
+ setOomBannerDismissed(true)} /> {budgetSummary && !budgetBannerDismissed && budgetSummary.budget.warningLevel !== 'none' && ( void; +} + +// Local storage key for tracking dismissed alerts +const OOM_DISMISS_KEY = 'fabric-oom-dismissed'; + +/** + * OOM Alert Banner + * + * Shows a persistent red alert banner at the top of the FABRIC dashboard + * when an OOM kill is detected. Includes the oom_kill count and memory.current + * at time of detection. Dismissible via X button; auto-clears after 1 hour. + */ +export const OomAlertBanner: React.FC = ({ onDismiss }) => { + const [oomState, setOomState] = useState(null); + const [dismissed, setDismissed] = useState(false); + + // Check for previously dismissed alert (auto-clear after 1 hour) + useEffect(() => { + try { + const dismissedData = localStorage.getItem(OOM_DISMISS_KEY); + if (dismissedData) { + const { timestamp } = JSON.parse(dismissedData); + const oneHour = 60 * 60 * 1000; + if (Date.now() - timestamp < oneHour) { + setDismissed(true); + } else { + // Expired, clear it + localStorage.removeItem(OOM_DISMISS_KEY); + } + } + } catch { + // Ignore localStorage errors + } + }, []); + + // Poll OOM state every 30 seconds + useEffect(() => { + const pollOomState = async () => { + try { + const res = await fetch('/api/system/oom-state'); + if (res.ok) { + const data = await res.json(); + setOomState(data); + } + } catch (err) { + console.error('Failed to fetch OOM state:', err); + } + }; + + // Initial poll + pollOomState(); + + // Poll every 30 seconds + const interval = setInterval(pollOomState, 30000); + return () => clearInterval(interval); + }, []); + + // Handle dismiss + const handleDismiss = () => { + setDismissed(true); + // Save dismissal timestamp to localStorage + try { + localStorage.setItem(OOM_DISMISS_KEY, JSON.stringify({ timestamp: Date.now() })); + } catch { + // Ignore localStorage errors + } + onDismiss(); + }; + + // Don't show if dismissed or no OOM detected + if (dismissed || !oomState || !oomState.oomDetected) { + return null; + } + + const timeSinceOom = oomState.lastOomAt + ? new Date(oomState.lastOomAt).toLocaleTimeString() + : 'Unknown'; + + return ( +
+
+ ⚠️ + + OOM kill detected — check system memory + {' '} + + (Kill count: {oomState.oomKillCount} at {timeSinceOom} + {oomState.formattedMemoryCurrent && `, memory: ${oomState.formattedMemoryCurrent}`}) + + + +
+
+ ); +}; + +export default OomAlertBanner; diff --git a/src/web/frontend/src/index.css b/src/web/frontend/src/index.css index 5af35a4..06bcd59 100644 --- a/src/web/frontend/src/index.css +++ b/src/web/frontend/src/index.css @@ -8938,3 +8938,71 @@ body { color: var(--text-secondary); text-align: right; } + +/* ============================================ + OOM Alert Banner (persistent, top of page) + ============================================ */ + +.oom-alert-banner { + position: sticky; + top: 0; + z-index: 100; + background: linear-gradient(135deg, #4a0000, #660000); + border-bottom: 2px solid var(--error); + animation: slideDown 0.3s ease-out; + box-shadow: 0 2px 8px rgba(255, 0, 0, 0.3); +} + +.oom-alert-banner-content { + display: flex; + align-items: center; + gap: 0.75rem; + padding: 0.75rem 1rem; + font-size: 0.9rem; + color: var(--text-primary); +} + +.oom-alert-banner-icon { + font-size: 1.2rem; + min-width: 1.5em; + text-align: center; + animation: pulse 2s infinite; +} + +@keyframes pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.6; } +} + +.oom-alert-banner-text { + flex: 1; + font-weight: 500; +} + +.oom-alert-banner-detail { + color: var(--text-secondary); + font-size: 0.85rem; + font-weight: 400; +} + +.oom-alert-banner-dismiss { + background: transparent; + border: none; + color: var(--text-secondary); + font-size: 1.2rem; + line-height: 1; + padding: 0.25rem 0.5rem; + cursor: pointer; + transition: color 0.15s ease; + min-width: 2rem; + height: 2rem; + display: flex; + align-items: center; + justify-content: center; + border-radius: 4px; +} + +.oom-alert-banner-dismiss:hover { + color: var(--text-primary); + background: rgba(255, 255, 255, 0.1); +} diff --git a/src/web/server.ts b/src/web/server.ts index 65f70bd..6714b38 100644 --- a/src/web/server.ts +++ b/src/web/server.ts @@ -1687,6 +1687,17 @@ export function createWebServer(options: WebServerOptions): WebServer { res.json(alert); }); + // Get OOM state for polling (oom_kill count and detection state) + app.get('/api/system/oom-state', async (_req: Request, res: Response) => { + const { getOomState, formatBytes } = await import('../systemCgroupMonitor.js'); + const oomState = getOomState(); + + res.json({ + ...oomState, + formattedMemoryCurrent: oomState.memoryCurrentAtOom ? formatBytes(oomState.memoryCurrentAtOom) : null, + }); + }); + // Serve static frontend files const staticPath = join(__dirname, 'public'); app.use(express.static(staticPath));