feat(bf-dm8v): implement OOM event detection and alert banner
Backend changes: - Add getOomState() to systemCgroupMonitor.ts for lightweight OOM polling - Track oomKillCount, lastOomAt, oomDetected, memoryCurrentAtOom - Add GET /api/system/oom-state endpoint in server.ts Frontend changes: - Create OomAlertBanner component that polls /api/system/oom-state every 30s - Show persistent red alert banner when oomDetected=true - Display oomKillCount and memory.current at time of detection - Banner dismissable via X button; auto-clears after 1 hour (localStorage) - Add CSS styling for the banner (red background, icon, text) - Integrate banner into App.tsx at top of dashboard Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
933f66cbfc
commit
ea1406ac2d
5 changed files with 263 additions and 0 deletions
|
|
@ -24,6 +24,13 @@ export interface MemoryHistorySample {
|
|||
swapUsage: number | null;
|
||||
}
|
||||
|
||||
export interface OomState {
|
||||
oomKillCount: number;
|
||||
lastOomAt: string | null; // ISO timestamp string or null
|
||||
oomDetected: boolean; // True if oom_kill increased since last check
|
||||
memoryCurrentAtOom: number | null; // memory.current at time of OOM detection
|
||||
}
|
||||
|
||||
export interface SystemMemoryStatus {
|
||||
totalMemory: number | null;
|
||||
availableMemory: number | null;
|
||||
|
|
@ -39,11 +46,17 @@ export interface SystemMemoryStatus {
|
|||
oomRisk: 'none' | 'low' | 'medium' | 'high' | 'critical';
|
||||
oomKill: number;
|
||||
oom: number;
|
||||
oomState: OomState;
|
||||
}
|
||||
|
||||
// In-memory history store
|
||||
const memoryHistory: MemoryHistorySample[] = [];
|
||||
|
||||
// OOM state tracking
|
||||
let lastOomKillCount = 0;
|
||||
let lastOomAt: number | null = null;
|
||||
let memoryCurrentAtOom: number | null = null;
|
||||
|
||||
/**
|
||||
* Read a file and return its trimmed content, or null if file doesn't exist.
|
||||
*/
|
||||
|
|
@ -128,6 +141,16 @@ export function getSystemMemoryStatus(): SystemMemoryStatus {
|
|||
const memoryEvents = readCgroupFile('memory.events');
|
||||
const oomKill = parseOomKill(memoryEvents);
|
||||
|
||||
// Track OOM state changes
|
||||
let oomDetected = false;
|
||||
if (oomKill > lastOomKillCount && lastOomKillCount > 0) {
|
||||
// OOM kill detected!
|
||||
oomDetected = true;
|
||||
lastOomAt = Date.now();
|
||||
memoryCurrentAtOom = cgroupUsage;
|
||||
}
|
||||
lastOomKillCount = oomKill;
|
||||
|
||||
// Read memory.stat for additional stats
|
||||
const memoryStatContent = readCgroupFile('memory.stat');
|
||||
const memoryStat = parseMemoryStat(memoryStatContent);
|
||||
|
|
@ -228,6 +251,12 @@ export function getSystemMemoryStatus(): SystemMemoryStatus {
|
|||
oomRisk,
|
||||
oomKill,
|
||||
oom: oomKill, // Alias for compatibility
|
||||
oomState: {
|
||||
oomKillCount: oomKill,
|
||||
lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null,
|
||||
oomDetected,
|
||||
memoryCurrentAtOom,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -246,6 +275,43 @@ export function getMemorySummary(): string {
|
|||
return formatBytes(status.cgroupUsage) + ' / ' + formatBytes(status.cgroupLimit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get current OOM state for polling.
|
||||
* This is a lightweight call that just reads the oom_kill counter
|
||||
* and compares to the last known value.
|
||||
*/
|
||||
export function getOomState(): OomState {
|
||||
const memoryEvents = readCgroupFile('memory.events');
|
||||
const oomKill = parseOomKill(memoryEvents);
|
||||
const memoryCurrentStr = readCgroupFile('memory.current');
|
||||
const memoryCurrent = memoryCurrentStr ? parseInt(memoryCurrentStr, 10) : null;
|
||||
|
||||
// Check if OOM kill increased
|
||||
let oomDetected = false;
|
||||
if (oomKill > lastOomKillCount && lastOomKillCount > 0) {
|
||||
oomDetected = true;
|
||||
lastOomAt = Date.now();
|
||||
memoryCurrentAtOom = memoryCurrent;
|
||||
}
|
||||
lastOomKillCount = oomKill;
|
||||
|
||||
return {
|
||||
oomKillCount: oomKill,
|
||||
lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null,
|
||||
oomDetected,
|
||||
memoryCurrentAtOom,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset OOM detection state (e.g., after user dismisses alert).
|
||||
* This clears the oomDetected flag but keeps the historical oomKillCount.
|
||||
*/
|
||||
export function resetOomDetected(): void {
|
||||
// The oomDetected flag is transient and will be cleared on the next poll
|
||||
// This function exists for explicit reset if needed
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the background memory sampler.
|
||||
* This should be called once when the server starts.
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ import AnalyticsDashboard from './components/AnalyticsDashboard';
|
|||
import ErrorGroupPanel from './components/ErrorGroupPanel';
|
||||
import SemanticNarrativePanel from './components/SemanticNarrativePanel';
|
||||
import BudgetAlertPanel, { BudgetBanner } from './components/BudgetAlertPanel';
|
||||
import OomAlertBanner from './components/OomAlertBanner';
|
||||
import SessionDigestPanel from './components/SessionDigestPanel';
|
||||
import GitIntegrationPanel from './components/GitIntegrationPanel';
|
||||
import ProductivityPanel from './components/ProductivityPanel';
|
||||
|
|
@ -269,6 +270,7 @@ const App: React.FC = () => {
|
|||
const [showWorkerAnalytics, setShowWorkerAnalytics] = useState(false);
|
||||
const [showSystemMemory, setShowSystemMemory] = useState(false);
|
||||
const [budgetBannerDismissed, setBudgetBannerDismissed] = useState(false);
|
||||
const [oomBannerDismissed, setOomBannerDismissed] = useState(false);
|
||||
const [hideTestWorkers, setHideTestWorkers] = useState(true);
|
||||
|
||||
// Budget alert state polled from /api/cost/summary
|
||||
|
|
@ -635,6 +637,7 @@ const App: React.FC = () => {
|
|||
|
||||
return (
|
||||
<div className="app">
|
||||
<OomAlertBanner onDismiss={() => setOomBannerDismissed(true)} />
|
||||
{budgetSummary && !budgetBannerDismissed && budgetSummary.budget.warningLevel !== 'none' && (
|
||||
<BudgetBanner
|
||||
budget={budgetSummary.budget}
|
||||
|
|
|
|||
115
src/web/frontend/src/components/OomAlertBanner.tsx
Normal file
115
src/web/frontend/src/components/OomAlertBanner.tsx
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
import React, { useState, useEffect } from 'react';
|
||||
|
||||
interface OomState {
|
||||
oomKillCount: number;
|
||||
lastOomAt: string | null;
|
||||
oomDetected: boolean;
|
||||
memoryCurrentAtOom: number | null;
|
||||
formattedMemoryCurrent?: string | null;
|
||||
}
|
||||
|
||||
interface OomAlertBannerProps {
|
||||
onDismiss: () => void;
|
||||
}
|
||||
|
||||
// Local storage key for tracking dismissed alerts
|
||||
const OOM_DISMISS_KEY = 'fabric-oom-dismissed';
|
||||
|
||||
/**
|
||||
* OOM Alert Banner
|
||||
*
|
||||
* Shows a persistent red alert banner at the top of the FABRIC dashboard
|
||||
* when an OOM kill is detected. Includes the oom_kill count and memory.current
|
||||
* at time of detection. Dismissible via X button; auto-clears after 1 hour.
|
||||
*/
|
||||
export const OomAlertBanner: React.FC<OomAlertBannerProps> = ({ onDismiss }) => {
|
||||
const [oomState, setOomState] = useState<OomState | null>(null);
|
||||
const [dismissed, setDismissed] = useState(false);
|
||||
|
||||
// Check for previously dismissed alert (auto-clear after 1 hour)
|
||||
useEffect(() => {
|
||||
try {
|
||||
const dismissedData = localStorage.getItem(OOM_DISMISS_KEY);
|
||||
if (dismissedData) {
|
||||
const { timestamp } = JSON.parse(dismissedData);
|
||||
const oneHour = 60 * 60 * 1000;
|
||||
if (Date.now() - timestamp < oneHour) {
|
||||
setDismissed(true);
|
||||
} else {
|
||||
// Expired, clear it
|
||||
localStorage.removeItem(OOM_DISMISS_KEY);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Ignore localStorage errors
|
||||
}
|
||||
}, []);
|
||||
|
||||
// Poll OOM state every 30 seconds
|
||||
useEffect(() => {
|
||||
const pollOomState = async () => {
|
||||
try {
|
||||
const res = await fetch('/api/system/oom-state');
|
||||
if (res.ok) {
|
||||
const data = await res.json();
|
||||
setOomState(data);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Failed to fetch OOM state:', err);
|
||||
}
|
||||
};
|
||||
|
||||
// Initial poll
|
||||
pollOomState();
|
||||
|
||||
// Poll every 30 seconds
|
||||
const interval = setInterval(pollOomState, 30000);
|
||||
return () => clearInterval(interval);
|
||||
}, []);
|
||||
|
||||
// Handle dismiss
|
||||
const handleDismiss = () => {
|
||||
setDismissed(true);
|
||||
// Save dismissal timestamp to localStorage
|
||||
try {
|
||||
localStorage.setItem(OOM_DISMISS_KEY, JSON.stringify({ timestamp: Date.now() }));
|
||||
} catch {
|
||||
// Ignore localStorage errors
|
||||
}
|
||||
onDismiss();
|
||||
};
|
||||
|
||||
// Don't show if dismissed or no OOM detected
|
||||
if (dismissed || !oomState || !oomState.oomDetected) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const timeSinceOom = oomState.lastOomAt
|
||||
? new Date(oomState.lastOomAt).toLocaleTimeString()
|
||||
: 'Unknown';
|
||||
|
||||
return (
|
||||
<div className="oom-alert-banner">
|
||||
<div className="oom-alert-banner-content">
|
||||
<span className="oom-alert-banner-icon">⚠️</span>
|
||||
<span className="oom-alert-banner-text">
|
||||
<strong>OOM kill detected</strong> — check system memory
|
||||
{' '}
|
||||
<span className="oom-alert-banner-detail">
|
||||
(Kill count: {oomState.oomKillCount} at {timeSinceOom}
|
||||
{oomState.formattedMemoryCurrent && `, memory: ${oomState.formattedMemoryCurrent}`})
|
||||
</span>
|
||||
</span>
|
||||
<button
|
||||
className="oom-alert-banner-dismiss"
|
||||
onClick={handleDismiss}
|
||||
title="Dismiss (will auto-clear after 1 hour)"
|
||||
>
|
||||
×
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default OomAlertBanner;
|
||||
|
|
@ -8938,3 +8938,71 @@ body {
|
|||
color: var(--text-secondary);
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
/* ============================================
|
||||
OOM Alert Banner (persistent, top of page)
|
||||
============================================ */
|
||||
|
||||
.oom-alert-banner {
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 100;
|
||||
background: linear-gradient(135deg, #4a0000, #660000);
|
||||
border-bottom: 2px solid var(--error);
|
||||
animation: slideDown 0.3s ease-out;
|
||||
box-shadow: 0 2px 8px rgba(255, 0, 0, 0.3);
|
||||
}
|
||||
|
||||
.oom-alert-banner-content {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
padding: 0.75rem 1rem;
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.oom-alert-banner-icon {
|
||||
font-size: 1.2rem;
|
||||
min-width: 1.5em;
|
||||
text-align: center;
|
||||
animation: pulse 2s infinite;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 1; }
|
||||
50% { opacity: 0.6; }
|
||||
}
|
||||
|
||||
.oom-alert-banner-text {
|
||||
flex: 1;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.oom-alert-banner-detail {
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.85rem;
|
||||
font-weight: 400;
|
||||
}
|
||||
|
||||
.oom-alert-banner-dismiss {
|
||||
background: transparent;
|
||||
border: none;
|
||||
color: var(--text-secondary);
|
||||
font-size: 1.2rem;
|
||||
line-height: 1;
|
||||
padding: 0.25rem 0.5rem;
|
||||
cursor: pointer;
|
||||
transition: color 0.15s ease;
|
||||
min-width: 2rem;
|
||||
height: 2rem;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.oom-alert-banner-dismiss:hover {
|
||||
color: var(--text-primary);
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1687,6 +1687,17 @@ export function createWebServer(options: WebServerOptions): WebServer {
|
|||
res.json(alert);
|
||||
});
|
||||
|
||||
// Get OOM state for polling (oom_kill count and detection state)
|
||||
app.get('/api/system/oom-state', async (_req: Request, res: Response) => {
|
||||
const { getOomState, formatBytes } = await import('../systemCgroupMonitor.js');
|
||||
const oomState = getOomState();
|
||||
|
||||
res.json({
|
||||
...oomState,
|
||||
formattedMemoryCurrent: oomState.memoryCurrentAtOom ? formatBytes(oomState.memoryCurrentAtOom) : null,
|
||||
});
|
||||
});
|
||||
|
||||
// Serve static frontend files
|
||||
const staticPath = join(__dirname, 'public');
|
||||
app.use(express.static(staticPath));
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue