feat(bf-dm8v): implement OOM event detection and alert banner
Backend changes: - Add getOomState() to systemCgroupMonitor.ts for lightweight OOM polling - Track oomKillCount, lastOomAt, oomDetected, memoryCurrentAtOom - Add GET /api/system/oom-state endpoint in server.ts Frontend changes: - Create OomAlertBanner component that polls /api/system/oom-state every 30s - Show persistent red alert banner when oomDetected=true - Display oomKillCount and memory.current at time of detection - Banner dismissable via X button; auto-clears after 1 hour (localStorage) - Add CSS styling for the banner (red background, icon, text) - Integrate banner into App.tsx at top of dashboard Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
933f66cbfc
commit
ea1406ac2d
5 changed files with 263 additions and 0 deletions
|
|
@ -24,6 +24,13 @@ export interface MemoryHistorySample {
|
||||||
swapUsage: number | null;
|
swapUsage: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface OomState {
|
||||||
|
oomKillCount: number;
|
||||||
|
lastOomAt: string | null; // ISO timestamp string or null
|
||||||
|
oomDetected: boolean; // True if oom_kill increased since last check
|
||||||
|
memoryCurrentAtOom: number | null; // memory.current at time of OOM detection
|
||||||
|
}
|
||||||
|
|
||||||
export interface SystemMemoryStatus {
|
export interface SystemMemoryStatus {
|
||||||
totalMemory: number | null;
|
totalMemory: number | null;
|
||||||
availableMemory: number | null;
|
availableMemory: number | null;
|
||||||
|
|
@ -39,11 +46,17 @@ export interface SystemMemoryStatus {
|
||||||
oomRisk: 'none' | 'low' | 'medium' | 'high' | 'critical';
|
oomRisk: 'none' | 'low' | 'medium' | 'high' | 'critical';
|
||||||
oomKill: number;
|
oomKill: number;
|
||||||
oom: number;
|
oom: number;
|
||||||
|
oomState: OomState;
|
||||||
}
|
}
|
||||||
|
|
||||||
// In-memory history store
|
// In-memory history store
|
||||||
const memoryHistory: MemoryHistorySample[] = [];
|
const memoryHistory: MemoryHistorySample[] = [];
|
||||||
|
|
||||||
|
// OOM state tracking
|
||||||
|
let lastOomKillCount = 0;
|
||||||
|
let lastOomAt: number | null = null;
|
||||||
|
let memoryCurrentAtOom: number | null = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read a file and return its trimmed content, or null if file doesn't exist.
|
* Read a file and return its trimmed content, or null if file doesn't exist.
|
||||||
*/
|
*/
|
||||||
|
|
@ -128,6 +141,16 @@ export function getSystemMemoryStatus(): SystemMemoryStatus {
|
||||||
const memoryEvents = readCgroupFile('memory.events');
|
const memoryEvents = readCgroupFile('memory.events');
|
||||||
const oomKill = parseOomKill(memoryEvents);
|
const oomKill = parseOomKill(memoryEvents);
|
||||||
|
|
||||||
|
// Track OOM state changes
|
||||||
|
let oomDetected = false;
|
||||||
|
if (oomKill > lastOomKillCount && lastOomKillCount > 0) {
|
||||||
|
// OOM kill detected!
|
||||||
|
oomDetected = true;
|
||||||
|
lastOomAt = Date.now();
|
||||||
|
memoryCurrentAtOom = cgroupUsage;
|
||||||
|
}
|
||||||
|
lastOomKillCount = oomKill;
|
||||||
|
|
||||||
// Read memory.stat for additional stats
|
// Read memory.stat for additional stats
|
||||||
const memoryStatContent = readCgroupFile('memory.stat');
|
const memoryStatContent = readCgroupFile('memory.stat');
|
||||||
const memoryStat = parseMemoryStat(memoryStatContent);
|
const memoryStat = parseMemoryStat(memoryStatContent);
|
||||||
|
|
@ -228,6 +251,12 @@ export function getSystemMemoryStatus(): SystemMemoryStatus {
|
||||||
oomRisk,
|
oomRisk,
|
||||||
oomKill,
|
oomKill,
|
||||||
oom: oomKill, // Alias for compatibility
|
oom: oomKill, // Alias for compatibility
|
||||||
|
oomState: {
|
||||||
|
oomKillCount: oomKill,
|
||||||
|
lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null,
|
||||||
|
oomDetected,
|
||||||
|
memoryCurrentAtOom,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -246,6 +275,43 @@ export function getMemorySummary(): string {
|
||||||
return formatBytes(status.cgroupUsage) + ' / ' + formatBytes(status.cgroupLimit);
|
return formatBytes(status.cgroupUsage) + ' / ' + formatBytes(status.cgroupLimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get current OOM state for polling.
|
||||||
|
* This is a lightweight call that just reads the oom_kill counter
|
||||||
|
* and compares to the last known value.
|
||||||
|
*/
|
||||||
|
export function getOomState(): OomState {
|
||||||
|
const memoryEvents = readCgroupFile('memory.events');
|
||||||
|
const oomKill = parseOomKill(memoryEvents);
|
||||||
|
const memoryCurrentStr = readCgroupFile('memory.current');
|
||||||
|
const memoryCurrent = memoryCurrentStr ? parseInt(memoryCurrentStr, 10) : null;
|
||||||
|
|
||||||
|
// Check if OOM kill increased
|
||||||
|
let oomDetected = false;
|
||||||
|
if (oomKill > lastOomKillCount && lastOomKillCount > 0) {
|
||||||
|
oomDetected = true;
|
||||||
|
lastOomAt = Date.now();
|
||||||
|
memoryCurrentAtOom = memoryCurrent;
|
||||||
|
}
|
||||||
|
lastOomKillCount = oomKill;
|
||||||
|
|
||||||
|
return {
|
||||||
|
oomKillCount: oomKill,
|
||||||
|
lastOomAt: lastOomAt ? new Date(lastOomAt).toISOString() : null,
|
||||||
|
oomDetected,
|
||||||
|
memoryCurrentAtOom,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset OOM detection state (e.g., after user dismisses alert).
|
||||||
|
* This clears the oomDetected flag but keeps the historical oomKillCount.
|
||||||
|
*/
|
||||||
|
export function resetOomDetected(): void {
|
||||||
|
// The oomDetected flag is transient and will be cleared on the next poll
|
||||||
|
// This function exists for explicit reset if needed
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Start the background memory sampler.
|
* Start the background memory sampler.
|
||||||
* This should be called once when the server starts.
|
* This should be called once when the server starts.
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ import AnalyticsDashboard from './components/AnalyticsDashboard';
|
||||||
import ErrorGroupPanel from './components/ErrorGroupPanel';
|
import ErrorGroupPanel from './components/ErrorGroupPanel';
|
||||||
import SemanticNarrativePanel from './components/SemanticNarrativePanel';
|
import SemanticNarrativePanel from './components/SemanticNarrativePanel';
|
||||||
import BudgetAlertPanel, { BudgetBanner } from './components/BudgetAlertPanel';
|
import BudgetAlertPanel, { BudgetBanner } from './components/BudgetAlertPanel';
|
||||||
|
import OomAlertBanner from './components/OomAlertBanner';
|
||||||
import SessionDigestPanel from './components/SessionDigestPanel';
|
import SessionDigestPanel from './components/SessionDigestPanel';
|
||||||
import GitIntegrationPanel from './components/GitIntegrationPanel';
|
import GitIntegrationPanel from './components/GitIntegrationPanel';
|
||||||
import ProductivityPanel from './components/ProductivityPanel';
|
import ProductivityPanel from './components/ProductivityPanel';
|
||||||
|
|
@ -269,6 +270,7 @@ const App: React.FC = () => {
|
||||||
const [showWorkerAnalytics, setShowWorkerAnalytics] = useState(false);
|
const [showWorkerAnalytics, setShowWorkerAnalytics] = useState(false);
|
||||||
const [showSystemMemory, setShowSystemMemory] = useState(false);
|
const [showSystemMemory, setShowSystemMemory] = useState(false);
|
||||||
const [budgetBannerDismissed, setBudgetBannerDismissed] = useState(false);
|
const [budgetBannerDismissed, setBudgetBannerDismissed] = useState(false);
|
||||||
|
const [oomBannerDismissed, setOomBannerDismissed] = useState(false);
|
||||||
const [hideTestWorkers, setHideTestWorkers] = useState(true);
|
const [hideTestWorkers, setHideTestWorkers] = useState(true);
|
||||||
|
|
||||||
// Budget alert state polled from /api/cost/summary
|
// Budget alert state polled from /api/cost/summary
|
||||||
|
|
@ -635,6 +637,7 @@ const App: React.FC = () => {
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="app">
|
<div className="app">
|
||||||
|
<OomAlertBanner onDismiss={() => setOomBannerDismissed(true)} />
|
||||||
{budgetSummary && !budgetBannerDismissed && budgetSummary.budget.warningLevel !== 'none' && (
|
{budgetSummary && !budgetBannerDismissed && budgetSummary.budget.warningLevel !== 'none' && (
|
||||||
<BudgetBanner
|
<BudgetBanner
|
||||||
budget={budgetSummary.budget}
|
budget={budgetSummary.budget}
|
||||||
|
|
|
||||||
115
src/web/frontend/src/components/OomAlertBanner.tsx
Normal file
115
src/web/frontend/src/components/OomAlertBanner.tsx
Normal file
|
|
@ -0,0 +1,115 @@
|
||||||
|
import React, { useState, useEffect } from 'react';
|
||||||
|
|
||||||
|
interface OomState {
|
||||||
|
oomKillCount: number;
|
||||||
|
lastOomAt: string | null;
|
||||||
|
oomDetected: boolean;
|
||||||
|
memoryCurrentAtOom: number | null;
|
||||||
|
formattedMemoryCurrent?: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface OomAlertBannerProps {
|
||||||
|
onDismiss: () => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Local storage key for tracking dismissed alerts
|
||||||
|
const OOM_DISMISS_KEY = 'fabric-oom-dismissed';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* OOM Alert Banner
|
||||||
|
*
|
||||||
|
* Shows a persistent red alert banner at the top of the FABRIC dashboard
|
||||||
|
* when an OOM kill is detected. Includes the oom_kill count and memory.current
|
||||||
|
* at time of detection. Dismissible via X button; auto-clears after 1 hour.
|
||||||
|
*/
|
||||||
|
export const OomAlertBanner: React.FC<OomAlertBannerProps> = ({ onDismiss }) => {
|
||||||
|
const [oomState, setOomState] = useState<OomState | null>(null);
|
||||||
|
const [dismissed, setDismissed] = useState(false);
|
||||||
|
|
||||||
|
// Check for previously dismissed alert (auto-clear after 1 hour)
|
||||||
|
useEffect(() => {
|
||||||
|
try {
|
||||||
|
const dismissedData = localStorage.getItem(OOM_DISMISS_KEY);
|
||||||
|
if (dismissedData) {
|
||||||
|
const { timestamp } = JSON.parse(dismissedData);
|
||||||
|
const oneHour = 60 * 60 * 1000;
|
||||||
|
if (Date.now() - timestamp < oneHour) {
|
||||||
|
setDismissed(true);
|
||||||
|
} else {
|
||||||
|
// Expired, clear it
|
||||||
|
localStorage.removeItem(OOM_DISMISS_KEY);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore localStorage errors
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// Poll OOM state every 30 seconds
|
||||||
|
useEffect(() => {
|
||||||
|
const pollOomState = async () => {
|
||||||
|
try {
|
||||||
|
const res = await fetch('/api/system/oom-state');
|
||||||
|
if (res.ok) {
|
||||||
|
const data = await res.json();
|
||||||
|
setOomState(data);
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to fetch OOM state:', err);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Initial poll
|
||||||
|
pollOomState();
|
||||||
|
|
||||||
|
// Poll every 30 seconds
|
||||||
|
const interval = setInterval(pollOomState, 30000);
|
||||||
|
return () => clearInterval(interval);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// Handle dismiss
|
||||||
|
const handleDismiss = () => {
|
||||||
|
setDismissed(true);
|
||||||
|
// Save dismissal timestamp to localStorage
|
||||||
|
try {
|
||||||
|
localStorage.setItem(OOM_DISMISS_KEY, JSON.stringify({ timestamp: Date.now() }));
|
||||||
|
} catch {
|
||||||
|
// Ignore localStorage errors
|
||||||
|
}
|
||||||
|
onDismiss();
|
||||||
|
};
|
||||||
|
|
||||||
|
// Don't show if dismissed or no OOM detected
|
||||||
|
if (dismissed || !oomState || !oomState.oomDetected) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const timeSinceOom = oomState.lastOomAt
|
||||||
|
? new Date(oomState.lastOomAt).toLocaleTimeString()
|
||||||
|
: 'Unknown';
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="oom-alert-banner">
|
||||||
|
<div className="oom-alert-banner-content">
|
||||||
|
<span className="oom-alert-banner-icon">⚠️</span>
|
||||||
|
<span className="oom-alert-banner-text">
|
||||||
|
<strong>OOM kill detected</strong> — check system memory
|
||||||
|
{' '}
|
||||||
|
<span className="oom-alert-banner-detail">
|
||||||
|
(Kill count: {oomState.oomKillCount} at {timeSinceOom}
|
||||||
|
{oomState.formattedMemoryCurrent && `, memory: ${oomState.formattedMemoryCurrent}`})
|
||||||
|
</span>
|
||||||
|
</span>
|
||||||
|
<button
|
||||||
|
className="oom-alert-banner-dismiss"
|
||||||
|
onClick={handleDismiss}
|
||||||
|
title="Dismiss (will auto-clear after 1 hour)"
|
||||||
|
>
|
||||||
|
×
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default OomAlertBanner;
|
||||||
|
|
@ -8938,3 +8938,71 @@ body {
|
||||||
color: var(--text-secondary);
|
color: var(--text-secondary);
|
||||||
text-align: right;
|
text-align: right;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ============================================
|
||||||
|
OOM Alert Banner (persistent, top of page)
|
||||||
|
============================================ */
|
||||||
|
|
||||||
|
.oom-alert-banner {
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 100;
|
||||||
|
background: linear-gradient(135deg, #4a0000, #660000);
|
||||||
|
border-bottom: 2px solid var(--error);
|
||||||
|
animation: slideDown 0.3s ease-out;
|
||||||
|
box-shadow: 0 2px 8px rgba(255, 0, 0, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.oom-alert-banner-content {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.75rem;
|
||||||
|
padding: 0.75rem 1rem;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
color: var(--text-primary);
|
||||||
|
}
|
||||||
|
|
||||||
|
.oom-alert-banner-icon {
|
||||||
|
font-size: 1.2rem;
|
||||||
|
min-width: 1.5em;
|
||||||
|
text-align: center;
|
||||||
|
animation: pulse 2s infinite;
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes pulse {
|
||||||
|
0%, 100% { opacity: 1; }
|
||||||
|
50% { opacity: 0.6; }
|
||||||
|
}
|
||||||
|
|
||||||
|
.oom-alert-banner-text {
|
||||||
|
flex: 1;
|
||||||
|
font-weight: 500;
|
||||||
|
}
|
||||||
|
|
||||||
|
.oom-alert-banner-detail {
|
||||||
|
color: var(--text-secondary);
|
||||||
|
font-size: 0.85rem;
|
||||||
|
font-weight: 400;
|
||||||
|
}
|
||||||
|
|
||||||
|
.oom-alert-banner-dismiss {
|
||||||
|
background: transparent;
|
||||||
|
border: none;
|
||||||
|
color: var(--text-secondary);
|
||||||
|
font-size: 1.2rem;
|
||||||
|
line-height: 1;
|
||||||
|
padding: 0.25rem 0.5rem;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: color 0.15s ease;
|
||||||
|
min-width: 2rem;
|
||||||
|
height: 2rem;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.oom-alert-banner-dismiss:hover {
|
||||||
|
color: var(--text-primary);
|
||||||
|
background: rgba(255, 255, 255, 0.1);
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -1687,6 +1687,17 @@ export function createWebServer(options: WebServerOptions): WebServer {
|
||||||
res.json(alert);
|
res.json(alert);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Get OOM state for polling (oom_kill count and detection state)
|
||||||
|
app.get('/api/system/oom-state', async (_req: Request, res: Response) => {
|
||||||
|
const { getOomState, formatBytes } = await import('../systemCgroupMonitor.js');
|
||||||
|
const oomState = getOomState();
|
||||||
|
|
||||||
|
res.json({
|
||||||
|
...oomState,
|
||||||
|
formattedMemoryCurrent: oomState.memoryCurrentAtOom ? formatBytes(oomState.memoryCurrentAtOom) : null,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
// Serve static frontend files
|
// Serve static frontend files
|
||||||
const staticPath = join(__dirname, 'public');
|
const staticPath = join(__dirname, 'public');
|
||||||
app.use(express.static(staticPath));
|
app.use(express.static(staticPath));
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue