Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/health/monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ import type { ISdk } from "iii-sdk";
import type { HealthSnapshot } from "../types.js";
import type { StateKV } from "../state/kv.js";
import { KV } from "../state/schema.js";
import { evaluateHealth } from "./thresholds.js";
import { evaluateHealth, resolveThresholdConfig } from "./thresholds.js";

export function registerHealthMonitor(
sdk: ISdk,
kv: StateKV,
): { stop: () => void } {
// Env overrides are read once at startup; thresholds don't change at runtime.
const thresholdConfig = resolveThresholdConfig();
let connectionState = "connected";
let prevCpuUsage = process.cpuUsage();
let prevCpuTime = Date.now();
Expand Down Expand Up @@ -84,7 +86,7 @@ export function registerHealthMonitor(
alerts: [],
};

const evaluated = evaluateHealth(snapshot);
const evaluated = evaluateHealth(snapshot, thresholdConfig);
snapshot.status = evaluated.status;
snapshot.alerts = evaluated.alerts;
snapshot.notes = evaluated.notes;
Expand Down
92 changes: 91 additions & 1 deletion src/health/thresholds.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os from "node:os";
import type { HealthSnapshot } from "../types.js";

interface ThresholdConfig {
Expand All @@ -8,6 +9,9 @@ interface ThresholdConfig {
memoryWarnPercent: number;
memoryCriticalPercent: number;
memoryRssFloorBytes: number;
memoryCriticalRssBytes: number;
// 0 disables the host-free-RAM escape hatch.
memorySystemFreeFloorRatio: number;
}

const DEFAULTS: ThresholdConfig = {
Expand All @@ -18,8 +22,89 @@ const DEFAULTS: ThresholdConfig = {
memoryWarnPercent: 80,
memoryCriticalPercent: 95,
memoryRssFloorBytes: 512 * 1024 * 1024,
memoryCriticalRssBytes: 4096 * 1024 * 1024,
memorySystemFreeFloorRatio: 0.1,
};

function parseIntEnv(
name: string,
fallback: number,
bounds: { min: number; max?: number },
): number {
const raw = process.env[name];
if (raw === undefined || raw === "") return fallback;
const parsed = Number(raw);
if (!Number.isInteger(parsed) || parsed < bounds.min) return fallback;
if (bounds.max !== undefined && parsed > bounds.max) return fallback;
return parsed;
}

function parseFloatEnv(
name: string,
fallback: number,
bounds: { min: number; max: number },
): number {
const raw = process.env[name];
if (raw === undefined || raw === "") return fallback;
const parsed = Number(raw);
return Number.isFinite(parsed) && parsed >= bounds.min && parsed <= bounds.max
? parsed
: fallback;
}

/**
* Resolve the env-overridable subset of the threshold config. Out-of-range
* overrides fall back to the default so a typo cannot silently disable a gate.
*/
export function resolveThresholdConfig(): Partial<ThresholdConfig> {
const MB = 1024 * 1024;
return {
memoryCriticalPercent: parseIntEnv(
"AGENTMEMORY_MEMORY_CRITICAL_PERCENT",
DEFAULTS.memoryCriticalPercent,
{ min: 0, max: 100 },
),
memoryWarnPercent: parseIntEnv(
"AGENTMEMORY_MEMORY_WARN_PERCENT",
DEFAULTS.memoryWarnPercent,
{ min: 0, max: 100 },
),
memoryRssFloorBytes:
parseIntEnv(
"AGENTMEMORY_MEMORY_RSS_FLOOR_MB",
DEFAULTS.memoryRssFloorBytes / MB,
{ min: 0 },
) * MB,
memoryCriticalRssBytes:
parseIntEnv(
"AGENTMEMORY_MEMORY_CRITICAL_RSS_MB",
DEFAULTS.memoryCriticalRssBytes / MB,
{ min: 0 },
) * MB,
memorySystemFreeFloorRatio: parseFloatEnv(
"AGENTMEMORY_MEMORY_SYSTEM_FREE_FLOOR_RATIO",
DEFAULTS.memorySystemFreeFloorRatio,
{ min: 0, max: 1 },
),
};
}

// A busy Node process keeps its heap near-full by design, so a high heap ratio
// alone is not memory pressure. Require a real signal: absolute RSS over a high
// ceiling, or the host itself low on free RAM.
function isUnderRealMemoryPressure(
rssBytes: number,
cfg: ThresholdConfig,
): boolean {
const rssAboveCritical = rssBytes >= cfg.memoryCriticalRssBytes;
const totalRam = os.totalmem();
const systemFreeRatio = totalRam > 0 ? os.freemem() / totalRam : 1;
const hostLowOnFreeRam =
cfg.memorySystemFreeFloorRatio > 0 &&
systemFreeRatio < cfg.memorySystemFreeFloorRatio;
return rssAboveCritical || hostLowOnFreeRam;
}

export function evaluateHealth(
snapshot: HealthSnapshot,
config: Partial<ThresholdConfig> = {},
Expand Down Expand Up @@ -66,7 +151,12 @@ export function evaluateHealth(
const rss = snapshot.memory.rss ?? 0;
const rssAboveFloor = rss >= cfg.memoryRssFloorBytes;
const memMb = Math.round(rss / (1024 * 1024));
if (memPercent > cfg.memoryCriticalPercent && rssAboveFloor) {

if (
memPercent > cfg.memoryCriticalPercent &&
rssAboveFloor &&
isUnderRealMemoryPressure(rss, cfg)
) {
alerts.push(`memory_critical_${Math.round(memPercent)}%_rss${memMb}mb`);
critical = true;
} else if (memPercent > cfg.memoryWarnPercent && rssAboveFloor) {
Expand Down
77 changes: 73 additions & 4 deletions test/health-thresholds.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { describe, expect, it } from "vitest";
import { evaluateHealth } from "../src/health/thresholds.js";
import { afterEach, describe, expect, it, vi } from "vitest";
import os from "node:os";
import { evaluateHealth, resolveThresholdConfig } from "../src/health/thresholds.js";
import type { HealthSnapshot } from "../src/types.js";

function snap(over: Partial<HealthSnapshot> = {}): HealthSnapshot {
Expand All @@ -18,6 +19,8 @@ function snap(over: Partial<HealthSnapshot> = {}): HealthSnapshot {
}

describe("evaluateHealth memory severity", () => {
afterEach(() => vi.restoreAllMocks());

it("stays healthy when heap fills a tiny steady-state process (issue #158)", () => {
const s = snap({
memory: {
Expand All @@ -44,7 +47,12 @@ describe("evaluateHealth memory severity", () => {
external: 0,
},
});
const { status, alerts } = evaluateHealth(s);
// memory_critical now also requires a real-pressure signal (absolute RSS
// ceiling or low system-free RAM), not heap fullness alone. Supply a low
// absolute-RSS ceiling so the 1100MB RSS trips it deterministically.
const { status, alerts } = evaluateHealth(s, {
memoryCriticalRssBytes: 1024 * 1024 * 1024,
});
expect(status).toBe("critical");
expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(true);
});
Expand Down Expand Up @@ -89,9 +97,70 @@ describe("evaluateHealth memory severity", () => {
external: 0,
},
});
const loose = evaluateHealth(s, { memoryRssFloorBytes: 10 * 1024 * 1024 });
// A low RSS floor makes RSS "above floor"; pair it with a low absolute-RSS
// ceiling so the real-pressure gate also trips and the result is critical.
const loose = evaluateHealth(s, {
memoryRssFloorBytes: 10 * 1024 * 1024,
memoryCriticalRssBytes: 10 * 1024 * 1024,
});
expect(loose.status).toBe("critical");
// A high RSS floor keeps RSS below the floor, so it never reaches critical.
const strict = evaluateHealth(s, { memoryRssFloorBytes: 1024 * 1024 * 1024 });
expect(strict.status).toBe("healthy");
});

it("goes critical via the host-memory gate when free RAM is low (RSS below ceiling)", () => {
// ~3% host free RAM with the absolute-RSS ceiling unreachable, so the only
// path to critical is the system-free gate.
vi.spyOn(os, "totalmem").mockReturnValue(16 * 1024 * 1024 * 1024);
vi.spyOn(os, "freemem").mockReturnValue(512 * 1024 * 1024);
const s = snap({
memory: {
heapUsed: 970 * 1024 * 1024,
heapTotal: 1000 * 1024 * 1024,
rss: 700 * 1024 * 1024,
external: 0,
},
});
const { status, alerts } = evaluateHealth(s, {
memoryRssFloorBytes: 512 * 1024 * 1024,
memoryCriticalRssBytes: 64 * 1024 * 1024 * 1024,
});
expect(status).toBe("critical");
expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(true);
});

it("memorySystemFreeFloorRatio=0 disables the host-memory gate", () => {
vi.spyOn(os, "totalmem").mockReturnValue(16 * 1024 * 1024 * 1024);
vi.spyOn(os, "freemem").mockReturnValue(512 * 1024 * 1024);
const s = snap({
memory: {
heapUsed: 970 * 1024 * 1024,
heapTotal: 1000 * 1024 * 1024,
rss: 700 * 1024 * 1024,
external: 0,
},
});
const { status, alerts } = evaluateHealth(s, {
memoryRssFloorBytes: 512 * 1024 * 1024,
memoryCriticalRssBytes: 64 * 1024 * 1024 * 1024,
memorySystemFreeFloorRatio: 0,
});
expect(status).not.toBe("critical");
expect(alerts.some((a) => a.startsWith("memory_critical_"))).toBe(false);
expect(alerts.some((a) => a.startsWith("memory_warn_"))).toBe(true);
});

it("rejects out-of-range env overrides so a typo cannot disable the gate", () => {
process.env.AGENTMEMORY_MEMORY_SYSTEM_FREE_FLOOR_RATIO = "2";
process.env.AGENTMEMORY_MEMORY_CRITICAL_RSS_MB = "-1";
try {
const cfg = resolveThresholdConfig();
expect(cfg.memorySystemFreeFloorRatio).toBe(0.1);
expect(cfg.memoryCriticalRssBytes).toBe(4096 * 1024 * 1024);
} finally {
delete process.env.AGENTMEMORY_MEMORY_SYSTEM_FREE_FLOOR_RATIO;
delete process.env.AGENTMEMORY_MEMORY_CRITICAL_RSS_MB;
}
});
});