Skip to content

Commit 41f913c

Browse files
hkiratclaude
andcommitted
fix: detect crashed agent immediately instead of waiting 10min timeout
Previously, if the agent process died before writing any log entries (e.g. startup crash), the dead-process check was skipped because it required entries.length > 0. This left sessions stuck for the full 10-minute safety valve timeout. Also, agent stderr was piped to /dev/null, making crash debugging impossible. - Capture agent stderr to agent-stderr.log instead of /dev/null - Remove entries.length > 0 guard from dead-process detection - Read stderr in both crash detection and safety valve paths for logging Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 9fab33d commit 41f913c

1 file changed

Lines changed: 26 additions & 7 deletions

File tree

apps/server/src/services/agent.service.ts

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ const VENDI_DIR = "/workspace/.vendi";
99
const CONFIG_PATH = `${VENDI_DIR}/agent-config.json`;
1010
const SCRIPT_PATH = `${VENDI_DIR}/agent.mjs`;
1111
const LOG_PATH = `${VENDI_DIR}/agent-log.jsonl`;
12+
const STDERR_PATH = `${VENDI_DIR}/agent-stderr.log`;
1213

1314
const AGENT_STALE_MS = 10 * 60 * 1000; // 10 minute safety valve
1415

@@ -108,8 +109,8 @@ export async function startAgentTurn(config: StartAgentConfig): Promise<void> {
108109
},
109110
});
110111

111-
// Start agent as background process in sandbox
112-
await sandbox.commands.run(`node ${SCRIPT_PATH} > /dev/null 2>&1 &`, {
112+
// Start agent as background process in sandbox (capture stderr for crash debugging)
113+
await sandbox.commands.run(`node ${SCRIPT_PATH} > /dev/null 2>${STDERR_PATH} &`, {
113114
background: true,
114115
requestTimeoutMs: 5_000,
115116
});
@@ -156,11 +157,19 @@ export async function syncAgentProgress(sessionId: string): Promise<void> {
156157

157158
if (!session?.agentRunId || !session.sandboxId) return;
158159

159-
// Safety valve: if agent has been running > 1 hour, clean up
160+
// Safety valve: if agent has been running too long, clean up
160161
if (session.agentRunStartedAt) {
161162
const age = Date.now() - session.agentRunStartedAt.getTime();
162163
if (age > AGENT_STALE_MS) {
163-
console.log(`[Agent] Stale agent run for session ${sessionId}, cleaning up`);
164+
// Try to read stderr for debugging context
165+
let stderrHint = "";
166+
try {
167+
const sbx = await Sandbox.connect(session.sandboxId);
168+
const stderr = String(await sbx.files.read(STDERR_PATH)).trim();
169+
if (stderr) stderrHint = ` stderr: ${stderr.slice(0, 500)}`;
170+
} catch {}
171+
172+
console.log(`[Agent] Stale agent run for session ${sessionId} (${Math.round(age / 1000)}s), cleaning up.${stderrHint}`);
164173
if (session.agentWorkingMsgId) {
165174
await prisma.chatMessage.delete({ where: { id: session.agentWorkingMsgId } }).catch(() => {});
166175
}
@@ -203,8 +212,18 @@ export async function syncAgentProgress(sessionId: string): Promise<void> {
203212
"pgrep -f 'node.*agent\\.mjs' > /dev/null 2>&1 && echo ALIVE || echo DEAD",
204213
{ requestTimeoutMs: 5_000 }
205214
);
206-
if (psResult.stdout.trim() === "DEAD" && entries.length > 0) {
207-
console.log(`[Agent] Agent process died without writing done/error for session ${sessionId}`);
215+
if (psResult.stdout.trim() === "DEAD") {
216+
// Read stderr log for crash debugging
217+
let stderrContent = "";
218+
try {
219+
stderrContent = String(await sandbox.files.read(STDERR_PATH)).trim();
220+
} catch {}
221+
222+
console.log(
223+
`[Agent] Agent process died for session ${sessionId} (${entries.length} log entries).` +
224+
(stderrContent ? ` stderr: ${stderrContent.slice(0, 500)}` : "")
225+
);
226+
208227
// Treat as a crashed agent — synthesize an error entry
209228
const toolCalls: ToolCallEntry[] = entries
210229
.filter((e) => e.type === "tool_call")
@@ -223,7 +242,7 @@ export async function syncAgentProgress(sessionId: string): Promise<void> {
223242
data: {
224243
sessionId,
225244
role: "SYSTEM",
226-
content: "The agent encountered an unexpected error. You can send a new message to try again.",
245+
content: "The agent crashed unexpectedly. You can send a new message to try again.",
227246
metadata: toolCalls.length > 0 ? JSON.parse(JSON.stringify({ toolCalls })) : undefined,
228247
},
229248
});

0 commit comments

Comments
 (0)