Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .changeset/t9962-doctor-audit-budget.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
id: t9962-doctor-audit-budget
tasks: [T9962]
kind: fix
summary: Add width-budget + timeout to doctor worktree-orphan audit to prevent 60s+ hang on large corpora
---

`cleo doctor --audit-worktree-orphans` hung indefinitely on a 194-orphan corpus because depth was already bounded (MAX_SCAN_DEPTH=3) but per-entry IO at width was unbounded.

Two tactical fixes (strategic Rust rewrite deferred to T9977/T9986):

- Width budget: soft-warn at 100 entries per level, hard-stop at 500 with `isPartial: true, partialReason: 'overflow'` on the result envelope.
- Timeout: `--timeout <seconds>` flag (default 30s) on `cleo doctor --audit-worktree-orphans` and `--prune-worktree-orphans`; on expiry scan returns `isPartial: true, partialReason: 'timeout'`.

New `scanWorktreeOrphansBudgeted()` function wraps the existing bare-array scanner and surfaces the `OrphanScanResult` envelope. Existing `scanWorktreeOrphans` callers are unaffected.
139 changes: 129 additions & 10 deletions packages/cleo/src/cli/commands/doctor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import { mkdirSync, writeFileSync } from 'node:fs';
import { join } from 'node:path';
import type { HookMatrixResult } from '@cleocode/core';
import { getProjectRoot } from '@cleocode/core';
import { getProjectRoot, pushWarning } from '@cleocode/core';
import {
quarantineRogueCleoDir,
scanRogueCleoDirs,
Expand Down Expand Up @@ -263,6 +263,27 @@ export const doctorCommand = defineCommand({
description:
'List orphan .cleo/ directories under .claude/worktrees/ (T9790, fallout from T9550/T9580)',
},
/**
* T9962: timeout in seconds for --audit-worktree-orphans and
* --prune-worktree-orphans. On expiry the scan returns a partial result.
* Default: 30 seconds.
*/
timeout: {
type: 'string',
description:
'Timeout in seconds for worktree-orphan audit/prune scan (default: 30). ' +
'Partial results are returned on expiry.',
},
/**
* T9962: per-level fan-out cap for --audit-worktree-orphans.
* Default: 500.
*/
'max-entries-per-level': {
type: 'string',
description:
'Per-level entry hard-stop for worktree-orphan scan (default: 500). ' +
'Scan aborts with partial result when exceeded.',
},
/**
* T9790: archive then remove every orphan reported by
* `--audit-worktree-orphans`. Combine with `--dry-run` to preview
Expand Down Expand Up @@ -561,29 +582,75 @@ export const doctorCommand = defineCommand({
// 2. worktrees outside the canonical XDG location
// 3. rogue .cleo/worktrees/ DIRECTORY (council D009)
// Also runs the legacy .claude/worktrees/ orphan scan for full coverage.
// T9962: budgeted scan with configurable timeout + per-level fan-out cap.
progress.step(0, 'Comprehensive worktree anomaly audit (T9808 / council D009)');
const { auditWorktreeOrphansComprehensive, scanWorktreeOrphans } = await import(
const { auditWorktreeOrphansComprehensive, scanWorktreeOrphansBudgeted } = await import(
'@cleocode/core/doctor/worktree-orphans.js'
);
const projectRoot = getProjectRoot();

// Parse optional budget overrides from CLI flags (T9962).
const timeoutSecs =
args['timeout'] !== undefined ? Number.parseInt(String(args['timeout']), 10) : 30;
const timeoutMs =
Number.isFinite(timeoutSecs) && timeoutSecs > 0 ? timeoutSecs * 1000 : 30_000;
const maxEntriesPerLevel =
args['max-entries-per-level'] !== undefined
? Number.parseInt(String(args['max-entries-per-level']), 10)
: 500;

// Run both scans in parallel.
const [comprehensive, legacyOrphans] = await Promise.all([
const [comprehensive, legacyScanResult] = await Promise.all([
auditWorktreeOrphansComprehensive(projectRoot),
scanWorktreeOrphans(projectRoot),
scanWorktreeOrphansBudgeted(projectRoot, {
timeoutMs,
maxEntriesPerLevel: Number.isFinite(maxEntriesPerLevel) ? maxEntriesPerLevel : 500,
}),
]);

const legacyOrphans = legacyScanResult.orphans;
const totalAnomalies = comprehensive.count;

// Queue soft-warn through pushWarning so it lands in envelope.meta.warnings (T9763/T9772).
if (legacyScanResult.softWarnMessage) {
pushWarning({
code: 'W_DOCTOR_SCAN_SOFT_WARN',
message: legacyScanResult.softWarnMessage,
severity: 'warn',
});
}

// Queue partial-result warning if the scan was aborted.
if (legacyScanResult.isPartial) {
const reason =
legacyScanResult.partialReason === 'timeout'
? `timed out after ${timeoutSecs}s (use --timeout <seconds> to adjust)`
: `per-level entry cap of ${maxEntriesPerLevel} exceeded (use --max-entries-per-level <n> to adjust)`;
pushWarning({
code: 'W_DOCTOR_SCAN_PARTIAL',
message: `legacy orphan scan is PARTIAL — ${reason}. Results may be incomplete.`,
severity: 'warn',
context: {
partialReason: legacyScanResult.partialReason,
timeoutSecs,
maxEntriesPerLevel,
},
});
}

progress.complete(
`Found ${totalAnomalies} anomal${totalAnomalies === 1 ? 'y' : 'ies'}` +
(legacyOrphans.length > 0 ? ` (${legacyOrphans.length} legacy orphan(s))` : ''),
(legacyOrphans.length > 0 ? ` (${legacyOrphans.length} legacy orphan(s))` : '') +
(legacyScanResult.isPartial ? ' [PARTIAL]' : ''),
);

cliOutput(
{
projectRoot,
comprehensive,
legacyOrphans,
legacyScanPartial: legacyScanResult.isPartial,
legacyScanPartialReason: legacyScanResult.partialReason,
count: totalAnomalies,
},
{ command: 'doctor', operation: 'doctor.audit-worktree-orphans' },
Expand All @@ -594,19 +661,63 @@ export const doctorCommand = defineCommand({
} else if (args['prune-worktree-orphans']) {
// T9790: archive + remove orphan .cleo/ directories. Always writes
// a tarball + audit-log line BEFORE removing anything.
// T9962: budgeted scan with configurable timeout + per-level fan-out cap.
const isDryRun = args['dry-run'] === true;
progress.step(
0,
`${isDryRun ? '[DRY RUN] ' : ''}Scanning + pruning worktree-orphan .cleo/ directories`,
);
const { pruneWorktreeOrphans, scanWorktreeOrphans } = await import(
const { pruneWorktreeOrphans, scanWorktreeOrphansBudgeted } = await import(
'@cleocode/core/doctor/worktree-orphans.js'
);
const projectRoot = getProjectRoot();
const orphans = await scanWorktreeOrphans(projectRoot);

// Parse optional budget overrides from CLI flags (T9962).
const timeoutSecs =
args['timeout'] !== undefined ? Number.parseInt(String(args['timeout']), 10) : 30;
const timeoutMs =
Number.isFinite(timeoutSecs) && timeoutSecs > 0 ? timeoutSecs * 1000 : 30_000;
const maxEntriesPerLevel =
args['max-entries-per-level'] !== undefined
? Number.parseInt(String(args['max-entries-per-level']), 10)
: 500;

const scanResult = await scanWorktreeOrphansBudgeted(projectRoot, {
timeoutMs,
maxEntriesPerLevel: Number.isFinite(maxEntriesPerLevel) ? maxEntriesPerLevel : 500,
});

// Queue partial-result warning through pushWarning if the scan was aborted.
if (scanResult.softWarnMessage) {
pushWarning({
code: 'W_DOCTOR_SCAN_SOFT_WARN',
message: scanResult.softWarnMessage,
severity: 'warn',
});
}
if (scanResult.isPartial) {
const reason =
scanResult.partialReason === 'timeout'
? `timed out after ${timeoutSecs}s (use --timeout <seconds> to adjust)`
: `per-level entry cap of ${maxEntriesPerLevel} exceeded (use --max-entries-per-level <n> to adjust)`;
pushWarning({
code: 'W_DOCTOR_SCAN_PARTIAL',
message: `orphan scan is PARTIAL — ${reason}. Only orphans found before abort will be pruned.`,
severity: 'warn',
context: {
partialReason: scanResult.partialReason,
timeoutSecs,
maxEntriesPerLevel,
},
});
}

const orphans = scanResult.orphans;

if (orphans.length === 0) {
progress.complete('No worktree orphans found — nothing to prune');
progress.complete(
`No worktree orphans found — nothing to prune${scanResult.isPartial ? ' [PARTIAL SCAN]' : ''}`,
);
cliOutput(
{
projectRoot,
Expand All @@ -615,6 +726,8 @@ export const doctorCommand = defineCommand({
pruned: [],
rejected: [],
totalSizeBytes: 0,
scanPartial: scanResult.isPartial,
scanPartialReason: scanResult.partialReason,
},
{ command: 'doctor', operation: 'doctor.prune-worktree-orphans' },
);
Expand All @@ -634,11 +747,17 @@ export const doctorCommand = defineCommand({
const verb = isDryRun ? 'Would prune' : 'Pruned';
progress.complete(
`${verb} ${result.pruned.length} orphan${result.pruned.length === 1 ? '' : 's'}` +
`${result.rejected.length > 0 ? `, ${result.rejected.length} rejected` : ''}`,
`${result.rejected.length > 0 ? `, ${result.rejected.length} rejected` : ''}` +
`${scanResult.isPartial ? ' [PARTIAL SCAN]' : ''}`,
);

cliOutput(
{ projectRoot, ...result },
{
projectRoot,
...result,
scanPartial: scanResult.isPartial,
scanPartialReason: scanResult.partialReason,
},
{ command: 'doctor', operation: 'doctor.prune-worktree-orphans' },
);
if (result.rejected.length > 0) {
Expand Down
44 changes: 44 additions & 0 deletions packages/contracts/src/doctor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,50 @@ export interface ComprehensiveAuditResult {
anomalies: WorktreeAnomaly[];
/** Total anomaly count. Non-zero triggers exit code 2. */
count: number;
/**
* `true` when the audit was aborted early due to a width budget overflow
* or a timeout. The `anomalies` list reflects only the entries scanned
* before the abort — results may be incomplete.
*/
isPartial?: boolean;
/**
* Machine-readable reason the scan was cut short.
* - `'timeout'`: the configured `timeoutMs` was exceeded.
* - `'overflow'`: a per-level entry count exceeded `maxEntriesPerLevel`.
*/
partialReason?: 'timeout' | 'overflow';
}

/**
* Result shape returned by the budgeted orphan scanner
* ({@link scanWorktreeOrphansBudgeted}).
*
* Wraps the bare `OrphanEntry[]` from `scanWorktreeOrphans` with optional
* partial-result metadata so callers can surface incomplete scans to the
* operator without changing the existing `OrphanEntry[]` return type.
*/
export interface OrphanScanResult {
/** Discovered orphan entries (may be incomplete when `isPartial` is `true`). */
orphans: OrphanEntry[];
/**
* `true` when the scan was aborted before completion due to a budget
* overflow or timeout. The `orphans` list reflects only entries found
* before the abort.
*/
isPartial: boolean;
/**
* Machine-readable reason the scan was cut short.
* - `'timeout'`: the configured `timeoutMs` was exceeded.
* - `'overflow'`: a per-level entry count exceeded `maxEntriesPerLevel`.
* `undefined` when `isPartial` is `false`.
*/
partialReason?: 'timeout' | 'overflow';
/**
* Human-readable warning message produced when the soft-warn threshold
* was crossed (entries per level exceeded `softWarnEntriesPerLevel` but
* stayed under the hard stop). `undefined` when no warning was triggered.
*/
softWarnMessage?: string;
}

/**
Expand Down
3 changes: 2 additions & 1 deletion packages/contracts/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -299,10 +299,11 @@ export {
DocKindConfigError,
DocKindRegistry,
} from './docs-taxonomy.js';
// === Doctor: Worktree-Orphan Audit + Prune Types (T9790, T9808) ===
// === Doctor: Worktree-Orphan Audit + Prune Types (T9790, T9808, T9962) ===
export type {
ComprehensiveAuditResult,
OrphanEntry,
OrphanScanResult,
PruneAuditEntry,
PruneResult,
WorktreeAnomaly,
Expand Down
Loading
Loading