Skip to content

Commit daae21c

Browse files
feat: add OpenFE monitor for periodic job status checks and auto-restart
Adds monitor.sbatch and monitor.sh that self-resubmit via SLURM to periodically check OpenFE quickrun jobs, restart failed replicas, and send email status reports. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b9b7887 commit daae21c

2 files changed

Lines changed: 357 additions & 0 deletions

File tree

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=openfe_monitor
3+
#SBATCH --partition=normal,owners
4+
#SBATCH --nodes=1
5+
#SBATCH --ntasks-per-node=1
6+
#SBATCH --cpus-per-task=1
7+
#SBATCH --mem=4G
8+
#SBATCH --time=00:30:00
9+
#SBATCH --mail-type=FAIL
10+
#SBATCH --mail-user=zhaoyangli@stanford.edu
11+
#SBATCH --output=logs/monitor_%j.out
12+
#SBATCH --error=logs/monitor_%j.err
13+
14+
set -euo pipefail
15+
16+
# SLURM copies batch scripts to a spool directory, so BASH_SOURCE won't point
17+
# to the original location. Extract the original script path from the submit
18+
# line recorded by SLURM.
19+
_submit_line="$(scontrol show job "$SLURM_JOB_ID" 2>/dev/null | sed -n 's/^[[:space:]]*SubmitLine=//p')"
20+
SCRIPTS_DIR=""
21+
for _word in $_submit_line; do
22+
if [[ "$_word" == *monitor.sbatch ]]; then
23+
SCRIPTS_DIR="$(cd "${SLURM_SUBMIT_DIR:-.}" && cd "$(dirname "$_word")" && pwd -P)"
24+
break
25+
fi
26+
done
27+
28+
if [[ -z "$SCRIPTS_DIR" || ! -f "${SCRIPTS_DIR}/monitor.sh" ]]; then
29+
echo "Error: could not resolve scripts directory from submit line: $_submit_line" >&2
30+
exit 1
31+
fi
32+
33+
exec bash "${SCRIPTS_DIR}/monitor.sh" "$@"

scripts/openfe/runtime/monitor.sh

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
# monitor.sh
6+
#
7+
# Monitor OpenFE quickrun jobs across project directories, restart failed jobs,
8+
# and send email reports. Designed to self-resubmit via SLURM for periodic
9+
# monitoring (default: hourly for 48 iterations / 2 days).
10+
#
11+
# Status handling:
12+
# completed : no action
13+
# active : no action (job still running)
14+
# failed : restart via sbatch
15+
# error : report only (multiple matching jobs, needs manual attention)
16+
#
17+
# Options:
18+
# -d DIR Project directory to monitor (repeatable, required)
19+
# -e EMAIL Notification email (default: zhaoyangli@stanford.edu)
20+
# -m MAX_ITER Maximum iterations before stopping (default: 48)
21+
# -i HOURS Hours between checks (default: 1)
22+
# -s STATE_FILE Iteration state file (default: ~/.openfe_monitor_state)
23+
# -n Dry run: parse and report but do not restart or resubmit
24+
# -h Show help
25+
26+
SCRIPTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
27+
CHECK_STATUS="${SCRIPTS_DIR}/check_status.sh"
28+
29+
DIRS=()
30+
EMAIL="zhaoyangli@stanford.edu"
31+
MAX_ITER=48
32+
INTERVAL=1
33+
STATE_FILE="${HOME}/.openfe_monitor_state"
34+
DRY_RUN=false
35+
36+
usage() {
37+
cat <<'EOF'
38+
Usage: monitor.sh -d DIR [-d DIR ...] [OPTIONS]
39+
40+
Options:
41+
-d DIR Project directory to monitor (repeatable, required)
42+
-e EMAIL Notification email (default: zhaoyangli@stanford.edu)
43+
-m MAX_ITER Maximum iterations (default: 48)
44+
-i HOURS Interval between checks in hours (default: 1)
45+
-s STATE_FILE Iteration state file (default: ~/.openfe_monitor_state)
46+
-n Dry run: report only, no restarts or resubmissions
47+
-h Show this help
48+
EOF
49+
}
50+
51+
while getopts ":d:e:m:i:s:nh" opt; do
52+
case "$opt" in
53+
d) DIRS+=("$OPTARG") ;;
54+
e) EMAIL="$OPTARG" ;;
55+
m) MAX_ITER="$OPTARG" ;;
56+
i) INTERVAL="$OPTARG" ;;
57+
s) STATE_FILE="$OPTARG" ;;
58+
n) DRY_RUN=true ;;
59+
h) usage; exit 0 ;;
60+
\?)
61+
echo "Error: invalid option -$OPTARG" >&2
62+
usage
63+
exit 2
64+
;;
65+
:)
66+
echo "Error: option -$OPTARG requires an argument" >&2
67+
usage
68+
exit 2
69+
;;
70+
esac
71+
done
72+
73+
if [[ ${#DIRS[@]} -eq 0 ]]; then
74+
echo "Error: at least one -d DIR is required" >&2
75+
usage
76+
exit 2
77+
fi
78+
79+
# ---- Helper functions ----
80+
81+
strip_ansi() {
82+
sed 's/\x1b\[[0-9;]*m//g'
83+
}
84+
85+
send_email() {
86+
local subject="$1"
87+
local body="$2"
88+
local recipient="$3"
89+
90+
if command -v mail >/dev/null 2>&1; then
91+
echo "$body" | mail -s "$subject" "$recipient"
92+
elif command -v sendmail >/dev/null 2>&1; then
93+
{
94+
printf 'To: %s\n' "$recipient"
95+
printf 'Subject: %s\n' "$subject"
96+
printf 'Content-Type: text/plain; charset=UTF-8\n'
97+
printf '\n'
98+
printf '%s\n' "$body"
99+
} | sendmail "$recipient"
100+
else
101+
echo "WARNING: No mail command available. Email report:" >&2
102+
echo "Subject: $subject" >&2
103+
echo "$body" >&2
104+
fi
105+
}
106+
107+
# ---- Iteration tracking ----
108+
109+
ITERATION=1
110+
if [[ -f "$STATE_FILE" ]]; then
111+
ITERATION=$(( $(cat "$STATE_FILE") + 1 ))
112+
fi
113+
114+
if (( ITERATION > MAX_ITER )); then
115+
echo "Maximum iterations ($MAX_ITER) reached. Exiting."
116+
rm -f "$STATE_FILE"
117+
exit 0
118+
fi
119+
120+
echo "$ITERATION" > "$STATE_FILE"
121+
122+
NL=$'\n'
123+
124+
echo "=== OpenFE Monitor: Iteration ${ITERATION}/${MAX_ITER} ==="
125+
echo "Timestamp: $(date)"
126+
echo ""
127+
128+
# ---- Main loop over directories ----
129+
130+
TOTAL_RESTARTS=0
131+
TOTAL_COMPLETED=0
132+
TOTAL_ACTIVE=0
133+
TOTAL_FAILED=0
134+
TOTAL_ERROR=0
135+
REPORT=""
136+
137+
for DIR in "${DIRS[@]}"; do
138+
if [[ ! -d "$DIR" ]]; then
139+
echo " Warning: directory does not exist: $DIR" >&2
140+
REPORT+="${NL}--- $(basename "$DIR") ---${NL} Directory not found: ${DIR}${NL}"
141+
continue
142+
fi
143+
144+
DIR_ABS="$(cd "$DIR" && pwd -P)"
145+
DIR_NAME="$(basename "$DIR_ABS")"
146+
147+
echo "--- Checking: ${DIR_NAME} ---"
148+
149+
# Run check_status.sh; stdout has TSV data, stderr goes to monitor log.
150+
if ! STATUS_OUTPUT="$(bash "$CHECK_STATUS" -r "$DIR_ABS")"; then
151+
echo " Warning: check_status.sh returned non-zero for ${DIR_NAME}" >&2
152+
REPORT+="${NL}--- ${DIR_NAME} ---${NL} check_status.sh failed${NL}"
153+
continue
154+
fi
155+
156+
# Strip ANSI codes for parsing.
157+
CLEAN_OUTPUT="$(echo "$STATUS_OUTPUT" | strip_ansi)"
158+
159+
# Count statuses (skip header line).
160+
COMPLETED=0
161+
ACTIVE=0
162+
FAILED=0
163+
ERROR=0
164+
TOTAL=0
165+
166+
FAILED_ENTRIES=()
167+
ERROR_ENTRIES=()
168+
169+
while IFS=$'\t' read -r directory status replica info; do
170+
# Skip header and blank lines.
171+
[[ "$status" == "status" ]] && continue
172+
[[ -z "$status" ]] && continue
173+
174+
TOTAL=$((TOTAL + 1))
175+
176+
case "$status" in
177+
completed)
178+
COMPLETED=$((COMPLETED + 1))
179+
;;
180+
active)
181+
ACTIVE=$((ACTIVE + 1))
182+
;;
183+
failed)
184+
FAILED=$((FAILED + 1))
185+
# Extract tname from directory: .../results/<tname>/replica_<N>
186+
tname="$(basename "$(dirname "$directory")")"
187+
# Extract replica_id from replica column: replica_<N>
188+
replica_id="${replica#replica_}"
189+
FAILED_ENTRIES+=("${tname}|${replica_id}|${info}")
190+
;;
191+
error)
192+
ERROR=$((ERROR + 1))
193+
tname="$(basename "$(dirname "$directory")")"
194+
replica_id="${replica#replica_}"
195+
ERROR_ENTRIES+=("${tname}|${replica_id}|${info}")
196+
;;
197+
esac
198+
done <<< "$CLEAN_OUTPUT"
199+
200+
TOTAL_COMPLETED=$((TOTAL_COMPLETED + COMPLETED))
201+
TOTAL_ACTIVE=$((TOTAL_ACTIVE + ACTIVE))
202+
TOTAL_FAILED=$((TOTAL_FAILED + FAILED))
203+
TOTAL_ERROR=$((TOTAL_ERROR + ERROR))
204+
205+
# Restart failed jobs.
206+
DIR_RESTARTS=""
207+
RESTART_COUNT=0
208+
209+
if (( FAILED > 0 )); then
210+
for entry in "${FAILED_ENTRIES[@]}"; do
211+
IFS='|' read -r tname replica_id info <<< "$entry"
212+
213+
tfile="${DIR_ABS}/transformations/${tname}.json"
214+
if [[ ! -f "$tfile" ]]; then
215+
DIR_RESTARTS+=" ${tname} replica_${replica_id}: transformation file not found${NL}"
216+
continue
217+
fi
218+
219+
if [[ "$DRY_RUN" == true ]]; then
220+
DIR_RESTARTS+=" ${tname} replica_${replica_id} -> [DRY RUN] would restart${NL}"
221+
RESTART_COUNT=$((RESTART_COUNT + 1))
222+
else
223+
# Submit restart from the project directory where quickrun.sbatch is symlinked.
224+
RESTART_OUTPUT="$(cd "$DIR_ABS" && sbatch --array="$replica_id" quickrun.sbatch "transformations/${tname}.json" -o results/ 2>&1)" || true
225+
JOB_ID="$(echo "$RESTART_OUTPUT" | grep -oP 'Submitted batch job \K[0-9]+' || echo "unknown")"
226+
DIR_RESTARTS+=" ${tname} replica_${replica_id} -> Job ${JOB_ID}${NL}"
227+
RESTART_COUNT=$((RESTART_COUNT + 1))
228+
fi
229+
done
230+
fi
231+
232+
TOTAL_RESTARTS=$((TOTAL_RESTARTS + RESTART_COUNT))
233+
234+
# Build directory report section.
235+
DIR_REPORT="--- ${DIR_NAME} ---${NL}"
236+
DIR_REPORT+=" Completed: ${COMPLETED}/${TOTAL} Active: ${ACTIVE} Failed: ${FAILED}"
237+
if (( RESTART_COUNT > 0 )); then
238+
DIR_REPORT+=" (restarted)"
239+
fi
240+
DIR_REPORT+=" Error: ${ERROR}${NL}"
241+
242+
if [[ -n "$DIR_RESTARTS" ]]; then
243+
DIR_REPORT+="${NL} Restarts:${NL}${DIR_RESTARTS}"
244+
fi
245+
246+
if (( ERROR > 0 )); then
247+
DIR_REPORT+="${NL} Errors (not restarted):${NL}"
248+
for entry in "${ERROR_ENTRIES[@]}"; do
249+
IFS='|' read -r tname replica_id info <<< "$entry"
250+
DIR_REPORT+=" ${tname} replica_${replica_id}: ${info}${NL}"
251+
done
252+
fi
253+
254+
REPORT+="${NL}${DIR_REPORT}"
255+
echo "$DIR_REPORT"
256+
done
257+
258+
# ---- Check if all done ----
259+
260+
ALL_DONE=false
261+
if (( TOTAL_ACTIVE == 0 && TOTAL_FAILED == 0 && TOTAL_ERROR == 0 && TOTAL_COMPLETED > 0 )); then
262+
ALL_DONE=true
263+
fi
264+
265+
# ---- Build and send email ----
266+
267+
if [[ "$ALL_DONE" == true ]]; then
268+
SUBJECT="[OpenFE Monitor] All jobs completed!"
269+
BODY="All jobs across ${#DIRS[@]} directories have completed.${NL}${NL}"
270+
BODY+="Final status (iteration ${ITERATION}/${MAX_ITER}):${NL}"
271+
BODY+="${REPORT}"
272+
else
273+
SUBJECT="[OpenFE Monitor] Iteration ${ITERATION}/${MAX_ITER}"
274+
if (( TOTAL_RESTARTS > 0 )); then
275+
SUBJECT+=" - ${TOTAL_RESTARTS} restart(s)"
276+
fi
277+
BODY="Monitoring report for iteration ${ITERATION}/${MAX_ITER}${NL}"
278+
BODY+="Timestamp: $(date)${NL}"
279+
BODY+="${REPORT}"
280+
fi
281+
282+
echo ""
283+
echo "Sending email to ${EMAIL}..."
284+
if [[ "$DRY_RUN" == true ]]; then
285+
echo "[DRY RUN] Would send email:"
286+
echo "Subject: ${SUBJECT}"
287+
echo "$BODY"
288+
else
289+
send_email "$SUBJECT" "$BODY" "$EMAIL"
290+
fi
291+
292+
# ---- Self-resubmit or exit ----
293+
294+
if [[ "$ALL_DONE" == true ]]; then
295+
echo "All jobs completed. No resubmission needed."
296+
rm -f "$STATE_FILE"
297+
exit 0
298+
fi
299+
300+
if (( ITERATION >= MAX_ITER )); then
301+
echo "Maximum iterations (${MAX_ITER}) reached. No resubmission."
302+
rm -f "$STATE_FILE"
303+
exit 0
304+
fi
305+
306+
# Reconstruct original arguments for resubmission.
307+
MONITOR_SBATCH="${SCRIPTS_DIR}/monitor.sbatch"
308+
ARGS=()
309+
for d in "${DIRS[@]}"; do
310+
ARGS+=(-d "$d")
311+
done
312+
ARGS+=(-e "$EMAIL" -m "$MAX_ITER" -i "$INTERVAL" -s "$STATE_FILE")
313+
if [[ "$DRY_RUN" == true ]]; then
314+
ARGS+=(-n)
315+
fi
316+
317+
if [[ "$DRY_RUN" == true ]]; then
318+
echo "[DRY RUN] Would resubmit: sbatch --begin=now+${INTERVAL}hour --dependency=singleton ${MONITOR_SBATCH} ${ARGS[*]}"
319+
else
320+
echo "Resubmitting for next check in ${INTERVAL} hour(s)..."
321+
sbatch --begin="now+${INTERVAL}hour" --dependency=singleton "$MONITOR_SBATCH" "${ARGS[@]}"
322+
fi
323+
324+
echo "Done."

0 commit comments

Comments
 (0)