|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +set -euo pipefail |
| 4 | + |
| 5 | +# monitor.sh |
| 6 | +# |
| 7 | +# Monitor OpenFE quickrun jobs across project directories, restart failed jobs, |
| 8 | +# and send email reports. Designed to self-resubmit via SLURM for periodic |
| 9 | +# monitoring (default: hourly for 48 iterations / 2 days). |
| 10 | +# |
| 11 | +# Status handling: |
| 12 | +# completed : no action |
| 13 | +# active : no action (job still running) |
| 14 | +# failed : restart via sbatch |
| 15 | +# error : report only (multiple matching jobs, needs manual attention) |
| 16 | +# |
| 17 | +# Options: |
| 18 | +# -d DIR Project directory to monitor (repeatable, required) |
| 19 | +# -e EMAIL Notification email (default: zhaoyangli@stanford.edu) |
| 20 | +# -m MAX_ITER Maximum iterations before stopping (default: 48) |
| 21 | +# -i HOURS Hours between checks (default: 1) |
| 22 | +# -s STATE_FILE Iteration state file (default: ~/.openfe_monitor_state) |
| 23 | +# -n Dry run: parse and report but do not restart or resubmit |
| 24 | +# -h Show help |
| 25 | + |
| 26 | +SCRIPTS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" |
| 27 | +CHECK_STATUS="${SCRIPTS_DIR}/check_status.sh" |
| 28 | + |
| 29 | +DIRS=() |
| 30 | +EMAIL="zhaoyangli@stanford.edu" |
| 31 | +MAX_ITER=48 |
| 32 | +INTERVAL=1 |
| 33 | +STATE_FILE="${HOME}/.openfe_monitor_state" |
| 34 | +DRY_RUN=false |
| 35 | + |
| 36 | +usage() { |
| 37 | + cat <<'EOF' |
| 38 | +Usage: monitor.sh -d DIR [-d DIR ...] [OPTIONS] |
| 39 | +
|
| 40 | +Options: |
| 41 | + -d DIR Project directory to monitor (repeatable, required) |
| 42 | + -e EMAIL Notification email (default: zhaoyangli@stanford.edu) |
| 43 | + -m MAX_ITER Maximum iterations (default: 48) |
| 44 | + -i HOURS Interval between checks in hours (default: 1) |
| 45 | + -s STATE_FILE Iteration state file (default: ~/.openfe_monitor_state) |
| 46 | + -n Dry run: report only, no restarts or resubmissions |
| 47 | + -h Show this help |
| 48 | +EOF |
| 49 | +} |
| 50 | + |
| 51 | +while getopts ":d:e:m:i:s:nh" opt; do |
| 52 | + case "$opt" in |
| 53 | + d) DIRS+=("$OPTARG") ;; |
| 54 | + e) EMAIL="$OPTARG" ;; |
| 55 | + m) MAX_ITER="$OPTARG" ;; |
| 56 | + i) INTERVAL="$OPTARG" ;; |
| 57 | + s) STATE_FILE="$OPTARG" ;; |
| 58 | + n) DRY_RUN=true ;; |
| 59 | + h) usage; exit 0 ;; |
| 60 | + \?) |
| 61 | + echo "Error: invalid option -$OPTARG" >&2 |
| 62 | + usage |
| 63 | + exit 2 |
| 64 | + ;; |
| 65 | + :) |
| 66 | + echo "Error: option -$OPTARG requires an argument" >&2 |
| 67 | + usage |
| 68 | + exit 2 |
| 69 | + ;; |
| 70 | + esac |
| 71 | +done |
| 72 | + |
| 73 | +if [[ ${#DIRS[@]} -eq 0 ]]; then |
| 74 | + echo "Error: at least one -d DIR is required" >&2 |
| 75 | + usage |
| 76 | + exit 2 |
| 77 | +fi |
| 78 | + |
| 79 | +# ---- Helper functions ---- |
| 80 | + |
| 81 | +strip_ansi() { |
| 82 | + sed 's/\x1b\[[0-9;]*m//g' |
| 83 | +} |
| 84 | + |
| 85 | +send_email() { |
| 86 | + local subject="$1" |
| 87 | + local body="$2" |
| 88 | + local recipient="$3" |
| 89 | + |
| 90 | + if command -v mail >/dev/null 2>&1; then |
| 91 | + echo "$body" | mail -s "$subject" "$recipient" |
| 92 | + elif command -v sendmail >/dev/null 2>&1; then |
| 93 | + { |
| 94 | + printf 'To: %s\n' "$recipient" |
| 95 | + printf 'Subject: %s\n' "$subject" |
| 96 | + printf 'Content-Type: text/plain; charset=UTF-8\n' |
| 97 | + printf '\n' |
| 98 | + printf '%s\n' "$body" |
| 99 | + } | sendmail "$recipient" |
| 100 | + else |
| 101 | + echo "WARNING: No mail command available. Email report:" >&2 |
| 102 | + echo "Subject: $subject" >&2 |
| 103 | + echo "$body" >&2 |
| 104 | + fi |
| 105 | +} |
| 106 | + |
| 107 | +# ---- Iteration tracking ---- |
| 108 | + |
| 109 | +ITERATION=1 |
| 110 | +if [[ -f "$STATE_FILE" ]]; then |
| 111 | + ITERATION=$(( $(cat "$STATE_FILE") + 1 )) |
| 112 | +fi |
| 113 | + |
| 114 | +if (( ITERATION > MAX_ITER )); then |
| 115 | + echo "Maximum iterations ($MAX_ITER) reached. Exiting." |
| 116 | + rm -f "$STATE_FILE" |
| 117 | + exit 0 |
| 118 | +fi |
| 119 | + |
| 120 | +echo "$ITERATION" > "$STATE_FILE" |
| 121 | + |
| 122 | +NL=$'\n' |
| 123 | + |
| 124 | +echo "=== OpenFE Monitor: Iteration ${ITERATION}/${MAX_ITER} ===" |
| 125 | +echo "Timestamp: $(date)" |
| 126 | +echo "" |
| 127 | + |
| 128 | +# ---- Main loop over directories ---- |
| 129 | + |
| 130 | +TOTAL_RESTARTS=0 |
| 131 | +TOTAL_COMPLETED=0 |
| 132 | +TOTAL_ACTIVE=0 |
| 133 | +TOTAL_FAILED=0 |
| 134 | +TOTAL_ERROR=0 |
| 135 | +REPORT="" |
| 136 | + |
| 137 | +for DIR in "${DIRS[@]}"; do |
| 138 | + if [[ ! -d "$DIR" ]]; then |
| 139 | + echo " Warning: directory does not exist: $DIR" >&2 |
| 140 | + REPORT+="${NL}--- $(basename "$DIR") ---${NL} Directory not found: ${DIR}${NL}" |
| 141 | + continue |
| 142 | + fi |
| 143 | + |
| 144 | + DIR_ABS="$(cd "$DIR" && pwd -P)" |
| 145 | + DIR_NAME="$(basename "$DIR_ABS")" |
| 146 | + |
| 147 | + echo "--- Checking: ${DIR_NAME} ---" |
| 148 | + |
| 149 | + # Run check_status.sh; stdout has TSV data, stderr goes to monitor log. |
| 150 | + if ! STATUS_OUTPUT="$(bash "$CHECK_STATUS" -r "$DIR_ABS")"; then |
| 151 | + echo " Warning: check_status.sh returned non-zero for ${DIR_NAME}" >&2 |
| 152 | + REPORT+="${NL}--- ${DIR_NAME} ---${NL} check_status.sh failed${NL}" |
| 153 | + continue |
| 154 | + fi |
| 155 | + |
| 156 | + # Strip ANSI codes for parsing. |
| 157 | + CLEAN_OUTPUT="$(echo "$STATUS_OUTPUT" | strip_ansi)" |
| 158 | + |
| 159 | + # Count statuses (skip header line). |
| 160 | + COMPLETED=0 |
| 161 | + ACTIVE=0 |
| 162 | + FAILED=0 |
| 163 | + ERROR=0 |
| 164 | + TOTAL=0 |
| 165 | + |
| 166 | + FAILED_ENTRIES=() |
| 167 | + ERROR_ENTRIES=() |
| 168 | + |
| 169 | + while IFS=$'\t' read -r directory status replica info; do |
| 170 | + # Skip header and blank lines. |
| 171 | + [[ "$status" == "status" ]] && continue |
| 172 | + [[ -z "$status" ]] && continue |
| 173 | + |
| 174 | + TOTAL=$((TOTAL + 1)) |
| 175 | + |
| 176 | + case "$status" in |
| 177 | + completed) |
| 178 | + COMPLETED=$((COMPLETED + 1)) |
| 179 | + ;; |
| 180 | + active) |
| 181 | + ACTIVE=$((ACTIVE + 1)) |
| 182 | + ;; |
| 183 | + failed) |
| 184 | + FAILED=$((FAILED + 1)) |
| 185 | + # Extract tname from directory: .../results/<tname>/replica_<N> |
| 186 | + tname="$(basename "$(dirname "$directory")")" |
| 187 | + # Extract replica_id from replica column: replica_<N> |
| 188 | + replica_id="${replica#replica_}" |
| 189 | + FAILED_ENTRIES+=("${tname}|${replica_id}|${info}") |
| 190 | + ;; |
| 191 | + error) |
| 192 | + ERROR=$((ERROR + 1)) |
| 193 | + tname="$(basename "$(dirname "$directory")")" |
| 194 | + replica_id="${replica#replica_}" |
| 195 | + ERROR_ENTRIES+=("${tname}|${replica_id}|${info}") |
| 196 | + ;; |
| 197 | + esac |
| 198 | + done <<< "$CLEAN_OUTPUT" |
| 199 | + |
| 200 | + TOTAL_COMPLETED=$((TOTAL_COMPLETED + COMPLETED)) |
| 201 | + TOTAL_ACTIVE=$((TOTAL_ACTIVE + ACTIVE)) |
| 202 | + TOTAL_FAILED=$((TOTAL_FAILED + FAILED)) |
| 203 | + TOTAL_ERROR=$((TOTAL_ERROR + ERROR)) |
| 204 | + |
| 205 | + # Restart failed jobs. |
| 206 | + DIR_RESTARTS="" |
| 207 | + RESTART_COUNT=0 |
| 208 | + |
| 209 | + if (( FAILED > 0 )); then |
| 210 | + for entry in "${FAILED_ENTRIES[@]}"; do |
| 211 | + IFS='|' read -r tname replica_id info <<< "$entry" |
| 212 | + |
| 213 | + tfile="${DIR_ABS}/transformations/${tname}.json" |
| 214 | + if [[ ! -f "$tfile" ]]; then |
| 215 | + DIR_RESTARTS+=" ${tname} replica_${replica_id}: transformation file not found${NL}" |
| 216 | + continue |
| 217 | + fi |
| 218 | + |
| 219 | + if [[ "$DRY_RUN" == true ]]; then |
| 220 | + DIR_RESTARTS+=" ${tname} replica_${replica_id} -> [DRY RUN] would restart${NL}" |
| 221 | + RESTART_COUNT=$((RESTART_COUNT + 1)) |
| 222 | + else |
| 223 | + # Submit restart from the project directory where quickrun.sbatch is symlinked. |
| 224 | + RESTART_OUTPUT="$(cd "$DIR_ABS" && sbatch --array="$replica_id" quickrun.sbatch "transformations/${tname}.json" -o results/ 2>&1)" || true |
| 225 | + JOB_ID="$(echo "$RESTART_OUTPUT" | grep -oP 'Submitted batch job \K[0-9]+' || echo "unknown")" |
| 226 | + DIR_RESTARTS+=" ${tname} replica_${replica_id} -> Job ${JOB_ID}${NL}" |
| 227 | + RESTART_COUNT=$((RESTART_COUNT + 1)) |
| 228 | + fi |
| 229 | + done |
| 230 | + fi |
| 231 | + |
| 232 | + TOTAL_RESTARTS=$((TOTAL_RESTARTS + RESTART_COUNT)) |
| 233 | + |
| 234 | + # Build directory report section. |
| 235 | + DIR_REPORT="--- ${DIR_NAME} ---${NL}" |
| 236 | + DIR_REPORT+=" Completed: ${COMPLETED}/${TOTAL} Active: ${ACTIVE} Failed: ${FAILED}" |
| 237 | + if (( RESTART_COUNT > 0 )); then |
| 238 | + DIR_REPORT+=" (restarted)" |
| 239 | + fi |
| 240 | + DIR_REPORT+=" Error: ${ERROR}${NL}" |
| 241 | + |
| 242 | + if [[ -n "$DIR_RESTARTS" ]]; then |
| 243 | + DIR_REPORT+="${NL} Restarts:${NL}${DIR_RESTARTS}" |
| 244 | + fi |
| 245 | + |
| 246 | + if (( ERROR > 0 )); then |
| 247 | + DIR_REPORT+="${NL} Errors (not restarted):${NL}" |
| 248 | + for entry in "${ERROR_ENTRIES[@]}"; do |
| 249 | + IFS='|' read -r tname replica_id info <<< "$entry" |
| 250 | + DIR_REPORT+=" ${tname} replica_${replica_id}: ${info}${NL}" |
| 251 | + done |
| 252 | + fi |
| 253 | + |
| 254 | + REPORT+="${NL}${DIR_REPORT}" |
| 255 | + echo "$DIR_REPORT" |
| 256 | +done |
| 257 | + |
| 258 | +# ---- Check if all done ---- |
| 259 | + |
| 260 | +ALL_DONE=false |
| 261 | +if (( TOTAL_ACTIVE == 0 && TOTAL_FAILED == 0 && TOTAL_ERROR == 0 && TOTAL_COMPLETED > 0 )); then |
| 262 | + ALL_DONE=true |
| 263 | +fi |
| 264 | + |
| 265 | +# ---- Build and send email ---- |
| 266 | + |
| 267 | +if [[ "$ALL_DONE" == true ]]; then |
| 268 | + SUBJECT="[OpenFE Monitor] All jobs completed!" |
| 269 | + BODY="All jobs across ${#DIRS[@]} directories have completed.${NL}${NL}" |
| 270 | + BODY+="Final status (iteration ${ITERATION}/${MAX_ITER}):${NL}" |
| 271 | + BODY+="${REPORT}" |
| 272 | +else |
| 273 | + SUBJECT="[OpenFE Monitor] Iteration ${ITERATION}/${MAX_ITER}" |
| 274 | + if (( TOTAL_RESTARTS > 0 )); then |
| 275 | + SUBJECT+=" - ${TOTAL_RESTARTS} restart(s)" |
| 276 | + fi |
| 277 | + BODY="Monitoring report for iteration ${ITERATION}/${MAX_ITER}${NL}" |
| 278 | + BODY+="Timestamp: $(date)${NL}" |
| 279 | + BODY+="${REPORT}" |
| 280 | +fi |
| 281 | + |
| 282 | +echo "" |
| 283 | +echo "Sending email to ${EMAIL}..." |
| 284 | +if [[ "$DRY_RUN" == true ]]; then |
| 285 | + echo "[DRY RUN] Would send email:" |
| 286 | + echo "Subject: ${SUBJECT}" |
| 287 | + echo "$BODY" |
| 288 | +else |
| 289 | + send_email "$SUBJECT" "$BODY" "$EMAIL" |
| 290 | +fi |
| 291 | + |
| 292 | +# ---- Self-resubmit or exit ---- |
| 293 | + |
| 294 | +if [[ "$ALL_DONE" == true ]]; then |
| 295 | + echo "All jobs completed. No resubmission needed." |
| 296 | + rm -f "$STATE_FILE" |
| 297 | + exit 0 |
| 298 | +fi |
| 299 | + |
| 300 | +if (( ITERATION >= MAX_ITER )); then |
| 301 | + echo "Maximum iterations (${MAX_ITER}) reached. No resubmission." |
| 302 | + rm -f "$STATE_FILE" |
| 303 | + exit 0 |
| 304 | +fi |
| 305 | + |
| 306 | +# Reconstruct original arguments for resubmission. |
| 307 | +MONITOR_SBATCH="${SCRIPTS_DIR}/monitor.sbatch" |
| 308 | +ARGS=() |
| 309 | +for d in "${DIRS[@]}"; do |
| 310 | + ARGS+=(-d "$d") |
| 311 | +done |
| 312 | +ARGS+=(-e "$EMAIL" -m "$MAX_ITER" -i "$INTERVAL" -s "$STATE_FILE") |
| 313 | +if [[ "$DRY_RUN" == true ]]; then |
| 314 | + ARGS+=(-n) |
| 315 | +fi |
| 316 | + |
| 317 | +if [[ "$DRY_RUN" == true ]]; then |
| 318 | + echo "[DRY RUN] Would resubmit: sbatch --begin=now+${INTERVAL}hour --dependency=singleton ${MONITOR_SBATCH} ${ARGS[*]}" |
| 319 | +else |
| 320 | + echo "Resubmitting for next check in ${INTERVAL} hour(s)..." |
| 321 | + sbatch --begin="now+${INTERVAL}hour" --dependency=singleton "$MONITOR_SBATCH" "${ARGS[@]}" |
| 322 | +fi |
| 323 | + |
| 324 | +echo "Done." |
0 commit comments