Skip to content

Commit 125983e

Browse files
refactor: remove replica count option from check_status.sh and update related tests
Eliminated the -n option for specifying the number of replicas in check_status.sh, simplifying the script's usage. Updated the script to auto-detect the number of replicas based on existing directories and active jobs. Adjusted corresponding tests to reflect this change, ensuring they no longer pass a replica count argument. This refactor enhances usability and streamlines the job checking process.
1 parent 6135bf8 commit 125983e

2 files changed

Lines changed: 51 additions & 49 deletions

File tree

scripts/openfe/runtime/check_status.sh

Lines changed: 26 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ set -euo pipefail
1414
#
1515
# Options:
1616
# -j N number of parallel workers (default: 8)
17-
# -n N number of replicas per transformation (default: auto-detect)
1817
# -r DIR root directory to search (default: .)
1918
# -R restart failed replicas via sbatch
2019
# -h show help
@@ -23,26 +22,23 @@ set -euo pipefail
2322

2423
JOBS=8
2524
ROOT="."
26-
REPLICAS=""
2725
RESTART=false
2826

2927
usage() {
3028
cat <<'EOF'
31-
Usage: check_status.sh [-j N] [-n REPLICAS] [-r ROOT] [-R]
29+
Usage: check_status.sh [-j N] [-r ROOT] [-R]
3230
3331
Options:
3432
-j N Number of parallel workers (default: 8)
35-
-n REPLICAS Number of replicas per transformation (default: auto-detect)
3633
-r ROOT Root directory (default: .)
3734
-R Restart failed replicas via sbatch
3835
-h Show this help
3936
EOF
4037
}
4138

42-
while getopts ":j:n:r:Rh" opt; do
39+
while getopts ":j:r:Rh" opt; do
4340
case "$opt" in
4441
j) JOBS="$OPTARG" ;;
45-
n) REPLICAS="$OPTARG" ;;
4642
r) ROOT="$OPTARG" ;;
4743
R) RESTART=true ;;
4844
h)
@@ -71,11 +67,6 @@ for cmd in parallel squeue scontrol; do
7167
}
7268
done
7369

74-
if [[ -n "$REPLICAS" ]] && ! [[ "$REPLICAS" =~ ^[1-9][0-9]*$ ]]; then
75-
echo "Error: -n REPLICAS must be a positive integer" >&2
76-
exit 2
77-
fi
78-
7970
# ---- Setup ----
8071

8172
SCRIPTS_DIR="$(cd "$(dirname "$(readlink -f "$0")")" && pwd -P)"
@@ -137,34 +128,36 @@ build_active_jobs() {
137128
# ---- Functions: work enumeration ----
138129

139130
# List all (transform_name, replica_id) pairs that need checking.
131+
# Replica count is auto-detected per transformation by taking the maximum
132+
# replica ID from two sources:
133+
# 1. Existing replica_* directories under results/<tname>/
134+
# 2. Active SLURM array task IDs for that transformation in squeue
135+
# The final count is max_id + 1, checking replica_0 through replica_{max_id}.
140136
enumerate_work() {
141-
local transforms_dir="$1" results_dir="$2" active_tsv="$3" num_replicas="$4"
137+
local transforms_dir="$1" results_dir="$2" active_tsv="$3"
142138

143139
find "$transforms_dir" -maxdepth 1 -name '*.json' -type f | sort |
144140
while IFS= read -r tfile; do
145141
tname="$(basename "$tfile" .json)"
146142

147-
if [[ -n "$num_replicas" ]]; then
148-
count="$num_replicas"
149-
else
150-
# Auto-detect replica count from results dir + active jobs.
151-
max_id=-1
152-
153-
if [[ -d "${results_dir}/${tname}" ]]; then
154-
for rdir in "${results_dir}/${tname}"/replica_*; do
155-
[[ -d "$rdir" ]] || continue
156-
rid="${rdir##*replica_}"
157-
[[ "$rid" =~ ^[0-9]+$ ]] && ((rid > max_id)) && max_id=$rid
158-
done
159-
fi
160-
161-
while IFS=$'\t' read -r tn tid _ _; do
162-
[[ "$tn" == "$tname" && "$tid" =~ ^[0-9]+$ ]] && ((tid > max_id)) && max_id=$tid
163-
done <"$active_tsv"
164-
165-
count=$((max_id + 1))
143+
max_id=-1
144+
145+
# Source 1: replica directories on disk.
146+
if [[ -d "${results_dir}/${tname}" ]]; then
147+
for rdir in "${results_dir}/${tname}"/replica_*; do
148+
[[ -d "$rdir" ]] || continue
149+
rid="${rdir##*replica_}"
150+
[[ "$rid" =~ ^[0-9]+$ ]] && ((rid > max_id)) && max_id=$rid
151+
done
166152
fi
167153

154+
# Source 2: active SLURM array task IDs.
155+
while IFS=$'\t' read -r tn tid _ _; do
156+
[[ "$tn" == "$tname" && "$tid" =~ ^[0-9]+$ ]] && ((tid > max_id)) && max_id=$tid
157+
done <"$active_tsv"
158+
159+
count=$((max_id + 1))
160+
168161
for ((i = 0; i < count; i++)); do
169162
printf '%s\t%s\n' "$tname" "$i"
170163
done
@@ -304,11 +297,11 @@ restart_failed() {
304297
build_active_jobs "$ROOT_ABS" >"$ACTIVE_FILE"
305298

306299
shopt -s nullglob
307-
enumerate_work "$TRANSFORMS_DIR" "$RESULTS_DIR" "$ACTIVE_FILE" "$REPLICAS" \
300+
enumerate_work "$TRANSFORMS_DIR" "$RESULTS_DIR" "$ACTIVE_FILE" \
308301
>"${TMPDIR_MAIN}/work.tsv"
309302

310303
if [[ ! -s "${TMPDIR_MAIN}/work.tsv" ]]; then
311-
echo "No replicas found. Use -n to specify replica count." >&2
304+
echo "No replicas found." >&2
312305
exit 0
313306
fi
314307

tests/scripts/openfe/runtime/test_check_status.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,19 @@ def _place_result(
7373
return result_json
7474

7575

76+
def _ensure_replica_dir(results_dir: Path, tname: str, replica_id: int) -> Path:
77+
"""Create an empty replica directory (no result JSON) for auto-detect."""
78+
replica_dir = results_dir / tname / f"replica_{replica_id}"
79+
replica_dir.mkdir(parents=True, exist_ok=True)
80+
return replica_dir
81+
82+
7683
def _run(
7784
slurm_env: dict[str, Path],
7885
*extra_args: str,
7986
root: Path,
80-
replicas: int | None = None,
8187
) -> subprocess.CompletedProcess[str]:
8288
cmd = ["bash", str(CHECK_STATUS), "-r", str(root)]
83-
if replicas is not None:
84-
cmd += ["-n", str(replicas)]
8589
cmd += list(extra_args)
8690
return subprocess.run(
8791
cmd,
@@ -118,7 +122,7 @@ def test_valid_result_shows_ddg(
118122
) -> None:
119123
_place_result(openfe_workspace["results_dir"], "rbfe_A_complex_B_complex", 0, valid=True)
120124

121-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
125+
result = _run(slurm_env, root=openfe_workspace["root"])
122126
assert result.returncode == 0
123127

124128
rows = _parse_rows(result.stdout)
@@ -137,7 +141,7 @@ def test_null_estimate_is_failed(
137141
) -> None:
138142
_place_result(openfe_workspace["results_dir"], "rbfe_A_complex_B_complex", 0, valid=False)
139143

140-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
144+
result = _run(slurm_env, root=openfe_workspace["root"])
141145
assert result.returncode == 0
142146

143147
rows = _parse_rows(result.stdout)
@@ -152,7 +156,8 @@ class TestNoResultNoJob:
152156
def test_missing_result_is_failed(
153157
self, slurm_env: dict[str, Path], openfe_workspace: dict[str, Path]
154158
) -> None:
155-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
159+
_ensure_replica_dir(openfe_workspace["results_dir"], "rbfe_A_complex_B_complex", 0)
160+
result = _run(slurm_env, root=openfe_workspace["root"])
156161
assert result.returncode == 0
157162

158163
rows = _parse_rows(result.stdout)
@@ -178,7 +183,8 @@ def test_active_job_in_squeue(
178183
self, slurm_env: dict[str, Path], openfe_workspace: dict[str, Path]
179184
) -> None:
180185
self._setup_active(slurm_env, openfe_workspace)
181-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
186+
_ensure_replica_dir(openfe_workspace["results_dir"], "rbfe_A_complex_B_complex", 0)
187+
result = _run(slurm_env, root=openfe_workspace["root"])
182188
assert result.returncode == 0
183189

184190
rows = _parse_rows(result.stdout)
@@ -191,7 +197,8 @@ def test_no_yaml_shows_zero_percent(
191197
self, slurm_env: dict[str, Path], openfe_workspace: dict[str, Path]
192198
) -> None:
193199
self._setup_active(slurm_env, openfe_workspace)
194-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
200+
_ensure_replica_dir(openfe_workspace["results_dir"], "rbfe_A_complex_B_complex", 0)
201+
result = _run(slurm_env, root=openfe_workspace["root"])
195202

196203
rows = _parse_rows(result.stdout)
197204
assert "0%" in rows[0]["info"]
@@ -222,7 +229,7 @@ def test_yaml_progress(
222229
" estimated_time_remaining: 1 day, 12:00:00\n"
223230
)
224231

225-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
232+
result = _run(slurm_env, root=openfe_workspace["root"])
226233
rows = _parse_rows(result.stdout)
227234
assert "25.0%" in rows[0]["info"]
228235
assert "ETA:" in rows[0]["info"]
@@ -246,7 +253,8 @@ def test_error_on_duplicate_jobs(
246253
f"-o {root_abs}/results\n"
247254
)
248255

249-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
256+
_ensure_replica_dir(openfe_workspace["results_dir"], "rbfe_A_complex_B_complex", 0)
257+
result = _run(slurm_env, root=openfe_workspace["root"])
250258
assert result.returncode == 0
251259

252260
rows = _parse_rows(result.stdout)
@@ -264,9 +272,9 @@ def test_mixed_statuses(
264272
tname = "rbfe_A_complex_B_complex"
265273
_place_result(openfe_workspace["results_dir"], tname, 0, valid=True)
266274
_place_result(openfe_workspace["results_dir"], tname, 1, valid=False)
267-
# replica 2: no result, no job
275+
_ensure_replica_dir(openfe_workspace["results_dir"], tname, 2)
268276

269-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=3)
277+
result = _run(slurm_env, root=openfe_workspace["root"])
270278
assert result.returncode == 0
271279

272280
rows = _parse_rows(result.stdout)
@@ -304,9 +312,9 @@ def test_restart_submits_failed(
304312
tname = "rbfe_A_complex_B_complex"
305313
_place_result(openfe_workspace["results_dir"], tname, 0, valid=True)
306314
_place_result(openfe_workspace["results_dir"], tname, 1, valid=False)
307-
# replica 2: missing
315+
_ensure_replica_dir(openfe_workspace["results_dir"], tname, 2)
308316

309-
result = _run(slurm_env, "-R", root=openfe_workspace["root"], replicas=3)
317+
result = _run(slurm_env, "-R", root=openfe_workspace["root"])
310318
assert result.returncode == 0
311319

312320
sbatch_log = slurm_env["sbatch"].read_text().strip()
@@ -322,7 +330,8 @@ def test_restart_submits_failed(
322330
def test_no_restart_without_flag(
323331
self, slurm_env: dict[str, Path], openfe_workspace: dict[str, Path]
324332
) -> None:
325-
result = _run(slurm_env, root=openfe_workspace["root"], replicas=1)
333+
_ensure_replica_dir(openfe_workspace["results_dir"], "rbfe_A_complex_B_complex", 0)
334+
result = _run(slurm_env, root=openfe_workspace["root"])
326335
assert result.returncode == 0
327336

328337
sbatch_log = slurm_env["sbatch"].read_text().strip()
@@ -336,7 +345,7 @@ def test_missing_transforms_dir(self, slurm_env: dict[str, Path], tmp_path: Path
336345
empty = tmp_path / "empty"
337346
empty.mkdir()
338347

339-
result = _run(slurm_env, root=empty, replicas=1)
348+
result = _run(slurm_env, root=empty)
340349
assert result.returncode != 0
341350
assert "transformations directory not found" in result.stderr
342351

0 commit comments

Comments
 (0)