Skip to content

Commit f355002

Browse files
ChSonnabenddavidrohr
authored andcommitted
FST checks to avoid node crashes for SLURM, MI100 and FMQ segments
1 parent 821e1a9 commit f355002

File tree

1 file changed

+34
-0
lines changed

1 file changed

+34
-0
lines changed

prodtests/full-system-test/start_tmux.sh

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,39 @@
11
#!/bin/bash
22

3+
### --- Early safety checks ----------------------------------------------------
4+
5+
# Skip checks if FST_RUN_WITHOUT_CHECKS=1
6+
if [[ "${FST_RUN_WITHOUT_CHECKS:-0}" != "1" ]]; then
7+
8+
# 1. Abort if running inside a Slurm shell
9+
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
10+
echo "ERROR: This script must not be run inside a Slurm job (SLURM_JOB_ID=${SLURM_JOB_ID})." >&2
11+
echo "Please run it from a normal ssh shell." >&2
12+
exit 1
13+
fi
14+
15+
# 2. Abort if FMQ shared-memory files exist in /dev/shm
16+
if compgen -G "/dev/shm/fmq*" > /dev/null; then
17+
echo "ERROR: Found existing /dev/shm/fmq* files." >&2
18+
echo "Please clean them manually before running the FST." >&2
19+
exit 1
20+
fi
21+
22+
# 3. MI100 check: detect MI100 GPU but EPN_NODE_MI100 not set or set to 0
23+
if lspci | grep -qi "MI100"; then
24+
if [[ -z "${EPN_NODE_MI100:-}" || "${EPN_NODE_MI100}" == "0" ]]; then
25+
echo "ERROR: MI100 GPU detected on this node, but EPN_NODE_MI100 is not set to 1." >&2
26+
echo "Please export EPN_NODE_MI100=1 before running this script." >&2
27+
echo "See installation instructions here:" >&2
28+
echo " https://alice-pdp-operations.docs.cern.ch/o2install/#install-and-validate-the-new-o2pdpsuite-on-one-production-epn-using-the-fst"
29+
exit 1
30+
fi
31+
fi
32+
33+
fi
34+
35+
### ---------------------------------------------------------------------------
36+
337
if [ "0$1" != "0dd" ] && [ "0$1" != "0rr" ] && [ "0$1" != "0tf" ]; then
438
echo Please indicate whether to start with raw-reader [rr] or with DataDistribution [dd] or TfReader [tf] 1>&2
539
exit 1

0 commit comments

Comments
 (0)