Skip to content

Commit 4a88c93

Browse files
committed
jobutils/taskwrapper: use fairmq-shmmonitor
* parse DPL session id and use fairmq-shmmonitor for shared memory cleanup (in case this leaked). The solution based on lsof is deprecated. * this enables also the possibility to monitor shared memory consumption (and report the max value)
1 parent 922ff46 commit 4a88c93

File tree

1 file changed

+47
-22
lines changed

1 file changed

+47
-22
lines changed

Utilities/Tools/jobutils.sh

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,28 +20,9 @@
2020
# -harmonize coding style for variables
2121

2222
o2_cleanup_shm_files() {
23-
# check if we have lsof (otherwise we do nothing)
24-
which lsof &> /dev/null
25-
if [ "$?" = "0" ]; then
26-
# find shared memory files **CURRENTLY IN USE** by FairMQ
27-
USEDFILES=`lsof -u $(whoami) 2> /dev/null | grep -e \"/dev/shm/.*fmq\" | sed 's/.*\/dev/\/dev/g' | sort | uniq | tr '\n' ' '`
28-
29-
echo "${USEDFILES}"
30-
if [ ! "${USEDFILES}" ]; then
31-
# in this case we can remove everything
32-
COMMAND="find /dev/shm/ -user $(whoami) -name \"*fmq_*\" -delete 2> /dev/null"
33-
else
34-
# build exclusion list
35-
for f in ${USEDFILES}; do
36-
LOGICALOP=""
37-
[ "${EXCLPATTERN}" ] && LOGICALOP="-o"
38-
EXCLPATTERN="${EXCLPATTERN} ${LOGICALOP} -wholename ${f}"
39-
done
40-
COMMAND="find /dev/shm/ -user $(whoami) -type f -not \( ${EXCLPATTERN} \) -delete 2> /dev/null"
41-
fi
42-
eval "${COMMAND}"
43-
else
44-
echo "Can't do shared mem cleanup: lsof not found"
23+
if [ "${JOBUTILS_INTERNAL_DPL_SESSION}" ]; then
24+
# echo "cleaning up session ${JOBUTILS_INTERNAL_DPL_SESSION}"
25+
fairmq-shmmonitor -s ${JOBUTILS_INTERNAL_DPL_SESSION} -c &> /dev/null
4526
fi
4627
}
4728

@@ -74,6 +55,7 @@ taskwrapper_cleanup() {
7455
sleep 2
7556
# remove leftover shm files
7657
o2_cleanup_shm_files
58+
unset JOBUTILS_INTERNAL_DPL_SESSION
7759
}
7860

7961
taskwrapper_cleanup_handler() {
@@ -99,6 +81,25 @@ taskwrapper_cleanup_handler() {
9981
# - possibility to define timeout
10082
# - possibility to control/limit the CPU load
10183
taskwrapper() {
84+
unset JOBUTILS_INTERNAL_DPL_SESSION
85+
# nested helper to parse DPL session ID
86+
_parse_DPL_session ()
87+
{
88+
childpids=$(childprocs ${1})
89+
for p in ${childpids}; do
90+
command=$(ps -o command ${p} | grep -v "COMMAND" | grep "session")
91+
if [ "$?" = "0" ]; then
92+
# echo "parsing from ${command}"
93+
session=`echo ${command} | sed 's/.*--session//g' | awk '//{print $1}'`
94+
if [ "${session}" ]; then
95+
# echo "found ${session}"
96+
break
97+
fi
98+
fi
99+
done
100+
echo "${session:-""}"
101+
}
102+
102103
local logfile=$1
103104
shift 1
104105
local command="$*"
@@ -215,6 +216,17 @@ taskwrapper() {
215216
ps -p $PID > /dev/null
216217
[ $? == 1 ] && break
217218

219+
if [ "${JOBUTILS_MONITORMEM}" ]; then
220+
if [ "${JOBUTILS_INTERNAL_DPL_SESSION}" ]; then
221+
MAX_FMQ_SHM=${MAX_FMQ_SHM:-0}
222+
text=$(timeout 1 fairmq-shmmonitor --interval 100 -v -s ${JOBUTILS_INTERNAL_DPL_SESSION})
223+
line=$(echo ${text} | tr '[' '\n[' | grep "^0" | tail -n1)
224+
CURRENT_FMQ_SHM=$(echo ${line} | sed 's/.*used://g')
225+
# echo "current shm ${CURRENT_FMQ_SHM}"
226+
MAX_FMQ_SHM=$(awk -v "t=${CURRENT_FMQ_SHM}" -v "s=${MAX_FMQ_SHM}" 'BEGIN { if(t>=s) { print t; } else { print s; } }')
227+
fi
228+
fi
229+
218230
if [ "${JOBUTILS_MONITORCPU}" ] || [ "${JOBUTILS_LIMITLOAD}" ]; then
219231
# NOTE: The following section is "a bit" compute intensive and currently not optimized
220232
# A careful evaluation of awk vs bc or other tools might be needed -- or a move to a more
@@ -330,6 +342,12 @@ taskwrapper() {
330342
fi
331343
fi
332344

345+
# Try to find out DPL session ID
346+
# if [ -z "${JOBUTILS_INTERNAL_DPL_SESSION}" ]; then
347+
JOBUTILS_INTERNAL_DPL_SESSION=$(_parse_DPL_session ${PID})
348+
# echo "got session ${JOBUTILS_INTERNAL_DPL_SESSION}"
349+
# fi
350+
333351
# sleep for some time (can be customized for power user)
334352
sleep ${JOBUTILS_WRAPPER_SLEEP:-1}
335353

@@ -387,6 +405,13 @@ taskwrapper() {
387405
[[ ! $- == *i* ]] && exit ${RC}
388406
fi
389407
fi
408+
if [ "${JOBUTILS_MONITORMEM}" ]; then
409+
# convert bytes in MB
410+
MAX_FMQ_SHM=${MAX_FMQ_SHM:-0}
411+
MAX_FMQ_SHM=$(awk -v "s=${MAX_FMQ_SHM}" 'BEGIN { print s/(1024.*1024) }')
412+
echo "PROCESS MAX FMQ_SHM = ${MAX_FMQ_SHM}" >> ${logfile}
413+
fi
414+
unset JOBUTILS_INTERNAL_DPL_SESSION
390415
return ${RC}
391416
}
392417

0 commit comments

Comments
 (0)