Skip to content

Commit 02dfd45

Browse files
committed
Improvements for grid_submit
* ability to just wait for any succeeding job in a split * some cleanup/renaming
1 parent 1502a4d commit 02dfd45

File tree

1 file changed

+21
-15
lines changed

1 file changed

+21
-15
lines changed

GRID/utils/grid_submit.sh

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ while [ $# -gt 0 ] ; do
216216
--prodsplit) PRODSPLIT=$2; shift 2 ;; # allows to set JDL production split level (useful to easily replicate workflows)
217217
--singularity) SINGULARITY=ON; shift 1 ;; # run everything inside singularity
218218
--wait) WAITFORALIEN=ON; shift 1 ;; #wait for alien jobs to finish
219+
--wait-any) WAITFORALIENANY=ON; WAITFORALIEN=ON; shift 1 ;; #wait for any good==done alien jobs to return
219220
--outputspec) OUTPUTSPEC=$2; shift 2 ;; #provide comma separate list of JDL file specs to be put as part of JDL Output field (example '"*.log@disk=1","*.root@disk=2"')
220221
-h) Usage ; exit ;;
221222
--help) Usage ; exit ;;
@@ -227,6 +228,7 @@ export JOBTTL
227228
export JOBLABEL
228229
export MATTERMOSTHOOK
229230
export CONTROLSERVER
231+
230232
[[ $PRODSPLIT -gt 100 ]] && echo "Production split needs to be smaller than 100 for the moment" && exit 1
231233

232234
# check for presence of jq (needed in code path to fetch output files)
@@ -270,9 +272,10 @@ pok "Set the job name by running $0 <scriptname> <jobname>"
270272
# Generate local workdir
271273
#
272274
if [[ "${ONGRID}" == "0" ]]; then
273-
WORKDIR=${WORKDIR:-/tmp/alien_work/$(basename "$MY_JOBWORKDIR")}
274-
[ ! -d "${WORKDIR}" ] && mkdir -p ${WORKDIR}
275-
[ ! "${CONTINUE_WORKDIR}" ] && cp "${MY_JOBSCRIPT}" "${WORKDIR}/alien_jobscript.sh"
275+
GRID_SUBMIT_WORKDIR=${GRID_SUBMIT_WORKDIR:-/tmp/alien_work/$(basename "$MY_JOBWORKDIR")}
276+
echo "WORKDIR FOR THIS JOB IS ${GRID_SUBMIT_WORKDIR}"
277+
[ ! -d "${GRID_SUBMIT_WORKDIR}" ] && mkdir -p ${GRID_SUBMIT_WORKDIR}
278+
[ ! "${CONTINUE_WORKDIR}" ] && cp "${MY_JOBSCRIPT}" "${GRID_SUBMIT_WORKDIR}/alien_jobscript.sh"
276279
fi
277280

278281
#
@@ -349,7 +352,7 @@ if [[ "${IS_ALIEN_JOB_SUBMITTER}" ]]; then
349352
cd "$(dirname "$0")"
350353
THIS_SCRIPT="$PWD/$(basename "$0")"
351354

352-
cd "${WORKDIR}"
355+
cd "${GRID_SUBMIT_WORKDIR}"
353356

354357
QUOT='"'
355358
# ---- Generate JDL ----------------
@@ -436,11 +439,18 @@ EOF
436439
continue
437440
fi
438441
let counter=0 # reset counter
439-
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $3}')
440-
# echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}"
442+
443+
# this is the global job status (a D here means the production is done)
444+
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $3}') # this is the global job status
445+
# in addition we may query individual splits
446+
if [ "${WAITFORANY}" ]; then
447+
if ALIENPY_JSON=true alien.py ps -a -m "${MY_JOBID}" | grep "status" | grep -q "DONE"; then
448+
JOBSTATUS="D" # a D here means == some job finished successfully
449+
fi
450+
fi
441451

442452
if [ "${JOBSTATUS}" == "D" ]; then
443-
echo "Job done"
453+
echo "${WAITFORALIENANY:+At least one }Job(s) done"
444454
WAITFORALIEN="" # guarantees to go out of outer while loop
445455

446456
if [ "${FETCHOUTPUT}" ]; then
@@ -473,10 +483,6 @@ EOF
473483
done
474484
fi
475485
fi
476-
if [[ "${FOO:0:1}" == [EK] ]]; then
477-
echo "Job error occured"
478-
exit 1
479-
fi
480486
done
481487
# get the job data products locally if requested
482488

@@ -490,7 +496,7 @@ if [[ ${SINGULARITY} ]]; then
490496
# if singularity was asked we restart this script within a container
491497
# it's actually much like the GRID mode --> which is why we set JALIEN_TOKEN_CERT
492498
set -x
493-
cp $0 ${WORKDIR}
499+
cp $0 ${GRID_SUBMIT_WORKDIR}
494500

495501
# detect architecture (ARM or X86)
496502
ARCH=$(uname -i)
@@ -508,15 +514,15 @@ if [[ ${SINGULARITY} ]]; then
508514
APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer"
509515

510516
# we can actually analyse the local JDL to find the package and set it up for the container
511-
${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --pwd /workdir -C ${CONTAINER} /workdir/grid_submit.sh \
517+
${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${GRID_SUBMIT_WORKDIR}:/workdir --pwd /workdir -C ${CONTAINER} /workdir/grid_submit.sh \
512518
${CONTINUE_WORKDIR:+"-c ${CONTINUE_WORKDIR}"} --local ${O2TAG:+--o2tag ${O2TAG}} --ttl ${JOBTTL} --label ${JOBLABEL:-label} ${MATTERMOSTHOOK:+--mattermost ${MATTERMOSTHOOK}} ${CONTROLSERVER:+--controlserver ${CONTROLSERVER}}
513519
set +x
514520
exit $?
515521
fi
516522

517523
if [[ "${ONGRID}" == 0 ]]; then
518-
banner "Executing job in directory ${WORKDIR}"
519-
cd "${WORKDIR}" 2> /dev/null
524+
banner "Executing job in directory ${GRID_SUBMIT_WORKDIR}"
525+
cd "${GRID_SUBMIT_WORKDIR}" 2> /dev/null
520526
fi
521527

522528
exec &> >(tee -a alien_log_${ALIEN_PROC_ID:-0}.txt)

0 commit comments

Comments
 (0)