Skip to content

Commit 5e58122

Browse files
committed
Several tooling improvements
* cleanup, fixes and adjustments in grid_submit * a new script getReproducerScript.sh, which downloads a GRID reproducer script and modifies it to ensure automatic execution in apptainer: - the script needs a ALIEN_PROC_ID - creates a script reproducer_script_${ALIEN_PROC_ID}.sh which executes the job of ALIEN_PROC_ID within an apptainer environment (just as on the GRID) * anchorMC now creates a reproducer script automatically
1 parent 691d4d5 commit 5e58122

File tree

3 files changed

+110
-5
lines changed

3 files changed

+110
-5
lines changed

GRID/utils/getReproducerScript.sh

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#!/bin/bash
2+
ALIEN_PID=$1
3+
4+
if [ ${JALIEN_TOKEN_CERT} ]; then
5+
TOKENCERT=${JALIEN_TOKEN_CERT}
6+
TOKENKEY=${JALIEN_TOKEN_KEY}
7+
else
8+
if [ -f ${TMPDIR:-/tmp}/tokencert_`id -u`.pem ]; then
9+
TOKENCERT=${TMPDIR:-/tmp}/tokencert_`id -u`.pem;
10+
fi
11+
if [ -f ${TMPDIR:-/tmp}/tokenkey_`id -u`.pem ]; then
12+
TOKENKEY=${TMPDIR:-/tmp}/tokenkey_`id -u`.pem;
13+
fi
14+
fi
15+
16+
if [ ! ${TOKENCERT} ]; then
17+
echo "This needs a tokencert and tokenkey file in the tmp folder"
18+
exit 1
19+
fi
20+
21+
SCRIPT=reproducer_script_${ALIEN_PID}.sh
22+
# talk to MonaLisa to fetch the script provided by Costin
23+
curl 'https://alimonitor.cern.ch/users/jobenv.jsp?pid='${ALIEN_PID} \
24+
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36' \
25+
--insecure --cert ${TOKENCERT} --key ${TOKENKEY} -o ${SCRIPT}
26+
27+
# Define the Apptainer injection block which makes sure
28+
# that the job script is automatically executed in apptainer
29+
INJECTION='
30+
export ALIEN_PID=#ALIEN_PID#
31+
# Check if the script is running inside an Apptainer (Singularity) container
32+
if [ -z "$APPTAINER_NAME" ] && [ -z "$SINGULARITY_NAME" ]; then
33+
# Relaunch this script inside the container
34+
35+
export WORKDIR=/tmp/foo-${ALIEN_PID}
36+
if [ ! -d ${WORKDIR} ]; then
37+
mkdir ${WORKDIR}
38+
fi
39+
40+
# - copy the certificate token into /tmp/ inside the container
41+
mkdir ${WORKDIR}/tmp
42+
cp /tmp/token*pem ${WORKDIR}/tmp
43+
44+
# - copy the job script into workdir
45+
cp $0 ${WORKDIR}
46+
47+
# detect architecture (ARM or X86)
48+
ARCH=$(uname -i)
49+
if [ "$ARCH" == "aarch64" ] || [ "$ARCH" == "x86_64" ]; then
50+
echo "Detected hardware architecture : $ARCH"
51+
else
52+
echo "Invalid architecture ${ARCH} detected. Exiting"
53+
exit 1
54+
fi
55+
if [ "$ARCH" == "aarch64" ]; then
56+
ISAARCH64="1"
57+
fi
58+
59+
CONTAINER="/cvmfs/alice.cern.ch/containers/fs/apptainer/compat_el9-${ARCH}"
60+
APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer"
61+
62+
# we can actually analyse the local JDL to find the package and set it up for the container
63+
${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir,${WORKDIR}/tmp:/tmp --pwd /workdir -C ${CONTAINER} "$0"
64+
exit $?
65+
fi
66+
'
67+
68+
# Inject the block after the first line (shebang)
69+
awk -v block="$INJECTION" 'NR==1 {print; print block; next} 1' "$SCRIPT" > tmpfile && mv tmpfile "$SCRIPT"
70+
71+
# take out sandboxing structure
72+
sed -i "/echo \"Create a fresh sandbox at every attempt of running the job: alien-job-$ALIEN_PID\"/d" "$SCRIPT"
73+
sed -i "/rm -rf alien-job-$ALIEN_PID/d" "$SCRIPT"
74+
sed -i "/mkdir -p alien-job-$ALIEN_PID\/tmp/d" "$SCRIPT"
75+
sed -i "/cd alien-job-$ALIEN_PID/d" "$SCRIPT"
76+
77+
# replace the PID
78+
sed -i "s/#ALIEN_PID#/${ALIEN_PID}/g" "$SCRIPT"
79+
80+
chmod +x "${SCRIPT}"

GRID/utils/grid_submit.sh

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ EOF
370370
[ $ERROROUTPUTSPEC ] && echo "OutputErrorE = {"${ERROROUTPUTSPEC}"};" >> "${MY_JOBNAMEDATE}.jdl" # add error output files
371371
[ $IMAGESPEC ] && echo "DebugTag = {\"${IMAGESPEC}\"};" >> "${MY_JOBNAMEDATE}.jdl" # use special singularity image to run job
372372
# echo "Requirements = {"${REQUIREMENTSSPEC}"} >> "${MY_JOBNAMEDATE}.jdl"
373-
[ $REQUIRESPEC ] && echo "Requirements = ${REQUIRESPEC}" >> "${MY_JOBNAMEDATE}.jdl"
373+
[ "$REQUIRESPEC" ] && echo "Requirements = ${REQUIRESPEC}" >> "${MY_JOBNAMEDATE}.jdl"
374374

375375
# "output_arch.zip:output/*@disk=2",
376376
# "checkpoint*.tar@disk=2"
@@ -383,6 +383,8 @@ EOF
383383
(
384384
# assemble all GRID interaction in a single script / transaction
385385
[ -f "${command_file}" ] && rm ${command_file}
386+
echo "user ${MY_USER}" >> ${command_file}
387+
echo "whoami" >> ${command_file}
386388
[ ! "${CONTINUE_WORKDIR}" ] && echo "rmdir ${MY_JOBWORKDIR}" >> ${command_file} # remove existing job dir
387389
# echo "mkdir ${MY_BINDIR}" >> ${command_file} # create bindir
388390
echo "mkdir ${MY_JOBPREFIX}" >> ${command_file} # create job output prefix
@@ -434,7 +436,7 @@ EOF
434436
continue
435437
fi
436438
let counter=0 # reset counter
437-
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $4}')
439+
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $3}')
438440
# echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}"
439441

440442
if [ "${JOBSTATUS}" == "D" ]; then
@@ -489,7 +491,24 @@ if [[ ${SINGULARITY} ]]; then
489491
# it's actually much like the GRID mode --> which is why we set JALIEN_TOKEN_CERT
490492
set -x
491493
cp $0 ${WORKDIR}
492-
singularity exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --env JALIEN_TOKEN_CERT="foo" --pwd /workdir /cvmfs/alice.cern.ch/containers/fs/singularity/centos7 $0 \
494+
495+
# detect architecture (ARM or X86)
496+
ARCH=$(uname -i)
497+
if [ "$ARCH" == "aarch64" ] || [ "$ARCH" == "x86_64" ]; then
498+
echo "Detected hardware architecture : $ARCH"
499+
else
500+
echo "Invalid architecture ${ARCH} detected. Exiting"
501+
exit 1
502+
fi
503+
if [ "$ARCH" == "aarch64" ]; then
504+
ISAARCH64="1"
505+
fi
506+
507+
CONTAINER="/cvmfs/alice.cern.ch/containers/fs/apptainer/compat_el9-${ARCH}"
508+
APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer"
509+
510+
# we can actually analyse the local JDL to find the package and set it up for the container
511+
${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --pwd /workdir -C ${CONTAINER} /workdir/grid_submit.sh \
493512
${CONTINUE_WORKDIR:+"-c ${CONTINUE_WORKDIR}"} --local ${O2TAG:+--o2tag ${O2TAG}} --ttl ${JOBTTL} --label ${JOBLABEL:-label} ${MATTERMOSTHOOK:+--mattermost ${MATTERMOSTHOOK}} ${CONTROLSERVER:+--controlserver ${CONTROLSERVER}}
494513
set +x
495514
exit $?
@@ -515,7 +534,6 @@ banner "Limits"
515534
ulimit -a
516535

517536
banner "OS detection"
518-
lsb_release -a || true
519537
cat /etc/os-release || true
520538
cat /etc/redhat-release || true
521539

MC/run/ANCHOR/anchorMC.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,13 @@ fi
144144
[ -z "${CYCLE}" ] && { echo_error "Set CYCLE" ; exit 1 ; }
145145
[ -z "${PRODSPLIT}" ] && { echo_error "Set PRODSPLIT" ; exit 1 ; }
146146

147+
148+
# this generates an exact reproducer script for this job
149+
# that can be used locally for debugging etc.
150+
if [[ -n "${ALIEN_PROC_ID}" && -n "${JALIEN_WSPORT}" ]]; then
151+
${O2DPG_ROOT}/GRID/utils/getReproducerScript.sh ${ALIEN_PROC_ID}
152+
fi
153+
147154
# also for this keep a real default
148155
NWORKERS=${NWORKERS:-8}
149156
# set a default seed if not given
@@ -370,7 +377,7 @@ fi
370377
# full logs tar-ed for output, regardless the error code or validation - to catch also QC logs...
371378
#
372379
if [[ -n "$ALIEN_PROC_ID" ]]; then
373-
find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "pythia8.cfg" \) | tar -czvf debug_log_archive.tgz -T -
380+
find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "pythia8.cfg" -o -name "reproducer*.sh" \) | tar -czvf debug_log_archive.tgz -T -
374381
if [[ "$ALIEN_JDL_CREATE_TAR_IN_MC" == "1" ]]; then
375382
find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "*.root" \) | tar -czvf debug_full_archive.tgz -T -
376383
fi

0 commit comments

Comments
 (0)