Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions GRID/utils/getReproducerScript.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash
ALIEN_PID=$1

if [ ${JALIEN_TOKEN_CERT} ]; then
TOKENCERT=${JALIEN_TOKEN_CERT}
TOKENKEY=${JALIEN_TOKEN_KEY}
else
if [ -f ${TMPDIR:-/tmp}/tokencert_`id -u`.pem ]; then
TOKENCERT=${TMPDIR:-/tmp}/tokencert_`id -u`.pem;
fi
if [ -f ${TMPDIR:-/tmp}/tokenkey_`id -u`.pem ]; then
TOKENKEY=${TMPDIR:-/tmp}/tokenkey_`id -u`.pem;
fi
fi

if [ ! ${TOKENCERT} ]; then
echo "This needs a tokencert and tokenkey file in the tmp folder"
exit 1
fi

SCRIPT=reproducer_script_${ALIEN_PID}.sh
# talk to MonaLisa to fetch the script provided by Costin
curl 'https://alimonitor.cern.ch/users/jobenv.jsp?pid='${ALIEN_PID} \
-H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36' \
--insecure --cert ${TOKENCERT} --key ${TOKENKEY} -o ${SCRIPT}

# Define the Apptainer injection block which makes sure
# that the job script is automatically executed in apptainer
INJECTION='
export ALIEN_PID=#ALIEN_PID#
# Check if the script is running inside an Apptainer (Singularity) container
if [ -z "$APPTAINER_NAME" ] && [ -z "$SINGULARITY_NAME" ]; then
# Relaunch this script inside the container

export WORKDIR=/tmp/foo-${ALIEN_PID}
if [ ! -d ${WORKDIR} ]; then
mkdir ${WORKDIR}
fi

# - copy the certificate token into /tmp/ inside the container
mkdir ${WORKDIR}/tmp
cp /tmp/token*pem ${WORKDIR}/tmp

# - copy the job script into workdir
cp $0 ${WORKDIR}

# detect architecture (ARM or X86)
ARCH=$(uname -i)
if [ "$ARCH" == "aarch64" ] || [ "$ARCH" == "x86_64" ]; then
echo "Detected hardware architecture : $ARCH"
else
echo "Invalid architecture ${ARCH} detected. Exiting"
exit 1
fi
if [ "$ARCH" == "aarch64" ]; then
ISAARCH64="1"
fi

CONTAINER="/cvmfs/alice.cern.ch/containers/fs/apptainer/compat_el9-${ARCH}"
APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer"

# we can actually analyse the local JDL to find the package and set it up for the container
${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir,${WORKDIR}/tmp:/tmp --pwd /workdir -C ${CONTAINER} "$0"
exit $?
fi
'

# Inject the block after the first line (shebang)
awk -v block="$INJECTION" 'NR==1 {print; print block; next} 1' "$SCRIPT" > tmpfile && mv tmpfile "$SCRIPT"

# take out sandboxing structure
sed -i "/echo \"Create a fresh sandbox at every attempt of running the job: alien-job-$ALIEN_PID\"/d" "$SCRIPT"
sed -i "/rm -rf alien-job-$ALIEN_PID/d" "$SCRIPT"
sed -i "/mkdir -p alien-job-$ALIEN_PID\/tmp/d" "$SCRIPT"
sed -i "/cd alien-job-$ALIEN_PID/d" "$SCRIPT"

# replace the PID
sed -i "s/#ALIEN_PID#/${ALIEN_PID}/g" "$SCRIPT"

chmod +x "${SCRIPT}"
26 changes: 22 additions & 4 deletions GRID/utils/grid_submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ EOF
[ $ERROROUTPUTSPEC ] && echo "OutputErrorE = {"${ERROROUTPUTSPEC}"};" >> "${MY_JOBNAMEDATE}.jdl" # add error output files
[ $IMAGESPEC ] && echo "DebugTag = {\"${IMAGESPEC}\"};" >> "${MY_JOBNAMEDATE}.jdl" # use special singularity image to run job
# echo "Requirements = {"${REQUIREMENTSSPEC}"} >> "${MY_JOBNAMEDATE}.jdl"
[ $REQUIRESPEC ] && echo "Requirements = ${REQUIRESPEC}" >> "${MY_JOBNAMEDATE}.jdl"
[ "$REQUIRESPEC" ] && echo "Requirements = ${REQUIRESPEC}" >> "${MY_JOBNAMEDATE}.jdl"

# "output_arch.zip:output/*@disk=2",
# "checkpoint*.tar@disk=2"
Expand All @@ -383,6 +383,8 @@ EOF
(
# assemble all GRID interaction in a single script / transaction
[ -f "${command_file}" ] && rm ${command_file}
echo "user ${MY_USER}" >> ${command_file}
echo "whoami" >> ${command_file}
[ ! "${CONTINUE_WORKDIR}" ] && echo "rmdir ${MY_JOBWORKDIR}" >> ${command_file} # remove existing job dir
# echo "mkdir ${MY_BINDIR}" >> ${command_file} # create bindir
echo "mkdir ${MY_JOBPREFIX}" >> ${command_file} # create job output prefix
Expand Down Expand Up @@ -434,7 +436,7 @@ EOF
continue
fi
let counter=0 # reset counter
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $4}')
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $3}')
# echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}"

if [ "${JOBSTATUS}" == "D" ]; then
Expand Down Expand Up @@ -489,7 +491,24 @@ if [[ ${SINGULARITY} ]]; then
# it's actually much like the GRID mode --> which is why we set JALIEN_TOKEN_CERT
set -x
cp $0 ${WORKDIR}
singularity exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --env JALIEN_TOKEN_CERT="foo" --pwd /workdir /cvmfs/alice.cern.ch/containers/fs/singularity/centos7 $0 \

# detect architecture (ARM or X86)
ARCH=$(uname -i)
if [ "$ARCH" == "aarch64" ] || [ "$ARCH" == "x86_64" ]; then
echo "Detected hardware architecture : $ARCH"
else
echo "Invalid architecture ${ARCH} detected. Exiting"
exit 1
fi
if [ "$ARCH" == "aarch64" ]; then
ISAARCH64="1"
fi

CONTAINER="/cvmfs/alice.cern.ch/containers/fs/apptainer/compat_el9-${ARCH}"
APPTAINER_EXEC="/cvmfs/alice.cern.ch/containers/bin/apptainer/${ARCH}/current/bin/apptainer"

# we can actually analyse the local JDL to find the package and set it up for the container
${APPTAINER_EXEC} exec -C -B /cvmfs:/cvmfs,${WORKDIR}:/workdir --pwd /workdir -C ${CONTAINER} /workdir/grid_submit.sh \
${CONTINUE_WORKDIR:+"-c ${CONTINUE_WORKDIR}"} --local ${O2TAG:+--o2tag ${O2TAG}} --ttl ${JOBTTL} --label ${JOBLABEL:-label} ${MATTERMOSTHOOK:+--mattermost ${MATTERMOSTHOOK}} ${CONTROLSERVER:+--controlserver ${CONTROLSERVER}}
set +x
exit $?
Expand All @@ -515,7 +534,6 @@ banner "Limits"
ulimit -a

banner "OS detection"
lsb_release -a || true
cat /etc/os-release || true
cat /etc/redhat-release || true

Expand Down
9 changes: 8 additions & 1 deletion MC/run/ANCHOR/anchorMC.sh
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ fi
[ -z "${CYCLE}" ] && { echo_error "Set CYCLE" ; exit 1 ; }
[ -z "${PRODSPLIT}" ] && { echo_error "Set PRODSPLIT" ; exit 1 ; }


# this generates an exact reproducer script for this job
# that can be used locally for debugging etc.
if [[ -n "${ALIEN_PROC_ID}" && -n "${JALIEN_WSPORT}" ]]; then
${O2DPG_ROOT}/GRID/utils/getReproducerScript.sh ${ALIEN_PROC_ID}
fi

# also for this keep a real default
NWORKERS=${NWORKERS:-8}
# set a default seed if not given
Expand Down Expand Up @@ -370,7 +377,7 @@ fi
# full logs tar-ed for output, regardless the error code or validation - to catch also QC logs...
#
if [[ -n "$ALIEN_PROC_ID" ]]; then
find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "pythia8.cfg" \) | tar -czvf debug_log_archive.tgz -T -
find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "pythia8.cfg" -o -name "reproducer*.sh" \) | tar -czvf debug_log_archive.tgz -T -
if [[ "$ALIEN_JDL_CREATE_TAR_IN_MC" == "1" ]]; then
find ./ \( -name "*.log*" -o -name "*mergerlog*" -o -name "*serverlog*" -o -name "*workerlog*" -o -name "*.root" \) | tar -czvf debug_full_archive.tgz -T -
fi
Expand Down