Skip to content

Commit bfbfa30

Browse files
Add qws support for TSUBAME4 Pegasus and Sirius
Signed-off-by: Yoshifumi Nakamura <nakamura@riken.jp>
1 parent ca8277c commit bfbfa30

8 files changed

Lines changed: 115 additions & 20 deletions

File tree

config/queue.csv

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ NQSV_AOBA_B,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue
1111
PJM_WISTERIA_O,pjsub,"-g jh260034o -L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc} --omp thread=${nthreads}"
1212
PJM_WISTERIA_A,pjsub,"-g jh260034a -L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc} --omp thread=${nthreads}"
1313
PBS_TSUKUBA,qsub,"-q ${queue_group} -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} -l walltime=${elapse}"
14-
AGE_TSUBAME4,qsub,"-l ${queue_group}=${nodes} -l h_rt=${elapse}"
14+
PBS_PEGASUS,qsub,"-q ${queue_group} -A CNTBENCH -l elapstim_req=${elapse} -v OMP_NUM_THREADS=${nthreads}"
15+
PBS_SIRIUS,qsub,"-q ${queue_group} -A CNTBENCH -W group_list=CNTBENCH -l select=${nodes}:ncpus=24:mem=124gb:ngpus=1 -l walltime=${elapse}"
16+
AGE_TSUBAME4,qsub,"-g jh260034 -l ${queue_group}=${nodes} -l h_rt=${elapse}"
1517
SLURM_CAMPHOR3,sbatch,"-p ${queue_group} -t ${elapse} --rsc p=${proc}:t=${nthreads}:c=${nthreads}:m=1G"
1618
NQSV_OSAKA_CPU,qsub,"-q ${queue_group} -b ${nodes} -l elapstim_req=${elapse},cpunum_job=${nthreads}"
1719
NQSV_OSAKA_GPU,qsub,"-q ${queue_group} -b ${nodes} -l elapstim_req=${elapse},cpunum_job=${nthreads},gpunum_job=${gpu_per_node}"

config/system.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ AOBA_B,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_B,lx
1919
AOBA_S,cross,aoba_s_login,aoba_s_jacamar,NQSV_AOBA_VE,sxs
2020
Odyssey,cross,wisteria_login,wisteria-o_jacamar,PJM_WISTERIA_O,short-o
2121
Aquarius,cross,wisteria_login,wisteria-a_jacamar,PJM_WISTERIA_A,short-a
22-
Pegasus,cross,pegasus_login,pegasus_jacamar,PBS_TSUKUBA,regular
23-
Sirius,cross,sirius_login,sirius_jacamar,PBS_TSUKUBA,regular
22+
Pegasus,cross,pegasus_login,pegasus_jacamar,PBS_PEGASUS,gpu
23+
Sirius,cross,sirius_login,sirius_jacamar,PBS_SIRIUS,mcrp
2424
TSUBAME4,cross,tsubame4_login,tsubame4_jacamar,AGE_TSUBAME4,node_f
2525
Camphor3,cross,camphor3_login,camphor3_jacamar,SLURM_CAMPHOR3,jha
2626
SQUID_CPU,cross,squid_login,squid_jacamar,NQSV_OSAKA_CPU,SQUID

docs/guides/add-site.md

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ ARM64 ログインノードでは `--arch arm64` を指定します。
109109
- `--no-systemd` / `--no-start`
110110
- systemd user service を作らない、または作るだけで起動しない場合に使います
111111

112+
Jacamar-CI のビルドは、ログインノードのプロセス数・メモリ制限に当たりにくいよう、既定で `make -j1``GOMAXPROCS=1``GOFLAGS="-p=1 -gcflags=all=-dwarf=false"` を使います。余裕のある環境では `JACAMAR_BUILD_MAKE_JOBS``JACAMAR_BUILD_GOMAXPROCS``JACAMAR_BUILD_GOFLAGS` で上書きできます。
113+
112114
このスクリプトは `config.toml``environment``PATH=$BASE_DIR/bin:...` を登録時点で入れるため、アーティファクト保存時に `gitlab-runner` が見つからない問題も避けられます。以下の手動手順は、スクリプトが失敗した場合の切り分けや、サイト固有に調整したい場合の参照として使ってください。
113115

114116
---
@@ -144,6 +146,7 @@ $BASE_DIR/
144146
├── custom-config.toml # Jacamar 設定ファイル
145147
├── config.sh # カスタムランナー: config
146148
├── prepare.sh # カスタムランナー: prepare
149+
├── runner-env.sh # カスタムランナー: 共通環境初期化
147150
├── run.sh # カスタムランナー: run
148151
└── cleanup.sh # カスタムランナー: cleanup
149152
```
@@ -202,8 +205,10 @@ cd jacamar-ci
202205
export CC=gcc
203206
export CXX=g++
204207
export CGO_ENABLED=1
208+
export GOMAXPROCS=1
209+
export GOFLAGS="-p=1 -gcflags=all=-dwarf=false"
205210

206-
make build
211+
make -j1 build
207212
make install PREFIX="$BASE_DIR"
208213

209214
# 後片付け
@@ -233,7 +238,9 @@ git clone https://gitlab.com/ecp-ci/jacamar-ci.git
233238
cp tools.go jacamar-ci/internal/executors/pbs/
234239

235240
cd jacamar-ci
236-
make build
241+
export GOMAXPROCS=1
242+
export GOFLAGS="-p=1 -gcflags=all=-dwarf=false"
243+
make -j1 build
237244
make install PREFIX="$BASE_DIR"
238245
```
239246

@@ -290,7 +297,9 @@ export CPATH="${SEC_PREFIX}/include:${CPATH:-}"
290297
git clone https://gitlab.com/ecp-ci/jacamar-ci.git
291298
cd jacamar-ci
292299
export CC=gcc CXX=g++ CGO_ENABLED=1
293-
make build
300+
export GOMAXPROCS=1
301+
export GOFLAGS="-p=1 -gcflags=all=-dwarf=false"
302+
make -j1 build
294303
make install PREFIX="${WORKDIR}"
295304

296305
# --- 5. 後片付け ---
@@ -347,10 +356,19 @@ set -euo pipefail
347356
exit 0
348357
```
349358

359+
### `runner-env.sh` - 共通環境初期化
360+
361+
`run.sh` から source される共通環境初期化ファイルです。非対話 shell でも site の module catalog やユーザの基本環境が見えるように、`/etc/profile``/etc/bashrc`、module 初期化ファイル、`~/.bashrc` を順に読みます。アプリごとの `build.sh` / `run.sh` は、原則として site の shell 初期化そのものではなく、必要な `module load` と実行コマンドだけを持ちます。
362+
350363
### `run.sh` - ジョブ実行
351364
```bash
352365
#!/usr/bin/env bash
353-
source ~/.bashrc
366+
RUNNER_ENV="${CUSTOM_DIR:-/path/to/gitlab-runner_jacamar-ci_amd}/runner-env.sh"
367+
if [[ -r "${RUNNER_ENV}" ]]; then
368+
source "${RUNNER_ENV}"
369+
elif [[ -r "${HOME}/.bashrc" ]]; then
370+
source "${HOME}/.bashrc"
371+
fi
354372
set -eo pipefail
355373
exec "$@"
356374
```

programs/qws/build.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,20 @@ case "$system" in
8989
source /work/opt/local/x86_64/cores/intel/2023.0.0/mpi/latest/env/vars.sh
9090
make compiler=intel arch=skylake rdma= -j8
9191
;;
92+
Pegasus)
93+
module load intel/2025.3.1 intmpi/2025.3.1
94+
make compiler=intel arch=skylake mpi=1 omp=1 rdma=
95+
;;
96+
Sirius)
97+
module load aocc/5.0.0 openmpi/5.0.10/aocc5.0.0
98+
make -j4 compiler=aocc arch=zen4 rdma= mpi=1 omp=1 profiler=timing \
99+
AMD_MARCH=-march=znver4 cppflags="-DARCH_AVX512" main
100+
;;
92101
TSUBAME4)
93-
make -j 8 fugaku_benchmark= omp=1 compiler=openmpi-gnu arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpic++
102+
module load openmpi/5.0.10-gcc aocc/4.1.0
103+
export OMPI_CC=clang OMPI_CXX=clang++ OMPI_FC=flang
104+
make -j4 compiler=aocc arch=zen4 rdma= mpi=1 omp=1 profiler=timing \
105+
AMD_MARCH=-march=znver4 cppflags="-DARCH_AVX512" main
94106
;;
95107
Camphor3)
96108
camphor3_modulepath="${MODULEPATH:-}"

programs/qws/list.csv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ AOBA_S,yes,1,1,8,0:10:00
2020
AOBA_B,yes,1,1,128,0:10:00
2121
Odyssey,yes,1,1,12,0:10:00
2222
Aquarius,yes,1,1,8,0:10:00
23+
Pegasus,yes,1,1,96,00:10:00
24+
Sirius,yes,1,1,24,0:10:00
2325
TSUBAME4,yes,1,1,192,0:10:00
2426
Camphor3,yes,1,1,112,0:10:00
2527
FNCX,yes,1,1,1,0:10:00

programs/qws/run.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,22 @@ case "$system" in
171171
mpiexec -n 1 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0
172172
print_results CASE0 CASE0 1 >> ../results/result
173173
;;
174+
Pegasus)
175+
qws_numproc=$((nodes * numproc_node))
176+
module load intel/2025.3.1 intmpi/2025.3.1
177+
mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0
178+
print_results CASE0 CASE0 ${numproc_node} >> ../results/result
179+
;;
180+
Sirius)
181+
qws_numproc=$((nodes * numproc_node))
182+
module load aocc/5.0.0 openmpi/5.0.10/aocc5.0.0
183+
mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0
184+
print_results CASE0 CASE0 ${numproc_node} >> ../results/result
185+
;;
174186
TSUBAME4)
175187
qws_numproc=$((nodes * numproc_node))
188+
module load openmpi/5.0.10-gcc aocc/4.1.0
189+
export OMPI_CC=clang OMPI_CXX=clang++ OMPI_FC=flang
176190
mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0
177191
print_results CASE0 CASE0 ${numproc_node} >> ../results/result
178192
;;

scripts/setup_site_runner.sh

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ install_systemd=1
2020
start_service=1
2121
libseccomp_mode="auto"
2222
jacamar_pbs_tools=""
23+
jacamar_make_jobs="${JACAMAR_BUILD_MAKE_JOBS:-1}"
24+
jacamar_gomaxprocs="${JACAMAR_BUILD_GOMAXPROCS:-1}"
25+
jacamar_goflags="${JACAMAR_BUILD_GOFLAGS:--p=1 -gcflags=all=-dwarf=false}"
2326
unrestricted_cmd_line=false
2427
runner_proxy=""
2528
runner_no_proxy=""
@@ -65,6 +68,12 @@ Options:
6568
--no-start Create and enable service, but do not start it.
6669
-h, --help Show this help.
6770
71+
Environment overrides:
72+
JACAMAR_BUILD_MAKE_JOBS Jacamar build make parallelism. Default: 1.
73+
JACAMAR_BUILD_GOMAXPROCS Jacamar build Go scheduler threads. Default: 1.
74+
JACAMAR_BUILD_GOFLAGS Jacamar build Go flags.
75+
Default: -p=1 -gcflags=all=-dwarf=false.
76+
6877
Example:
6978
curl -fsSL https://raw.githubusercontent.com/RIKEN-RCCS/benchkit/main/scripts/setup_site_runner.sh \
7079
| bash -s -- --arch amd64 --site genkai \
@@ -297,7 +306,10 @@ if [[ ! -x "$jacamar_bin" ]]; then
297306
(
298307
cd "${work_dir}/jacamar-ci"
299308
export CC=gcc CXX=g++ CGO_ENABLED=1
300-
make build
309+
export GOMAXPROCS="${GOMAXPROCS:-$jacamar_gomaxprocs}"
310+
export GOFLAGS="${GOFLAGS:-$jacamar_goflags}"
311+
info "Using Jacamar build limits: make -j${jacamar_make_jobs}, GOMAXPROCS=${GOMAXPROCS}, GOFLAGS=${GOFLAGS}"
312+
make -j"$jacamar_make_jobs" build
301313
make install PREFIX="$base_dir"
302314
)
303315
else
@@ -349,11 +361,40 @@ set -euo pipefail
349361
exit 0
350362
EOF
351363

352-
cat > "${base_dir}/run.sh" <<'EOF'
364+
cat > "${base_dir}/runner-env.sh" <<'EOF'
353365
#!/usr/bin/env bash
354-
source ~/.bashrc
366+
367+
source_if_readable() {
368+
local file="$1"
369+
if [[ -r "$file" ]]; then
370+
# shellcheck disable=SC1090
371+
source "$file" || true
372+
fi
373+
}
374+
375+
source_if_readable /etc/profile
376+
source_if_readable /etc/bashrc
377+
378+
if ! type module >/dev/null 2>&1; then
379+
source_if_readable /etc/profile.d/modules.sh
380+
source_if_readable /etc/profile.d/z00_lmod.sh
381+
fi
382+
383+
source_if_readable "${HOME}/.bashrc"
384+
385+
unset -f source_if_readable
386+
EOF
387+
388+
cat > "${base_dir}/run.sh" <<EOF
389+
#!/usr/bin/env bash
390+
RUNNER_ENV="\${CUSTOM_DIR:-${base_dir}}/runner-env.sh"
391+
if [[ -r "\${RUNNER_ENV}" ]]; then
392+
source "\${RUNNER_ENV}"
393+
elif [[ -r "\${HOME}/.bashrc" ]]; then
394+
source "\${HOME}/.bashrc"
395+
fi
355396
set -eo pipefail
356-
exec "$@"
397+
exec "\$@"
357398
EOF
358399

359400
cat > "${base_dir}/cleanup.sh" <<EOF
@@ -378,7 +419,7 @@ esac
378419
echo "CLEANUP DONE at \$(date)" >> "\$LOGFILE"
379420
EOF
380421

381-
chmod +x "${base_dir}/config.sh" "${base_dir}/prepare.sh" "${base_dir}/run.sh" "${base_dir}/cleanup.sh"
422+
chmod +x "${base_dir}/config.sh" "${base_dir}/prepare.sh" "${base_dir}/runner-env.sh" "${base_dir}/run.sh" "${base_dir}/cleanup.sh"
382423

383424
info "Writing Jacamar config"
384425
cat > "${base_dir}/custom-config.toml" <<EOF

scripts/test_submit.sh

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,17 +142,23 @@ case "$system" in
142142
--mpi proc=$proc --omp thread=$nthreads \
143143
script.sh
144144
;;
145-
Pegasus|Sirius)
146-
echo qsub -q $queue_group \
147-
-l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} \
145+
Pegasus)
146+
echo qsub -q $queue_group -A CNTBENCH \
147+
-l elapstim_req=${elapse} -v OMP_NUM_THREADS=${nthreads} script.sh
148+
qsub -q $queue_group -A CNTBENCH \
149+
-l elapstim_req=${elapse} -v OMP_NUM_THREADS=${nthreads} script.sh
150+
;;
151+
Sirius)
152+
echo qsub -q $queue_group -A CNTBENCH -W group_list=CNTBENCH \
153+
-l select=${nodes}:ncpus=24:mem=124gb:ngpus=1 \
148154
-l walltime=${elapse} script.sh
149-
qsub -q $queue_group \
150-
-l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} \
155+
qsub -q $queue_group -A CNTBENCH -W group_list=CNTBENCH \
156+
-l select=${nodes}:ncpus=24:mem=124gb:ngpus=1 \
151157
-l walltime=${elapse} script.sh
152158
;;
153159
TSUBAME4)
154-
echo qsub -l ${queue_group}=${nodes} -l h_rt=${elapse} script.sh
155-
qsub -l ${queue_group}=${nodes} -l h_rt=${elapse} script.sh
160+
echo qsub -g jh260034 -l ${queue_group}=${nodes} -l h_rt=${elapse} script.sh
161+
qsub -g jh260034 -l ${queue_group}=${nodes} -l h_rt=${elapse} script.sh
156162
;;
157163
Camphor3)
158164
proc=$((nodes * numproc_node))

0 commit comments

Comments
 (0)