Skip to content

Commit e3fa976

Browse files
authored
Merge pull request #33 from converged-computing/add-debug-traces-compute-engine-gpu
debug: traces for compute engine
2 parents 93cb53b + 7401973 commit e3fa976

38 files changed

+176313
-358
lines changed

README.md

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,6 @@ This study will test HPC application performance across three clouds. The reposi
1111
- [Amazon Web Services](experiments/aws) includes Parallel Cluster (EC2), and EKS (KUbernetes) for each of CPU and GPU
1212
- [Microsoft Azure](experiments/azure) includes CycleCloud (VMs), and AKS (Kubernetes) for each of CPU and GPU.
1313

14-
## Timing
15-
16-
This is a checklist for the setups we have tested and timed:
17-
1814
## Experiments
1915

2016
### "Bare Metal"
@@ -44,14 +40,14 @@ This is a checklist for the setups we have tested and timed:
4440
- [x] size 64 (vsoch done 8/26/2024)
4541
- [x] size 128 (vsoch done 8/27/2024)
4642
- [x] size 256 (vsoch done 8/27/2024)
47-
- [ ] Google Compute Engine GPU
43+
- [x] Google Compute Engine GPU
4844
- done on llnl-flux
4945
- [x] New VM and automation needed with Terraform (vsoch, early 9/2024)
5046
- [x] size 4 (vsoch 9/6/2024)
5147
- [x] size 8 (vsoch 9/7/2024)
5248
- [x] size 16 (vsoch 9/8/2024)
5349
- [x] size 32 (vsoch 9/8/2024)
54-
- [ ] quicksilver and osu all reduce need runs at all sizes.
50+
- [x] quicksilver and osu all reduce need runs at all sizes (vsoch 9/9/2024)
5551

5652
### Kubernetes
5753

experiments/google/compute-engine/gpu/build-images/startup-script.sh

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,20 +104,30 @@ cd flux-sched
104104
./configure --prefix=/usr --sysconfdir=/etc
105105
make -j 8 && sudo make install && sudo ldconfig
106106

107-
# install openmpi with cuda
107+
# Note that UCX (and a rebuild of open mpi) was done after to get OSU/quicksilver working
108+
109+
cd /opt
110+
sudo git clone https://github.com/openucx/ucx && \
111+
sudo chown -R $USER ./ucx && cd ucx/ && \
112+
git clean -xfd && \
113+
./autogen.sh && mkdir build && cd build && \
114+
../configure --prefix=/usr --enable-debug --with-cuda=/usr/local/cuda --enable-mt --disable-cma && \
115+
make -j && sudo make install
116+
117+
# If already existed - remove
118+
sudo rm -rf /usr/local/pancakes/
119+
120+
# install openmpi with cuda and ucx
108121
cd /opt
109122
sudo mkdir -p /usr/local/pancakes && \
110-
sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz && \
123+
sudo wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz || true && \
111124
sudo tar -xzvf openmpi-4.1.2.tar.gz && \
112125
cd openmpi-4.1.2 && \
113126
sudo chown -R $USER $(pwd) && \
114-
./configure --with-cuda --prefix=/usr/local/pancakes && \
115-
make -j 20 && sudo make install
116-
117-
# TODO check these, should be provided in flux environment later
118-
# ENV CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
119-
# ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/pancakes/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
120-
# ENV LD_LIBRARY_PATH=/usr/local/pancakes/lib:/opt/miniconda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
127+
make distclean || true && \
128+
mkdir build && cd build
129+
../configure --with-cuda=/usr/local/cuda --with-ucx=/usr/ --prefix=/usr/local/pancakes
130+
make -j && sudo make install
121131

122132
cd /opt
123133

@@ -384,3 +394,47 @@ sudo apt-get install -y --no-install-recommends --allow-change-held-packages apt
384394
sudo echo "deb https://packages.cloud.google.com/apt google-fast-socket main" | tee /etc/apt/sources.list.d/google-fast-socket.list
385395
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add
386396
sudo apt update && sudo apt install -y --no-install-recommends google-fast-socket=0.0.5
397+
398+
399+
# Install additional apps for bare metal, osu and quicksilver and multi-gpu-models
400+
54 make -j
401+
55 make install
402+
56 cd /opt/osu-benchmark/
403+
57 rm -rf build.openmpi/
404+
58 export OSU_VERSION=5.8
405+
59 mkdir -p build.openmpi && cd build.openmpi && ../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda && make && make install
406+
60 export PATH=/usr/local/pancakes/bin:$PATH
407+
61 make && make install
408+
62 cd /opt/containers/
409+
63 cd /root
410+
411+
# OSU benchmarks
412+
sudo git clone --depth 1 https://github.com/ULHPC/tutorials /opt/tutorials && \
413+
sudo mkdir -p /opt/osu-benchmark && \
414+
sudo chown -R $USER /opt/tutorials /opt/osu-benchmark && \
415+
cd /opt/osu-benchmark && \
416+
ln -s /opt/tutorials/parallel/mpi/OSU_MicroBenchmarks ref.d && \
417+
ln -s ref.d/Makefile . && \
418+
ln -s ref.d/scripts . && \
419+
mkdir src && \
420+
cd src && \
421+
export OSU_VERSION=5.8 && \
422+
wget --no-check-certificate http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${OSU_VERSION}.tgz && \
423+
tar xf osu-micro-benchmarks-${OSU_VERSION}.tgz && \
424+
cd /opt/osu-benchmark && \
425+
# Compile based on openmpi with cuda/ucx
426+
mkdir -p build.openmpi && cd build.openmpi && \
427+
../src/osu-micro-benchmarks-${OSU_VERSION}/configure CC=mpicc CXX=mpicxx CFLAGS=-I$(pwd)/../src/osu-micro-benchmarks-${OSU_VERSION}/util --prefix=$(pwd) --enable-cuda --with-cuda=/usr/local/cuda && \
428+
make && make install
429+
430+
# Quicksilver
431+
sudo git clone https://github.com/LLNL/Quicksilver quicksilver
432+
sudo chown -R $USER /opt/quicksilver
433+
wget https://raw.githubusercontent.com/converged-computing/performance-study/main/docker/google/gpu/quicksilver/Makefile
434+
cd /opt/quicksilver/src
435+
make || nvcc -DHAVE_CUDA -std=c++11 -O2 -Xptxas -v -gencode=arch=compute_70,code=\"sm_70,compute_70\" --compiler-bindir=/usr/local/pancakes/bin/mpicxx -L/usr/local/cuda/lib64/ -lcuda -lcudart -lm -o qs CollisionEvent.o CoralBenchmark.o CycleTracking.o DecompositionObject.o DirectionCosine.o EnergySpectrum.o GlobalFccGrid.o GridAssignmentObject.o InputBlock.o MCT.o MC_Adjacent_Facet.o MC_Base_Particle.o MC_Domain.o MC_Facet_Crossing_Event.o MC_Fast_Timer.o MC_Load_Particle.o MC_Location.o MC_Particle_Buffer.o MC_RNG_State.o MC_Segment_Outcome.o MC_SourceNow.o MacroscopicCrossSection.o MeshPartition.o MonteCarlo.o MpiCommObject.o NuclearData.o Parameters.o ParticleVault.o ParticleVaultContainer.o PopulationControl.o SendQueue.o SharedMemoryCommObject.o Tallies.o cmdLineParser.o cudaFunctions.o initMC.o main.o parseUtils.o utils.o utilsMpi.o && sudo cp qs /usr/bin/qs
436+
437+
# Multi-gpu-models
438+
sudo git clone https://github.com/NVIDIA/multi-gpu-programming-models /opt/multi-gpu-programming-models && \
439+
cd multi-gpu-programming-models/mpi && \
440+
make && sudo mv jacobi /usr/local/bin
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Debugging
2+
3+
This is looking at running the osu all_reduce benchmark with flux (as the flux user) and with Singularity.
4+
I did this in two ways:
5+
6+
- `strace -f`
7+
- `strace -f -s 128`
8+
9+
And from the outside and within the container.
10+
11+
## Traces
12+
13+
### [flux-singularity-trace-f.txt](flux-singularity-trace-f.txt)
14+
15+
```bash
16+
strace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-trace-f.txt
17+
```
18+
19+
### [flux-singularity-ltrace-f.txt](flux-singularity-ltrace-f.txt)
20+
21+
```bash
22+
ltrace -f flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-ltrace-f.txt
23+
```
24+
25+
### [flux-singularity-trace-s-f.txt](flux-singularity-trace-s-f.txt)
26+
27+
```bash
28+
strace -f -s 128 flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "/opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-trace-s-f.txt
29+
```
30+
31+
### [flux-singularity-inside-container-trace-f.txt](flux-singularity-inside-container-trace-f.txt)
32+
33+
```bash
34+
flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "strace -f /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-inside-container-trace-f.txt
35+
```
36+
37+
### [flux-singularity-inside-container-trace-s-f.txt](flux-singularity-inside-container-trace-s-f.txt)
38+
39+
```bash
40+
flux run -opmi=pmix --env OMPI_COMM_WORLD_LOCAL_RANK=0 -N 1 -n 8 -g 1 -o cpu-affinity=per-task -o gpu-affinity=per-task singularity exec --nv --bind /usr/local/cuda /opt/containers/metric-osu-gpu_google-gpu.sif /bin/bash -c "strace -f -s 128 /opt/osu-benchmark/build.openmpi/mpi/collective/osu_allreduce -d cuda H H" 2> flux-singularity-inside-container-trace-s-f.txt
41+
```

0 commit comments

Comments
 (0)