Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions superbench/benchmarks/model_benchmarks/pytorch_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,8 +591,8 @@ def _benchmark(self):
Run the benchmark then handle post-run model log save/compare.
Set SB_ENABLE_PYTORCH_PROFILER='1' to enable profiling.
"""
# Check if this is a Nvidia GPU
if not (torch.cuda.is_available() and torch.version.cuda is not None):
# Check if this is a Nvidia or AMD GPU
if not (torch.cuda.is_available() and (torch.version.cuda is not None or torch.version.hip is not None)):
Comment thread
shcho marked this conversation as resolved.
ok = super()._benchmark()
self._post_run_model_log()
return ok
Expand Down
58 changes: 45 additions & 13 deletions superbench/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sys
import json
import random
import shlex
import signal
from pathlib import Path
from pprint import pformat
Expand Down Expand Up @@ -135,12 +136,25 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
enable_nsys = os.environ.get('SB_ENABLE_NSYS', '') == '1'
trace_dir = os.environ.get('SB_NSYS_TRACE_DIR', self._sb_output_dir)

# Enable rocprofv2 profiling based on environment variable
enable_rocprof = os.environ.get('SB_ENABLE_ROCPROF', '') == '1'
rocprof_trace_dir = os.environ.get('SB_ROCPROF_TRACE_DIR', self._sb_output_dir)
Comment thread
shcho marked this conversation as resolved.

mode_command = exec_command
if mode.name == 'local':
trace_command = (
f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
) if enable_nsys and mode.proc_rank == 0 else ''
trace_command = ''
if enable_nsys and mode.proc_rank == 0:
trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_{mode.proc_rank}_traces')
trace_command = (
f'nsys profile --output {trace_output} '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
)
elif enable_rocprof and mode.proc_rank == 0:
trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces')
trace_command = (
f'rocprofv2 --hip-trace --kernel-trace --plugin json '
f'-d {trace_output} '
)
Comment thread
shcho marked this conversation as resolved.
Comment on lines +152 to +157
Comment on lines +145 to +157
# Build the command parts, only including trace if it's not empty
command_parts = []
prefix = mode.prefix.format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)
Expand All @@ -159,23 +173,41 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
'--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
)

nsys_prefix = (
f'nsys profile --output {trace_dir}/{benchmark_name}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
) if enable_nsys else ''
trace_prefix = ''
if enable_nsys:
trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_traces')
trace_prefix = (
f'nsys profile --output {trace_output} '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
)
elif enable_rocprof:
trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_traces')
trace_prefix = (
f'rocprofv2 --hip-trace --kernel-trace --plugin json '
f'-d {trace_output} '
)
Comment on lines +176 to +188

mode_command = (
f'{nsys_prefix}'
f'{trace_prefix}'
f'torchrun'
f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_impl=ddp'
f' superbench.benchmarks.{benchmark_name}.parameters.distributed_backend=nccl'
)
elif mode.name == 'mpi':
trace_command = (
f'nsys profile --output {trace_dir}/{benchmark_name}_{mode.proc_rank}_traces '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
) if enable_nsys else ''
trace_command = ''
if enable_nsys:
trace_output = shlex.quote(f'{trace_dir}/{benchmark_name}_{mode.proc_rank}_traces')
trace_command = (
f'nsys profile --output {trace_output} '
f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
)
elif enable_rocprof:
trace_output = shlex.quote(f'{rocprof_trace_dir}/{benchmark_name}_{mode.proc_rank}_traces')
trace_command = (
f'rocprofv2 --hip-trace --kernel-trace --plugin json '
f'-d {trace_output} '
)
mode_command = (
'{trace} '
'mpirun ' # use default OpenMPI in image
Expand Down
Loading