-
Notifications
You must be signed in to change notification settings - Fork 26
Expand file tree
/
Copy pathLaunchTransferBench.sh
More file actions
executable file
·237 lines (200 loc) · 7.2 KB
/
LaunchTransferBench.sh
File metadata and controls
executable file
·237 lines (200 loc) · 7.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/bin/bash
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# LaunchTransferBench - Multi-rank TransferBench Socket Execution Script
#
# This script simplifies the execution of socket-based multi-rank TransferBench
# by automatically setting up SSH connections to specified hosts and setting
# the appropriate environment variables.
#
# Usage:
# ./LaunchTransferBench.sh <hosts> [env_vars...] [-- <transferbench_args>]
#
# Arguments:
# hosts: Comma-separated list of hostnames/IPs to run on
# env_vars: Optional environment variables (e.g., NUM_ITERATIONS=10 NUM_SUBITERATIONS=100)
# transferbench_args: Arguments to pass to TransferBench (after --)
#
# Examples:
# ./LaunchTransferBench.sh node1,node2,node3,node4 NUM_ITERATIONS=10 NUM_SUBITERATIONS=100 -- a2a
# ./LaunchTransferBench.sh host1,host2 -- cmdline 1G "1 1 R0G0 R0D0 R1G0"
# ./LaunchTransferBench.sh server1,server2,server3 TB_MASTER_PORT=30000 -- example.cfg
#
set -e
# Function to display usage information
show_usage() {
cat << EOF
Usage: $0 <hosts> [env_vars...] [-- <transferbench_args>]
Arguments:
hosts Comma-separated list of hostnames/IPs to run on
env_vars Optional environment variables (KEY=VALUE format)
transferbench_args Arguments to pass to TransferBench (after --)
Environment Variables for TransferBench:
NUM_ITERATIONS Number of timed iterations to perform (default: 10)
NUM_SUBITERATIONS Number of subiterations to perform (default: 1)
NUM_WARMUPS Number of warmup iterations (default: 3)
TB_MASTER_PORT Port for rank 0 communication (default: 29500)
... and many others (see TransferBench documentation)
Examples:
$0 node1,node2,node3,node4 NUM_ITERATIONS=10 NUM_SUBITERATIONS=100 -- a2a
$0 host1,host2 -- cmdline 1G "1 1 R0G0 R0D0 R1G0"
$0 server1,server2,server3 TB_MASTER_PORT=30000 -- example.cfg
Notes:
- The first host in the list becomes rank 0 (master)
- TransferBench must be built in the same directory as this script on all hosts
- SSH access must be configured for all hosts
EOF
}
# Parse command line arguments
if [[ $# -lt 1 ]]; then
show_usage
exit 1
fi
# Parse hosts
hosts_input="$1"
shift
if [[ -z "$hosts_input" ]]; then
echo "ERROR: No hosts specified" >&2
show_usage
exit 1
fi
# Convert comma-separated hosts to array and trim whitespace
IFS=',' read -ra hosts_raw <<< "$hosts_input"
hosts=()
for host in "${hosts_raw[@]}"; do
# Trim leading and trailing whitespace
host=$(echo "$host" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
if [[ -z "$host" ]]; then
echo "ERROR: Empty hostname found in host list" >&2
exit 1
fi
# Check for remaining whitespace in hostname
if [[ "$host" =~ [[:space:]] ]]; then
echo "ERROR: Hostname '$host' contains whitespace" >&2
exit 1
fi
hosts+=("$host")
done
num_ranks=${#hosts[@]}
if [[ $num_ranks -lt 2 ]]; then
echo "ERROR: At least 2 hosts are required for multi-rank execution" >&2
echo "For single-node execution, run TransferBench directly without this script" >&2
exit 1
fi
echo "Hosts : ${hosts[*]}"
echo "Ranks : $num_ranks"
# Parse environment variables and TransferBench arguments
env_vars=()
tb_args=()
parsing_tb_args=false
while [[ $# -gt 0 ]]; do
if [[ "$1" == "--" ]]; then
parsing_tb_args=true
shift
continue
fi
if [[ $parsing_tb_args == true ]]; then
tb_args+=("$1")
elif [[ "$1" =~ ^[A-Za-z_][A-Za-z0-9_]*=.*$ ]]; then
env_vars+=("$1")
else
echo "ERROR: Invalid environment variable format: $1" >&2
echo "Environment variables should be in KEY=VALUE format" >&2
exit 1
fi
shift
done
echo "EnvVars : ${env_vars[*]:-none}"
if [[ ${#tb_args[@]} -eq 0 ]]; then
echo "Args : none (will show topology)"
else
echo "Args : ${tb_args[*]}"
fi
# Get the absolute directory where this script is located
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
transferbench_path="$script_dir/TransferBench"
echo
# Build properly escaped environment variable string
env_string=""
for env_var in "${env_vars[@]}"; do
# Split into key and value (validation already done during parsing)
key="${env_var%%=*}"
value="${env_var#*=}"
# Escape the value and rebuild the env var
escaped_value=$(printf '%q' "$value")
env_string="$env_string $key=$escaped_value"
done
# Cleanup function for interruption
cleanup() {
echo >&2
echo "Interrupted! Cleaning up worker processes..." >&2
# First kill local SSH processes to stop remote TransferBench
if [[ ${#worker_pids[@]} -gt 0 ]]; then
for pid in "${worker_pids[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
kill -TERM "$pid" 2>/dev/null || true
fi
done
# Give SSH processes a moment to terminate and clean up remote processes
sleep 2
# Force kill any remaining local SSH processes
for pid in "${worker_pids[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
echo "Force killing SSH PID $pid..." >&2
kill -KILL "$pid" 2>/dev/null || true
fi
done
# Brief wait for cleanup, but don't hang
for pid in "${worker_pids[@]}"; do
# Wait with timeout - if process doesn't exit in 1 second, move on
timeout 1 bash -c "wait $pid" 2>/dev/null || true
done
fi
# Final cleanup: kill any remaining TransferBench processes on all hosts
if [[ ${#worker_hosts[@]} -gt 0 ]]; then
for host in "${worker_hosts[@]}"; do
ssh -q -o LogLevel=ERROR -o ConnectTimeout=1 "$host" "pkill -u \$(whoami) -f TransferBench 2>/dev/null || true" 2>/dev/null || true &
done
# Don't wait for these - let them complete in background
fi
echo "Cleanup complete" >&2
exit 130
}
# Set up signal handlers for Ctrl-C and termination
trap cleanup INT TERM
# Start worker ranks in the background
master_host="${hosts[0]}"
worker_pids=()
worker_hosts=()
# Build properly escaped arguments string
tb_args_escaped=""
for arg in "${tb_args[@]}"; do
tb_args_escaped+=" $(printf '%q' "$arg")"
done
for ((rank=1; rank<num_ranks; rank++)); do
worker_host="${hosts[$rank]}"
worker_cmd="TB_NUM_RANKS=$num_ranks TB_RANK=$rank TB_SINGLE_LOG=1 TB_MASTER_ADDR=$master_host $env_string '$transferbench_path'$tb_args_escaped"
ssh -q -o LogLevel=ERROR "$worker_host" "$worker_cmd" >/dev/null 2>&1 &
worker_pids+=($!)
worker_hosts+=("$worker_host")
done
# Start master rank (TransferBench will wait for all workers to connect)
master_cmd="TB_NUM_RANKS=$num_ranks TB_RANK=0 TB_SINGLE_LOG=1 $env_string '$transferbench_path'$tb_args_escaped"
if ! ssh -q -o LogLevel=ERROR "$master_host" "$master_cmd"; then
echo "ERROR: Master rank failed on $master_host" >&2
# Clean up worker processes before exiting
cleanup
exit 1
fi
# Check worker exit codes
any_worker_failed=false
for ((i=0; i<${#worker_pids[@]}; i++)); do
if ! wait "${worker_pids[$i]}"; then
rank=$((i+1))
echo "ERROR: Worker rank $rank failed on ${worker_hosts[$i]}" >&2
any_worker_failed=true
fi
done
if [[ "$any_worker_failed" == "true" ]]; then
exit 1
fi