Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Documentation for TransferBench is available at

## v1.67.00
### Added
- Added NIC_TRAFFIC_CLASS to set the DSCP/traffic class byte in the RoCE GRH for QPs (RoCE only)
- Added NIC_SERVICE_LEVEL to set the IB service level (sl) for QPs (IB and RoCE)
- Initial support for pod communication. Requires compatible hardware / ROCm version and subject to further testing
- This potentially enables GFX/DMA executors to access SRC/DST memory locations on GPUs within the same pod
- Pod membership requires amd-smi however can be skipped by setting TB_FORCE_SINGLE_POD=1
Expand Down
22 changes: 22 additions & 0 deletions src/client/EnvVars.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ class EnvVars
int nicChunkBytes; // Number of bytes to send per chunk for RDMA operations
int nicCqPollBatch; // Number of CQ entries to poll per ibv_poll_cq call
int nicRelaxedOrder; // Use relaxed ordering for RDMA
int nicServiceLevel; // IB service level (sl) for InfiniBand QPs
int nicTrafficClass; // DSCP/traffic class byte for RoCE GRH
int roceVersion; // RoCE version number

// Developer features
Expand Down Expand Up @@ -185,6 +187,18 @@ class EnvVars
nicChunkBytes = GetEnvVar("NIC_CHUNK_BYTES" , 1073741824);
nicCqPollBatch = GetEnvVar("NIC_CQ_POLL_BATCH" , 4);
nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER" , 1);
nicServiceLevel = GetEnvVar("NIC_SERVICE_LEVEL" , 0);
nicTrafficClass = GetEnvVar("NIC_TRAFFIC_CLASS" , 0);

// Check that NIC service level and traffic class are in valid ranges
if (nicServiceLevel < 0 || nicServiceLevel > 15) {
printf("[ERROR] NIC_SERVICE_LEVEL must be in range 0..15 (got %d)\n", nicServiceLevel);
exit(1);
}
if (nicTrafficClass < 0 || nicTrafficClass > 255) {
printf("[ERROR] NIC_TRAFFIC_CLASS must be in range 0..255 (got %d)\n", nicTrafficClass);
exit(1);
}
Comment thread
paklui marked this conversation as resolved.

gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4);

Expand Down Expand Up @@ -366,6 +380,8 @@ class EnvVars
printf(" NIC_CHUNK_BYTES - Number of bytes to send at a time using NIC (default = 1GB)\n");
printf(" NIC_CQ_POLL_BATCH - Number of CQ entries to poll per ibv_poll_cq call (default = 4)\n");
printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering\n");
printf(" NIC_SERVICE_LEVEL - IB service level (sl) for InfiniBand QPs (default=0)\n");
printf(" NIC_TRAFFIC_CLASS - DSCP/traffic class byte for RoCE GRH (default=0)\n");
Comment thread
paklui marked this conversation as resolved.
#endif
printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n");
printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
Expand Down Expand Up @@ -504,6 +520,10 @@ class EnvVars
"Polling %d CQ entries per ibv_poll_cq call", nicCqPollBatch);
Print("NIC_RELAX_ORDER", nicRelaxedOrder,
"Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict");
Print("NIC_SERVICE_LEVEL", nicServiceLevel,
"IB service level (sl) set to %d", nicServiceLevel);
Print("NIC_TRAFFIC_CLASS", nicTrafficClass,
"RoCE traffic class (DSCP) set to %d", nicTrafficClass);
#endif
Print("NUM_ITERATIONS", numIterations,
(numIterations == 0) ? "Running infinitely" :
Expand Down Expand Up @@ -725,6 +745,8 @@ class EnvVars
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.serviceLevel = nicServiceLevel;
cfg.nic.trafficClass = nicTrafficClass;
Comment thread
paklui marked this conversation as resolved.
cfg.nic.roceVersion = roceVersion;

return cfg;
Expand Down
15 changes: 11 additions & 4 deletions src/header/TransferBench.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ namespace TransferBench
int queueSize = 100; ///< Completion queue size
int roceVersion = 2; ///< RoCE version (used for auto GID detection)
int useRelaxedOrder = 1; ///< Use relaxed ordering
uint8_t serviceLevel = 0; ///< IB service level (sl) for InfiniBand QPs
Comment thread
paklui marked this conversation as resolved.
uint8_t trafficClass = 0; ///< DSCP/traffic class byte for RoCE GRH
int useNuma = 0; ///< Switch to closest numa thread for execution
};

Expand Down Expand Up @@ -2000,6 +2002,8 @@ namespace {
if (nic.maxSendWorkReq != cfg.nic.maxSendWorkReq) ADD_ERROR("cfg.nic.maxSendWorkReq");
// nic.queueSize is permitted to be different across ranks
if (nic.roceVersion != cfg.nic.roceVersion) ADD_ERROR("cfg.nic.roceVersion");
if (nic.serviceLevel != cfg.nic.serviceLevel) ADD_ERROR("cfg.nic.serviceLevel");
if (nic.trafficClass != cfg.nic.trafficClass) ADD_ERROR("cfg.nic.trafficClass");
if (nic.useRelaxedOrder != cfg.nic.useRelaxedOrder) ADD_ERROR("cfg.nic.useRelaxedOrder");
Comment thread
paklui marked this conversation as resolved.
if (nic.useNuma != cfg.nic.useNuma) ADD_ERROR("cfg.nic.useNuma");
}
Expand Down Expand Up @@ -3284,7 +3288,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ConnInfo const& connInfo,
uint8_t const& port,
bool const& isRoCE,
ibv_mtu const& mtu)
ibv_mtu const& mtu,
uint8_t const& trafficClass,
uint8_t const& serviceLevel)
{
// Prepare QP attributes
struct ibv_qp_attr attr = {};
Expand All @@ -3300,11 +3306,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
attr.ah_attr.grh.flow_label = 0;
attr.ah_attr.grh.sgid_index = connInfo.gidIdx;
attr.ah_attr.grh.hop_limit = 255;
attr.ah_attr.grh.traffic_class = trafficClass;
} else {
attr.ah_attr.is_global = 0;
attr.ah_attr.dlid = connInfo.lid;
}
attr.ah_attr.sl = 0;
attr.ah_attr.sl = serviceLevel;
attr.ah_attr.src_path_bits = 0;
attr.ah_attr.port_num = port;
attr.dest_qp_num = connInfo.qpn;
Expand Down Expand Up @@ -3588,7 +3595,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
static_assert(std::is_trivially_copyable<QpTransitionResult>::value, "QpTransitionResult must be trivially copyable for MPI broadcast");
QpTransitionResult srcQpResult = {ERR_NONE, false};
if (GetRank() == srcMemRank) {
ErrResult err = TransitionQpToRtr(rss.srcQueuePairs[i], dstConnInfo, port, srcIsRoCE, rss.srcPortAttr.active_mtu);
ErrResult err = TransitionQpToRtr(rss.srcQueuePairs[i], dstConnInfo, port, srcIsRoCE, rss.srcPortAttr.active_mtu, cfg.nic.trafficClass, cfg.nic.serviceLevel);
srcQpResult.rtrFailed = (err.errType != ERR_NONE);
if (err.errType == ERR_NONE) {
err = TransitionQpToRts(rss.srcQueuePairs[i]);
Expand All @@ -3603,7 +3610,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)

QpTransitionResult dstQpResult = {ERR_NONE, false};
if (GetRank() == dstMemRank) {
ErrResult err = TransitionQpToRtr(rss.dstQueuePairs[i], srcConnInfo, port, dstIsRoCE, rss.dstPortAttr.active_mtu);
ErrResult err = TransitionQpToRtr(rss.dstQueuePairs[i], srcConnInfo, port, dstIsRoCE, rss.dstPortAttr.active_mtu, cfg.nic.trafficClass, cfg.nic.serviceLevel);
dstQpResult.rtrFailed = (err.errType != ERR_NONE);
if (err.errType == ERR_NONE) {
err = TransitionQpToRts(rss.dstQueuePairs[i]);
Expand Down
Loading