Files
Sajina PK 09b8342e22 [Rocprofiler-systems] : Add XGMI and PCIe metrics to the profiling data (#1628)
* Add XGMI and PCIe metrics to the profiling data

Add support for AMD XGMI (GPU-to-GPU interconnect) and PCIe
metrics:
  * XGMI link width in bits
  * XGMI link speed in GT/s
  * Per-link read bandwidth (KB)
  * Per-link write bandwidth (KB)

- Add new categories for PCIe metrics:
  * PCIe link width
  * PCIe link speed in GT/s
  * Accumulated bandwidth (MB)
  * Instantaneous bandwidth (MB/s)

* Fix VCN/JPEG insert logic

* Modify the gpu_metrics struct to accomodate XCP structure

* Add ctest automation for gpu interconnect metrics

* Refactor to move gpu_metrics struct and serialization to another file

* Possible fix for timeout in CI

Fix redundant skip check in ctest
Add xgmi and pcie option in rocprof-sys-avail.

* Change2: Address review comments

Change ctest sampling to avoid timeout
Change variable name and code structuring

* Add option in ctest to run rocprof-sys-run without rewrite

Run transferbench with rocprof-sys-run without sampling

* Change3: Fix sample insert bug and address review comments

xgmi and pci support check
renaming variables
additional hip_api validation in rocpd

* Reduce the load from the trnasferBench sample

The CI builds were timing out when flushing a big temporary file to the
DB: (2720824.23 KB / 2720.82 MB / 2.72 GB)...
2025-11-14 19:42:33 -05:00

811 lines
32 KiB
C++

/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <algorithm>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <limits>
#include <map>
#include <random>
#include <set>
#include <sstream>
#include <string>
#include <vector>
// Include necessary headers
#include "TransferBench.hpp"
using namespace TransferBench;
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do \
{ \
hipError_t error = (cmd); \
if(error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while(0)
// Default configuration values
// Reduced to 16KB (1 << 14) for minimal data capture during profiling
size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 14);
char const ExeTypeName[5][4] = { "CPU", "GPU", "DMA", "NIC", "NIC" };
// Simplified EnvVars class for standalone use
class EnvVars
{
public:
// Environment variables (using minimal defaults for profiling)
int numIterations = 1;
int numSubIterations = 1;
int numWarmups = 0;
int showIterations = 0;
int useInteractive = 0;
int alwaysValidate = 0;
int blockBytes = 256;
int byteOffset = 0;
std::vector<float> fillPattern;
std::vector<int> fillCompress;
int validateDirect = 0;
int validateSource = 0;
int useHsaDma = 0;
int gfxBlockOrder = 0;
int gfxBlockSize = 256;
std::vector<uint32_t> cuMask;
std::vector<std::vector<int>> prefXccTable;
int gfxTemporal = 0;
int gfxUnroll = 4;
int useHipEvents = 1;
int useSingleStream = 1;
int gfxSingleTeam = 1;
int gfxWaveOrder = 0;
int gfxWordSize = 4;
int hideEnv = 0;
int minNumVarSubExec = 1;
int maxNumVarSubExec = 0;
int outputToCsv = 0;
int samplingFactor = 1;
int ibGidIndex = -1;
int roceVersion = 2;
int ipAddressFamily = 4;
uint8_t ibPort = 1;
int nicRelaxedOrder = 1;
std::string closestNicStr = "";
int gpuMaxHwQueues = 4;
// Constructor that collects values from environment
EnvVars()
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
(void) numDetectedGpus; // May be unused
// Get architecture-specific defaults
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, 0));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
int defaultGfxUnroll = 4;
if(archName == "gfx906")
defaultGfxUnroll = 8;
else if(archName == "gfx90a")
defaultGfxUnroll = 8;
else if(archName == "gfx942")
defaultGfxUnroll = 4;
else if(archName == "gfx950")
defaultGfxUnroll = 4;
// Read environment variables
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE", 0);
blockBytes = GetEnvVar("BLOCK_BYTES", 256);
byteOffset = GetEnvVar("BYTE_OFFSET", 0);
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER", 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE", 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM", 1);
gfxTemporal = GetEnvVar("GFX_TEMPORAL", 0);
gfxUnroll = GetEnvVar("GFX_UNROLL", defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER", 0);
gfxWordSize = GetEnvVar("GFX_WORD_SIZE", 4);
hideEnv = GetEnvVar("HIDE_ENV", 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC", 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC", 0);
numIterations = GetEnvVar("NUM_ITERATIONS", 1);
numSubIterations = GetEnvVar("NUM_SUBITERATIONS", 1);
numWarmups = GetEnvVar("NUM_WARMUPS", 0);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV", 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR", 1);
showIterations = GetEnvVar("SHOW_ITERATIONS", 0);
useHipEvents = GetEnvVar("USE_HIP_EVENTS", 1);
useHsaDma = GetEnvVar("USE_HSA_DMA", 0);
useInteractive = GetEnvVar("USE_INTERACTIVE", 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 1);
validateDirect = GetEnvVar("VALIDATE_DIRECT", 0);
validateSource = GetEnvVar("VALIDATE_SOURCE", 0);
ibGidIndex = GetEnvVar("IB_GID_INDEX", -1);
ibPort = GetEnvVar("IB_PORT_NUMBER", 1);
roceVersion = GetEnvVar("ROCE_VERSION", 2);
ipAddressFamily = GetEnvVar("IP_ADDRESS_FAMILY", 4);
nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER", 1);
closestNicStr = GetEnvVar("CLOSEST_NIC", "");
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES", 4);
}
// Helper function that gets environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue)
{
if(getenv(varname.c_str())) return atoi(getenv(varname.c_str()));
return defaultValue;
}
static std::string GetEnvVar(std::string const& varname,
std::string const& defaultValue)
{
if(getenv(varname.c_str())) return getenv(varname.c_str());
return defaultValue;
}
void Print(std::string const& name, int32_t const value, const char* format,
...) const
{
printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value,
outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
void Print(std::string const& name, std::string const& value, const char* format,
...) const
{
printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(),
outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
// Display env var settings (simplified)
void DisplayEnvVars() const
{
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
if(!outputToCsv)
{
printf("Standalone AllToAll v%s%s\n", TransferBench::VERSION,
nicSupport.c_str());
printf("===============================================================\n");
if(!hideEnv)
printf("[Common] (Suppress by setting "
"HIDE_ENV=1)\n");
}
else if(!hideEnv)
printf("EnvVar,Value,Description,(Standalone AllToAll v%s)\n",
TransferBench::VERSION);
if(hideEnv) return;
Print("NUM_ITERATIONS", numIterations, "Running %d timed iteration(s)",
numIterations);
Print("NUM_WARMUPS", numWarmups, "Running %d warmup iteration(s) per Test",
numWarmups);
Print("USE_SINGLE_STREAM", useSingleStream, "Using single stream per GFX %s",
useSingleStream ? "device" : "Transfer");
Print("GFX_UNROLL", gfxUnroll, "Using GFX unroll factor of %d", gfxUnroll);
printf("\n");
}
// Display usage instructions
static void DisplayUsage()
{
printf("Environment variables:\n");
printf("======================\n");
printf(" NUM_ITERATIONS - # of timed iterations per test (default=1)\n");
printf(
" NUM_WARMUPS - # of untimed warmup iterations per test (default=0)\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor "
"(default=1)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (default=4)\n");
printf(
" HIDE_ENV - Hide environment variable value listing (default=0)\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set (default=0)\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info (default=0)\n");
printf("\n");
printf("AllToAll specific variables:\n");
printf(" A2A_DIRECT - Only using direct links (default=1)\n");
printf(" A2A_LOCAL - Include local transfers (default=0)\n");
printf(" A2A_MODE - Transfer mode: 0=Copy, 1=Read-Only, 2=Write-Only "
"(default=0)\n");
printf(" NUM_GPU_DEVICES - Number of GPUs to use (default=4 detected)\n");
printf(
" NUM_SUB_EXEC - Number of subexecutors/CUs per Transfer (default=1)\n");
printf(" USE_DMA_EXEC - Use DMA executor instead of GFX (default=0)\n");
printf(" USE_FINE_GRAIN - Use fine-grained memory (default=1)\n");
printf(" USE_REMOTE_READ - Use DST as executor instead of SRC (default=0)\n");
}
TransferBench::ConfigOptions ToConfigOptions()
{
TransferBench::ConfigOptions cfg;
cfg.general.numIterations = numIterations;
cfg.general.numSubIterations = numSubIterations;
cfg.general.numWarmups = numWarmups;
cfg.general.recordPerIteration = showIterations;
cfg.general.useInteractive = useInteractive;
cfg.data.alwaysValidate = alwaysValidate;
cfg.data.blockBytes = blockBytes;
cfg.data.byteOffset = byteOffset;
cfg.data.fillCompress = fillCompress;
cfg.data.fillPattern = fillPattern;
cfg.data.validateDirect = validateDirect;
cfg.data.validateSource = validateSource;
cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma;
cfg.gfx.blockOrder = gfxBlockOrder;
cfg.gfx.blockSize = gfxBlockSize;
cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable;
cfg.gfx.unrollFactor = gfxUnroll;
cfg.gfx.temporalMode = gfxTemporal;
cfg.gfx.useHipEvents = useHipEvents;
cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder;
cfg.gfx.wordSize = gfxWordSize;
cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.roceVersion = roceVersion;
std::vector<int> closestNics;
if(closestNicStr != "")
{
std::stringstream ss(closestNicStr);
std::string item;
while(std::getline(ss, item, ','))
{
try
{
int nic = std::stoi(item);
closestNics.push_back(nic);
} catch(const std::invalid_argument& e)
{
printf("[ERROR] Invalid NIC index (%s) by user in %s\n", item.c_str(),
closestNicStr.c_str());
exit(1);
}
}
cfg.nic.closestNics = closestNics;
}
return cfg;
}
};
// Forward declarations
void
PrintResults(EnvVars const& ev, int const testNum, std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results);
void
PrintErrors(std::vector<ErrResult> const& errors);
void
CheckForError(ErrResult const& error);
std::string
MemDevicesToStr(std::vector<MemDevice> const& memDevices);
// Helper function that converts MemDevices to a string
std::string
MemDevicesToStr(std::vector<MemDevice> const& memDevices)
{
if(memDevices.empty()) return "N";
std::stringstream ss;
for(auto const& m : memDevices)
ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
return ss.str();
}
// Helper function to print warning / exit on fatal error
void
CheckForError(ErrResult const& error)
{
switch(error.errType)
{
case ERR_NONE: return;
case ERR_WARN: printf("[WARN] %s\n", error.errMsg.c_str()); return;
case ERR_FATAL: printf("[ERROR] %s\n", error.errMsg.c_str()); exit(1);
default: break;
}
}
// Helper function to print list of errors
void
PrintErrors(std::vector<ErrResult> const& errors)
{
bool isFatal = false;
for(auto const& err : errors)
{
printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN",
err.errMsg.c_str());
isFatal |= (err.errType == ERR_FATAL);
}
if(isFatal) exit(1);
}
// Print TransferBench test results
void
PrintResults(EnvVars const& ev, int const testNum, std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
if(!ev.outputToCsv) printf("Test %d:\n", testNum);
// Loop over each executor
for(auto exeInfoPair : results.exeResults)
{
ExeDevice const& exeDevice = exeInfoPair.first;
ExeResult const& exeResult = exeInfoPair.second;
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f "
"GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep,
exeResult.sumBandwidthGbPerSec);
// Loop over each transfer
for(int idx : exeResult.transferIdx)
{
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];
char exeSubIndexStr[32] = "";
if(t.exeSubIndex != -1) sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
printf(" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s "
"-> %c%03d%s:%03d -> %s\n",
idx, sep, r.avgBandwidthGbPerSec, sep, r.avgDurationMsec, sep,
r.numBytes, sep, MemDevicesToStr(t.srcs).c_str(),
TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information
if(ev.showIterations)
{
// Check that per-iteration information exists
if(r.perIterMsec.size() != numTimedIterations)
{
printf("[ERROR] Per iteration timing data unavailable: Expected %lu "
"data points, but have %lu\n",
numTimedIterations, r.perIterMsec.size());
exit(1);
}
// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for(size_t i = 0; i < numTimedIterations; i++)
{
times.insert(
std::make_pair(r.perIterMsec[i], static_cast<int>(i + 1)));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs =
(t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
// Loop over iterations (fastest to slowest)
for(auto& time : times)
{
double iterDurationMsec = time.first;
double iterBandwidthGbs =
(t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c", time.second,
sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if(static_cast<size_t>(time.second - 1) < r.perIterCUs.size())
{
printf(" CUs:");
for(auto x : r.perIterCUs[time.second - 1])
{
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for(auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw,
sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f "
"ms\n",
sep, results.avgTotalBandwidthGbPerSec, sep, results.avgTotalDurationMsec, sep,
results.totalBytesTransferred, sep, results.overheadMsec);
}
// AllToAll Preset Implementation
void
AllToAllPreset(EnvVars& ev, size_t const numBytesPerTransfer,
std::string const presetName)
{
(void) presetName; // May be unused
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = { "Copy", "Read-Only", "Write-Only", "Custom" };
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
// Force to gfx unroll 2 unless explicitly set
ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT", 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL", 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", std::min(4, numDetectedGpus));
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC", 1);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC", 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if(getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2)
{
a2aMode = A2A_CUSTOM;
}
else
{
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if(a2aMode < 0 || a2aMode > 2)
{
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if(!ev.hideEnv)
{
if(!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT", a2aDirect,
a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL", a2aLocal, "%s local transfers",
a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE",
(a2aMode == A2A_CUSTOM)
? std::to_string(numSrcs) + ":" + std::to_string(numDsts)
: std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)")
.c_str()
: a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs,
"Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("NUM_SUB_EXEC", numSubExecs, "Using %d subexecutors/CUs per Transfer",
numSubExecs);
ev.Print("USE_DMA_EXEC", useDmaExec, "Using %s executor",
useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory",
useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor",
useRemoteRead ? "DST" : "SRC");
printf("\n");
}
// Validate env vars
if(numGpus < 0 || numGpus > numDetectedGpus)
{
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus,
numDetectedGpus);
exit(1);
}
if(useDmaExec && (numSrcs != 1 || numDsts != 1))
{
printf("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for(int i = 0; i < numGpus; i++)
{
for(int j = 0; j < numGpus; j++)
{
// Check whether or not to execute this pair
if(i == j)
{
if(!a2aLocal) continue;
}
else if(a2aDirect)
{
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if(hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
for(int x = 0; x < numSrcs; x++)
transfer.srcs.push_back({ memType, i });
// When using multiple destinations, the additional destinations are "local"
if(numDsts) transfer.dsts.push_back({ memType, j });
for(int x = 1; x < numDsts; x++)
transfer.dsts.push_back({ memType, i });
transfer.exeDevice = { exeType, (useRemoteRead ? j : i) };
transfer.exeSubIndex = -1;
transfer.numSubExecs = numSubExecs;
reIndex[std::make_pair(i, j)] = transfers.size();
transfers.push_back(transfer);
}
}
// Create a ring using NICs
std::vector<int> nicTransferIdx(numGpus);
if(numQueuePairs > 0)
{
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
(void) numNics; // May be unused
for(int i = 0; i < numGpus; i++)
{
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({ memType, i });
transfer.dsts.push_back({ memType, (i + 1) % numGpus });
transfer.exeDevice = { TransferBench::EXE_NIC_NEAREST, i };
transfer.exeSubIndex = (i + 1) % numGpus;
transfer.numSubExecs = numQueuePairs;
nicTransferIdx[i] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs,
transfers.size());
if(transfers.size() == 0)
{
printf("Error: No valid transfers created. Check GPU count, a2aLocal=%d, "
"a2aDirect=%d settings, and GPU topology/connectivity.\n",
a2aLocal, a2aDirect);
return;
}
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if(!TransferBench::RunTransfers(cfg, transfers, results))
{
for(auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
}
else
{
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs,
numDsts);
printf(
"===========================================================================\n");
printf("SRC\\DST ");
for(int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
if(numQueuePairs > 0) printf("%cNIC(%02d QP)", separator, numQueuePairs);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus + 2, 0.0);
for(int src = 0; src < numGpus; src++)
{
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for(int dst = 0; dst < numGpus; dst++)
{
if(reIndex.count(std::make_pair(src, dst)))
{
int const transferIdx = reIndex[std::make_pair(src, dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
else
{
printf("%c%8s ", separator, "N/A");
}
}
if(numQueuePairs > 0)
{
TransferBench::TransferResult const& r =
results.tfrResults[nicTransferIdx[src]];
colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator,
actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus + 1] += rowTotalBandwidth;
}
printf("\nRTotal");
for(int dst = 0; dst < numGpus; dst++)
{
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
if(numQueuePairs > 0)
{
printf("%c%8.3f ", separator, colTotalBandwidth[numGpus]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus + 1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n",
totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n",
results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
// Display usage instructions
void
DisplayUsage(char const* cmdName)
{
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
printf("Standalone AllToAll v%s%s\n", TransferBench::VERSION, nicSupport.c_str());
printf("========================================\n");
printf("Usage: %s [N]\n", cmdName);
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 "
"bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / "
"gigabytes\n");
printf("\n");
EnvVars::DisplayUsage();
}
// Main function
int
main(int argc, char** argv)
{
// Collect environment variables
EnvVars ev;
// Display usage instructions if requested
if(argc > 1 && (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0))
{
DisplayUsage(argv[0]);
exit(0);
}
// Determine number of bytes to run per Transfer
size_t numBytesPerTransfer = argc > 1 ? atoll(argv[1]) : DEFAULT_BYTES_PER_TRANSFER;
if(argc > 1)
{
// Adjust bytes if unit specified
char units = argv[1][strlen(argv[1]) - 1];
switch(units)
{
case 'G':
case 'g': numBytesPerTransfer *= 1024;
case 'M':
case 'm': numBytesPerTransfer *= 1024;
case 'K':
case 'k': numBytesPerTransfer *= 1024;
}
}
if(numBytesPerTransfer % 4)
{
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n",
numBytesPerTransfer);
exit(1);
}
printf("Running AllToAll benchmark with %lu bytes per transfer\n\n",
numBytesPerTransfer);
// Run AllToAll preset
AllToAllPreset(ev, numBytesPerTransfer, "AllToAll");
return 0;
}