[Rocprofiler-systems] : Add XGMI and PCIe metrics to the profiling data (#1628)
* Add XGMI and PCIe metrics to the profiling data Add support for AMD XGMI (GPU-to-GPU interconnect) and PCIe metrics: * XGMI link width in bits * XGMI link speed in GT/s * Per-link read bandwidth (KB) * Per-link write bandwidth (KB) - Add new categories for PCIe metrics: * PCIe link width * PCIe link speed in GT/s * Accumulated bandwidth (MB) * Instantaneous bandwidth (MB/s) * Fix VCN/JPEG insert logic * Modify the gpu_metrics struct to accomodate XCP structure * Add ctest automation for gpu interconnect metrics * Refactor to move gpu_metrics struct and serialization to another file * Possible fix for timeout in CI Fix redundant skip check in ctest Add xgmi and pcie option in rocprof-sys-avail. * Change2: Address review comments Change ctest sampling to avoid timeout Change variable name and code structuring * Add option in ctest to run rocprof-sys-run without rewrite Run transferbench with rocprof-sys-run without sampling * Change3: Fix sample insert bug and address review comments xgmi and pci support check renaming variables additional hip_api validation in rocpd * Reduce the load from the trnasferBench sample The CI builds were timing out when flushing a big temporary file to the DB: (2720824.23 KB / 2720.82 MB / 2.72 GB)...
此提交包含在:
@@ -79,3 +79,4 @@ add_subdirectory(videodecode)
|
||||
add_subdirectory(jpegdecode)
|
||||
add_subdirectory(roctx)
|
||||
add_subdirectory(thread-limit)
|
||||
add_subdirectory(transferBench)
|
||||
|
||||
@@ -0,0 +1,810 @@
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <random>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// Include necessary headers
|
||||
#include "TransferBench.hpp"
|
||||
|
||||
using namespace TransferBench;
|
||||
|
||||
// Helper macro for catching HIP errors
|
||||
#define HIP_CALL(cmd) \
|
||||
do \
|
||||
{ \
|
||||
hipError_t error = (cmd); \
|
||||
if(error != hipSuccess) \
|
||||
{ \
|
||||
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
|
||||
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
|
||||
exit(-1); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
// Default configuration values
|
||||
// Reduced to 16KB (1 << 14) for minimal data capture during profiling
|
||||
size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 14);
|
||||
char const ExeTypeName[5][4] = { "CPU", "GPU", "DMA", "NIC", "NIC" };
|
||||
|
||||
// Simplified EnvVars class for standalone use
|
||||
class EnvVars
|
||||
{
|
||||
public:
|
||||
// Environment variables (using minimal defaults for profiling)
|
||||
int numIterations = 1;
|
||||
int numSubIterations = 1;
|
||||
int numWarmups = 0;
|
||||
int showIterations = 0;
|
||||
int useInteractive = 0;
|
||||
int alwaysValidate = 0;
|
||||
int blockBytes = 256;
|
||||
int byteOffset = 0;
|
||||
std::vector<float> fillPattern;
|
||||
std::vector<int> fillCompress;
|
||||
int validateDirect = 0;
|
||||
int validateSource = 0;
|
||||
int useHsaDma = 0;
|
||||
int gfxBlockOrder = 0;
|
||||
int gfxBlockSize = 256;
|
||||
std::vector<uint32_t> cuMask;
|
||||
std::vector<std::vector<int>> prefXccTable;
|
||||
int gfxTemporal = 0;
|
||||
int gfxUnroll = 4;
|
||||
int useHipEvents = 1;
|
||||
int useSingleStream = 1;
|
||||
int gfxSingleTeam = 1;
|
||||
int gfxWaveOrder = 0;
|
||||
int gfxWordSize = 4;
|
||||
int hideEnv = 0;
|
||||
int minNumVarSubExec = 1;
|
||||
int maxNumVarSubExec = 0;
|
||||
int outputToCsv = 0;
|
||||
int samplingFactor = 1;
|
||||
int ibGidIndex = -1;
|
||||
int roceVersion = 2;
|
||||
int ipAddressFamily = 4;
|
||||
uint8_t ibPort = 1;
|
||||
int nicRelaxedOrder = 1;
|
||||
std::string closestNicStr = "";
|
||||
int gpuMaxHwQueues = 4;
|
||||
|
||||
// Constructor that collects values from environment
|
||||
EnvVars()
|
||||
{
|
||||
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
|
||||
(void) numDetectedGpus; // May be unused
|
||||
|
||||
// Get architecture-specific defaults
|
||||
hipDeviceProp_t prop;
|
||||
HIP_CALL(hipGetDeviceProperties(&prop, 0));
|
||||
std::string fullName = prop.gcnArchName;
|
||||
std::string archName = fullName.substr(0, fullName.find(':'));
|
||||
|
||||
int defaultGfxUnroll = 4;
|
||||
if(archName == "gfx906")
|
||||
defaultGfxUnroll = 8;
|
||||
else if(archName == "gfx90a")
|
||||
defaultGfxUnroll = 8;
|
||||
else if(archName == "gfx942")
|
||||
defaultGfxUnroll = 4;
|
||||
else if(archName == "gfx950")
|
||||
defaultGfxUnroll = 4;
|
||||
|
||||
// Read environment variables
|
||||
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE", 0);
|
||||
blockBytes = GetEnvVar("BLOCK_BYTES", 256);
|
||||
byteOffset = GetEnvVar("BYTE_OFFSET", 0);
|
||||
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER", 0);
|
||||
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE", 256);
|
||||
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM", 1);
|
||||
gfxTemporal = GetEnvVar("GFX_TEMPORAL", 0);
|
||||
gfxUnroll = GetEnvVar("GFX_UNROLL", defaultGfxUnroll);
|
||||
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER", 0);
|
||||
gfxWordSize = GetEnvVar("GFX_WORD_SIZE", 4);
|
||||
hideEnv = GetEnvVar("HIDE_ENV", 0);
|
||||
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC", 1);
|
||||
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC", 0);
|
||||
numIterations = GetEnvVar("NUM_ITERATIONS", 1);
|
||||
numSubIterations = GetEnvVar("NUM_SUBITERATIONS", 1);
|
||||
numWarmups = GetEnvVar("NUM_WARMUPS", 0);
|
||||
outputToCsv = GetEnvVar("OUTPUT_TO_CSV", 0);
|
||||
samplingFactor = GetEnvVar("SAMPLING_FACTOR", 1);
|
||||
showIterations = GetEnvVar("SHOW_ITERATIONS", 0);
|
||||
useHipEvents = GetEnvVar("USE_HIP_EVENTS", 1);
|
||||
useHsaDma = GetEnvVar("USE_HSA_DMA", 0);
|
||||
useInteractive = GetEnvVar("USE_INTERACTIVE", 0);
|
||||
useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 1);
|
||||
validateDirect = GetEnvVar("VALIDATE_DIRECT", 0);
|
||||
validateSource = GetEnvVar("VALIDATE_SOURCE", 0);
|
||||
ibGidIndex = GetEnvVar("IB_GID_INDEX", -1);
|
||||
ibPort = GetEnvVar("IB_PORT_NUMBER", 1);
|
||||
roceVersion = GetEnvVar("ROCE_VERSION", 2);
|
||||
ipAddressFamily = GetEnvVar("IP_ADDRESS_FAMILY", 4);
|
||||
nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER", 1);
|
||||
closestNicStr = GetEnvVar("CLOSEST_NIC", "");
|
||||
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES", 4);
|
||||
}
|
||||
|
||||
// Helper function that gets environment variable or sets to default value
|
||||
static int GetEnvVar(std::string const& varname, int defaultValue)
|
||||
{
|
||||
if(getenv(varname.c_str())) return atoi(getenv(varname.c_str()));
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
static std::string GetEnvVar(std::string const& varname,
|
||||
std::string const& defaultValue)
|
||||
{
|
||||
if(getenv(varname.c_str())) return getenv(varname.c_str());
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
void Print(std::string const& name, int32_t const value, const char* format,
|
||||
...) const
|
||||
{
|
||||
printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value,
|
||||
outputToCsv ? "," : " : ");
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
vprintf(format, args);
|
||||
va_end(args);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void Print(std::string const& name, std::string const& value, const char* format,
|
||||
...) const
|
||||
{
|
||||
printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(),
|
||||
outputToCsv ? "," : " : ");
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
vprintf(format, args);
|
||||
va_end(args);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Display env var settings (simplified)
|
||||
void DisplayEnvVars() const
|
||||
{
|
||||
std::string nicSupport = "";
|
||||
#if NIC_EXEC_ENABLED
|
||||
nicSupport = " (with NIC support)";
|
||||
#endif
|
||||
if(!outputToCsv)
|
||||
{
|
||||
printf("Standalone AllToAll v%s%s\n", TransferBench::VERSION,
|
||||
nicSupport.c_str());
|
||||
printf("===============================================================\n");
|
||||
if(!hideEnv)
|
||||
printf("[Common] (Suppress by setting "
|
||||
"HIDE_ENV=1)\n");
|
||||
}
|
||||
else if(!hideEnv)
|
||||
printf("EnvVar,Value,Description,(Standalone AllToAll v%s)\n",
|
||||
TransferBench::VERSION);
|
||||
if(hideEnv) return;
|
||||
|
||||
Print("NUM_ITERATIONS", numIterations, "Running %d timed iteration(s)",
|
||||
numIterations);
|
||||
Print("NUM_WARMUPS", numWarmups, "Running %d warmup iteration(s) per Test",
|
||||
numWarmups);
|
||||
Print("USE_SINGLE_STREAM", useSingleStream, "Using single stream per GFX %s",
|
||||
useSingleStream ? "device" : "Transfer");
|
||||
Print("GFX_UNROLL", gfxUnroll, "Using GFX unroll factor of %d", gfxUnroll);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Display usage instructions
|
||||
static void DisplayUsage()
|
||||
{
|
||||
printf("Environment variables:\n");
|
||||
printf("======================\n");
|
||||
printf(" NUM_ITERATIONS - # of timed iterations per test (default=1)\n");
|
||||
printf(
|
||||
" NUM_WARMUPS - # of untimed warmup iterations per test (default=0)\n");
|
||||
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor "
|
||||
"(default=1)\n");
|
||||
printf(" GFX_UNROLL - Unroll factor for GFX kernel (default=4)\n");
|
||||
printf(
|
||||
" HIDE_ENV - Hide environment variable value listing (default=0)\n");
|
||||
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set (default=0)\n");
|
||||
printf(" SHOW_ITERATIONS - Show per-iteration timing info (default=0)\n");
|
||||
printf("\n");
|
||||
printf("AllToAll specific variables:\n");
|
||||
printf(" A2A_DIRECT - Only using direct links (default=1)\n");
|
||||
printf(" A2A_LOCAL - Include local transfers (default=0)\n");
|
||||
printf(" A2A_MODE - Transfer mode: 0=Copy, 1=Read-Only, 2=Write-Only "
|
||||
"(default=0)\n");
|
||||
printf(" NUM_GPU_DEVICES - Number of GPUs to use (default=4 detected)\n");
|
||||
printf(
|
||||
" NUM_SUB_EXEC - Number of subexecutors/CUs per Transfer (default=1)\n");
|
||||
printf(" USE_DMA_EXEC - Use DMA executor instead of GFX (default=0)\n");
|
||||
printf(" USE_FINE_GRAIN - Use fine-grained memory (default=1)\n");
|
||||
printf(" USE_REMOTE_READ - Use DST as executor instead of SRC (default=0)\n");
|
||||
}
|
||||
|
||||
TransferBench::ConfigOptions ToConfigOptions()
|
||||
{
|
||||
TransferBench::ConfigOptions cfg;
|
||||
|
||||
cfg.general.numIterations = numIterations;
|
||||
cfg.general.numSubIterations = numSubIterations;
|
||||
cfg.general.numWarmups = numWarmups;
|
||||
cfg.general.recordPerIteration = showIterations;
|
||||
cfg.general.useInteractive = useInteractive;
|
||||
|
||||
cfg.data.alwaysValidate = alwaysValidate;
|
||||
cfg.data.blockBytes = blockBytes;
|
||||
cfg.data.byteOffset = byteOffset;
|
||||
cfg.data.fillCompress = fillCompress;
|
||||
cfg.data.fillPattern = fillPattern;
|
||||
cfg.data.validateDirect = validateDirect;
|
||||
cfg.data.validateSource = validateSource;
|
||||
|
||||
cfg.dma.useHipEvents = useHipEvents;
|
||||
cfg.dma.useHsaCopy = useHsaDma;
|
||||
|
||||
cfg.gfx.blockOrder = gfxBlockOrder;
|
||||
cfg.gfx.blockSize = gfxBlockSize;
|
||||
cfg.gfx.cuMask = cuMask;
|
||||
cfg.gfx.prefXccTable = prefXccTable;
|
||||
cfg.gfx.unrollFactor = gfxUnroll;
|
||||
cfg.gfx.temporalMode = gfxTemporal;
|
||||
cfg.gfx.useHipEvents = useHipEvents;
|
||||
cfg.gfx.useMultiStream = !useSingleStream;
|
||||
cfg.gfx.useSingleTeam = gfxSingleTeam;
|
||||
cfg.gfx.waveOrder = gfxWaveOrder;
|
||||
cfg.gfx.wordSize = gfxWordSize;
|
||||
|
||||
cfg.nic.ibGidIndex = ibGidIndex;
|
||||
cfg.nic.ibPort = ibPort;
|
||||
cfg.nic.ipAddressFamily = ipAddressFamily;
|
||||
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
|
||||
cfg.nic.roceVersion = roceVersion;
|
||||
|
||||
std::vector<int> closestNics;
|
||||
if(closestNicStr != "")
|
||||
{
|
||||
std::stringstream ss(closestNicStr);
|
||||
std::string item;
|
||||
while(std::getline(ss, item, ','))
|
||||
{
|
||||
try
|
||||
{
|
||||
int nic = std::stoi(item);
|
||||
closestNics.push_back(nic);
|
||||
} catch(const std::invalid_argument& e)
|
||||
{
|
||||
printf("[ERROR] Invalid NIC index (%s) by user in %s\n", item.c_str(),
|
||||
closestNicStr.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
cfg.nic.closestNics = closestNics;
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
};
|
||||
|
||||
// Forward declarations
|
||||
void
|
||||
PrintResults(EnvVars const& ev, int const testNum, std::vector<Transfer> const& transfers,
|
||||
TransferBench::TestResults const& results);
|
||||
void
|
||||
PrintErrors(std::vector<ErrResult> const& errors);
|
||||
void
|
||||
CheckForError(ErrResult const& error);
|
||||
std::string
|
||||
MemDevicesToStr(std::vector<MemDevice> const& memDevices);
|
||||
|
||||
// Helper function that converts MemDevices to a string
|
||||
std::string
|
||||
MemDevicesToStr(std::vector<MemDevice> const& memDevices)
|
||||
{
|
||||
if(memDevices.empty()) return "N";
|
||||
std::stringstream ss;
|
||||
for(auto const& m : memDevices)
|
||||
ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Helper function to print warning / exit on fatal error
|
||||
void
|
||||
CheckForError(ErrResult const& error)
|
||||
{
|
||||
switch(error.errType)
|
||||
{
|
||||
case ERR_NONE: return;
|
||||
case ERR_WARN: printf("[WARN] %s\n", error.errMsg.c_str()); return;
|
||||
case ERR_FATAL: printf("[ERROR] %s\n", error.errMsg.c_str()); exit(1);
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to print list of errors
|
||||
void
|
||||
PrintErrors(std::vector<ErrResult> const& errors)
|
||||
{
|
||||
bool isFatal = false;
|
||||
for(auto const& err : errors)
|
||||
{
|
||||
printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN",
|
||||
err.errMsg.c_str());
|
||||
isFatal |= (err.errType == ERR_FATAL);
|
||||
}
|
||||
if(isFatal) exit(1);
|
||||
}
|
||||
|
||||
// Print TransferBench test results
|
||||
void
|
||||
PrintResults(EnvVars const& ev, int const testNum, std::vector<Transfer> const& transfers,
|
||||
TransferBench::TestResults const& results)
|
||||
{
|
||||
char sep = ev.outputToCsv ? ',' : '|';
|
||||
size_t numTimedIterations = results.numTimedIterations;
|
||||
|
||||
if(!ev.outputToCsv) printf("Test %d:\n", testNum);
|
||||
|
||||
// Loop over each executor
|
||||
for(auto exeInfoPair : results.exeResults)
|
||||
{
|
||||
ExeDevice const& exeDevice = exeInfoPair.first;
|
||||
ExeResult const& exeResult = exeInfoPair.second;
|
||||
ExeType const exeType = exeDevice.exeType;
|
||||
int32_t const exeIndex = exeDevice.exeIndex;
|
||||
|
||||
printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f "
|
||||
"GB/s (sum)\n",
|
||||
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
|
||||
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep,
|
||||
exeResult.sumBandwidthGbPerSec);
|
||||
|
||||
// Loop over each transfer
|
||||
for(int idx : exeResult.transferIdx)
|
||||
{
|
||||
Transfer const& t = transfers[idx];
|
||||
TransferResult const& r = results.tfrResults[idx];
|
||||
|
||||
char exeSubIndexStr[32] = "";
|
||||
if(t.exeSubIndex != -1) sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
|
||||
printf(" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s "
|
||||
"-> %c%03d%s:%03d -> %s\n",
|
||||
idx, sep, r.avgBandwidthGbPerSec, sep, r.avgDurationMsec, sep,
|
||||
r.numBytes, sep, MemDevicesToStr(t.srcs).c_str(),
|
||||
TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
|
||||
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
|
||||
|
||||
// Show per-iteration timing information
|
||||
if(ev.showIterations)
|
||||
{
|
||||
// Check that per-iteration information exists
|
||||
if(r.perIterMsec.size() != numTimedIterations)
|
||||
{
|
||||
printf("[ERROR] Per iteration timing data unavailable: Expected %lu "
|
||||
"data points, but have %lu\n",
|
||||
numTimedIterations, r.perIterMsec.size());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Compute standard deviation and track iterations by speed
|
||||
std::set<std::pair<double, int>> times;
|
||||
double stdDevTime = 0;
|
||||
double stdDevBw = 0;
|
||||
for(size_t i = 0; i < numTimedIterations; i++)
|
||||
{
|
||||
times.insert(
|
||||
std::make_pair(r.perIterMsec[i], static_cast<int>(i + 1)));
|
||||
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
|
||||
stdDevTime += varTime * varTime;
|
||||
|
||||
double iterBandwidthGbs =
|
||||
(t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
|
||||
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
|
||||
stdDevBw += varBw * varBw;
|
||||
}
|
||||
stdDevTime = sqrt(stdDevTime / numTimedIterations);
|
||||
stdDevBw = sqrt(stdDevBw / numTimedIterations);
|
||||
|
||||
// Loop over iterations (fastest to slowest)
|
||||
for(auto& time : times)
|
||||
{
|
||||
double iterDurationMsec = time.first;
|
||||
double iterBandwidthGbs =
|
||||
(t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
|
||||
printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c", time.second,
|
||||
sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
|
||||
|
||||
std::set<int> usedXccs;
|
||||
if(static_cast<size_t>(time.second - 1) < r.perIterCUs.size())
|
||||
{
|
||||
printf(" CUs:");
|
||||
for(auto x : r.perIterCUs[time.second - 1])
|
||||
{
|
||||
printf(" %02d:%02d", x.first, x.second);
|
||||
usedXccs.insert(x.first);
|
||||
}
|
||||
}
|
||||
|
||||
printf(" XCCs:");
|
||||
for(auto x : usedXccs)
|
||||
printf(" %02d", x);
|
||||
printf("\n");
|
||||
}
|
||||
printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw,
|
||||
sep, stdDevTime, sep);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f "
|
||||
"ms\n",
|
||||
sep, results.avgTotalBandwidthGbPerSec, sep, results.avgTotalDurationMsec, sep,
|
||||
results.totalBytesTransferred, sep, results.overheadMsec);
|
||||
}
|
||||
|
||||
// AllToAll Preset Implementation
|
||||
void
|
||||
AllToAllPreset(EnvVars& ev, size_t const numBytesPerTransfer,
|
||||
std::string const presetName)
|
||||
{
|
||||
(void) presetName; // May be unused
|
||||
enum
|
||||
{
|
||||
A2A_COPY = 0,
|
||||
A2A_READ_ONLY = 1,
|
||||
A2A_WRITE_ONLY = 2,
|
||||
A2A_CUSTOM = 3,
|
||||
};
|
||||
char a2aModeStr[4][20] = { "Copy", "Read-Only", "Write-Only", "Custom" };
|
||||
|
||||
// Force single-stream mode for all-to-all benchmark
|
||||
ev.useSingleStream = 1;
|
||||
|
||||
// Force to gfx unroll 2 unless explicitly set
|
||||
ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
|
||||
|
||||
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
|
||||
|
||||
// Collect env vars for this preset
|
||||
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT", 1);
|
||||
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL", 0);
|
||||
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", std::min(4, numDetectedGpus));
|
||||
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
|
||||
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC", 1);
|
||||
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC", 0);
|
||||
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
|
||||
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
|
||||
|
||||
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
|
||||
int numSrcs, numDsts;
|
||||
int a2aMode = 0;
|
||||
if(getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2)
|
||||
{
|
||||
a2aMode = A2A_CUSTOM;
|
||||
}
|
||||
else
|
||||
{
|
||||
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
|
||||
if(a2aMode < 0 || a2aMode > 2)
|
||||
{
|
||||
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
|
||||
exit(1);
|
||||
}
|
||||
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
|
||||
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
|
||||
}
|
||||
|
||||
// Print off environment variables
|
||||
ev.DisplayEnvVars();
|
||||
if(!ev.hideEnv)
|
||||
{
|
||||
if(!ev.outputToCsv) printf("[AllToAll Related]\n");
|
||||
ev.Print("A2A_DIRECT", a2aDirect,
|
||||
a2aDirect ? "Only using direct links" : "Full all-to-all");
|
||||
ev.Print("A2A_LOCAL", a2aLocal, "%s local transfers",
|
||||
a2aLocal ? "Include" : "Exclude");
|
||||
ev.Print("A2A_MODE",
|
||||
(a2aMode == A2A_CUSTOM)
|
||||
? std::to_string(numSrcs) + ":" + std::to_string(numDsts)
|
||||
: std::to_string(a2aMode),
|
||||
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
|
||||
std::to_string(numDsts) + " write(s)")
|
||||
.c_str()
|
||||
: a2aModeStr[a2aMode]);
|
||||
ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
|
||||
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs,
|
||||
"Using %d queue pairs for NIC transfers", numQueuePairs);
|
||||
ev.Print("NUM_SUB_EXEC", numSubExecs, "Using %d subexecutors/CUs per Transfer",
|
||||
numSubExecs);
|
||||
ev.Print("USE_DMA_EXEC", useDmaExec, "Using %s executor",
|
||||
useDmaExec ? "DMA" : "GFX");
|
||||
ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory",
|
||||
useFineGrain ? "fine" : "coarse");
|
||||
ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor",
|
||||
useRemoteRead ? "DST" : "SRC");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Validate env vars
|
||||
if(numGpus < 0 || numGpus > numDetectedGpus)
|
||||
{
|
||||
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus,
|
||||
numDetectedGpus);
|
||||
exit(1);
|
||||
}
|
||||
if(useDmaExec && (numSrcs != 1 || numDsts != 1))
|
||||
{
|
||||
printf("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Collect the number of GPU devices to use
|
||||
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
|
||||
ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
|
||||
|
||||
std::map<std::pair<int, int>, int> reIndex;
|
||||
std::vector<Transfer> transfers;
|
||||
for(int i = 0; i < numGpus; i++)
|
||||
{
|
||||
for(int j = 0; j < numGpus; j++)
|
||||
{
|
||||
// Check whether or not to execute this pair
|
||||
if(i == j)
|
||||
{
|
||||
if(!a2aLocal) continue;
|
||||
}
|
||||
else if(a2aDirect)
|
||||
{
|
||||
#if !defined(__NVCC__)
|
||||
uint32_t linkType, hopCount;
|
||||
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
|
||||
if(hopCount != 1) continue;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Build Transfer and add it to list
|
||||
TransferBench::Transfer transfer;
|
||||
transfer.numBytes = numBytesPerTransfer;
|
||||
for(int x = 0; x < numSrcs; x++)
|
||||
transfer.srcs.push_back({ memType, i });
|
||||
|
||||
// When using multiple destinations, the additional destinations are "local"
|
||||
if(numDsts) transfer.dsts.push_back({ memType, j });
|
||||
for(int x = 1; x < numDsts; x++)
|
||||
transfer.dsts.push_back({ memType, i });
|
||||
transfer.exeDevice = { exeType, (useRemoteRead ? j : i) };
|
||||
transfer.exeSubIndex = -1;
|
||||
transfer.numSubExecs = numSubExecs;
|
||||
|
||||
reIndex[std::make_pair(i, j)] = transfers.size();
|
||||
transfers.push_back(transfer);
|
||||
}
|
||||
}
|
||||
|
||||
// Create a ring using NICs
|
||||
std::vector<int> nicTransferIdx(numGpus);
|
||||
if(numQueuePairs > 0)
|
||||
{
|
||||
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
|
||||
(void) numNics; // May be unused
|
||||
for(int i = 0; i < numGpus; i++)
|
||||
{
|
||||
TransferBench::Transfer transfer;
|
||||
transfer.numBytes = numBytesPerTransfer;
|
||||
transfer.srcs.push_back({ memType, i });
|
||||
transfer.dsts.push_back({ memType, (i + 1) % numGpus });
|
||||
transfer.exeDevice = { TransferBench::EXE_NIC_NEAREST, i };
|
||||
transfer.exeSubIndex = (i + 1) % numGpus;
|
||||
transfer.numSubExecs = numQueuePairs;
|
||||
nicTransferIdx[i] = transfers.size();
|
||||
transfers.push_back(transfer);
|
||||
}
|
||||
}
|
||||
|
||||
printf("GPU-GFX All-To-All benchmark:\n");
|
||||
printf("==========================\n");
|
||||
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
|
||||
numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs,
|
||||
transfers.size());
|
||||
if(transfers.size() == 0)
|
||||
{
|
||||
printf("Error: No valid transfers created. Check GPU count, a2aLocal=%d, "
|
||||
"a2aDirect=%d settings, and GPU topology/connectivity.\n",
|
||||
a2aLocal, a2aDirect);
|
||||
return;
|
||||
}
|
||||
|
||||
// Execute Transfers
|
||||
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
|
||||
TransferBench::TestResults results;
|
||||
if(!TransferBench::RunTransfers(cfg, transfers, results))
|
||||
{
|
||||
for(auto const& err : results.errResults)
|
||||
printf("%s\n", err.errMsg.c_str());
|
||||
exit(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
PrintResults(ev, 1, transfers, results);
|
||||
}
|
||||
|
||||
// Print results
|
||||
char separator = (ev.outputToCsv ? ',' : ' ');
|
||||
printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
|
||||
numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs,
|
||||
numDsts);
|
||||
printf(
|
||||
"===========================================================================\n");
|
||||
printf("SRC\\DST ");
|
||||
for(int dst = 0; dst < numGpus; dst++)
|
||||
printf("%cGPU %02d ", separator, dst);
|
||||
if(numQueuePairs > 0) printf("%cNIC(%02d QP)", separator, numQueuePairs);
|
||||
printf(" %cSTotal %cActual\n", separator, separator);
|
||||
|
||||
double totalBandwidthGpu = 0.0;
|
||||
double minActualBandwidth = std::numeric_limits<double>::max();
|
||||
double maxActualBandwidth = 0.0;
|
||||
std::vector<double> colTotalBandwidth(numGpus + 2, 0.0);
|
||||
for(int src = 0; src < numGpus; src++)
|
||||
{
|
||||
double rowTotalBandwidth = 0;
|
||||
int transferCount = 0;
|
||||
double minBandwidth = std::numeric_limits<double>::max();
|
||||
printf("GPU %02d", src);
|
||||
for(int dst = 0; dst < numGpus; dst++)
|
||||
{
|
||||
if(reIndex.count(std::make_pair(src, dst)))
|
||||
{
|
||||
int const transferIdx = reIndex[std::make_pair(src, dst)];
|
||||
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
|
||||
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
|
||||
rowTotalBandwidth += r.avgBandwidthGbPerSec;
|
||||
totalBandwidthGpu += r.avgBandwidthGbPerSec;
|
||||
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
|
||||
transferCount++;
|
||||
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%c%8s ", separator, "N/A");
|
||||
}
|
||||
}
|
||||
|
||||
if(numQueuePairs > 0)
|
||||
{
|
||||
TransferBench::TransferResult const& r =
|
||||
results.tfrResults[nicTransferIdx[src]];
|
||||
colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
|
||||
rowTotalBandwidth += r.avgBandwidthGbPerSec;
|
||||
totalBandwidthGpu += r.avgBandwidthGbPerSec;
|
||||
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
|
||||
transferCount++;
|
||||
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
|
||||
}
|
||||
double actualBandwidth = minBandwidth * transferCount;
|
||||
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator,
|
||||
actualBandwidth);
|
||||
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
|
||||
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
|
||||
colTotalBandwidth[numGpus + 1] += rowTotalBandwidth;
|
||||
}
|
||||
printf("\nRTotal");
|
||||
for(int dst = 0; dst < numGpus; dst++)
|
||||
{
|
||||
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
|
||||
}
|
||||
if(numQueuePairs > 0)
|
||||
{
|
||||
printf("%c%8.3f ", separator, colTotalBandwidth[numGpus]);
|
||||
}
|
||||
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus + 1],
|
||||
separator, minActualBandwidth, separator, maxActualBandwidth);
|
||||
printf("\n");
|
||||
|
||||
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n",
|
||||
totalBandwidthGpu / transfers.size());
|
||||
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
|
||||
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n",
|
||||
results.avgTotalBandwidthGbPerSec);
|
||||
|
||||
PrintErrors(results.errResults);
|
||||
}
|
||||
|
||||
// Display usage instructions
|
||||
void
|
||||
DisplayUsage(char const* cmdName)
|
||||
{
|
||||
std::string nicSupport = "";
|
||||
#if NIC_EXEC_ENABLED
|
||||
nicSupport = " (with NIC support)";
|
||||
#endif
|
||||
printf("Standalone AllToAll v%s%s\n", TransferBench::VERSION, nicSupport.c_str());
|
||||
printf("========================================\n");
|
||||
|
||||
printf("Usage: %s [N]\n", cmdName);
|
||||
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
|
||||
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 "
|
||||
"bytes\n",
|
||||
DEFAULT_BYTES_PER_TRANSFER);
|
||||
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / "
|
||||
"gigabytes\n");
|
||||
printf("\n");
|
||||
|
||||
EnvVars::DisplayUsage();
|
||||
}
|
||||
|
||||
// Main function
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
// Collect environment variables
|
||||
EnvVars ev;
|
||||
|
||||
// Display usage instructions if requested
|
||||
if(argc > 1 && (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0))
|
||||
{
|
||||
DisplayUsage(argv[0]);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// Determine number of bytes to run per Transfer
|
||||
size_t numBytesPerTransfer = argc > 1 ? atoll(argv[1]) : DEFAULT_BYTES_PER_TRANSFER;
|
||||
if(argc > 1)
|
||||
{
|
||||
// Adjust bytes if unit specified
|
||||
char units = argv[1][strlen(argv[1]) - 1];
|
||||
switch(units)
|
||||
{
|
||||
case 'G':
|
||||
case 'g': numBytesPerTransfer *= 1024;
|
||||
case 'M':
|
||||
case 'm': numBytesPerTransfer *= 1024;
|
||||
case 'K':
|
||||
case 'k': numBytesPerTransfer *= 1024;
|
||||
}
|
||||
}
|
||||
if(numBytesPerTransfer % 4)
|
||||
{
|
||||
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n",
|
||||
numBytesPerTransfer);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("Running AllToAll benchmark with %lu bytes per transfer\n\n",
|
||||
numBytesPerTransfer);
|
||||
|
||||
// Run AllToAll preset
|
||||
AllToAllPreset(ev, numBytesPerTransfer, "AllToAll");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-transferBench-example LANGUAGES CXX)
|
||||
|
||||
if(ROCPROFSYS_DISABLE_EXAMPLES)
|
||||
get_filename_component(_DIR ${CMAKE_CURRENT_LIST_DIR} NAME)
|
||||
|
||||
if(
|
||||
${PROJECT_NAME} IN_LIST ROCPROFSYS_DISABLE_EXAMPLES
|
||||
OR ${_DIR} IN_LIST ROCPROFSYS_DISABLE_EXAMPLES
|
||||
)
|
||||
return()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(hip QUIET HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR})
|
||||
|
||||
find_program(
|
||||
HIPCC_EXECUTABLE
|
||||
NAMES hipcc
|
||||
HINTS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
PATHS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
NO_CACHE
|
||||
)
|
||||
mark_as_advanced(HIPCC_EXECUTABLE)
|
||||
|
||||
if(NOT HIPCC_EXECUTABLE)
|
||||
message(AUTHOR_WARNING "hipcc could not be found. Cannot build transferBench target")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
|
||||
if(
|
||||
CMAKE_CXX_COMPILER STREQUAL HIPCC_EXECUTABLE
|
||||
OR "${CMAKE_CXX_COMPILER}" MATCHES "hipcc"
|
||||
)
|
||||
set(CMAKE_CXX_COMPILER_IS_HIPCC 1 CACHE BOOL "HIP compiler")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(
|
||||
(
|
||||
NOT CMAKE_CXX_COMPILER_IS_HIPCC
|
||||
OR (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT hip_FOUND)
|
||||
)
|
||||
AND (NOT COMMAND rocprofiler_systems_custom_compilation AND NOT HIPCC_EXECUTABLE)
|
||||
)
|
||||
message(AUTHOR_WARNING "transferBench target could not be built")
|
||||
return()
|
||||
endif()
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# Find HSA runtime library
|
||||
find_library(
|
||||
HSA_RUNTIME_LIBRARY
|
||||
NAMES hsa-runtime64
|
||||
HINTS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
PATHS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
PATH_SUFFIXES lib lib64
|
||||
)
|
||||
|
||||
find_path(
|
||||
HSA_RUNTIME_INCLUDE_DIR
|
||||
NAMES hsa/hsa.h
|
||||
HINTS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
PATHS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
PATH_SUFFIXES include
|
||||
)
|
||||
|
||||
if(NOT HSA_RUNTIME_LIBRARY OR NOT HSA_RUNTIME_INCLUDE_DIR)
|
||||
message(
|
||||
AUTHOR_WARNING
|
||||
"HSA runtime library not found. Cannot build transferBench target"
|
||||
)
|
||||
return()
|
||||
endif()
|
||||
|
||||
add_executable(transferBench AllToAll.cpp)
|
||||
target_link_libraries(transferBench PRIVATE Threads::Threads ${HSA_RUNTIME_LIBRARY})
|
||||
target_include_directories(
|
||||
transferBench
|
||||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${HSA_RUNTIME_INCLUDE_DIR}
|
||||
)
|
||||
|
||||
if(
|
||||
CMAKE_CXX_COMPILER_ID MATCHES "Clang"
|
||||
AND NOT CMAKE_CXX_COMPILER_IS_HIPCC
|
||||
AND NOT HIPCC_EXECUTABLE
|
||||
)
|
||||
target_link_libraries(
|
||||
transferBench
|
||||
PRIVATE
|
||||
$<TARGET_NAME_IF_EXISTS:rocprofiler-systems::rocprofiler-systems-compile-options>
|
||||
$<TARGET_NAME_IF_EXISTS:hip::host>
|
||||
$<TARGET_NAME_IF_EXISTS:hip::device>
|
||||
)
|
||||
else()
|
||||
target_compile_options(transferBench PRIVATE -W -Wall)
|
||||
endif()
|
||||
|
||||
if("${CMAKE_BUILD_TYPE}" MATCHES "Release")
|
||||
target_compile_options(transferBench PRIVATE -g1)
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
|
||||
# defined in MacroUtilities.cmake
|
||||
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transferBench)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS transferBench DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
endif()
|
||||
檔案差異因為檔案過大而無法顯示
載入差異
@@ -39,6 +39,7 @@ set(core_sources
|
||||
${CMAKE_CURRENT_LIST_DIR}/dynamic_library.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/exception.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/gpu.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/gpu_metrics.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/mproc.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/node_info.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/perf.cpp
|
||||
@@ -66,6 +67,7 @@ set(core_headers
|
||||
${CMAKE_CURRENT_LIST_DIR}/dynamic_library.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/exception.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/gpu.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/gpu_metrics.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/locking.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/mpi.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/mproc.hpp
|
||||
|
||||
@@ -70,11 +70,13 @@ config_settings(const std::shared_ptr<settings>& _config)
|
||||
// No distinction between busy and activity shown in description
|
||||
std::string jpeg_activity_support = "";
|
||||
std::string vcn_activity_support = "";
|
||||
std::string xgmi_support = "";
|
||||
std::string pcie_support = "";
|
||||
|
||||
size_t device_count = gpu::get_processor_count();
|
||||
for(size_t i = 0; i < device_count; i++)
|
||||
{
|
||||
if(gpu::is_vcn_activity_supported(i) || gpu::is_vcn_busy_supported(i))
|
||||
if(gpu::vcn_is_device_level_only(i) || gpu::is_vcn_busy_supported(i))
|
||||
{
|
||||
vcn_activity_support += ", vcn_activity";
|
||||
break;
|
||||
@@ -82,17 +84,33 @@ config_settings(const std::shared_ptr<settings>& _config)
|
||||
}
|
||||
for(size_t i = 0; i < device_count; i++)
|
||||
{
|
||||
if(gpu::is_jpeg_activity_supported(i) || gpu::is_jpeg_busy_supported(i))
|
||||
if(gpu::jpeg_is_device_level_only(i) || gpu::is_jpeg_busy_supported(i))
|
||||
{
|
||||
jpeg_activity_support += ", jpeg_activity";
|
||||
break;
|
||||
}
|
||||
}
|
||||
for(size_t i = 0; i < device_count; i++)
|
||||
{
|
||||
if(gpu::is_xgmi_supported(i))
|
||||
{
|
||||
xgmi_support += ", xgmi";
|
||||
break;
|
||||
}
|
||||
}
|
||||
for(size_t i = 0; i < device_count; i++)
|
||||
{
|
||||
if(gpu::is_pcie_supported(i))
|
||||
{
|
||||
pcie_support += ", pcie";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(
|
||||
std::string, "ROCPROFSYS_AMD_SMI_METRICS",
|
||||
"amd-smi metrics to collect: " + default_metrics + jpeg_activity_support +
|
||||
vcn_activity_support + ". " +
|
||||
vcn_activity_support + xgmi_support + pcie_support + ". " +
|
||||
"An empty value implies 'all' and 'none' suppresses all.",
|
||||
"busy, temp, power, mem_usage", "backend", "amd_smi", "rocm", "process_sampling");
|
||||
}
|
||||
|
||||
@@ -115,6 +115,14 @@ ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_power, ROCPROFSYS_CATEGORY_AMD_SMI_
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_memory_usage, ROCPROFSYS_CATEGORY_AMD_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_vcn_activity, ROCPROFSYS_CATEGORY_AMD_SMI_VCN_ACTIVITY, "device_vcn_activity", "VCN Activity of a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_jpeg_activity, ROCPROFSYS_CATEGORY_AMD_SMI_JPEG_ACTIVITY, "device_jpeg_activity", "JPEG Activity of a GPU device")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_link_width, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_WIDTH, "device_xgmi_link_width", "XGMI Link Width")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_link_speed, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_SPEED, "device_xgmi_link_speed", "XGMI Link Speed")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_read_data, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_READ_DATA, "device_xgmi_read_data", "XGMI Read Data Accumulator")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_write_data, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_WRITE_DATA, "device_xgmi_write_data", "XGMI Write Data Accumulator")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_link_width, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_WIDTH, "device_pcie_link_width", "PCIe Link Width")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_link_speed, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_SPEED, "device_pcie_link_speed", "PCIe Link Speed")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_bandwidth_acc, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_ACC, "device_pcie_bandwidth_acc", "PCIe Bandwidth Accumulated")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_bandwidth_inst, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_INST, "device_pcie_bandwidth_inst", "PCIe Bandwidth Instantaneous")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions")
|
||||
ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions")
|
||||
@@ -187,6 +195,14 @@ using name = perfetto_category<Tp...>;
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_memory_usage), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_vcn_activity), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_jpeg_activity), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_link_width), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_link_speed), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_read_data), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_write_data), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_link_width), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_link_speed), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_bandwidth_acc), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_bandwidth_inst), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \
|
||||
ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \
|
||||
|
||||
@@ -245,12 +245,14 @@ add_device_metadata()
|
||||
* Required amdsmi methods to get processors and handles
|
||||
*/
|
||||
|
||||
uint32_t processors::total_processor_count = 0;
|
||||
std::vector<amdsmi_processor_handle> processors::processors_list = {};
|
||||
std::vector<bool> processors::vcn_activity_supported = {};
|
||||
std::vector<bool> processors::jpeg_activity_supported = {};
|
||||
std::vector<bool> processors::vcn_busy_supported = {};
|
||||
std::vector<bool> processors::jpeg_busy_supported = {};
|
||||
uint32_t processors::total_processor_count = 0;
|
||||
std::vector<amdsmi_processor_handle> processors::processors_list = {};
|
||||
std::vector<bool> processors::vcn_device_level_only = {};
|
||||
std::vector<bool> processors::jpeg_device_level_only = {};
|
||||
std::vector<bool> processors::vcn_busy_supported = {};
|
||||
std::vector<bool> processors::jpeg_busy_supported = {};
|
||||
std::vector<bool> processors::xgmi_supported = {};
|
||||
std::vector<bool> processors::pcie_supported = {};
|
||||
|
||||
void
|
||||
get_processor_handles()
|
||||
@@ -299,49 +301,74 @@ get_processor_handles()
|
||||
amdsmi_gpu_metrics_t gpu_metrics;
|
||||
bool vcn_supported = false, jpeg_supported = false;
|
||||
bool v_busy_supported = false, j_busy_supported = false;
|
||||
bool xgmi_supported = false, pcie_supported = false;
|
||||
// AMD SMI will not report VCN_activity and JPEG_activity, if VCN_busy or
|
||||
// JPEG_busy fields are available.
|
||||
if(amdsmi_get_gpu_metrics_info(processor, &gpu_metrics) ==
|
||||
AMDSMI_STATUS_SUCCESS)
|
||||
{
|
||||
// Helper lambda to check if any value in the array is valid
|
||||
auto has_valid = [](const auto& arr) {
|
||||
// Helper lambda to check if any value in the array is valid (not
|
||||
// UINT16_MAX)
|
||||
auto has_valid_u16 = [](const auto& arr) {
|
||||
return std::any_of(std::begin(arr), std::end(arr),
|
||||
[](auto val) { return val != UINT16_MAX; });
|
||||
};
|
||||
vcn_supported = has_valid(gpu_metrics.vcn_activity);
|
||||
jpeg_supported = has_valid(gpu_metrics.jpeg_activity);
|
||||
|
||||
// Helper lambda to check if any value in the array is valid (not
|
||||
// UINT64_MAX)
|
||||
auto has_valid_u64 = [](const auto& arr) {
|
||||
return std::any_of(std::begin(arr), std::end(arr),
|
||||
[](auto val) { return val != UINT64_MAX; });
|
||||
};
|
||||
|
||||
vcn_supported = has_valid_u16(gpu_metrics.vcn_activity);
|
||||
jpeg_supported = has_valid_u16(gpu_metrics.jpeg_activity);
|
||||
|
||||
// Check if VCN and JPEG busy metrics are available
|
||||
for(const auto& xcp : gpu_metrics.xcp_stats)
|
||||
{
|
||||
if(!v_busy_supported && has_valid(xcp.vcn_busy))
|
||||
if(!v_busy_supported && has_valid_u16(xcp.vcn_busy))
|
||||
v_busy_supported = true;
|
||||
if(!j_busy_supported && has_valid(xcp.jpeg_busy))
|
||||
if(!j_busy_supported && has_valid_u16(xcp.jpeg_busy))
|
||||
j_busy_supported = true;
|
||||
if(v_busy_supported && j_busy_supported) break;
|
||||
}
|
||||
|
||||
// Check if XGMI metrics are supported (any value not at max)
|
||||
xgmi_supported = (gpu_metrics.xgmi_link_width != UINT16_MAX) ||
|
||||
(gpu_metrics.xgmi_link_speed != UINT16_MAX) ||
|
||||
has_valid_u64(gpu_metrics.xgmi_read_data_acc) ||
|
||||
has_valid_u64(gpu_metrics.xgmi_write_data_acc);
|
||||
|
||||
// Check if PCIe metrics are supported (any value not at max)
|
||||
pcie_supported = (gpu_metrics.pcie_link_width != UINT16_MAX) ||
|
||||
(gpu_metrics.pcie_link_speed != UINT16_MAX) ||
|
||||
(gpu_metrics.pcie_bandwidth_acc != UINT64_MAX) ||
|
||||
(gpu_metrics.pcie_bandwidth_inst != UINT64_MAX);
|
||||
}
|
||||
processors::vcn_activity_supported.push_back(vcn_supported);
|
||||
processors::jpeg_activity_supported.push_back(jpeg_supported);
|
||||
processors::vcn_device_level_only.push_back(vcn_supported);
|
||||
processors::jpeg_device_level_only.push_back(jpeg_supported);
|
||||
processors::vcn_busy_supported.push_back(v_busy_supported);
|
||||
processors::jpeg_busy_supported.push_back(j_busy_supported);
|
||||
processors::xgmi_supported.push_back(xgmi_supported);
|
||||
processors::pcie_supported.push_back(pcie_supported);
|
||||
}
|
||||
}
|
||||
processors::total_processor_count = processors::processors_list.size();
|
||||
}
|
||||
|
||||
bool
|
||||
is_vcn_activity_supported(uint32_t dev_id)
|
||||
vcn_is_device_level_only(uint32_t dev_id)
|
||||
{
|
||||
if(dev_id >= processors::vcn_activity_supported.size()) return false;
|
||||
return processors::vcn_activity_supported[dev_id];
|
||||
if(dev_id >= processors::vcn_device_level_only.size()) return false;
|
||||
return processors::vcn_device_level_only[dev_id];
|
||||
}
|
||||
|
||||
bool
|
||||
is_jpeg_activity_supported(uint32_t dev_id)
|
||||
jpeg_is_device_level_only(uint32_t dev_id)
|
||||
{
|
||||
if(dev_id >= processors::jpeg_activity_supported.size()) return false;
|
||||
return processors::jpeg_activity_supported[dev_id];
|
||||
if(dev_id >= processors::jpeg_device_level_only.size()) return false;
|
||||
return processors::jpeg_device_level_only[dev_id];
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -358,6 +385,20 @@ is_jpeg_busy_supported(uint32_t dev_id)
|
||||
return processors::jpeg_busy_supported[dev_id];
|
||||
}
|
||||
|
||||
bool
|
||||
is_xgmi_supported(uint32_t dev_id)
|
||||
{
|
||||
if(dev_id >= processors::xgmi_supported.size()) return false;
|
||||
return processors::xgmi_supported[dev_id];
|
||||
}
|
||||
|
||||
bool
|
||||
is_pcie_supported(uint32_t dev_id)
|
||||
{
|
||||
if(dev_id >= processors::pcie_supported.size()) return false;
|
||||
return processors::pcie_supported[dev_id];
|
||||
}
|
||||
|
||||
uint32_t
|
||||
get_processor_count()
|
||||
{
|
||||
|
||||
@@ -41,10 +41,10 @@ amdsmi_processor_handle
|
||||
get_handle_from_id(uint32_t dev_id);
|
||||
|
||||
bool
|
||||
is_vcn_activity_supported(uint32_t dev_id);
|
||||
vcn_is_device_level_only(uint32_t dev_id);
|
||||
|
||||
bool
|
||||
is_jpeg_activity_supported(uint32_t dev_id);
|
||||
jpeg_is_device_level_only(uint32_t dev_id);
|
||||
|
||||
bool
|
||||
is_vcn_busy_supported(uint32_t dev_id);
|
||||
@@ -52,23 +52,33 @@ is_vcn_busy_supported(uint32_t dev_id);
|
||||
bool
|
||||
is_jpeg_busy_supported(uint32_t dev_id);
|
||||
|
||||
bool
|
||||
is_xgmi_supported(uint32_t dev_id);
|
||||
|
||||
bool
|
||||
is_pcie_supported(uint32_t dev_id);
|
||||
|
||||
struct processors
|
||||
{
|
||||
static uint32_t total_processor_count;
|
||||
static std::vector<amdsmi_processor_handle> processors_list;
|
||||
static std::vector<bool> vcn_activity_supported;
|
||||
static std::vector<bool> jpeg_activity_supported;
|
||||
static std::vector<bool> vcn_device_level_only;
|
||||
static std::vector<bool> jpeg_device_level_only;
|
||||
static std::vector<bool> vcn_busy_supported;
|
||||
static std::vector<bool> jpeg_busy_supported;
|
||||
static std::vector<bool> xgmi_supported;
|
||||
static std::vector<bool> pcie_supported;
|
||||
|
||||
private:
|
||||
friend void rocprofsys::gpu::get_processor_handles();
|
||||
friend uint32_t rocprofsys::gpu::get_processor_count();
|
||||
friend amdsmi_processor_handle rocprofsys::gpu::get_handle_from_id(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::is_vcn_activity_supported(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::is_jpeg_activity_supported(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::vcn_is_device_level_only(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::jpeg_is_device_level_only(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::is_vcn_busy_supported(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::is_jpeg_busy_supported(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::is_xgmi_supported(uint32_t dev_id);
|
||||
friend bool rocprofsys::gpu::is_pcie_supported(uint32_t dev_id);
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
@@ -0,0 +1,332 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "gpu_metrics.hpp"
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace gpu
|
||||
{
|
||||
namespace
|
||||
{
|
||||
// Helper functions for serialization
|
||||
void
|
||||
serialize_uint8(std::vector<uint8_t>& data, uint8_t val)
|
||||
{
|
||||
data.push_back(val);
|
||||
}
|
||||
|
||||
void
|
||||
serialize_uint16(std::vector<uint8_t>& data, uint16_t val)
|
||||
{
|
||||
data.push_back(static_cast<uint8_t>(val & 0xFF));
|
||||
data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
|
||||
}
|
||||
|
||||
void
|
||||
serialize_uint16_vector(std::vector<uint8_t>& data, const std::vector<uint16_t>& vec,
|
||||
uint8_t count)
|
||||
{
|
||||
for(uint8_t i = 0; i < count; ++i)
|
||||
{
|
||||
data.push_back(static_cast<uint8_t>(vec[i] & 0xFF));
|
||||
data.push_back(static_cast<uint8_t>((vec[i] >> 8) & 0xFF));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
serialize_uint64(std::vector<uint8_t>& data, uint64_t val)
|
||||
{
|
||||
for(int i = 0; i < 8; ++i)
|
||||
data.push_back(static_cast<uint8_t>((val >> (i * 8)) & 0xFF));
|
||||
}
|
||||
|
||||
void
|
||||
serialize_uint64_vector(std::vector<uint8_t>& data, const std::vector<uint64_t>& vec,
|
||||
uint8_t count)
|
||||
{
|
||||
for(uint8_t i = 0; i < count; ++i)
|
||||
{
|
||||
for(int j = 0; j < 8; ++j)
|
||||
data.push_back(static_cast<uint8_t>((vec[i] >> (j * 8)) & 0xFF));
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions for deserialization
|
||||
uint8_t
|
||||
deserialize_uint8(const std::vector<uint8_t>& data, size_t& offset)
|
||||
{
|
||||
if(offset >= data.size())
|
||||
throw std::runtime_error("Invalid serialized data: unexpected end");
|
||||
return data[offset++];
|
||||
}
|
||||
|
||||
uint16_t
|
||||
deserialize_uint16(const std::vector<uint8_t>& data, size_t& offset)
|
||||
{
|
||||
if(offset + 1 >= data.size())
|
||||
throw std::runtime_error("Invalid serialized data: unexpected end");
|
||||
uint16_t value = static_cast<uint16_t>(data[offset]) |
|
||||
(static_cast<uint16_t>(data[offset + 1]) << 8);
|
||||
offset += 2;
|
||||
return value;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
deserialize_uint64(const std::vector<uint8_t>& data, size_t& offset)
|
||||
{
|
||||
if(offset + 7 >= data.size())
|
||||
throw std::runtime_error("Invalid serialized data: unexpected end");
|
||||
uint64_t value = 0;
|
||||
for(int i = 0; i < 8; ++i)
|
||||
value |= (static_cast<uint64_t>(data[offset + i]) << (i * 8));
|
||||
offset += 8;
|
||||
return value;
|
||||
}
|
||||
|
||||
std::vector<uint16_t>
|
||||
deserialize_uint16_vector(const std::vector<uint8_t>& data, size_t& offset, uint8_t count)
|
||||
{
|
||||
std::vector<uint16_t> values;
|
||||
values.reserve(count);
|
||||
for(uint8_t i = 0; i < count; ++i)
|
||||
values.push_back(deserialize_uint16(data, offset));
|
||||
return values;
|
||||
}
|
||||
|
||||
std::vector<uint64_t>
|
||||
deserialize_uint64_vector(const std::vector<uint8_t>& data, size_t& offset, uint8_t count)
|
||||
{
|
||||
std::vector<uint64_t> values;
|
||||
values.reserve(count);
|
||||
for(uint8_t i = 0; i < count; ++i)
|
||||
values.push_back(deserialize_uint64(data, offset));
|
||||
return values;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::vector<uint8_t>
|
||||
serialize_gpu_metrics(const gpu_metrics_t& metrics,
|
||||
const gpu_metrics_capabilities_t& capabilities,
|
||||
const gpu_metrics_settings_t& settings)
|
||||
{
|
||||
// Flatten XCP data if needed and pre-calculate counts
|
||||
// Example:
|
||||
// XCP 0: [10, 20, 30] (3 values)
|
||||
// XCP 1: [15, 25] (2 values)
|
||||
// XCP 2: [5, 10, 15, 20] (4 values)
|
||||
// vcn_xcp_count: 3
|
||||
// vcn_xcp_sizes: [3, 2, 4]
|
||||
// vcn_data_flat: [10, 20, 30, 15, 25, 5, 10, 15, 20]
|
||||
std::vector<uint16_t> vcn_data_flat;
|
||||
std::vector<uint16_t> jpeg_data_flat;
|
||||
std::vector<uint8_t> vcn_xcp_sizes; // Size of each XCP's VCN data
|
||||
std::vector<uint8_t> jpeg_xcp_sizes; // Size of each XCP's JPEG data
|
||||
|
||||
if(capabilities.flags.vcn_is_device_level_only)
|
||||
{
|
||||
vcn_data_flat = metrics.vcn_activity;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Flatten per-XCP VCN data and record sizes
|
||||
for(const auto& xcp_data : metrics.vcn_busy)
|
||||
{
|
||||
vcn_xcp_sizes.push_back(static_cast<uint8_t>(xcp_data.size()));
|
||||
vcn_data_flat.insert(vcn_data_flat.end(), xcp_data.begin(), xcp_data.end());
|
||||
}
|
||||
}
|
||||
|
||||
if(capabilities.flags.jpeg_is_device_level_only)
|
||||
{
|
||||
jpeg_data_flat = metrics.jpeg_activity;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Flatten per-XCP JPEG data and record sizes
|
||||
for(const auto& xcp_data : metrics.jpeg_busy)
|
||||
{
|
||||
jpeg_xcp_sizes.push_back(static_cast<uint8_t>(xcp_data.size()));
|
||||
jpeg_data_flat.insert(jpeg_data_flat.end(), xcp_data.begin(), xcp_data.end());
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t vcn_count = static_cast<uint8_t>(vcn_data_flat.size());
|
||||
uint8_t jpeg_count = static_cast<uint8_t>(jpeg_data_flat.size());
|
||||
uint8_t vcn_xcp_count = static_cast<uint8_t>(vcn_xcp_sizes.size());
|
||||
uint8_t jpeg_xcp_count = static_cast<uint8_t>(jpeg_xcp_sizes.size());
|
||||
uint8_t xgmi_read_count = static_cast<uint8_t>(metrics.xgmi_read_data_acc.size());
|
||||
uint8_t xgmi_write_count = static_cast<uint8_t>(metrics.xgmi_write_data_acc.size());
|
||||
|
||||
std::vector<uint8_t> result;
|
||||
|
||||
// Serialize capability flags (1 byte)
|
||||
// These flags determine how the activity information is provided in the data
|
||||
// Current flags:
|
||||
// - bit 0 (0x01): vcn_is_device_level_only (device-level vs per-XCP)
|
||||
// - bit 1 (0x02): jpeg_is_device_level_only (device-level vs per-XCP)
|
||||
// - bits 2-7: Reserved for future use
|
||||
//
|
||||
serialize_uint8(result, capabilities.value);
|
||||
|
||||
// Serialize counts
|
||||
serialize_uint8(result, vcn_count);
|
||||
serialize_uint8(result, jpeg_count);
|
||||
serialize_uint8(result, vcn_xcp_count);
|
||||
serialize_uint8(result, jpeg_xcp_count);
|
||||
serialize_uint8(result, xgmi_read_count);
|
||||
serialize_uint8(result, xgmi_write_count);
|
||||
|
||||
// Serialize per-XCP sizes
|
||||
for(uint8_t size : vcn_xcp_sizes)
|
||||
serialize_uint8(result, size);
|
||||
|
||||
for(uint8_t size : jpeg_xcp_sizes)
|
||||
serialize_uint8(result, size);
|
||||
|
||||
// Serialize the flattened data
|
||||
if(settings.vcn_activity && vcn_count > 0)
|
||||
serialize_uint16_vector(result, vcn_data_flat, vcn_count);
|
||||
if(settings.jpeg_activity && jpeg_count > 0)
|
||||
serialize_uint16_vector(result, jpeg_data_flat, jpeg_count);
|
||||
if(settings.xgmi)
|
||||
{
|
||||
serialize_uint16(result, metrics.xgmi_link_width);
|
||||
serialize_uint16(result, metrics.xgmi_link_speed);
|
||||
serialize_uint64_vector(result, metrics.xgmi_read_data_acc, xgmi_read_count);
|
||||
serialize_uint64_vector(result, metrics.xgmi_write_data_acc, xgmi_write_count);
|
||||
}
|
||||
if(settings.pcie)
|
||||
{
|
||||
serialize_uint16(result, metrics.pcie_link_width);
|
||||
serialize_uint16(result, metrics.pcie_link_speed);
|
||||
serialize_uint64(result, metrics.pcie_bandwidth_acc);
|
||||
serialize_uint64(result, metrics.pcie_bandwidth_inst);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
deserialize_gpu_metrics(const std::vector<uint8_t>& serialized_data,
|
||||
gpu_metrics_t& result, bool is_vcn_enabled, bool is_jpeg_enabled,
|
||||
bool is_xgmi_enabled, bool is_pcie_enabled,
|
||||
gpu_metrics_capabilities_t& capabilities)
|
||||
{
|
||||
if(serialized_data.empty())
|
||||
{
|
||||
throw std::runtime_error("Invalid serialized data: insufficient header size");
|
||||
}
|
||||
size_t offset = 0;
|
||||
|
||||
// Deserialize capability flags (1 byte)
|
||||
// Extract capability flags from packed byte.
|
||||
// See serialize_gpu_metrics() for flag definitions.
|
||||
capabilities.value = deserialize_uint8(serialized_data, offset);
|
||||
|
||||
// Deserialize counts
|
||||
uint8_t vcn_count = deserialize_uint8(serialized_data, offset);
|
||||
uint8_t jpeg_count = deserialize_uint8(serialized_data, offset);
|
||||
uint8_t vcn_xcp_count = deserialize_uint8(serialized_data, offset);
|
||||
uint8_t jpeg_xcp_count = deserialize_uint8(serialized_data, offset);
|
||||
uint8_t xgmi_read_count = deserialize_uint8(serialized_data, offset);
|
||||
uint8_t xgmi_write_count = deserialize_uint8(serialized_data, offset);
|
||||
|
||||
// Deserialize per-XCP sizes
|
||||
std::vector<uint8_t> vcn_xcp_sizes;
|
||||
std::vector<uint8_t> jpeg_xcp_sizes;
|
||||
for(uint8_t i = 0; i < vcn_xcp_count; ++i)
|
||||
vcn_xcp_sizes.push_back(deserialize_uint8(serialized_data, offset));
|
||||
for(uint8_t i = 0; i < jpeg_xcp_count; ++i)
|
||||
jpeg_xcp_sizes.push_back(deserialize_uint8(serialized_data, offset));
|
||||
|
||||
// Deserialize VCN data and reconstruct structure
|
||||
if(is_vcn_enabled && vcn_count > 0)
|
||||
{
|
||||
auto flat_data = deserialize_uint16_vector(serialized_data, offset, vcn_count);
|
||||
if(capabilities.flags.vcn_is_device_level_only)
|
||||
{
|
||||
result.vcn_activity = flat_data;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-XCP: split flat data according to XCP sizes into vcn_busy
|
||||
size_t flat_offset = 0;
|
||||
for(uint8_t xcp_size : vcn_xcp_sizes)
|
||||
{
|
||||
std::vector<uint16_t> xcp_data(flat_data.begin() + flat_offset,
|
||||
flat_data.begin() + flat_offset +
|
||||
xcp_size);
|
||||
result.vcn_busy.push_back(xcp_data);
|
||||
flat_offset += xcp_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Deserialize JPEG data and reconstruct structure
|
||||
if(is_jpeg_enabled && jpeg_count > 0)
|
||||
{
|
||||
auto flat_data = deserialize_uint16_vector(serialized_data, offset, jpeg_count);
|
||||
if(capabilities.flags.jpeg_is_device_level_only)
|
||||
{
|
||||
result.jpeg_activity = flat_data;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-XCP: split flat data according to XCP sizes into jpeg_busy
|
||||
size_t flat_offset = 0;
|
||||
for(uint8_t xcp_size : jpeg_xcp_sizes)
|
||||
{
|
||||
std::vector<uint16_t> xcp_data(flat_data.begin() + flat_offset,
|
||||
flat_data.begin() + flat_offset +
|
||||
xcp_size);
|
||||
result.jpeg_busy.push_back(xcp_data);
|
||||
flat_offset += xcp_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Deserialize XGMI data
|
||||
if(is_xgmi_enabled)
|
||||
{
|
||||
result.xgmi_link_width = deserialize_uint16(serialized_data, offset);
|
||||
result.xgmi_link_speed = deserialize_uint16(serialized_data, offset);
|
||||
result.xgmi_read_data_acc =
|
||||
deserialize_uint64_vector(serialized_data, offset, xgmi_read_count);
|
||||
result.xgmi_write_data_acc =
|
||||
deserialize_uint64_vector(serialized_data, offset, xgmi_write_count);
|
||||
}
|
||||
|
||||
// Deserialize PCIe data
|
||||
if(is_pcie_enabled)
|
||||
{
|
||||
result.pcie_link_width = deserialize_uint16(serialized_data, offset);
|
||||
result.pcie_link_speed = deserialize_uint16(serialized_data, offset);
|
||||
result.pcie_bandwidth_acc = deserialize_uint64(serialized_data, offset);
|
||||
result.pcie_bandwidth_inst = deserialize_uint64(serialized_data, offset);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace rocprofsys
|
||||
@@ -0,0 +1,144 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
namespace gpu
|
||||
{
|
||||
/// GPU metrics data structure for VCN, JPEG, XGMI, and PCIe metrics
|
||||
struct gpu_metrics_t
|
||||
{
|
||||
// VCN metrics
|
||||
std::vector<uint16_t> vcn_activity; // Device-level VCN (when supported)
|
||||
std::vector<std::vector<uint16_t>> vcn_busy; // XCP-level VCN (per-XCP organization)
|
||||
|
||||
// JPEG metrics
|
||||
std::vector<uint16_t> jpeg_activity; // Device-level JPEG (when supported)
|
||||
std::vector<std::vector<uint16_t>>
|
||||
jpeg_busy; // XCP-level JPEG (per-XCP organization)
|
||||
|
||||
// XGMI metrics
|
||||
uint16_t xgmi_link_width = 0;
|
||||
uint16_t xgmi_link_speed = 0;
|
||||
std::vector<uint64_t> xgmi_read_data_acc;
|
||||
std::vector<uint64_t> xgmi_write_data_acc;
|
||||
|
||||
// PCIe metrics
|
||||
uint16_t pcie_link_width = 0;
|
||||
uint16_t pcie_link_speed = 0;
|
||||
uint64_t pcie_bandwidth_acc = 0;
|
||||
uint64_t pcie_bandwidth_inst = 0;
|
||||
};
|
||||
|
||||
/// Settings structure for controlling which metrics are serialized
|
||||
struct gpu_metrics_settings_t
|
||||
{
|
||||
bool vcn_activity = true;
|
||||
bool jpeg_activity = true;
|
||||
bool xgmi = true;
|
||||
bool pcie = true;
|
||||
};
|
||||
|
||||
/// GPU metrics capabilities structure with bitfield flags
|
||||
struct gpu_metrics_capabilities_t
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint8_t vcn_is_device_level_only : 1; ///< VCN is device-level (vs per-XCP)
|
||||
uint8_t jpeg_is_device_level_only : 1; ///< JPEG is device-level (vs per-XCP)
|
||||
uint8_t reserved : 6; ///< Reserved for future use
|
||||
} flags;
|
||||
uint8_t value; ///< Raw byte value for easy serialization
|
||||
};
|
||||
|
||||
/// Default constructor - initializes all flags to zero
|
||||
gpu_metrics_capabilities_t()
|
||||
: value(0)
|
||||
{}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Serializes GPU metrics into a compact binary format
|
||||
*
|
||||
* Serialization format:
|
||||
* 1. Support flags byte (1 byte):
|
||||
* - bit 0: vcn_is_device_level_only (device-level vs per-XCP)
|
||||
* - bit 1: jpeg_is_device_level_only (device-level vs per-XCP)
|
||||
* - bits 2-7: reserved
|
||||
* 2. Data element counts (6 bytes):
|
||||
* - vcn_count (1 byte): total VCN values (flattened across all XCPs)
|
||||
* - jpeg_count (1 byte): total JPEG values (flattened across all XCPs)
|
||||
* - vcn_xcp_count (1 byte): number of XCPs with VCN data
|
||||
* - jpeg_xcp_count (1 byte): number of XCPs with JPEG data
|
||||
* - xgmi_read_count (1 byte): number of XGMI read data values
|
||||
* - xgmi_write_count (1 byte): number of XGMI write data values
|
||||
* 3. Per-XCP size arrays (variable):
|
||||
* - vcn_xcp_sizes[0..vcn_xcp_count-1]: size of each XCP's VCN data (1 byte each)
|
||||
* - jpeg_xcp_sizes[0..jpeg_xcp_count-1]: size of each XCP's JPEG data (1 byte each)
|
||||
* 4. Flattened data arrays (conditionally serialized based on settings):
|
||||
* - VCN data (if vcn_activity setting enabled): flattened uint16 values
|
||||
* - JPEG data (if jpeg_activity setting enabled): flattened uint16 values
|
||||
* - XGMI data (if xgmi setting enabled):
|
||||
* link_width (uint16), link_speed (uint16)
|
||||
* xgmi_read_data array (uint64[xgmi_read_count])
|
||||
* xgmi_write_data array (uint64[xgmi_write_count])
|
||||
* - PCIe data (if pcie setting enabled):
|
||||
* link_width (uint16), link_speed (uint16)
|
||||
* bandwidth_acc (uint64), bandwidth_inst (uint64)
|
||||
*
|
||||
* @param metrics GPU metrics to serialize
|
||||
* @param capabilities Capability flags (vcn/jpeg device-level status)
|
||||
* @param settings Controls which metrics to include in serialization
|
||||
* @return Binary serialized data
|
||||
*/
|
||||
std::vector<uint8_t>
|
||||
serialize_gpu_metrics(const gpu_metrics_t& metrics,
|
||||
const gpu_metrics_capabilities_t& capabilities,
|
||||
const gpu_metrics_settings_t& settings);
|
||||
|
||||
/**
|
||||
* @brief Deserializes GPU metrics from binary format
|
||||
*
|
||||
* @param serialized_data Binary data to deserialize
|
||||
* @param result Output GPU metrics structure
|
||||
* @param is_vcn_enabled Whether to deserialize VCN data
|
||||
* @param is_jpeg_enabled Whether to deserialize JPEG data
|
||||
* @param is_xgmi_enabled Whether to deserialize XGMI data
|
||||
* @param is_pcie_enabled Whether to deserialize PCIe data
|
||||
* @param capabilities Output: capability flags (vcn/jpeg device-level status)
|
||||
* @throws std::runtime_error if serialized data is invalid
|
||||
*/
|
||||
void
|
||||
deserialize_gpu_metrics(const std::vector<uint8_t>& serialized_data,
|
||||
gpu_metrics_t& result, bool is_vcn_enabled, bool is_jpeg_enabled,
|
||||
bool is_xgmi_enabled, bool is_pcie_enabled,
|
||||
gpu_metrics_capabilities_t& capabilities);
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace rocprofsys
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "agent_manager.hpp"
|
||||
#include "config.hpp"
|
||||
#include "debug.hpp"
|
||||
#include "gpu_metrics.hpp"
|
||||
#include "library/thread_info.hpp"
|
||||
#include "node_info.hpp"
|
||||
#include "rocpd/data_processor.hpp"
|
||||
@@ -50,7 +51,6 @@ namespace trace_cache
|
||||
{
|
||||
namespace
|
||||
{
|
||||
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
auto
|
||||
get_handle_from_code_object(
|
||||
@@ -405,73 +405,8 @@ rocpd_post_processing::get_pmc_event_with_sample_callback() const
|
||||
postprocessing_callback
|
||||
rocpd_post_processing::get_amd_smi_sample_callback() const
|
||||
{
|
||||
struct xcp_metrics_t
|
||||
{
|
||||
std::vector<uint16_t> vcn_busy;
|
||||
std::vector<uint16_t> jpeg_busy;
|
||||
};
|
||||
|
||||
auto deserialize_xcp_metrics = [](const std::vector<uint8_t>& serialized_data,
|
||||
bool& _is_vcn_supported, bool& _is_jpeg_supported,
|
||||
std::vector<xcp_metrics_t>& result) {
|
||||
if(serialized_data.size() < 5)
|
||||
{
|
||||
throw std::runtime_error("Invalid serialized data: insufficient header size");
|
||||
}
|
||||
|
||||
size_t offset = 0;
|
||||
|
||||
// Read header
|
||||
_is_vcn_supported = static_cast<bool>(serialized_data[offset++]);
|
||||
_is_jpeg_supported = static_cast<bool>(serialized_data[offset++]);
|
||||
uint8_t chunk_count = serialized_data[offset++];
|
||||
uint8_t vcn_count = serialized_data[offset++];
|
||||
uint8_t jpeg_count = serialized_data[offset++];
|
||||
|
||||
constexpr size_t elem_size = sizeof(uint16_t) / sizeof(uint8_t);
|
||||
const size_t chunk_size = (vcn_count + jpeg_count) * elem_size;
|
||||
|
||||
// Validate total size
|
||||
const size_t expected_size = 5 + (chunk_count * chunk_size);
|
||||
if(serialized_data.size() != expected_size)
|
||||
{
|
||||
throw std::runtime_error("Invalid serialized data: size mismatch");
|
||||
}
|
||||
|
||||
auto deserialize_uint16_array = [](const std::vector<uint8_t>& data,
|
||||
size_t& _offset, int array_size) {
|
||||
std::vector<uint16_t> _result;
|
||||
_result.reserve(array_size);
|
||||
|
||||
for(int i = 0; i < array_size; ++i)
|
||||
{
|
||||
if(_offset + 1 >= data.size())
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Invalid serialized data: unexpected end of data");
|
||||
}
|
||||
|
||||
uint16_t value = static_cast<uint16_t>(data[_offset]) |
|
||||
(static_cast<uint16_t>(data[_offset + 1]) << 8);
|
||||
_result.push_back(value);
|
||||
_offset += 2;
|
||||
}
|
||||
|
||||
return _result;
|
||||
};
|
||||
|
||||
result.reserve(chunk_count);
|
||||
|
||||
for(size_t count = 0; count < chunk_count; ++count)
|
||||
{
|
||||
xcp_metrics_t entry;
|
||||
entry.vcn_busy = deserialize_uint16_array(serialized_data, offset, vcn_count);
|
||||
entry.jpeg_busy =
|
||||
deserialize_uint16_array(serialized_data, offset, jpeg_count);
|
||||
|
||||
result.emplace_back(std::move(entry));
|
||||
}
|
||||
};
|
||||
// Use the shared gpu_metrics_t from core/gpu_metrics.hpp
|
||||
using gpu_metrics_t = gpu::gpu_metrics_t;
|
||||
|
||||
return [&](const storage_parsed_type_base& parsed) {
|
||||
auto _amd_smi = static_cast<const struct amd_smi_sample&>(parsed);
|
||||
@@ -502,6 +437,8 @@ rocpd_post_processing::get_amd_smi_sample_callback() const
|
||||
|
||||
bool is_vcn_enabled = settings_bits.test(static_cast<int>(pos::vcn_activity));
|
||||
bool is_jpeg_enabled = settings_bits.test(static_cast<int>(pos::jpeg_activity));
|
||||
bool is_xgmi_enabled = settings_bits.test(static_cast<int>(pos::xgmi));
|
||||
bool is_pcie_enabled = settings_bits.test(static_cast<int>(pos::pcie));
|
||||
|
||||
insert_event_and_sample(
|
||||
is_busy_enabled, trait::name<category::amd_smi_gfx_busy>::value,
|
||||
@@ -536,55 +473,145 @@ rocpd_post_processing::get_amd_smi_sample_callback() const
|
||||
.c_str(),
|
||||
_amd_smi.mem_usage);
|
||||
|
||||
if(!is_vcn_enabled && !is_jpeg_enabled)
|
||||
{
|
||||
if(!is_vcn_enabled && !is_jpeg_enabled && !is_xgmi_enabled && !is_pcie_enabled)
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<xcp_metrics_t> xcp_metrics;
|
||||
bool is_vcn_activity_supported;
|
||||
bool is_jpeg_activity_supported;
|
||||
deserialize_xcp_metrics(_amd_smi.xcp_activity, is_vcn_activity_supported,
|
||||
is_jpeg_activity_supported, xcp_metrics);
|
||||
gpu_metrics_t gpu_metrics;
|
||||
gpu::gpu_metrics_capabilities_t capabilities;
|
||||
gpu::deserialize_gpu_metrics(_amd_smi.gpu_activity, gpu_metrics, is_vcn_enabled,
|
||||
is_jpeg_enabled, is_xgmi_enabled, is_pcie_enabled,
|
||||
capabilities);
|
||||
|
||||
auto insert_xcp_metrics = [&](auto category, bool _is_enabled,
|
||||
const std::vector<uint16_t>& data,
|
||||
std::optional<size_t> _idx = std::nullopt) {
|
||||
if(!_is_enabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
// Insert VCN and JPEG activity metrics
|
||||
auto insert_decode_vector_metrics = [&](auto category, bool _is_enabled,
|
||||
const std::vector<uint16_t>& data,
|
||||
std::optional<size_t> _idx =
|
||||
std::nullopt) {
|
||||
if(!_is_enabled) return;
|
||||
|
||||
using Category = std::decay_t<decltype(category)>;
|
||||
|
||||
for(size_t clk = 0; clk < data.size(); ++clk)
|
||||
for(size_t i = 0; i < data.size(); ++i)
|
||||
{
|
||||
const auto value = data[clk];
|
||||
if(value == std::numeric_limits<uint16_t>::max())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const auto value = data[i];
|
||||
if(value == std::numeric_limits<uint16_t>::max()) continue;
|
||||
|
||||
auto pmc_name = info::annotate_category<Category>(_idx, clk);
|
||||
auto track_name = info::annotate_with_device_id<Category>(
|
||||
_amd_smi.device_id, _idx, clk);
|
||||
auto pmc_name = info::annotate_category<Category>(_idx, i);
|
||||
auto track_name =
|
||||
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
|
||||
|
||||
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
|
||||
value);
|
||||
static_cast<double>(value));
|
||||
}
|
||||
};
|
||||
|
||||
for(size_t idx = 0; idx < xcp_metrics.size(); ++idx)
|
||||
// Insert XGMI read/write data metrics
|
||||
auto insert_xgmi_vector_metrics = [&](auto category, bool _is_enabled,
|
||||
const std::vector<uint64_t>& data,
|
||||
std::optional<size_t> _idx = std::nullopt) {
|
||||
if(!_is_enabled) return;
|
||||
|
||||
using Category = std::decay_t<decltype(category)>;
|
||||
|
||||
for(size_t i = 0; i < data.size(); ++i)
|
||||
{
|
||||
const auto value = data[i];
|
||||
if(value == std::numeric_limits<uint64_t>::max()) continue;
|
||||
|
||||
auto pmc_name = info::annotate_category<Category>(_idx, i);
|
||||
auto track_name =
|
||||
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
|
||||
|
||||
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
|
||||
static_cast<double>(value));
|
||||
}
|
||||
};
|
||||
|
||||
// Insert VCN activity metrics
|
||||
if(capabilities.flags.vcn_is_device_level_only)
|
||||
{
|
||||
auto dimension =
|
||||
xcp_metrics.size() == 1 ? std::nullopt : std::make_optional<size_t>(idx);
|
||||
|
||||
insert_xcp_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled,
|
||||
xcp_metrics[idx].vcn_busy, dimension);
|
||||
|
||||
insert_xcp_metrics(category::amd_smi_jpeg_activity{}, is_jpeg_enabled,
|
||||
xcp_metrics[idx].jpeg_busy, dimension);
|
||||
// Device-level: use vcn_activity vector
|
||||
insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled,
|
||||
gpu_metrics.vcn_activity, std::nullopt);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-XCP: iterate through actual XCPs in vcn_busy
|
||||
for(size_t xcp = 0; xcp < gpu_metrics.vcn_busy.size(); ++xcp)
|
||||
{
|
||||
insert_decode_vector_metrics(category::amd_smi_vcn_activity{},
|
||||
is_vcn_enabled, gpu_metrics.vcn_busy[xcp],
|
||||
xcp);
|
||||
}
|
||||
}
|
||||
|
||||
// Insert JPEG activity metrics
|
||||
if(capabilities.flags.jpeg_is_device_level_only)
|
||||
{
|
||||
// Device-level: use jpeg_activity vector
|
||||
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{},
|
||||
is_jpeg_enabled, gpu_metrics.jpeg_activity,
|
||||
std::nullopt);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Per-XCP: iterate through actual XCPs in jpeg_busy
|
||||
for(size_t xcp = 0; xcp < gpu_metrics.jpeg_busy.size(); ++xcp)
|
||||
{
|
||||
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{},
|
||||
is_jpeg_enabled, gpu_metrics.jpeg_busy[xcp],
|
||||
xcp);
|
||||
}
|
||||
}
|
||||
|
||||
// Insert XGMI metrics (scalar values)
|
||||
insert_event_and_sample(
|
||||
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_width>::value,
|
||||
info::annotate_with_device_id<category::amd_smi_xgmi_link_width>(
|
||||
_amd_smi.device_id)
|
||||
.c_str(),
|
||||
gpu_metrics.xgmi_link_width);
|
||||
|
||||
insert_event_and_sample(
|
||||
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_speed>::value,
|
||||
info::annotate_with_device_id<category::amd_smi_xgmi_link_speed>(
|
||||
_amd_smi.device_id)
|
||||
.c_str(),
|
||||
gpu_metrics.xgmi_link_speed);
|
||||
|
||||
insert_xgmi_vector_metrics(category::amd_smi_xgmi_read_data{}, is_xgmi_enabled,
|
||||
gpu_metrics.xgmi_read_data_acc, std::nullopt);
|
||||
|
||||
insert_xgmi_vector_metrics(category::amd_smi_xgmi_write_data{}, is_xgmi_enabled,
|
||||
gpu_metrics.xgmi_write_data_acc, std::nullopt);
|
||||
|
||||
insert_event_and_sample(
|
||||
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_width>::value,
|
||||
info::annotate_with_device_id<category::amd_smi_pcie_link_width>(
|
||||
_amd_smi.device_id)
|
||||
.c_str(),
|
||||
gpu_metrics.pcie_link_width);
|
||||
|
||||
insert_event_and_sample(
|
||||
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_speed>::value,
|
||||
info::annotate_with_device_id<category::amd_smi_pcie_link_speed>(
|
||||
_amd_smi.device_id)
|
||||
.c_str(),
|
||||
gpu_metrics.pcie_link_speed);
|
||||
|
||||
insert_event_and_sample(
|
||||
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_acc>::value,
|
||||
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_acc>(
|
||||
_amd_smi.device_id)
|
||||
.c_str(),
|
||||
static_cast<double>(gpu_metrics.pcie_bandwidth_acc));
|
||||
|
||||
insert_event_and_sample(
|
||||
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_inst>::value,
|
||||
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_inst>(
|
||||
_amd_smi.device_id)
|
||||
.c_str(),
|
||||
static_cast<double>(gpu_metrics.pcie_bandwidth_inst));
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -188,7 +188,9 @@ struct amd_smi_sample : storage_parsed_type_base
|
||||
power,
|
||||
mem_usage,
|
||||
vcn_activity,
|
||||
jpeg_activity
|
||||
jpeg_activity,
|
||||
xgmi,
|
||||
pcie
|
||||
};
|
||||
|
||||
uint64_t settings; // bitfield
|
||||
@@ -200,7 +202,7 @@ struct amd_smi_sample : storage_parsed_type_base
|
||||
uint32_t power;
|
||||
int64_t temperature;
|
||||
size_t mem_usage;
|
||||
std::vector<uint8_t> xcp_activity;
|
||||
std::vector<uint8_t> gpu_activity;
|
||||
};
|
||||
|
||||
struct cpu_freq_sample : storage_parsed_type_base
|
||||
|
||||
@@ -213,7 +213,7 @@ storage_parser::consume_storage()
|
||||
_amd_smi_sample.gfx_activity, _amd_smi_sample.umc_activity,
|
||||
_amd_smi_sample.mm_activity, _amd_smi_sample.power,
|
||||
_amd_smi_sample.temperature, _amd_smi_sample.mem_usage,
|
||||
_amd_smi_sample.xcp_activity);
|
||||
_amd_smi_sample.gpu_activity);
|
||||
invoke_callbacks(header.type, _amd_smi_sample);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -67,6 +67,14 @@ extern "C"
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_MEMORY_USAGE,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_VCN_ACTIVITY,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_JPEG_ACTIVITY,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_WIDTH,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_SPEED,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_READ_DATA,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_WRITE_DATA,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_WIDTH,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_SPEED,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_ACC,
|
||||
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_INST,
|
||||
ROCPROFSYS_CATEGORY_ROCM_RCCL,
|
||||
ROCPROFSYS_CATEGORY_SAMPLING,
|
||||
ROCPROFSYS_CATEGORY_PTHREAD,
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#include "core/config.hpp"
|
||||
#include "core/debug.hpp"
|
||||
#include "core/gpu.hpp"
|
||||
#include "core/gpu_metrics.hpp"
|
||||
#include "core/node_info.hpp"
|
||||
#include "core/perfetto.hpp"
|
||||
#include "core/state.hpp"
|
||||
@@ -127,7 +128,7 @@ metadata_initialize_smi_tracks(size_t gpu_id)
|
||||
}
|
||||
};
|
||||
|
||||
if(gpu::is_vcn_activity_supported(gpu_id))
|
||||
if(gpu::vcn_is_device_level_only(gpu_id))
|
||||
{
|
||||
add_vcn_track(std::nullopt);
|
||||
}
|
||||
@@ -139,7 +140,7 @@ metadata_initialize_smi_tracks(size_t gpu_id)
|
||||
}
|
||||
}
|
||||
|
||||
if(gpu::is_jpeg_activity_supported(gpu_id))
|
||||
if(gpu::jpeg_is_device_level_only(gpu_id))
|
||||
{
|
||||
add_jpeg_track(std::nullopt);
|
||||
}
|
||||
@@ -150,6 +151,49 @@ metadata_initialize_smi_tracks(size_t gpu_id)
|
||||
add_jpeg_track(xcp);
|
||||
}
|
||||
}
|
||||
|
||||
// Add XGMI tracks using specific categories for each metric type
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_link_width>(
|
||||
gpu_id),
|
||||
thread_id, "{}" });
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_link_speed>(
|
||||
gpu_id),
|
||||
thread_id, "{}" });
|
||||
|
||||
for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i)
|
||||
{
|
||||
auto read_name =
|
||||
trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_read_data>(
|
||||
gpu_id, std::nullopt, i);
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ read_name.c_str(), thread_id, "{}" });
|
||||
|
||||
auto write_name =
|
||||
trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_write_data>(
|
||||
gpu_id, std::nullopt, i);
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ write_name.c_str(), thread_id, "{}" });
|
||||
}
|
||||
|
||||
// Add PCIe tracks using specific categories for each metric
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ trace_cache::info::annotate_with_device_id<category::amd_smi_pcie_link_width>(
|
||||
gpu_id),
|
||||
thread_id, "{}" });
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ trace_cache::info::annotate_with_device_id<category::amd_smi_pcie_link_speed>(
|
||||
gpu_id),
|
||||
thread_id, "{}" });
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ trace_cache::info::annotate_with_device_id<
|
||||
category::amd_smi_pcie_bandwidth_acc>(gpu_id),
|
||||
thread_id, "{}" });
|
||||
trace_cache::get_metadata_registry().add_track(
|
||||
{ trace_cache::info::annotate_with_device_id<
|
||||
category::amd_smi_pcie_bandwidth_inst>(gpu_id),
|
||||
thread_id, "{}" });
|
||||
}
|
||||
|
||||
void
|
||||
@@ -250,7 +294,7 @@ metadata_initialize_smi_pmc(size_t gpu_id)
|
||||
}
|
||||
};
|
||||
|
||||
if(gpu::is_vcn_activity_supported(gpu_id))
|
||||
if(gpu::vcn_is_device_level_only(gpu_id))
|
||||
{
|
||||
add_vcn_pmc(std::nullopt);
|
||||
}
|
||||
@@ -262,7 +306,7 @@ metadata_initialize_smi_pmc(size_t gpu_id)
|
||||
}
|
||||
}
|
||||
|
||||
if(gpu::is_jpeg_activity_supported(gpu_id))
|
||||
if(gpu::jpeg_is_device_level_only(gpu_id))
|
||||
{
|
||||
add_jpeg_pmc(std::nullopt);
|
||||
}
|
||||
@@ -273,6 +317,75 @@ metadata_initialize_smi_pmc(size_t gpu_id)
|
||||
add_jpeg_pmc(xcp);
|
||||
}
|
||||
}
|
||||
|
||||
// Add XGMI PMC info using specific categories for each metric type
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
trait::name<category::amd_smi_xgmi_link_width>::value, "XgmiLinkWidth",
|
||||
trait::name<category::amd_smi_xgmi_link_width>::description, LONG_DESCRIPTION,
|
||||
COMPONENT, "bits", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0,
|
||||
0 });
|
||||
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
trait::name<category::amd_smi_xgmi_link_speed>::value, "XgmiLinkSpeed",
|
||||
trait::name<category::amd_smi_xgmi_link_speed>::description, LONG_DESCRIPTION,
|
||||
COMPONENT, "GT/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0,
|
||||
0 });
|
||||
|
||||
for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i)
|
||||
{
|
||||
std::stringstream read_name_ss, read_symbol_ss;
|
||||
read_name_ss << trait::name<category::amd_smi_xgmi_read_data>::value << "_" << i;
|
||||
read_symbol_ss << "XgmiRead_" << i;
|
||||
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
read_name_ss.str(), read_symbol_ss.str(),
|
||||
trait::name<category::amd_smi_xgmi_read_data>::description,
|
||||
LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
|
||||
EXPRESSION, 0, 0 });
|
||||
|
||||
std::stringstream write_name_ss, write_symbol_ss;
|
||||
write_name_ss << trait::name<category::amd_smi_xgmi_write_data>::value << "_"
|
||||
<< i;
|
||||
write_symbol_ss << "XgmiWrite_" << i;
|
||||
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
write_name_ss.str(), write_symbol_ss.str(),
|
||||
trait::name<category::amd_smi_xgmi_write_data>::description,
|
||||
LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
|
||||
EXPRESSION, 0, 0 });
|
||||
}
|
||||
|
||||
// Add PCIe PMC info using specific categories for each metric
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
trait::name<category::amd_smi_pcie_link_width>::value, "PcieLinkWidth",
|
||||
trait::name<category::amd_smi_pcie_link_width>::description, LONG_DESCRIPTION,
|
||||
COMPONENT, "", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 });
|
||||
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
trait::name<category::amd_smi_pcie_link_speed>::value, "PcieLinkSpeed",
|
||||
trait::name<category::amd_smi_pcie_link_speed>::description, LONG_DESCRIPTION,
|
||||
COMPONENT, "GT/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0,
|
||||
0 });
|
||||
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
trait::name<category::amd_smi_pcie_bandwidth_acc>::value, "PcieBwAcc",
|
||||
trait::name<category::amd_smi_pcie_bandwidth_acc>::description,
|
||||
LONG_DESCRIPTION, COMPONENT, "MB", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
|
||||
EXPRESSION, 0, 0 });
|
||||
|
||||
trace_cache::get_metadata_registry().add_pmc_info(
|
||||
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
|
||||
trait::name<category::amd_smi_pcie_bandwidth_inst>::value, "PcieBwInst",
|
||||
trait::name<category::amd_smi_pcie_bandwidth_inst>::description,
|
||||
LONG_DESCRIPTION, COMPONENT, "MB/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
|
||||
EXPRESSION, 0, 0 });
|
||||
}
|
||||
|
||||
auto&
|
||||
@@ -335,70 +448,21 @@ get_state()
|
||||
}
|
||||
|
||||
std::vector<uint8_t>
|
||||
serialize_xcp_metrics(const bool& use_vcn_activity, const bool& use_jpeg_activity,
|
||||
const amdsmi_gpu_metrics_t& gpu_metrics)
|
||||
serialize_gpu_metrics(uint32_t device_id, const data::gpu_metrics_t& metrics,
|
||||
const gpu::gpu_metrics_capabilities_t& capabilities)
|
||||
{
|
||||
// Chunk:
|
||||
// <vcn_data_0>..<vcn_data_[vcn_count]> // lower and higher byte
|
||||
// <jpeg_data_0>..<jpeg_data_[jpeg_count]> // lower and higher byte
|
||||
// Get settings for this device
|
||||
auto settings = get_settings(device_id);
|
||||
|
||||
// Serialized:
|
||||
// <is_vcn_supported>
|
||||
// <is_jpeg_supported>
|
||||
// <xcp_count>
|
||||
// <vcn_count>
|
||||
// <jpeg_count>
|
||||
// Chunk_0
|
||||
// ...
|
||||
// Chunk_[xcp_count]
|
||||
// Convert amd_smi::settings to gpu::gpu_metrics_settings_t
|
||||
gpu::gpu_metrics_settings_t gpu_settings;
|
||||
gpu_settings.vcn_activity = settings.vcn_activity;
|
||||
gpu_settings.jpeg_activity = settings.jpeg_activity;
|
||||
gpu_settings.xgmi = settings.xgmi;
|
||||
gpu_settings.pcie = settings.pcie;
|
||||
|
||||
constexpr uint8_t vcn_count = AMDSMI_MAX_NUM_VCN;
|
||||
constexpr uint8_t jpeg_count = AMDSMI_MAX_NUM_JPEG;
|
||||
constexpr uint8_t xcp_count = AMDSMI_MAX_NUM_XCP;
|
||||
constexpr size_t elem_size = sizeof(uint16_t) / sizeof(uint8_t);
|
||||
constexpr uint8_t vector_size_header = sizeof(uint8_t);
|
||||
constexpr uint8_t serialized_data_headers =
|
||||
5 * vector_size_header; // is_vcn_supported + is_jpeg_supported + xcp_count +
|
||||
// vcn_count + jpeg_count
|
||||
constexpr size_t chunk_size = ((vcn_count + jpeg_count) * elem_size);
|
||||
|
||||
auto serialize_uint16_array = [](std::vector<uint8_t>& data, const uint16_t* arr,
|
||||
int array_size) {
|
||||
for(int i = 0; i < array_size; ++i)
|
||||
{
|
||||
data.push_back(static_cast<uint8_t>(arr[i] & 0xFF));
|
||||
data.push_back(static_cast<uint8_t>((arr[i] >> 8) & 0xFF));
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<uint8_t> result;
|
||||
|
||||
const bool is_vcn_jpeg_supported = (use_vcn_activity || use_jpeg_activity);
|
||||
const size_t chunk_count = is_vcn_jpeg_supported ? 1 : xcp_count;
|
||||
const size_t total_size = serialized_data_headers + (chunk_count * chunk_size);
|
||||
|
||||
result.reserve(total_size);
|
||||
|
||||
result.push_back((uint8_t) use_vcn_activity);
|
||||
result.push_back((uint8_t) use_jpeg_activity);
|
||||
result.push_back(chunk_count);
|
||||
result.push_back(vcn_count);
|
||||
result.push_back(jpeg_count);
|
||||
|
||||
for(size_t count = 0; count < chunk_count; ++count)
|
||||
{
|
||||
const auto* vcn_data =
|
||||
(is_vcn_jpeg_supported ? gpu_metrics.vcn_activity
|
||||
: gpu_metrics.xcp_stats[count].vcn_busy);
|
||||
const auto* jpeg_data =
|
||||
(is_vcn_jpeg_supported ? gpu_metrics.jpeg_activity
|
||||
: gpu_metrics.xcp_stats[count].jpeg_busy);
|
||||
|
||||
serialize_uint16_array(result, vcn_data, vcn_count);
|
||||
serialize_uint16_array(result, jpeg_data, jpeg_count);
|
||||
}
|
||||
|
||||
return result;
|
||||
// Use the shared serialization function
|
||||
return gpu::serialize_gpu_metrics(metrics, capabilities, gpu_settings);
|
||||
}
|
||||
|
||||
size_t
|
||||
@@ -425,6 +489,12 @@ serialize_settings(uint32_t _device_id)
|
||||
settings_bits.set(
|
||||
static_cast<int>(trace_cache::amd_smi_sample::settings_positions::jpeg_activity),
|
||||
settings.jpeg_activity);
|
||||
settings_bits.set(
|
||||
static_cast<int>(trace_cache::amd_smi_sample::settings_positions::xgmi),
|
||||
settings.xgmi);
|
||||
settings_bits.set(
|
||||
static_cast<int>(trace_cache::amd_smi_sample::settings_positions::pcie),
|
||||
settings.pcie);
|
||||
return settings_bits.to_ulong();
|
||||
}
|
||||
|
||||
@@ -446,7 +516,7 @@ data::sample(uint32_t _device_id)
|
||||
auto _timestamp = tim::get_clock_real_now<size_t, std::nano>();
|
||||
assert(_timestamp < std::numeric_limits<int64_t>::max());
|
||||
amdsmi_gpu_metrics_t _gpu_metrics;
|
||||
bool _vcn_or_jpeg_activity_enabled = false;
|
||||
bool _gpu_metrics_needed = false;
|
||||
|
||||
auto _state = get_state().load();
|
||||
|
||||
@@ -487,68 +557,153 @@ data::sample(uint32_t _device_id)
|
||||
#endif
|
||||
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).mem_usage, amdsmi_get_gpu_memory_usage,
|
||||
sample_handle, AMDSMI_MEM_TYPE_VRAM, &m_mem_usage);
|
||||
_vcn_or_jpeg_activity_enabled =
|
||||
get_settings(m_dev_id).vcn_activity || get_settings(m_dev_id).jpeg_activity;
|
||||
ROCPROFSYS_AMDSMI_GET(_vcn_or_jpeg_activity_enabled, amdsmi_get_gpu_metrics_info,
|
||||
sample_handle, &_gpu_metrics);
|
||||
|
||||
// Process metrics if either VCN or JPEG activity is enabled
|
||||
if(_vcn_or_jpeg_activity_enabled)
|
||||
// Check if GPU metrics are needed for VCN, JPEG, XGMI, or PCIe
|
||||
_gpu_metrics_needed = get_settings(m_dev_id).vcn_activity ||
|
||||
get_settings(m_dev_id).jpeg_activity ||
|
||||
get_settings(m_dev_id).xgmi || get_settings(m_dev_id).pcie;
|
||||
|
||||
ROCPROFSYS_AMDSMI_GET(_gpu_metrics_needed, amdsmi_get_gpu_metrics_info, sample_handle,
|
||||
&_gpu_metrics);
|
||||
|
||||
// Determine if basic metrics are enabled
|
||||
bool _basic_metrics_enabled =
|
||||
get_settings(m_dev_id).busy || get_settings(m_dev_id).temp ||
|
||||
get_settings(m_dev_id).power || get_settings(m_dev_id).mem_usage;
|
||||
|
||||
// Process GPU metrics if needed
|
||||
if(_gpu_metrics_needed || _basic_metrics_enabled)
|
||||
{
|
||||
// Helper lambda to fill busy metrics from a source array
|
||||
auto fill_busy_metrics = [](auto& dest, const auto& src) {
|
||||
for(const auto& val : src)
|
||||
{
|
||||
if(val != UINT16_MAX) dest.push_back(val);
|
||||
}
|
||||
};
|
||||
gpu_metrics_t metrics;
|
||||
bool has_data = false;
|
||||
gpu::gpu_metrics_capabilities_t capabilities;
|
||||
|
||||
if(gpu::is_vcn_activity_supported(m_dev_id) &&
|
||||
gpu::is_jpeg_activity_supported(m_dev_id))
|
||||
if(_gpu_metrics_needed)
|
||||
{
|
||||
// Both VCN and JPEG are supported - create one entry with both metrics
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.vcn_busy, _gpu_metrics.vcn_activity);
|
||||
fill_busy_metrics(metrics.jpeg_busy, _gpu_metrics.jpeg_activity);
|
||||
if(!metrics.vcn_busy.empty() || !metrics.jpeg_busy.empty())
|
||||
m_xcp_metrics.push_back(metrics);
|
||||
}
|
||||
else if(gpu::is_vcn_activity_supported(m_dev_id))
|
||||
{
|
||||
// Only VCN is supported
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.vcn_busy, _gpu_metrics.vcn_activity);
|
||||
if(!metrics.vcn_busy.empty()) m_xcp_metrics.push_back(metrics);
|
||||
}
|
||||
else if(gpu::is_jpeg_activity_supported(m_dev_id))
|
||||
{
|
||||
// Only JPEG is supported
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.jpeg_busy, _gpu_metrics.jpeg_activity);
|
||||
if(!metrics.jpeg_busy.empty()) m_xcp_metrics.push_back(metrics);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Neither is supported - use XCP stats
|
||||
// Each XCP gets one entry with both its VCN and JPEG metrics
|
||||
for(const auto& xcp : _gpu_metrics.xcp_stats)
|
||||
capabilities.flags.vcn_is_device_level_only =
|
||||
gpu::vcn_is_device_level_only(m_dev_id);
|
||||
capabilities.flags.jpeg_is_device_level_only =
|
||||
gpu::jpeg_is_device_level_only(m_dev_id);
|
||||
|
||||
// Helper lambda to filter max uint values (unsupported) - returns 0 if max,
|
||||
// otherwise the value
|
||||
auto filter_max_uint_value = [](const auto& value) {
|
||||
using ValueType = std::decay_t<decltype(value)>;
|
||||
return (value == std::numeric_limits<ValueType>::max()) ? ValueType{ 0 }
|
||||
: value;
|
||||
};
|
||||
|
||||
auto fill_gpu_metrics = [](auto& dest, const auto& src, auto max_val) {
|
||||
for(const auto& val : src)
|
||||
{
|
||||
if(val != max_val) dest.push_back(val);
|
||||
}
|
||||
};
|
||||
|
||||
if(get_settings(m_dev_id).vcn_activity)
|
||||
{
|
||||
xcp_metrics_t metrics;
|
||||
fill_busy_metrics(metrics.vcn_busy, xcp.vcn_busy);
|
||||
fill_busy_metrics(metrics.jpeg_busy, xcp.jpeg_busy);
|
||||
if(!metrics.vcn_busy.empty() || !metrics.jpeg_busy.empty())
|
||||
m_xcp_metrics.push_back(metrics);
|
||||
if(capabilities.flags.vcn_is_device_level_only)
|
||||
{
|
||||
fill_gpu_metrics(metrics.vcn_activity, _gpu_metrics.vcn_activity,
|
||||
UINT16_MAX);
|
||||
if(!metrics.vcn_activity.empty()) has_data = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
for(const auto& xcp : _gpu_metrics.xcp_stats)
|
||||
{
|
||||
std::vector<uint16_t> xcp_vcn_data;
|
||||
fill_gpu_metrics(xcp_vcn_data, xcp.vcn_busy, UINT16_MAX);
|
||||
if(!xcp_vcn_data.empty())
|
||||
{
|
||||
metrics.vcn_busy.push_back(std::move(xcp_vcn_data));
|
||||
has_data = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(get_settings(m_dev_id).jpeg_activity)
|
||||
{
|
||||
if(capabilities.flags.jpeg_is_device_level_only)
|
||||
{
|
||||
fill_gpu_metrics(metrics.jpeg_activity, _gpu_metrics.jpeg_activity,
|
||||
UINT16_MAX);
|
||||
if(!metrics.jpeg_activity.empty()) has_data = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
for(const auto& xcp : _gpu_metrics.xcp_stats)
|
||||
{
|
||||
std::vector<uint16_t> xcp_jpeg_data;
|
||||
fill_gpu_metrics(xcp_jpeg_data, xcp.jpeg_busy, UINT16_MAX);
|
||||
if(!xcp_jpeg_data.empty())
|
||||
{
|
||||
metrics.jpeg_busy.push_back(std::move(xcp_jpeg_data));
|
||||
has_data = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process XGMI metrics if enabled
|
||||
if(get_settings(m_dev_id).xgmi)
|
||||
{
|
||||
// Filter scalar values - returns 0 if unsupported (max value)
|
||||
metrics.xgmi_link_width =
|
||||
filter_max_uint_value(_gpu_metrics.xgmi_link_width);
|
||||
metrics.xgmi_link_speed =
|
||||
filter_max_uint_value(_gpu_metrics.xgmi_link_speed);
|
||||
|
||||
// Vector values filtered by fill_gpu_metrics
|
||||
fill_gpu_metrics(metrics.xgmi_read_data_acc,
|
||||
_gpu_metrics.xgmi_read_data_acc, UINT64_MAX);
|
||||
fill_gpu_metrics(metrics.xgmi_write_data_acc,
|
||||
_gpu_metrics.xgmi_write_data_acc, UINT64_MAX);
|
||||
|
||||
if(metrics.xgmi_link_width != 0 || metrics.xgmi_link_speed != 0 ||
|
||||
!metrics.xgmi_read_data_acc.empty() ||
|
||||
!metrics.xgmi_write_data_acc.empty())
|
||||
{
|
||||
has_data = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Process PCIe metrics if enabled
|
||||
if(get_settings(m_dev_id).pcie)
|
||||
{
|
||||
// Filter scalar values - returns 0 if unsupported (max value)
|
||||
metrics.pcie_link_width =
|
||||
filter_max_uint_value(_gpu_metrics.pcie_link_width);
|
||||
metrics.pcie_link_speed =
|
||||
filter_max_uint_value(_gpu_metrics.pcie_link_speed);
|
||||
metrics.pcie_bandwidth_acc =
|
||||
filter_max_uint_value(_gpu_metrics.pcie_bandwidth_acc);
|
||||
metrics.pcie_bandwidth_inst =
|
||||
filter_max_uint_value(_gpu_metrics.pcie_bandwidth_inst);
|
||||
|
||||
if(metrics.pcie_link_width != 0 || metrics.pcie_link_speed != 0 ||
|
||||
metrics.pcie_bandwidth_acc != 0 || metrics.pcie_bandwidth_inst != 0)
|
||||
{
|
||||
has_data = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store samples if basic metrics are enabled OR if there's advanced metric data
|
||||
if(_basic_metrics_enabled || has_data)
|
||||
{
|
||||
trace_cache::get_buffer_storage().store(
|
||||
trace_cache::entry_type::amd_smi_sample, serialize_settings(m_dev_id),
|
||||
_device_id, _timestamp, m_busy_perc.gfx_activity,
|
||||
m_busy_perc.umc_activity, m_busy_perc.mm_activity,
|
||||
m_power.current_socket_power, m_temp, m_mem_usage,
|
||||
serialize_gpu_metrics(m_dev_id, metrics, capabilities));
|
||||
|
||||
if(has_data) m_gpu_metrics.push_back(metrics);
|
||||
}
|
||||
}
|
||||
#undef ROCPROFSYS_AMDSMI_GET
|
||||
|
||||
trace_cache::get_buffer_storage().store(
|
||||
trace_cache::entry_type::amd_smi_sample, serialize_settings(m_dev_id), _device_id,
|
||||
_timestamp, m_busy_perc.gfx_activity, m_busy_perc.umc_activity,
|
||||
m_busy_perc.mm_activity, m_power.current_socket_power, m_temp, m_mem_usage,
|
||||
serialize_xcp_metrics(gpu::is_vcn_activity_supported(m_dev_id),
|
||||
gpu::is_jpeg_activity_supported(m_dev_id), _gpu_metrics));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -741,25 +896,28 @@ data::post_process(uint32_t _dev_id)
|
||||
}
|
||||
if(_settings.vcn_activity)
|
||||
{
|
||||
if(itr.m_xcp_metrics.empty())
|
||||
if(itr.m_gpu_metrics.empty())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(
|
||||
1, "No VCN activity data collected from device %u\n", _dev_id);
|
||||
}
|
||||
else if(gpu::is_vcn_activity_supported(_dev_id))
|
||||
else if(gpu::vcn_is_device_level_only(_dev_id))
|
||||
{
|
||||
// For VCN activity, use simple indexing
|
||||
for(std::size_t i = 0; i < std::size(itr.m_xcp_metrics[0].vcn_busy);
|
||||
++i)
|
||||
// For VCN activity supported: use vcn_activity vector
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_gpu_metrics[0].vcn_activity); ++i)
|
||||
counter_track::emplace(_dev_id, addendum_blk(i, "VCN Activity"),
|
||||
"%");
|
||||
}
|
||||
else
|
||||
{
|
||||
for(std::size_t xcp = 0; xcp < std::size(itr.m_xcp_metrics); ++xcp)
|
||||
// For VCN activity NOT supported: use vcn_busy vector with per-XCP
|
||||
// organization
|
||||
for(size_t xcp = 0; xcp < itr.m_gpu_metrics[0].vcn_busy.size(); ++xcp)
|
||||
{
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_xcp_metrics[xcp].vcn_busy); ++i)
|
||||
// Loop through each XCP's VCN busy values
|
||||
for(size_t i = 0; i < itr.m_gpu_metrics[0].vcn_busy[xcp].size();
|
||||
++i)
|
||||
{
|
||||
counter_track::emplace(
|
||||
_dev_id, addendum_blk(i, "VCN Activity", xcp), "%");
|
||||
@@ -769,29 +927,73 @@ data::post_process(uint32_t _dev_id)
|
||||
}
|
||||
if(_settings.jpeg_activity)
|
||||
{
|
||||
if(itr.m_xcp_metrics.empty())
|
||||
if(itr.m_gpu_metrics.empty())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(
|
||||
1, "No JPEG activity data collected from device %u\n", _dev_id);
|
||||
}
|
||||
else if(gpu::is_jpeg_activity_supported(_dev_id))
|
||||
else if(gpu::jpeg_is_device_level_only(_dev_id))
|
||||
{
|
||||
for(std::size_t i = 0; i < std::size(itr.m_xcp_metrics[0].jpeg_busy);
|
||||
++i)
|
||||
// For JPEG activity supported: use jpeg_activity vector
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_gpu_metrics[0].jpeg_activity); ++i)
|
||||
counter_track::emplace(_dev_id, addendum_blk(i, "JPEG Activity"),
|
||||
"%");
|
||||
}
|
||||
else
|
||||
{
|
||||
for(std::size_t xcp = 0; xcp < std::size(itr.m_xcp_metrics); ++xcp)
|
||||
// For JPEG activity NOT supported: use jpeg_busy vector with per-XCP
|
||||
// organization
|
||||
for(size_t xcp = 0; xcp < itr.m_gpu_metrics[0].jpeg_busy.size();
|
||||
++xcp)
|
||||
{
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_xcp_metrics[xcp].jpeg_busy); ++i)
|
||||
// Loop through each XCP's JPEG busy values
|
||||
for(size_t i = 0; i < itr.m_gpu_metrics[0].jpeg_busy[xcp].size();
|
||||
++i)
|
||||
{
|
||||
counter_track::emplace(
|
||||
_dev_id, addendum_blk(i, "JPEG Activity", xcp), "%");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(_settings.xgmi)
|
||||
{
|
||||
if(itr.m_gpu_metrics.empty())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(
|
||||
1, "No XGMI activity data collected from device %u\n", _dev_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
counter_track::emplace(_dev_id, addendum("XGMI Link Width"), "bits");
|
||||
counter_track::emplace(_dev_id, addendum("XGMI Link Speed"), "GT/s");
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_gpu_metrics[0].xgmi_read_data_acc); ++i)
|
||||
counter_track::emplace(_dev_id, addendum_blk(i, "XGMI Read Data"),
|
||||
"KB");
|
||||
for(std::size_t i = 0;
|
||||
i < std::size(itr.m_gpu_metrics[0].xgmi_write_data_acc); ++i)
|
||||
counter_track::emplace(_dev_id,
|
||||
addendum_blk(i, "XGMI Write Data"), "KB");
|
||||
}
|
||||
}
|
||||
if(_settings.pcie)
|
||||
{
|
||||
if(itr.m_gpu_metrics.empty())
|
||||
{
|
||||
ROCPROFSYS_VERBOSE(
|
||||
1, "No PCIe activity data collected from device %u\n", _dev_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
counter_track::emplace(_dev_id, addendum("PCIe Link Width"), "");
|
||||
counter_track::emplace(_dev_id, addendum("PCIe Link Speed"), "GT/s");
|
||||
counter_track::emplace(_dev_id, addendum("PCIe Bandwidth Acc"), "MB");
|
||||
counter_track::emplace(_dev_id, addendum("PCIe Bandwidth Inst"),
|
||||
"MB/s");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto write_perfetto_metrics = [&]() {
|
||||
@@ -822,32 +1024,97 @@ data::post_process(uint32_t _dev_id)
|
||||
counter_track::at(_dev_id, track_index++), _ts, _usage);
|
||||
}
|
||||
|
||||
if(_settings.vcn_activity && !itr.m_xcp_metrics.empty())
|
||||
if(_settings.vcn_activity && !itr.m_gpu_metrics.empty())
|
||||
{
|
||||
// Iterate over all XCPs and their VCN busy/activity values
|
||||
for(const auto& metrics : itr.m_xcp_metrics)
|
||||
if(gpu::vcn_is_device_level_only(_dev_id))
|
||||
{
|
||||
for(const auto& vcn_val : metrics.vcn_busy)
|
||||
// Device-level VCN activity
|
||||
for(const auto& vcn_val : itr.m_gpu_metrics[0].vcn_activity)
|
||||
{
|
||||
TRACE_COUNTER("device_vcn_activity",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
vcn_val);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// XCP-level VCN busy (per-XCP organization)
|
||||
for(const auto& xcp_data : itr.m_gpu_metrics[0].vcn_busy)
|
||||
{
|
||||
for(const auto& vcn_val : xcp_data)
|
||||
{
|
||||
TRACE_COUNTER("device_vcn_activity",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
vcn_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(_settings.jpeg_activity && !itr.m_xcp_metrics.empty())
|
||||
if(_settings.jpeg_activity && !itr.m_gpu_metrics.empty())
|
||||
{
|
||||
// Iterate over all XCPs and their JPEG busy/activity values
|
||||
for(const auto& metrics : itr.m_xcp_metrics)
|
||||
if(gpu::jpeg_is_device_level_only(_dev_id))
|
||||
{
|
||||
for(const auto& jpeg_val : metrics.jpeg_busy)
|
||||
// Device-level JPEG activity
|
||||
for(const auto& jpeg_val : itr.m_gpu_metrics[0].jpeg_activity)
|
||||
{
|
||||
TRACE_COUNTER("device_jpeg_activity",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
jpeg_val);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// XCP-level JPEG busy (per-XCP organization)
|
||||
for(const auto& xcp_data : itr.m_gpu_metrics[0].jpeg_busy)
|
||||
{
|
||||
for(const auto& jpeg_val : xcp_data)
|
||||
{
|
||||
TRACE_COUNTER("device_jpeg_activity",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
jpeg_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(_settings.xgmi && !itr.m_gpu_metrics.empty())
|
||||
{
|
||||
TRACE_COUNTER("device_xgmi_link_width",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
itr.m_gpu_metrics[0].xgmi_link_width);
|
||||
TRACE_COUNTER("device_xgmi_link_speed",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
itr.m_gpu_metrics[0].xgmi_link_speed);
|
||||
for(const auto& read_val : itr.m_gpu_metrics[0].xgmi_read_data_acc)
|
||||
{
|
||||
TRACE_COUNTER("device_xgmi_read_data",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
read_val);
|
||||
}
|
||||
|
||||
for(const auto& write_val : itr.m_gpu_metrics[0].xgmi_write_data_acc)
|
||||
{
|
||||
TRACE_COUNTER("device_xgmi_write_data",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
write_val);
|
||||
}
|
||||
}
|
||||
|
||||
if(_settings.pcie && !itr.m_gpu_metrics.empty())
|
||||
{
|
||||
TRACE_COUNTER("device_pcie_link_width",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
itr.m_gpu_metrics[0].pcie_link_width);
|
||||
TRACE_COUNTER("device_pcie_link_speed",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
itr.m_gpu_metrics[0].pcie_link_speed);
|
||||
TRACE_COUNTER("device_pcie_bandwidth_acc",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
itr.m_gpu_metrics[0].pcie_bandwidth_acc);
|
||||
TRACE_COUNTER("device_pcie_bandwidth_inst",
|
||||
counter_track::at(_dev_id, track_index++), _ts,
|
||||
itr.m_gpu_metrics[0].pcie_bandwidth_inst);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -951,6 +1218,8 @@ setup()
|
||||
key_pair_t{ "mem_usage", get_settings(itr).mem_usage },
|
||||
key_pair_t{ "vcn_activity", get_settings(itr).vcn_activity },
|
||||
key_pair_t{ "jpeg_activity", get_settings(itr).jpeg_activity },
|
||||
key_pair_t{ "xgmi", get_settings(itr).xgmi },
|
||||
key_pair_t{ "pcie", get_settings(itr).pcie },
|
||||
};
|
||||
|
||||
// Initialize all metrics to false
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "core/common.hpp"
|
||||
#include "core/components/fwd.hpp"
|
||||
#include "core/defines.hpp"
|
||||
#include "core/gpu_metrics.hpp"
|
||||
#include "core/state.hpp"
|
||||
#include "library/thread_data.hpp"
|
||||
|
||||
@@ -78,6 +79,8 @@ struct settings
|
||||
bool mem_usage = true;
|
||||
bool vcn_activity = true;
|
||||
bool jpeg_activity = true;
|
||||
bool xgmi = true;
|
||||
bool pcie = true;
|
||||
};
|
||||
|
||||
struct data
|
||||
@@ -93,11 +96,8 @@ struct data
|
||||
using mem_usage_t = uint64_t;
|
||||
using temp_t = int64_t;
|
||||
|
||||
struct xcp_metrics_t
|
||||
{
|
||||
std::vector<uint16_t> vcn_busy;
|
||||
std::vector<uint16_t> jpeg_busy;
|
||||
};
|
||||
// Use the shared gpu_metrics_t from core/gpu_metrics.hpp
|
||||
using gpu_metrics_t = rocprofsys::gpu::gpu_metrics_t;
|
||||
|
||||
ROCPROFSYS_DEFAULT_OBJECT(data)
|
||||
|
||||
@@ -112,7 +112,7 @@ struct data
|
||||
timestamp_t m_ts = 0;
|
||||
temp_t m_temp = 0;
|
||||
mem_usage_t m_mem_usage = 0;
|
||||
std::vector<xcp_metrics_t> m_xcp_metrics = {};
|
||||
std::vector<gpu_metrics_t> m_gpu_metrics = {};
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
amdsmi_engine_usage_t m_busy_perc = {};
|
||||
amdsmi_power_info_t m_power = {};
|
||||
|
||||
@@ -46,6 +46,7 @@ include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-annotate-tests.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-causal-tests.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-python-tests.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-decode-tests.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-gpu-connect-tests.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-nic-perf.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-roctx-tests.cmake)
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-rocm-hip-stream.cmake)
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
{
|
||||
"required_tables": [
|
||||
{
|
||||
"min_rows": 1,
|
||||
"name_prefix": "rocpd_info_pmc",
|
||||
"required_columns": [
|
||||
"agent_id",
|
||||
"target_arch",
|
||||
"name",
|
||||
"symbol",
|
||||
"description",
|
||||
"units",
|
||||
"value_type"
|
||||
],
|
||||
"validation_queries": [
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for Xgmi amd-smi metrics",
|
||||
"error_message": "Did not find Xgmi data in amd-smi metrics",
|
||||
"expected_result": 1,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} WHERE symbol LIKE 'Xgmi%'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for Pcie amd-smi metrics",
|
||||
"error_message": "Did not find Pcie data in amd-smi metrics",
|
||||
"expected_result": 1,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} WHERE symbol LIKE 'Pcie%'"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"min_rows": 500,
|
||||
"name_prefix": "rocpd_pmc_event",
|
||||
"required_columns": [
|
||||
"event_id",
|
||||
"pmc_id",
|
||||
"value"
|
||||
],
|
||||
"validation_queries": [
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi xgmi link speed samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_xgmi_link_speed'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi xgmi link width samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_xgmi_link_width'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi xgmi read data samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name LIKE 'device_xgmi_read_data%'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi xgmi write data samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name LIKE 'device_xgmi_write_data%'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi pcie bandwidth instantaneous samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_bandwidth_inst'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi pcie bandwidth accumulated samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_bandwidth_acc'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi pcie link speed samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_link_speed'"
|
||||
},
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check for amd-smi monitoring busy times",
|
||||
"error_message": "Less than expected number of captured amd-smi pcie link width samples!",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_link_width'"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
+93
@@ -0,0 +1,93 @@
|
||||
{
|
||||
"required_tables": [
|
||||
{
|
||||
"commit": "Validation rules for hip_api",
|
||||
"name": "events_args",
|
||||
"required_columns": [
|
||||
"event_id",
|
||||
"category",
|
||||
"stack_id",
|
||||
"parent_stack_id",
|
||||
"correlation_id"
|
||||
],
|
||||
"validation_queries": [
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Verify that 'rocm_hip_api' appears in category at least 100 times in table events_args",
|
||||
"error_message": "'rocm_hip_api' category entries are fewer than expected in events_args",
|
||||
"expected_result": 100,
|
||||
"query": "SELECT COUNT(*) FROM events_args WHERE category = 'rocm_hip_api';"
|
||||
},
|
||||
{
|
||||
"comparison": "equals",
|
||||
"description": "Check for missing category entries",
|
||||
"error_message": "Empty or NULL category entries found in events_args",
|
||||
"expected_result": 0,
|
||||
"query": "SELECT COUNT(*) FROM events_args WHERE category IS NULL OR TRIM(category) = '';"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"commit": "Validation rules for hip_api",
|
||||
"name": "regions",
|
||||
"required_columns": [
|
||||
"id",
|
||||
"guid",
|
||||
"category",
|
||||
"name"
|
||||
],
|
||||
"validation_queries": [
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Verify that 'rocm_hip_api' appears in category at least 50 times in table regions",
|
||||
"error_message": "'rocm_hip_api' category entries are fewer than expected in regions",
|
||||
"expected_result": 50,
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_hip_api';"
|
||||
},
|
||||
{
|
||||
"comparison": "equals",
|
||||
"description": "Ensure there are no HIP API calls that last 0 seconds",
|
||||
"error_message": "Found HIP API captures where duration is 0",
|
||||
"expected_result": 0,
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_hip_api' AND duration = 0;"
|
||||
},
|
||||
{
|
||||
"comparison": "equals",
|
||||
"description": "Check for any NULL values in the 'name' column of regions",
|
||||
"error_message": "NULL entries found in the name column of regions",
|
||||
"expected_result": 0,
|
||||
"query": "SELECT COUNT(*) FROM regions WHERE name IS NULL;"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "rocpd_info_agent",
|
||||
"required_columns": [
|
||||
"id",
|
||||
"guid",
|
||||
"nid",
|
||||
"pid",
|
||||
"type",
|
||||
"name"
|
||||
],
|
||||
"validation_queries": [
|
||||
{
|
||||
"comparison": "greater_than",
|
||||
"description": "Check that we have GPU agents detected",
|
||||
"error_message": "No GPU agents found",
|
||||
"expected_result": 0,
|
||||
"query": "SELECT COUNT(*) as count FROM rocpd_info_agent WHERE type = 'GPU'"
|
||||
},
|
||||
{
|
||||
"comparison": "equals",
|
||||
"description": "Check for NULL agent names",
|
||||
"error_message": "Found agents with NULL names",
|
||||
"expected_result": 0,
|
||||
"query": "SELECT COUNT(*) as count FROM rocpd_info_agent WHERE name IS NULL"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
#
|
||||
# GPU connectivity tests (transferBench)
|
||||
#
|
||||
# -------------------------------------------------------------------------------------- #
|
||||
|
||||
set(_gpu_connect_environment
|
||||
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api"
|
||||
"ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,xgmi,pcie"
|
||||
"ROCPROFSYS_SAMPLING_CPUS=none"
|
||||
"ROCPROFSYS_USE_SAMPLING=OFF"
|
||||
"ROCPROFSYS_PROCESS_SAMPLING_FREQ=10"
|
||||
"ROCPROFSYS_CPU_FREQ_ENABLED=OFF"
|
||||
)
|
||||
|
||||
set(_gpu_connect_rocpd_validation_rules
|
||||
"${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/gpu-connect/validation-rules.json"
|
||||
"${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/gpu-connect/amd-smi-rules.json"
|
||||
)
|
||||
|
||||
# Enable ROCPD for tests only if valid ROCm is installed and a valid GPU is detected
|
||||
if(${ENABLE_ROCPD_TEST} AND ${_VALID_GPU})
|
||||
list(APPEND _gpu_connect_environment "ROCPROFSYS_USE_ROCPD=ON")
|
||||
endif()
|
||||
|
||||
set(skip_validation FALSE)
|
||||
|
||||
if(EXISTS "${PROJECT_BINARY_DIR}/transferBench")
|
||||
execute_process(
|
||||
COMMAND ${PROJECT_BINARY_DIR}/transferBench
|
||||
OUTPUT_VARIABLE _transfer_output
|
||||
ERROR_VARIABLE _transfer_output
|
||||
RESULT_VARIABLE _transfer_result
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
ERROR_STRIP_TRAILING_WHITESPACE
|
||||
)
|
||||
|
||||
if(_transfer_output MATCHES "Error: No valid transfers created")
|
||||
set(skip_validation TRUE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
rocprofiler_systems_add_test(
|
||||
SKIP_BASELINE SKIP_REWRITE SKIP_SAMPLING SKIP_RUNTIME
|
||||
NAME transferbench
|
||||
TARGET transferBench
|
||||
GPU ON
|
||||
ENVIRONMENT "${_base_environment};${_gpu_connect_environment}"
|
||||
LABELS "transferbench;xgmi;pcie"
|
||||
SYS_RUN_SKIP_REGEX "Error: No valid transfers created"
|
||||
)
|
||||
|
||||
if(NOT skip_validation)
|
||||
rocprofiler_systems_add_validation_test(
|
||||
NAME transferbench-sys-run
|
||||
PERFETTO_FILE "perfetto-trace.proto"
|
||||
LABELS "transferbench;perfetto"
|
||||
ARGS --counter-names "XGMI Read Data" "XGMI Write Data" -p
|
||||
)
|
||||
|
||||
if(${ENABLE_ROCPD_TEST} AND ${_VALID_GPU})
|
||||
set_property(TEST transferbench-sys-run APPEND PROPERTY LABELS rocpd)
|
||||
|
||||
rocprofiler_systems_add_validation_test(
|
||||
NAME transferbench-sys-run
|
||||
ROCPD_FILE "rocpd.db"
|
||||
LABELS "transferbench;rocpd"
|
||||
ARGS --validation-rules
|
||||
${_gpu_connect_rocpd_validation_rules}
|
||||
)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "TransferBench: No valid transfers created, skipping tests")
|
||||
endif()
|
||||
@@ -530,6 +530,7 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
REWRITE
|
||||
REWRITE_RUN
|
||||
BASELINE
|
||||
SYS_RUN
|
||||
)
|
||||
foreach(_TYPE PASS FAIL SKIP)
|
||||
list(APPEND _REGEX_OPTS "${_PREFIX}_${_TYPE}_REGEX")
|
||||
@@ -548,8 +549,8 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
|
||||
cmake_parse_arguments(
|
||||
TEST
|
||||
"SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME"
|
||||
"NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT;WILL_FAIL;DISABLED"
|
||||
"SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SYS_RUN"
|
||||
"NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT;SYS_RUN_TIMEOUT;WILL_FAIL;DISABLED"
|
||||
"${_KWARGS}"
|
||||
${ARGN}
|
||||
)
|
||||
@@ -561,6 +562,7 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
REWRITE
|
||||
REWRITE_RUN
|
||||
BASELINE
|
||||
SYS_RUN
|
||||
)
|
||||
if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "")
|
||||
set(${_PREFIX}_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})")
|
||||
@@ -601,6 +603,10 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
set(TEST_SAMPLING_TIMEOUT 120)
|
||||
endif()
|
||||
|
||||
if(NOT TEST_SYS_RUN_TIMEOUT)
|
||||
set(TEST_SYS_RUN_TIMEOUT 300)
|
||||
endif()
|
||||
|
||||
if(NOT TEST_DISABLED)
|
||||
set(TEST_DISABLED OFF)
|
||||
endif()
|
||||
@@ -711,6 +717,16 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
)
|
||||
endif()
|
||||
|
||||
if(NOT TEST_SKIP_SYS_RUN)
|
||||
add_test(
|
||||
NAME ${TEST_NAME}-sys-run
|
||||
COMMAND
|
||||
${COMMAND_PREFIX} $<TARGET_FILE:rocprofiler-systems-run> --
|
||||
$<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
|
||||
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
|
||||
)
|
||||
endif()
|
||||
|
||||
if(TEST ${TEST_NAME}-binary-rewrite-run)
|
||||
set_tests_properties(
|
||||
${TEST_NAME}-binary-rewrite-run
|
||||
@@ -725,10 +741,17 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
binary-rewrite
|
||||
binary-rewrite-run
|
||||
runtime-instrument
|
||||
sys-run
|
||||
)
|
||||
string(REGEX REPLACE "-run(-|/)" "\\1" _prefix "${TEST_NAME}-${_TEST}/")
|
||||
string(
|
||||
REGEX REPLACE
|
||||
"rewrite-run(-|/)"
|
||||
"rewrite\\1"
|
||||
_prefix
|
||||
"${TEST_NAME}-${_TEST}/"
|
||||
)
|
||||
set(_labels "${_TEST}")
|
||||
string(REPLACE "-run" "" _labels "${_TEST}")
|
||||
string(REPLACE "rewrite-run" "rewrite" _labels "${_TEST}")
|
||||
if(TEST_TARGET)
|
||||
list(APPEND _labels "${TEST_TARGET}")
|
||||
endif()
|
||||
@@ -748,10 +771,12 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
set(_timeout ${TEST_SAMPLING_TIMEOUT})
|
||||
elseif("${_TEST}" MATCHES "runtime-instrument")
|
||||
set(_timeout ${TEST_RUNTIME_TIMEOUT})
|
||||
elseif("${_TEST}" MATCHES "sys-run")
|
||||
set(_timeout ${TEST_SYS_RUN_TIMEOUT})
|
||||
endif()
|
||||
|
||||
set(_props)
|
||||
if("${_TEST}" MATCHES "run|sampling|baseline")
|
||||
if("${_TEST}" MATCHES "sys-run|sampling|baseline")
|
||||
set(_props ${TEST_PROPERTIES})
|
||||
if(NOT "RUN_SERIAL" IN_LIST _props)
|
||||
list(APPEND _props RUN_SERIAL ON)
|
||||
@@ -768,11 +793,17 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
|
||||
set(_REGEX_VAR BASELINE)
|
||||
elseif("${_TEST}" MATCHES "sampling")
|
||||
set(_REGEX_VAR SAMPLING)
|
||||
elseif("${_TEST}" MATCHES "sys-run")
|
||||
set(_REGEX_VAR SYS_RUN)
|
||||
else()
|
||||
set(_REGEX_VAR)
|
||||
endif()
|
||||
|
||||
if("${_TEST}" MATCHES "binary-rewrite-run|runtime-instrument|sampling")
|
||||
if(
|
||||
"${_TEST}"
|
||||
MATCHES
|
||||
"binary-rewrite-run|runtime-instrument|sampling|sys-run"
|
||||
)
|
||||
rocprofiler_systems_patch_sanitizer_environment(_environ)
|
||||
endif()
|
||||
|
||||
|
||||
新增問題並參考
封鎖使用者