[Rocprofiler-systems] : Add XGMI and PCIe metrics to the profiling data (#1628)

* Add XGMI and PCIe metrics to the profiling data

Add support for AMD XGMI (GPU-to-GPU interconnect) and PCIe
metrics:
  * XGMI link width in bits
  * XGMI link speed in GT/s
  * Per-link read bandwidth (KB)
  * Per-link write bandwidth (KB)

- Add new categories for PCIe metrics:
  * PCIe link width
  * PCIe link speed in GT/s
  * Accumulated bandwidth (MB)
  * Instantaneous bandwidth (MB/s)

* Fix VCN/JPEG insert logic

* Modify the gpu_metrics struct to accomodate XCP structure

* Add ctest automation for gpu interconnect metrics

* Refactor to move gpu_metrics struct and serialization to another file

* Possible fix for timeout in CI

Fix redundant skip check in ctest
Add xgmi and pcie option in rocprof-sys-avail.

* Change2: Address review comments

Change ctest sampling to avoid timeout
Change variable name and code structuring

* Add option in ctest to run rocprof-sys-run without rewrite

Run transferbench with rocprof-sys-run without sampling

* Change3: Fix sample insert bug and address review comments

xgmi and pci support check
renaming variables
additional hip_api validation in rocpd

* Reduce the load from the trnasferBench sample

The CI builds were timing out when flushing a big temporary file to the
DB: (2720824.23 KB / 2720.82 MB / 2.72 GB)...
此提交包含在:
Sajina PK
2025-11-14 19:42:33 -05:00
提交者 GitHub
父節點 c9dd49c48a
當前提交 09b8342e22
共有 22 個檔案被更改,包括 7133 行新增288 行删除
+1
查看文件
@@ -79,3 +79,4 @@ add_subdirectory(videodecode)
add_subdirectory(jpegdecode)
add_subdirectory(roctx)
add_subdirectory(thread-limit)
add_subdirectory(transferBench)
+810
查看文件
@@ -0,0 +1,810 @@
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <algorithm>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <limits>
#include <map>
#include <random>
#include <set>
#include <sstream>
#include <string>
#include <vector>
// Include necessary headers
#include "TransferBench.hpp"
using namespace TransferBench;
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do \
{ \
hipError_t error = (cmd); \
if(error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while(0)
// Default configuration values
// Reduced to 16KB (1 << 14) for minimal data capture during profiling
size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 14);
char const ExeTypeName[5][4] = { "CPU", "GPU", "DMA", "NIC", "NIC" };
// Simplified EnvVars class for standalone use
class EnvVars
{
public:
// Environment variables (using minimal defaults for profiling)
int numIterations = 1;
int numSubIterations = 1;
int numWarmups = 0;
int showIterations = 0;
int useInteractive = 0;
int alwaysValidate = 0;
int blockBytes = 256;
int byteOffset = 0;
std::vector<float> fillPattern;
std::vector<int> fillCompress;
int validateDirect = 0;
int validateSource = 0;
int useHsaDma = 0;
int gfxBlockOrder = 0;
int gfxBlockSize = 256;
std::vector<uint32_t> cuMask;
std::vector<std::vector<int>> prefXccTable;
int gfxTemporal = 0;
int gfxUnroll = 4;
int useHipEvents = 1;
int useSingleStream = 1;
int gfxSingleTeam = 1;
int gfxWaveOrder = 0;
int gfxWordSize = 4;
int hideEnv = 0;
int minNumVarSubExec = 1;
int maxNumVarSubExec = 0;
int outputToCsv = 0;
int samplingFactor = 1;
int ibGidIndex = -1;
int roceVersion = 2;
int ipAddressFamily = 4;
uint8_t ibPort = 1;
int nicRelaxedOrder = 1;
std::string closestNicStr = "";
int gpuMaxHwQueues = 4;
// Constructor that collects values from environment
EnvVars()
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
(void) numDetectedGpus; // May be unused
// Get architecture-specific defaults
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, 0));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
int defaultGfxUnroll = 4;
if(archName == "gfx906")
defaultGfxUnroll = 8;
else if(archName == "gfx90a")
defaultGfxUnroll = 8;
else if(archName == "gfx942")
defaultGfxUnroll = 4;
else if(archName == "gfx950")
defaultGfxUnroll = 4;
// Read environment variables
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE", 0);
blockBytes = GetEnvVar("BLOCK_BYTES", 256);
byteOffset = GetEnvVar("BYTE_OFFSET", 0);
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER", 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE", 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM", 1);
gfxTemporal = GetEnvVar("GFX_TEMPORAL", 0);
gfxUnroll = GetEnvVar("GFX_UNROLL", defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER", 0);
gfxWordSize = GetEnvVar("GFX_WORD_SIZE", 4);
hideEnv = GetEnvVar("HIDE_ENV", 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC", 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC", 0);
numIterations = GetEnvVar("NUM_ITERATIONS", 1);
numSubIterations = GetEnvVar("NUM_SUBITERATIONS", 1);
numWarmups = GetEnvVar("NUM_WARMUPS", 0);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV", 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR", 1);
showIterations = GetEnvVar("SHOW_ITERATIONS", 0);
useHipEvents = GetEnvVar("USE_HIP_EVENTS", 1);
useHsaDma = GetEnvVar("USE_HSA_DMA", 0);
useInteractive = GetEnvVar("USE_INTERACTIVE", 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM", 1);
validateDirect = GetEnvVar("VALIDATE_DIRECT", 0);
validateSource = GetEnvVar("VALIDATE_SOURCE", 0);
ibGidIndex = GetEnvVar("IB_GID_INDEX", -1);
ibPort = GetEnvVar("IB_PORT_NUMBER", 1);
roceVersion = GetEnvVar("ROCE_VERSION", 2);
ipAddressFamily = GetEnvVar("IP_ADDRESS_FAMILY", 4);
nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER", 1);
closestNicStr = GetEnvVar("CLOSEST_NIC", "");
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES", 4);
}
// Helper function that gets environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue)
{
if(getenv(varname.c_str())) return atoi(getenv(varname.c_str()));
return defaultValue;
}
static std::string GetEnvVar(std::string const& varname,
std::string const& defaultValue)
{
if(getenv(varname.c_str())) return getenv(varname.c_str());
return defaultValue;
}
void Print(std::string const& name, int32_t const value, const char* format,
...) const
{
printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value,
outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
void Print(std::string const& name, std::string const& value, const char* format,
...) const
{
printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(),
outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
// Display env var settings (simplified)
void DisplayEnvVars() const
{
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
if(!outputToCsv)
{
printf("Standalone AllToAll v%s%s\n", TransferBench::VERSION,
nicSupport.c_str());
printf("===============================================================\n");
if(!hideEnv)
printf("[Common] (Suppress by setting "
"HIDE_ENV=1)\n");
}
else if(!hideEnv)
printf("EnvVar,Value,Description,(Standalone AllToAll v%s)\n",
TransferBench::VERSION);
if(hideEnv) return;
Print("NUM_ITERATIONS", numIterations, "Running %d timed iteration(s)",
numIterations);
Print("NUM_WARMUPS", numWarmups, "Running %d warmup iteration(s) per Test",
numWarmups);
Print("USE_SINGLE_STREAM", useSingleStream, "Using single stream per GFX %s",
useSingleStream ? "device" : "Transfer");
Print("GFX_UNROLL", gfxUnroll, "Using GFX unroll factor of %d", gfxUnroll);
printf("\n");
}
// Display usage instructions
static void DisplayUsage()
{
printf("Environment variables:\n");
printf("======================\n");
printf(" NUM_ITERATIONS - # of timed iterations per test (default=1)\n");
printf(
" NUM_WARMUPS - # of untimed warmup iterations per test (default=0)\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor "
"(default=1)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (default=4)\n");
printf(
" HIDE_ENV - Hide environment variable value listing (default=0)\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set (default=0)\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info (default=0)\n");
printf("\n");
printf("AllToAll specific variables:\n");
printf(" A2A_DIRECT - Only using direct links (default=1)\n");
printf(" A2A_LOCAL - Include local transfers (default=0)\n");
printf(" A2A_MODE - Transfer mode: 0=Copy, 1=Read-Only, 2=Write-Only "
"(default=0)\n");
printf(" NUM_GPU_DEVICES - Number of GPUs to use (default=4 detected)\n");
printf(
" NUM_SUB_EXEC - Number of subexecutors/CUs per Transfer (default=1)\n");
printf(" USE_DMA_EXEC - Use DMA executor instead of GFX (default=0)\n");
printf(" USE_FINE_GRAIN - Use fine-grained memory (default=1)\n");
printf(" USE_REMOTE_READ - Use DST as executor instead of SRC (default=0)\n");
}
TransferBench::ConfigOptions ToConfigOptions()
{
TransferBench::ConfigOptions cfg;
cfg.general.numIterations = numIterations;
cfg.general.numSubIterations = numSubIterations;
cfg.general.numWarmups = numWarmups;
cfg.general.recordPerIteration = showIterations;
cfg.general.useInteractive = useInteractive;
cfg.data.alwaysValidate = alwaysValidate;
cfg.data.blockBytes = blockBytes;
cfg.data.byteOffset = byteOffset;
cfg.data.fillCompress = fillCompress;
cfg.data.fillPattern = fillPattern;
cfg.data.validateDirect = validateDirect;
cfg.data.validateSource = validateSource;
cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma;
cfg.gfx.blockOrder = gfxBlockOrder;
cfg.gfx.blockSize = gfxBlockSize;
cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable;
cfg.gfx.unrollFactor = gfxUnroll;
cfg.gfx.temporalMode = gfxTemporal;
cfg.gfx.useHipEvents = useHipEvents;
cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder;
cfg.gfx.wordSize = gfxWordSize;
cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.roceVersion = roceVersion;
std::vector<int> closestNics;
if(closestNicStr != "")
{
std::stringstream ss(closestNicStr);
std::string item;
while(std::getline(ss, item, ','))
{
try
{
int nic = std::stoi(item);
closestNics.push_back(nic);
} catch(const std::invalid_argument& e)
{
printf("[ERROR] Invalid NIC index (%s) by user in %s\n", item.c_str(),
closestNicStr.c_str());
exit(1);
}
}
cfg.nic.closestNics = closestNics;
}
return cfg;
}
};
// Forward declarations
void
PrintResults(EnvVars const& ev, int const testNum, std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results);
void
PrintErrors(std::vector<ErrResult> const& errors);
void
CheckForError(ErrResult const& error);
std::string
MemDevicesToStr(std::vector<MemDevice> const& memDevices);
// Helper function that converts MemDevices to a string
std::string
MemDevicesToStr(std::vector<MemDevice> const& memDevices)
{
if(memDevices.empty()) return "N";
std::stringstream ss;
for(auto const& m : memDevices)
ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
return ss.str();
}
// Helper function to print warning / exit on fatal error
void
CheckForError(ErrResult const& error)
{
switch(error.errType)
{
case ERR_NONE: return;
case ERR_WARN: printf("[WARN] %s\n", error.errMsg.c_str()); return;
case ERR_FATAL: printf("[ERROR] %s\n", error.errMsg.c_str()); exit(1);
default: break;
}
}
// Helper function to print list of errors
void
PrintErrors(std::vector<ErrResult> const& errors)
{
bool isFatal = false;
for(auto const& err : errors)
{
printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN",
err.errMsg.c_str());
isFatal |= (err.errType == ERR_FATAL);
}
if(isFatal) exit(1);
}
// Print TransferBench test results
void
PrintResults(EnvVars const& ev, int const testNum, std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
if(!ev.outputToCsv) printf("Test %d:\n", testNum);
// Loop over each executor
for(auto exeInfoPair : results.exeResults)
{
ExeDevice const& exeDevice = exeInfoPair.first;
ExeResult const& exeResult = exeInfoPair.second;
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f "
"GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep,
exeResult.sumBandwidthGbPerSec);
// Loop over each transfer
for(int idx : exeResult.transferIdx)
{
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];
char exeSubIndexStr[32] = "";
if(t.exeSubIndex != -1) sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
printf(" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s "
"-> %c%03d%s:%03d -> %s\n",
idx, sep, r.avgBandwidthGbPerSec, sep, r.avgDurationMsec, sep,
r.numBytes, sep, MemDevicesToStr(t.srcs).c_str(),
TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information
if(ev.showIterations)
{
// Check that per-iteration information exists
if(r.perIterMsec.size() != numTimedIterations)
{
printf("[ERROR] Per iteration timing data unavailable: Expected %lu "
"data points, but have %lu\n",
numTimedIterations, r.perIterMsec.size());
exit(1);
}
// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for(size_t i = 0; i < numTimedIterations; i++)
{
times.insert(
std::make_pair(r.perIterMsec[i], static_cast<int>(i + 1)));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs =
(t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
// Loop over iterations (fastest to slowest)
for(auto& time : times)
{
double iterDurationMsec = time.first;
double iterBandwidthGbs =
(t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c", time.second,
sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if(static_cast<size_t>(time.second - 1) < r.perIterCUs.size())
{
printf(" CUs:");
for(auto x : r.perIterCUs[time.second - 1])
{
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for(auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw,
sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f "
"ms\n",
sep, results.avgTotalBandwidthGbPerSec, sep, results.avgTotalDurationMsec, sep,
results.totalBytesTransferred, sep, results.overheadMsec);
}
// AllToAll Preset Implementation
void
AllToAllPreset(EnvVars& ev, size_t const numBytesPerTransfer,
std::string const presetName)
{
(void) presetName; // May be unused
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = { "Copy", "Read-Only", "Write-Only", "Custom" };
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
// Force to gfx unroll 2 unless explicitly set
ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT", 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL", 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", std::min(4, numDetectedGpus));
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC", 1);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC", 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if(getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2)
{
a2aMode = A2A_CUSTOM;
}
else
{
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if(a2aMode < 0 || a2aMode > 2)
{
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if(!ev.hideEnv)
{
if(!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT", a2aDirect,
a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL", a2aLocal, "%s local transfers",
a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE",
(a2aMode == A2A_CUSTOM)
? std::to_string(numSrcs) + ":" + std::to_string(numDsts)
: std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)")
.c_str()
: a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs,
"Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("NUM_SUB_EXEC", numSubExecs, "Using %d subexecutors/CUs per Transfer",
numSubExecs);
ev.Print("USE_DMA_EXEC", useDmaExec, "Using %s executor",
useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory",
useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor",
useRemoteRead ? "DST" : "SRC");
printf("\n");
}
// Validate env vars
if(numGpus < 0 || numGpus > numDetectedGpus)
{
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus,
numDetectedGpus);
exit(1);
}
if(useDmaExec && (numSrcs != 1 || numDsts != 1))
{
printf("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for(int i = 0; i < numGpus; i++)
{
for(int j = 0; j < numGpus; j++)
{
// Check whether or not to execute this pair
if(i == j)
{
if(!a2aLocal) continue;
}
else if(a2aDirect)
{
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if(hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
for(int x = 0; x < numSrcs; x++)
transfer.srcs.push_back({ memType, i });
// When using multiple destinations, the additional destinations are "local"
if(numDsts) transfer.dsts.push_back({ memType, j });
for(int x = 1; x < numDsts; x++)
transfer.dsts.push_back({ memType, i });
transfer.exeDevice = { exeType, (useRemoteRead ? j : i) };
transfer.exeSubIndex = -1;
transfer.numSubExecs = numSubExecs;
reIndex[std::make_pair(i, j)] = transfers.size();
transfers.push_back(transfer);
}
}
// Create a ring using NICs
std::vector<int> nicTransferIdx(numGpus);
if(numQueuePairs > 0)
{
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
(void) numNics; // May be unused
for(int i = 0; i < numGpus; i++)
{
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({ memType, i });
transfer.dsts.push_back({ memType, (i + 1) % numGpus });
transfer.exeDevice = { TransferBench::EXE_NIC_NEAREST, i };
transfer.exeSubIndex = (i + 1) % numGpus;
transfer.numSubExecs = numQueuePairs;
nicTransferIdx[i] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs,
transfers.size());
if(transfers.size() == 0)
{
printf("Error: No valid transfers created. Check GPU count, a2aLocal=%d, "
"a2aDirect=%d settings, and GPU topology/connectivity.\n",
a2aLocal, a2aDirect);
return;
}
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if(!TransferBench::RunTransfers(cfg, transfers, results))
{
for(auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
}
else
{
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs,
numDsts);
printf(
"===========================================================================\n");
printf("SRC\\DST ");
for(int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
if(numQueuePairs > 0) printf("%cNIC(%02d QP)", separator, numQueuePairs);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus + 2, 0.0);
for(int src = 0; src < numGpus; src++)
{
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for(int dst = 0; dst < numGpus; dst++)
{
if(reIndex.count(std::make_pair(src, dst)))
{
int const transferIdx = reIndex[std::make_pair(src, dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
else
{
printf("%c%8s ", separator, "N/A");
}
}
if(numQueuePairs > 0)
{
TransferBench::TransferResult const& r =
results.tfrResults[nicTransferIdx[src]];
colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator,
actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus + 1] += rowTotalBandwidth;
}
printf("\nRTotal");
for(int dst = 0; dst < numGpus; dst++)
{
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
if(numQueuePairs > 0)
{
printf("%c%8.3f ", separator, colTotalBandwidth[numGpus]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus + 1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n",
totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n",
results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
// Display usage instructions
void
DisplayUsage(char const* cmdName)
{
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
printf("Standalone AllToAll v%s%s\n", TransferBench::VERSION, nicSupport.c_str());
printf("========================================\n");
printf("Usage: %s [N]\n", cmdName);
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 "
"bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / "
"gigabytes\n");
printf("\n");
EnvVars::DisplayUsage();
}
// Main function
int
main(int argc, char** argv)
{
// Collect environment variables
EnvVars ev;
// Display usage instructions if requested
if(argc > 1 && (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0))
{
DisplayUsage(argv[0]);
exit(0);
}
// Determine number of bytes to run per Transfer
size_t numBytesPerTransfer = argc > 1 ? atoll(argv[1]) : DEFAULT_BYTES_PER_TRANSFER;
if(argc > 1)
{
// Adjust bytes if unit specified
char units = argv[1][strlen(argv[1]) - 1];
switch(units)
{
case 'G':
case 'g': numBytesPerTransfer *= 1024;
case 'M':
case 'm': numBytesPerTransfer *= 1024;
case 'K':
case 'k': numBytesPerTransfer *= 1024;
}
}
if(numBytesPerTransfer % 4)
{
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n",
numBytesPerTransfer);
exit(1);
}
printf("Running AllToAll benchmark with %lu bytes per transfer\n\n",
numBytesPerTransfer);
// Run AllToAll preset
AllToAllPreset(ev, numBytesPerTransfer, "AllToAll");
return 0;
}
+125
查看文件
@@ -0,0 +1,125 @@
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-transferBench-example LANGUAGES CXX)
if(ROCPROFSYS_DISABLE_EXAMPLES)
get_filename_component(_DIR ${CMAKE_CURRENT_LIST_DIR} NAME)
if(
${PROJECT_NAME} IN_LIST ROCPROFSYS_DISABLE_EXAMPLES
OR ${_DIR} IN_LIST ROCPROFSYS_DISABLE_EXAMPLES
)
return()
endif()
endif()
find_package(hip QUIET HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR})
find_program(
HIPCC_EXECUTABLE
NAMES hipcc
HINTS ${ROCmVersion_DIR} ${ROCM_PATH}
ENV ROCM_PATH
/opt/rocm
PATHS ${ROCmVersion_DIR} ${ROCM_PATH}
ENV ROCM_PATH
/opt/rocm
NO_CACHE
)
mark_as_advanced(HIPCC_EXECUTABLE)
if(NOT HIPCC_EXECUTABLE)
message(AUTHOR_WARNING "hipcc could not be found. Cannot build transferBench target")
return()
endif()
if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
if(
CMAKE_CXX_COMPILER STREQUAL HIPCC_EXECUTABLE
OR "${CMAKE_CXX_COMPILER}" MATCHES "hipcc"
)
set(CMAKE_CXX_COMPILER_IS_HIPCC 1 CACHE BOOL "HIP compiler")
endif()
endif()
if(
(
NOT CMAKE_CXX_COMPILER_IS_HIPCC
OR (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT hip_FOUND)
)
AND (NOT COMMAND rocprofiler_systems_custom_compilation AND NOT HIPCC_EXECUTABLE)
)
message(AUTHOR_WARNING "transferBench target could not be built")
return()
endif()
find_package(Threads REQUIRED)
# Find HSA runtime library
find_library(
HSA_RUNTIME_LIBRARY
NAMES hsa-runtime64
HINTS ${ROCmVersion_DIR} ${ROCM_PATH}
ENV ROCM_PATH
/opt/rocm
PATHS ${ROCmVersion_DIR} ${ROCM_PATH}
ENV ROCM_PATH
/opt/rocm
PATH_SUFFIXES lib lib64
)
find_path(
HSA_RUNTIME_INCLUDE_DIR
NAMES hsa/hsa.h
HINTS ${ROCmVersion_DIR} ${ROCM_PATH}
ENV ROCM_PATH
/opt/rocm
PATHS ${ROCmVersion_DIR} ${ROCM_PATH}
ENV ROCM_PATH
/opt/rocm
PATH_SUFFIXES include
)
if(NOT HSA_RUNTIME_LIBRARY OR NOT HSA_RUNTIME_INCLUDE_DIR)
message(
AUTHOR_WARNING
"HSA runtime library not found. Cannot build transferBench target"
)
return()
endif()
add_executable(transferBench AllToAll.cpp)
target_link_libraries(transferBench PRIVATE Threads::Threads ${HSA_RUNTIME_LIBRARY})
target_include_directories(
transferBench
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${HSA_RUNTIME_INCLUDE_DIR}
)
if(
CMAKE_CXX_COMPILER_ID MATCHES "Clang"
AND NOT CMAKE_CXX_COMPILER_IS_HIPCC
AND NOT HIPCC_EXECUTABLE
)
target_link_libraries(
transferBench
PRIVATE
$<TARGET_NAME_IF_EXISTS:rocprofiler-systems::rocprofiler-systems-compile-options>
$<TARGET_NAME_IF_EXISTS:hip::host>
$<TARGET_NAME_IF_EXISTS:hip::device>
)
else()
target_compile_options(transferBench PRIVATE -W -Wall)
endif()
if("${CMAKE_BUILD_TYPE}" MATCHES "Release")
target_compile_options(transferBench PRIVATE -g1)
endif()
if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
# defined in MacroUtilities.cmake
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transferBench)
endif()
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS transferBench DESTINATION bin COMPONENT rocprofiler-systems-examples)
endif()
檔案差異因為檔案過大而無法顯示 載入差異
+2
查看文件
@@ -39,6 +39,7 @@ set(core_sources
${CMAKE_CURRENT_LIST_DIR}/dynamic_library.cpp
${CMAKE_CURRENT_LIST_DIR}/exception.cpp
${CMAKE_CURRENT_LIST_DIR}/gpu.cpp
${CMAKE_CURRENT_LIST_DIR}/gpu_metrics.cpp
${CMAKE_CURRENT_LIST_DIR}/mproc.cpp
${CMAKE_CURRENT_LIST_DIR}/node_info.cpp
${CMAKE_CURRENT_LIST_DIR}/perf.cpp
@@ -66,6 +67,7 @@ set(core_headers
${CMAKE_CURRENT_LIST_DIR}/dynamic_library.hpp
${CMAKE_CURRENT_LIST_DIR}/exception.hpp
${CMAKE_CURRENT_LIST_DIR}/gpu.hpp
${CMAKE_CURRENT_LIST_DIR}/gpu_metrics.hpp
${CMAKE_CURRENT_LIST_DIR}/locking.hpp
${CMAKE_CURRENT_LIST_DIR}/mpi.hpp
${CMAKE_CURRENT_LIST_DIR}/mproc.hpp
+21 -3
查看文件
@@ -70,11 +70,13 @@ config_settings(const std::shared_ptr<settings>& _config)
// No distinction between busy and activity shown in description
std::string jpeg_activity_support = "";
std::string vcn_activity_support = "";
std::string xgmi_support = "";
std::string pcie_support = "";
size_t device_count = gpu::get_processor_count();
for(size_t i = 0; i < device_count; i++)
{
if(gpu::is_vcn_activity_supported(i) || gpu::is_vcn_busy_supported(i))
if(gpu::vcn_is_device_level_only(i) || gpu::is_vcn_busy_supported(i))
{
vcn_activity_support += ", vcn_activity";
break;
@@ -82,17 +84,33 @@ config_settings(const std::shared_ptr<settings>& _config)
}
for(size_t i = 0; i < device_count; i++)
{
if(gpu::is_jpeg_activity_supported(i) || gpu::is_jpeg_busy_supported(i))
if(gpu::jpeg_is_device_level_only(i) || gpu::is_jpeg_busy_supported(i))
{
jpeg_activity_support += ", jpeg_activity";
break;
}
}
for(size_t i = 0; i < device_count; i++)
{
if(gpu::is_xgmi_supported(i))
{
xgmi_support += ", xgmi";
break;
}
}
for(size_t i = 0; i < device_count; i++)
{
if(gpu::is_pcie_supported(i))
{
pcie_support += ", pcie";
break;
}
}
ROCPROFSYS_CONFIG_SETTING(
std::string, "ROCPROFSYS_AMD_SMI_METRICS",
"amd-smi metrics to collect: " + default_metrics + jpeg_activity_support +
vcn_activity_support + ". " +
vcn_activity_support + xgmi_support + pcie_support + ". " +
"An empty value implies 'all' and 'none' suppresses all.",
"busy, temp, power, mem_usage", "backend", "amd_smi", "rocm", "process_sampling");
}
+16
查看文件
@@ -115,6 +115,14 @@ ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_power, ROCPROFSYS_CATEGORY_AMD_SMI_
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_memory_usage, ROCPROFSYS_CATEGORY_AMD_SMI_MEMORY_USAGE, "device_memory_usage", "Memory usage of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_vcn_activity, ROCPROFSYS_CATEGORY_AMD_SMI_VCN_ACTIVITY, "device_vcn_activity", "VCN Activity of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_jpeg_activity, ROCPROFSYS_CATEGORY_AMD_SMI_JPEG_ACTIVITY, "device_jpeg_activity", "JPEG Activity of a GPU device")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_link_width, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_WIDTH, "device_xgmi_link_width", "XGMI Link Width")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_link_speed, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_SPEED, "device_xgmi_link_speed", "XGMI Link Speed")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_read_data, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_READ_DATA, "device_xgmi_read_data", "XGMI Read Data Accumulator")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_xgmi_write_data, ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_WRITE_DATA, "device_xgmi_write_data", "XGMI Write Data Accumulator")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_link_width, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_WIDTH, "device_pcie_link_width", "PCIe Link Width")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_link_speed, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_SPEED, "device_pcie_link_speed", "PCIe Link Speed")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_bandwidth_acc, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_ACC, "device_pcie_bandwidth_acc", "PCIe Bandwidth Accumulated")
ROCPROFSYS_DEFINE_CATEGORY(category, amd_smi_pcie_bandwidth_inst, ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_INST, "device_pcie_bandwidth_inst", "PCIe Bandwidth Instantaneous")
ROCPROFSYS_DEFINE_CATEGORY(category, rocm_rccl, ROCPROFSYS_CATEGORY_ROCM_RCCL, "rccl", "ROCm Communication Collectives Library (RCCL) regions")
ROCPROFSYS_DEFINE_CATEGORY(category, pthread, ROCPROFSYS_CATEGORY_PTHREAD, "pthread", "POSIX threading functions")
ROCPROFSYS_DEFINE_CATEGORY(category, kokkos, ROCPROFSYS_CATEGORY_KOKKOS, "kokkos", "KokkosTools regions")
@@ -187,6 +195,14 @@ using name = perfetto_category<Tp...>;
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_memory_usage), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_vcn_activity), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_jpeg_activity), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_link_width), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_link_speed), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_read_data), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_xgmi_write_data), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_link_width), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_link_speed), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_bandwidth_acc), \
ROCPROFSYS_PERFETTO_CATEGORY(category::amd_smi_pcie_bandwidth_inst), \
ROCPROFSYS_PERFETTO_CATEGORY(category::rocm_rccl), \
ROCPROFSYS_PERFETTO_CATEGORY(category::pthread), \
ROCPROFSYS_PERFETTO_CATEGORY(category::kokkos), \
+61 -20
查看文件
@@ -245,12 +245,14 @@ add_device_metadata()
* Required amdsmi methods to get processors and handles
*/
uint32_t processors::total_processor_count = 0;
std::vector<amdsmi_processor_handle> processors::processors_list = {};
std::vector<bool> processors::vcn_activity_supported = {};
std::vector<bool> processors::jpeg_activity_supported = {};
std::vector<bool> processors::vcn_busy_supported = {};
std::vector<bool> processors::jpeg_busy_supported = {};
uint32_t processors::total_processor_count = 0;
std::vector<amdsmi_processor_handle> processors::processors_list = {};
std::vector<bool> processors::vcn_device_level_only = {};
std::vector<bool> processors::jpeg_device_level_only = {};
std::vector<bool> processors::vcn_busy_supported = {};
std::vector<bool> processors::jpeg_busy_supported = {};
std::vector<bool> processors::xgmi_supported = {};
std::vector<bool> processors::pcie_supported = {};
void
get_processor_handles()
@@ -299,49 +301,74 @@ get_processor_handles()
amdsmi_gpu_metrics_t gpu_metrics;
bool vcn_supported = false, jpeg_supported = false;
bool v_busy_supported = false, j_busy_supported = false;
bool xgmi_supported = false, pcie_supported = false;
// AMD SMI will not report VCN_activity and JPEG_activity, if VCN_busy or
// JPEG_busy fields are available.
if(amdsmi_get_gpu_metrics_info(processor, &gpu_metrics) ==
AMDSMI_STATUS_SUCCESS)
{
// Helper lambda to check if any value in the array is valid
auto has_valid = [](const auto& arr) {
// Helper lambda to check if any value in the array is valid (not
// UINT16_MAX)
auto has_valid_u16 = [](const auto& arr) {
return std::any_of(std::begin(arr), std::end(arr),
[](auto val) { return val != UINT16_MAX; });
};
vcn_supported = has_valid(gpu_metrics.vcn_activity);
jpeg_supported = has_valid(gpu_metrics.jpeg_activity);
// Helper lambda to check if any value in the array is valid (not
// UINT64_MAX)
auto has_valid_u64 = [](const auto& arr) {
return std::any_of(std::begin(arr), std::end(arr),
[](auto val) { return val != UINT64_MAX; });
};
vcn_supported = has_valid_u16(gpu_metrics.vcn_activity);
jpeg_supported = has_valid_u16(gpu_metrics.jpeg_activity);
// Check if VCN and JPEG busy metrics are available
for(const auto& xcp : gpu_metrics.xcp_stats)
{
if(!v_busy_supported && has_valid(xcp.vcn_busy))
if(!v_busy_supported && has_valid_u16(xcp.vcn_busy))
v_busy_supported = true;
if(!j_busy_supported && has_valid(xcp.jpeg_busy))
if(!j_busy_supported && has_valid_u16(xcp.jpeg_busy))
j_busy_supported = true;
if(v_busy_supported && j_busy_supported) break;
}
// Check if XGMI metrics are supported (any value not at max)
xgmi_supported = (gpu_metrics.xgmi_link_width != UINT16_MAX) ||
(gpu_metrics.xgmi_link_speed != UINT16_MAX) ||
has_valid_u64(gpu_metrics.xgmi_read_data_acc) ||
has_valid_u64(gpu_metrics.xgmi_write_data_acc);
// Check if PCIe metrics are supported (any value not at max)
pcie_supported = (gpu_metrics.pcie_link_width != UINT16_MAX) ||
(gpu_metrics.pcie_link_speed != UINT16_MAX) ||
(gpu_metrics.pcie_bandwidth_acc != UINT64_MAX) ||
(gpu_metrics.pcie_bandwidth_inst != UINT64_MAX);
}
processors::vcn_activity_supported.push_back(vcn_supported);
processors::jpeg_activity_supported.push_back(jpeg_supported);
processors::vcn_device_level_only.push_back(vcn_supported);
processors::jpeg_device_level_only.push_back(jpeg_supported);
processors::vcn_busy_supported.push_back(v_busy_supported);
processors::jpeg_busy_supported.push_back(j_busy_supported);
processors::xgmi_supported.push_back(xgmi_supported);
processors::pcie_supported.push_back(pcie_supported);
}
}
processors::total_processor_count = processors::processors_list.size();
}
bool
is_vcn_activity_supported(uint32_t dev_id)
vcn_is_device_level_only(uint32_t dev_id)
{
if(dev_id >= processors::vcn_activity_supported.size()) return false;
return processors::vcn_activity_supported[dev_id];
if(dev_id >= processors::vcn_device_level_only.size()) return false;
return processors::vcn_device_level_only[dev_id];
}
bool
is_jpeg_activity_supported(uint32_t dev_id)
jpeg_is_device_level_only(uint32_t dev_id)
{
if(dev_id >= processors::jpeg_activity_supported.size()) return false;
return processors::jpeg_activity_supported[dev_id];
if(dev_id >= processors::jpeg_device_level_only.size()) return false;
return processors::jpeg_device_level_only[dev_id];
}
bool
@@ -358,6 +385,20 @@ is_jpeg_busy_supported(uint32_t dev_id)
return processors::jpeg_busy_supported[dev_id];
}
bool
is_xgmi_supported(uint32_t dev_id)
{
if(dev_id >= processors::xgmi_supported.size()) return false;
return processors::xgmi_supported[dev_id];
}
bool
is_pcie_supported(uint32_t dev_id)
{
if(dev_id >= processors::pcie_supported.size()) return false;
return processors::pcie_supported[dev_id];
}
uint32_t
get_processor_count()
{
+16 -6
查看文件
@@ -41,10 +41,10 @@ amdsmi_processor_handle
get_handle_from_id(uint32_t dev_id);
bool
is_vcn_activity_supported(uint32_t dev_id);
vcn_is_device_level_only(uint32_t dev_id);
bool
is_jpeg_activity_supported(uint32_t dev_id);
jpeg_is_device_level_only(uint32_t dev_id);
bool
is_vcn_busy_supported(uint32_t dev_id);
@@ -52,23 +52,33 @@ is_vcn_busy_supported(uint32_t dev_id);
bool
is_jpeg_busy_supported(uint32_t dev_id);
bool
is_xgmi_supported(uint32_t dev_id);
bool
is_pcie_supported(uint32_t dev_id);
struct processors
{
static uint32_t total_processor_count;
static std::vector<amdsmi_processor_handle> processors_list;
static std::vector<bool> vcn_activity_supported;
static std::vector<bool> jpeg_activity_supported;
static std::vector<bool> vcn_device_level_only;
static std::vector<bool> jpeg_device_level_only;
static std::vector<bool> vcn_busy_supported;
static std::vector<bool> jpeg_busy_supported;
static std::vector<bool> xgmi_supported;
static std::vector<bool> pcie_supported;
private:
friend void rocprofsys::gpu::get_processor_handles();
friend uint32_t rocprofsys::gpu::get_processor_count();
friend amdsmi_processor_handle rocprofsys::gpu::get_handle_from_id(uint32_t dev_id);
friend bool rocprofsys::gpu::is_vcn_activity_supported(uint32_t dev_id);
friend bool rocprofsys::gpu::is_jpeg_activity_supported(uint32_t dev_id);
friend bool rocprofsys::gpu::vcn_is_device_level_only(uint32_t dev_id);
friend bool rocprofsys::gpu::jpeg_is_device_level_only(uint32_t dev_id);
friend bool rocprofsys::gpu::is_vcn_busy_supported(uint32_t dev_id);
friend bool rocprofsys::gpu::is_jpeg_busy_supported(uint32_t dev_id);
friend bool rocprofsys::gpu::is_xgmi_supported(uint32_t dev_id);
friend bool rocprofsys::gpu::is_pcie_supported(uint32_t dev_id);
};
#endif
+332
查看文件
@@ -0,0 +1,332 @@
// MIT License
//
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "gpu_metrics.hpp"
#include <stdexcept>
namespace rocprofsys
{
namespace gpu
{
namespace
{
// Helper functions for serialization
void
serialize_uint8(std::vector<uint8_t>& data, uint8_t val)
{
data.push_back(val);
}
void
serialize_uint16(std::vector<uint8_t>& data, uint16_t val)
{
data.push_back(static_cast<uint8_t>(val & 0xFF));
data.push_back(static_cast<uint8_t>((val >> 8) & 0xFF));
}
void
serialize_uint16_vector(std::vector<uint8_t>& data, const std::vector<uint16_t>& vec,
uint8_t count)
{
for(uint8_t i = 0; i < count; ++i)
{
data.push_back(static_cast<uint8_t>(vec[i] & 0xFF));
data.push_back(static_cast<uint8_t>((vec[i] >> 8) & 0xFF));
}
}
void
serialize_uint64(std::vector<uint8_t>& data, uint64_t val)
{
for(int i = 0; i < 8; ++i)
data.push_back(static_cast<uint8_t>((val >> (i * 8)) & 0xFF));
}
void
serialize_uint64_vector(std::vector<uint8_t>& data, const std::vector<uint64_t>& vec,
uint8_t count)
{
for(uint8_t i = 0; i < count; ++i)
{
for(int j = 0; j < 8; ++j)
data.push_back(static_cast<uint8_t>((vec[i] >> (j * 8)) & 0xFF));
}
}
// Helper functions for deserialization
uint8_t
deserialize_uint8(const std::vector<uint8_t>& data, size_t& offset)
{
if(offset >= data.size())
throw std::runtime_error("Invalid serialized data: unexpected end");
return data[offset++];
}
uint16_t
deserialize_uint16(const std::vector<uint8_t>& data, size_t& offset)
{
if(offset + 1 >= data.size())
throw std::runtime_error("Invalid serialized data: unexpected end");
uint16_t value = static_cast<uint16_t>(data[offset]) |
(static_cast<uint16_t>(data[offset + 1]) << 8);
offset += 2;
return value;
}
uint64_t
deserialize_uint64(const std::vector<uint8_t>& data, size_t& offset)
{
if(offset + 7 >= data.size())
throw std::runtime_error("Invalid serialized data: unexpected end");
uint64_t value = 0;
for(int i = 0; i < 8; ++i)
value |= (static_cast<uint64_t>(data[offset + i]) << (i * 8));
offset += 8;
return value;
}
std::vector<uint16_t>
deserialize_uint16_vector(const std::vector<uint8_t>& data, size_t& offset, uint8_t count)
{
std::vector<uint16_t> values;
values.reserve(count);
for(uint8_t i = 0; i < count; ++i)
values.push_back(deserialize_uint16(data, offset));
return values;
}
std::vector<uint64_t>
deserialize_uint64_vector(const std::vector<uint8_t>& data, size_t& offset, uint8_t count)
{
std::vector<uint64_t> values;
values.reserve(count);
for(uint8_t i = 0; i < count; ++i)
values.push_back(deserialize_uint64(data, offset));
return values;
}
} // namespace
std::vector<uint8_t>
serialize_gpu_metrics(const gpu_metrics_t& metrics,
const gpu_metrics_capabilities_t& capabilities,
const gpu_metrics_settings_t& settings)
{
// Flatten XCP data if needed and pre-calculate counts
// Example:
// XCP 0: [10, 20, 30] (3 values)
// XCP 1: [15, 25] (2 values)
// XCP 2: [5, 10, 15, 20] (4 values)
// vcn_xcp_count: 3
// vcn_xcp_sizes: [3, 2, 4]
// vcn_data_flat: [10, 20, 30, 15, 25, 5, 10, 15, 20]
std::vector<uint16_t> vcn_data_flat;
std::vector<uint16_t> jpeg_data_flat;
std::vector<uint8_t> vcn_xcp_sizes; // Size of each XCP's VCN data
std::vector<uint8_t> jpeg_xcp_sizes; // Size of each XCP's JPEG data
if(capabilities.flags.vcn_is_device_level_only)
{
vcn_data_flat = metrics.vcn_activity;
}
else
{
// Flatten per-XCP VCN data and record sizes
for(const auto& xcp_data : metrics.vcn_busy)
{
vcn_xcp_sizes.push_back(static_cast<uint8_t>(xcp_data.size()));
vcn_data_flat.insert(vcn_data_flat.end(), xcp_data.begin(), xcp_data.end());
}
}
if(capabilities.flags.jpeg_is_device_level_only)
{
jpeg_data_flat = metrics.jpeg_activity;
}
else
{
// Flatten per-XCP JPEG data and record sizes
for(const auto& xcp_data : metrics.jpeg_busy)
{
jpeg_xcp_sizes.push_back(static_cast<uint8_t>(xcp_data.size()));
jpeg_data_flat.insert(jpeg_data_flat.end(), xcp_data.begin(), xcp_data.end());
}
}
uint8_t vcn_count = static_cast<uint8_t>(vcn_data_flat.size());
uint8_t jpeg_count = static_cast<uint8_t>(jpeg_data_flat.size());
uint8_t vcn_xcp_count = static_cast<uint8_t>(vcn_xcp_sizes.size());
uint8_t jpeg_xcp_count = static_cast<uint8_t>(jpeg_xcp_sizes.size());
uint8_t xgmi_read_count = static_cast<uint8_t>(metrics.xgmi_read_data_acc.size());
uint8_t xgmi_write_count = static_cast<uint8_t>(metrics.xgmi_write_data_acc.size());
std::vector<uint8_t> result;
// Serialize capability flags (1 byte)
// These flags determine how the activity information is provided in the data
// Current flags:
// - bit 0 (0x01): vcn_is_device_level_only (device-level vs per-XCP)
// - bit 1 (0x02): jpeg_is_device_level_only (device-level vs per-XCP)
// - bits 2-7: Reserved for future use
//
serialize_uint8(result, capabilities.value);
// Serialize counts
serialize_uint8(result, vcn_count);
serialize_uint8(result, jpeg_count);
serialize_uint8(result, vcn_xcp_count);
serialize_uint8(result, jpeg_xcp_count);
serialize_uint8(result, xgmi_read_count);
serialize_uint8(result, xgmi_write_count);
// Serialize per-XCP sizes
for(uint8_t size : vcn_xcp_sizes)
serialize_uint8(result, size);
for(uint8_t size : jpeg_xcp_sizes)
serialize_uint8(result, size);
// Serialize the flattened data
if(settings.vcn_activity && vcn_count > 0)
serialize_uint16_vector(result, vcn_data_flat, vcn_count);
if(settings.jpeg_activity && jpeg_count > 0)
serialize_uint16_vector(result, jpeg_data_flat, jpeg_count);
if(settings.xgmi)
{
serialize_uint16(result, metrics.xgmi_link_width);
serialize_uint16(result, metrics.xgmi_link_speed);
serialize_uint64_vector(result, metrics.xgmi_read_data_acc, xgmi_read_count);
serialize_uint64_vector(result, metrics.xgmi_write_data_acc, xgmi_write_count);
}
if(settings.pcie)
{
serialize_uint16(result, metrics.pcie_link_width);
serialize_uint16(result, metrics.pcie_link_speed);
serialize_uint64(result, metrics.pcie_bandwidth_acc);
serialize_uint64(result, metrics.pcie_bandwidth_inst);
}
return result;
}
void
deserialize_gpu_metrics(const std::vector<uint8_t>& serialized_data,
gpu_metrics_t& result, bool is_vcn_enabled, bool is_jpeg_enabled,
bool is_xgmi_enabled, bool is_pcie_enabled,
gpu_metrics_capabilities_t& capabilities)
{
if(serialized_data.empty())
{
throw std::runtime_error("Invalid serialized data: insufficient header size");
}
size_t offset = 0;
// Deserialize capability flags (1 byte)
// Extract capability flags from packed byte.
// See serialize_gpu_metrics() for flag definitions.
capabilities.value = deserialize_uint8(serialized_data, offset);
// Deserialize counts
uint8_t vcn_count = deserialize_uint8(serialized_data, offset);
uint8_t jpeg_count = deserialize_uint8(serialized_data, offset);
uint8_t vcn_xcp_count = deserialize_uint8(serialized_data, offset);
uint8_t jpeg_xcp_count = deserialize_uint8(serialized_data, offset);
uint8_t xgmi_read_count = deserialize_uint8(serialized_data, offset);
uint8_t xgmi_write_count = deserialize_uint8(serialized_data, offset);
// Deserialize per-XCP sizes
std::vector<uint8_t> vcn_xcp_sizes;
std::vector<uint8_t> jpeg_xcp_sizes;
for(uint8_t i = 0; i < vcn_xcp_count; ++i)
vcn_xcp_sizes.push_back(deserialize_uint8(serialized_data, offset));
for(uint8_t i = 0; i < jpeg_xcp_count; ++i)
jpeg_xcp_sizes.push_back(deserialize_uint8(serialized_data, offset));
// Deserialize VCN data and reconstruct structure
if(is_vcn_enabled && vcn_count > 0)
{
auto flat_data = deserialize_uint16_vector(serialized_data, offset, vcn_count);
if(capabilities.flags.vcn_is_device_level_only)
{
result.vcn_activity = flat_data;
}
else
{
// Per-XCP: split flat data according to XCP sizes into vcn_busy
size_t flat_offset = 0;
for(uint8_t xcp_size : vcn_xcp_sizes)
{
std::vector<uint16_t> xcp_data(flat_data.begin() + flat_offset,
flat_data.begin() + flat_offset +
xcp_size);
result.vcn_busy.push_back(xcp_data);
flat_offset += xcp_size;
}
}
}
// Deserialize JPEG data and reconstruct structure
if(is_jpeg_enabled && jpeg_count > 0)
{
auto flat_data = deserialize_uint16_vector(serialized_data, offset, jpeg_count);
if(capabilities.flags.jpeg_is_device_level_only)
{
result.jpeg_activity = flat_data;
}
else
{
// Per-XCP: split flat data according to XCP sizes into jpeg_busy
size_t flat_offset = 0;
for(uint8_t xcp_size : jpeg_xcp_sizes)
{
std::vector<uint16_t> xcp_data(flat_data.begin() + flat_offset,
flat_data.begin() + flat_offset +
xcp_size);
result.jpeg_busy.push_back(xcp_data);
flat_offset += xcp_size;
}
}
}
// Deserialize XGMI data
if(is_xgmi_enabled)
{
result.xgmi_link_width = deserialize_uint16(serialized_data, offset);
result.xgmi_link_speed = deserialize_uint16(serialized_data, offset);
result.xgmi_read_data_acc =
deserialize_uint64_vector(serialized_data, offset, xgmi_read_count);
result.xgmi_write_data_acc =
deserialize_uint64_vector(serialized_data, offset, xgmi_write_count);
}
// Deserialize PCIe data
if(is_pcie_enabled)
{
result.pcie_link_width = deserialize_uint16(serialized_data, offset);
result.pcie_link_speed = deserialize_uint16(serialized_data, offset);
result.pcie_bandwidth_acc = deserialize_uint64(serialized_data, offset);
result.pcie_bandwidth_inst = deserialize_uint64(serialized_data, offset);
}
}
} // namespace gpu
} // namespace rocprofsys
+144
查看文件
@@ -0,0 +1,144 @@
// MIT License
//
// Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once
#include <cstdint>
#include <vector>
namespace rocprofsys
{
namespace gpu
{
/// GPU metrics data structure for VCN, JPEG, XGMI, and PCIe metrics
struct gpu_metrics_t
{
// VCN metrics
std::vector<uint16_t> vcn_activity; // Device-level VCN (when supported)
std::vector<std::vector<uint16_t>> vcn_busy; // XCP-level VCN (per-XCP organization)
// JPEG metrics
std::vector<uint16_t> jpeg_activity; // Device-level JPEG (when supported)
std::vector<std::vector<uint16_t>>
jpeg_busy; // XCP-level JPEG (per-XCP organization)
// XGMI metrics
uint16_t xgmi_link_width = 0;
uint16_t xgmi_link_speed = 0;
std::vector<uint64_t> xgmi_read_data_acc;
std::vector<uint64_t> xgmi_write_data_acc;
// PCIe metrics
uint16_t pcie_link_width = 0;
uint16_t pcie_link_speed = 0;
uint64_t pcie_bandwidth_acc = 0;
uint64_t pcie_bandwidth_inst = 0;
};
/// Settings structure for controlling which metrics are serialized
struct gpu_metrics_settings_t
{
bool vcn_activity = true;
bool jpeg_activity = true;
bool xgmi = true;
bool pcie = true;
};
/// GPU metrics capabilities structure with bitfield flags
struct gpu_metrics_capabilities_t
{
union
{
struct
{
uint8_t vcn_is_device_level_only : 1; ///< VCN is device-level (vs per-XCP)
uint8_t jpeg_is_device_level_only : 1; ///< JPEG is device-level (vs per-XCP)
uint8_t reserved : 6; ///< Reserved for future use
} flags;
uint8_t value; ///< Raw byte value for easy serialization
};
/// Default constructor - initializes all flags to zero
gpu_metrics_capabilities_t()
: value(0)
{}
};
/**
* @brief Serializes GPU metrics into a compact binary format
*
* Serialization format:
* 1. Support flags byte (1 byte):
* - bit 0: vcn_is_device_level_only (device-level vs per-XCP)
* - bit 1: jpeg_is_device_level_only (device-level vs per-XCP)
* - bits 2-7: reserved
* 2. Data element counts (6 bytes):
* - vcn_count (1 byte): total VCN values (flattened across all XCPs)
* - jpeg_count (1 byte): total JPEG values (flattened across all XCPs)
* - vcn_xcp_count (1 byte): number of XCPs with VCN data
* - jpeg_xcp_count (1 byte): number of XCPs with JPEG data
* - xgmi_read_count (1 byte): number of XGMI read data values
* - xgmi_write_count (1 byte): number of XGMI write data values
* 3. Per-XCP size arrays (variable):
* - vcn_xcp_sizes[0..vcn_xcp_count-1]: size of each XCP's VCN data (1 byte each)
* - jpeg_xcp_sizes[0..jpeg_xcp_count-1]: size of each XCP's JPEG data (1 byte each)
* 4. Flattened data arrays (conditionally serialized based on settings):
* - VCN data (if vcn_activity setting enabled): flattened uint16 values
* - JPEG data (if jpeg_activity setting enabled): flattened uint16 values
* - XGMI data (if xgmi setting enabled):
* link_width (uint16), link_speed (uint16)
* xgmi_read_data array (uint64[xgmi_read_count])
* xgmi_write_data array (uint64[xgmi_write_count])
* - PCIe data (if pcie setting enabled):
* link_width (uint16), link_speed (uint16)
* bandwidth_acc (uint64), bandwidth_inst (uint64)
*
* @param metrics GPU metrics to serialize
* @param capabilities Capability flags (vcn/jpeg device-level status)
* @param settings Controls which metrics to include in serialization
* @return Binary serialized data
*/
std::vector<uint8_t>
serialize_gpu_metrics(const gpu_metrics_t& metrics,
const gpu_metrics_capabilities_t& capabilities,
const gpu_metrics_settings_t& settings);
/**
* @brief Deserializes GPU metrics from binary format
*
* @param serialized_data Binary data to deserialize
* @param result Output GPU metrics structure
* @param is_vcn_enabled Whether to deserialize VCN data
* @param is_jpeg_enabled Whether to deserialize JPEG data
* @param is_xgmi_enabled Whether to deserialize XGMI data
* @param is_pcie_enabled Whether to deserialize PCIe data
* @param capabilities Output: capability flags (vcn/jpeg device-level status)
* @throws std::runtime_error if serialized data is invalid
*/
void
deserialize_gpu_metrics(const std::vector<uint8_t>& serialized_data,
gpu_metrics_t& result, bool is_vcn_enabled, bool is_jpeg_enabled,
bool is_xgmi_enabled, bool is_pcie_enabled,
gpu_metrics_capabilities_t& capabilities);
} // namespace gpu
} // namespace rocprofsys
+129 -102
查看文件
@@ -24,6 +24,7 @@
#include "agent_manager.hpp"
#include "config.hpp"
#include "debug.hpp"
#include "gpu_metrics.hpp"
#include "library/thread_info.hpp"
#include "node_info.hpp"
#include "rocpd/data_processor.hpp"
@@ -50,7 +51,6 @@ namespace trace_cache
{
namespace
{
#if ROCPROFSYS_USE_ROCM > 0
auto
get_handle_from_code_object(
@@ -405,73 +405,8 @@ rocpd_post_processing::get_pmc_event_with_sample_callback() const
postprocessing_callback
rocpd_post_processing::get_amd_smi_sample_callback() const
{
struct xcp_metrics_t
{
std::vector<uint16_t> vcn_busy;
std::vector<uint16_t> jpeg_busy;
};
auto deserialize_xcp_metrics = [](const std::vector<uint8_t>& serialized_data,
bool& _is_vcn_supported, bool& _is_jpeg_supported,
std::vector<xcp_metrics_t>& result) {
if(serialized_data.size() < 5)
{
throw std::runtime_error("Invalid serialized data: insufficient header size");
}
size_t offset = 0;
// Read header
_is_vcn_supported = static_cast<bool>(serialized_data[offset++]);
_is_jpeg_supported = static_cast<bool>(serialized_data[offset++]);
uint8_t chunk_count = serialized_data[offset++];
uint8_t vcn_count = serialized_data[offset++];
uint8_t jpeg_count = serialized_data[offset++];
constexpr size_t elem_size = sizeof(uint16_t) / sizeof(uint8_t);
const size_t chunk_size = (vcn_count + jpeg_count) * elem_size;
// Validate total size
const size_t expected_size = 5 + (chunk_count * chunk_size);
if(serialized_data.size() != expected_size)
{
throw std::runtime_error("Invalid serialized data: size mismatch");
}
auto deserialize_uint16_array = [](const std::vector<uint8_t>& data,
size_t& _offset, int array_size) {
std::vector<uint16_t> _result;
_result.reserve(array_size);
for(int i = 0; i < array_size; ++i)
{
if(_offset + 1 >= data.size())
{
throw std::runtime_error(
"Invalid serialized data: unexpected end of data");
}
uint16_t value = static_cast<uint16_t>(data[_offset]) |
(static_cast<uint16_t>(data[_offset + 1]) << 8);
_result.push_back(value);
_offset += 2;
}
return _result;
};
result.reserve(chunk_count);
for(size_t count = 0; count < chunk_count; ++count)
{
xcp_metrics_t entry;
entry.vcn_busy = deserialize_uint16_array(serialized_data, offset, vcn_count);
entry.jpeg_busy =
deserialize_uint16_array(serialized_data, offset, jpeg_count);
result.emplace_back(std::move(entry));
}
};
// Use the shared gpu_metrics_t from core/gpu_metrics.hpp
using gpu_metrics_t = gpu::gpu_metrics_t;
return [&](const storage_parsed_type_base& parsed) {
auto _amd_smi = static_cast<const struct amd_smi_sample&>(parsed);
@@ -502,6 +437,8 @@ rocpd_post_processing::get_amd_smi_sample_callback() const
bool is_vcn_enabled = settings_bits.test(static_cast<int>(pos::vcn_activity));
bool is_jpeg_enabled = settings_bits.test(static_cast<int>(pos::jpeg_activity));
bool is_xgmi_enabled = settings_bits.test(static_cast<int>(pos::xgmi));
bool is_pcie_enabled = settings_bits.test(static_cast<int>(pos::pcie));
insert_event_and_sample(
is_busy_enabled, trait::name<category::amd_smi_gfx_busy>::value,
@@ -536,55 +473,145 @@ rocpd_post_processing::get_amd_smi_sample_callback() const
.c_str(),
_amd_smi.mem_usage);
if(!is_vcn_enabled && !is_jpeg_enabled)
{
if(!is_vcn_enabled && !is_jpeg_enabled && !is_xgmi_enabled && !is_pcie_enabled)
return;
}
std::vector<xcp_metrics_t> xcp_metrics;
bool is_vcn_activity_supported;
bool is_jpeg_activity_supported;
deserialize_xcp_metrics(_amd_smi.xcp_activity, is_vcn_activity_supported,
is_jpeg_activity_supported, xcp_metrics);
gpu_metrics_t gpu_metrics;
gpu::gpu_metrics_capabilities_t capabilities;
gpu::deserialize_gpu_metrics(_amd_smi.gpu_activity, gpu_metrics, is_vcn_enabled,
is_jpeg_enabled, is_xgmi_enabled, is_pcie_enabled,
capabilities);
auto insert_xcp_metrics = [&](auto category, bool _is_enabled,
const std::vector<uint16_t>& data,
std::optional<size_t> _idx = std::nullopt) {
if(!_is_enabled)
{
return;
}
// Insert VCN and JPEG activity metrics
auto insert_decode_vector_metrics = [&](auto category, bool _is_enabled,
const std::vector<uint16_t>& data,
std::optional<size_t> _idx =
std::nullopt) {
if(!_is_enabled) return;
using Category = std::decay_t<decltype(category)>;
for(size_t clk = 0; clk < data.size(); ++clk)
for(size_t i = 0; i < data.size(); ++i)
{
const auto value = data[clk];
if(value == std::numeric_limits<uint16_t>::max())
{
continue;
}
const auto value = data[i];
if(value == std::numeric_limits<uint16_t>::max()) continue;
auto pmc_name = info::annotate_category<Category>(_idx, clk);
auto track_name = info::annotate_with_device_id<Category>(
_amd_smi.device_id, _idx, clk);
auto pmc_name = info::annotate_category<Category>(_idx, i);
auto track_name =
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
value);
static_cast<double>(value));
}
};
for(size_t idx = 0; idx < xcp_metrics.size(); ++idx)
// Insert XGMI read/write data metrics
auto insert_xgmi_vector_metrics = [&](auto category, bool _is_enabled,
const std::vector<uint64_t>& data,
std::optional<size_t> _idx = std::nullopt) {
if(!_is_enabled) return;
using Category = std::decay_t<decltype(category)>;
for(size_t i = 0; i < data.size(); ++i)
{
const auto value = data[i];
if(value == std::numeric_limits<uint64_t>::max()) continue;
auto pmc_name = info::annotate_category<Category>(_idx, i);
auto track_name =
info::annotate_with_device_id<Category>(_amd_smi.device_id, _idx, i);
insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(),
static_cast<double>(value));
}
};
// Insert VCN activity metrics
if(capabilities.flags.vcn_is_device_level_only)
{
auto dimension =
xcp_metrics.size() == 1 ? std::nullopt : std::make_optional<size_t>(idx);
insert_xcp_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled,
xcp_metrics[idx].vcn_busy, dimension);
insert_xcp_metrics(category::amd_smi_jpeg_activity{}, is_jpeg_enabled,
xcp_metrics[idx].jpeg_busy, dimension);
// Device-level: use vcn_activity vector
insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled,
gpu_metrics.vcn_activity, std::nullopt);
}
else
{
// Per-XCP: iterate through actual XCPs in vcn_busy
for(size_t xcp = 0; xcp < gpu_metrics.vcn_busy.size(); ++xcp)
{
insert_decode_vector_metrics(category::amd_smi_vcn_activity{},
is_vcn_enabled, gpu_metrics.vcn_busy[xcp],
xcp);
}
}
// Insert JPEG activity metrics
if(capabilities.flags.jpeg_is_device_level_only)
{
// Device-level: use jpeg_activity vector
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{},
is_jpeg_enabled, gpu_metrics.jpeg_activity,
std::nullopt);
}
else
{
// Per-XCP: iterate through actual XCPs in jpeg_busy
for(size_t xcp = 0; xcp < gpu_metrics.jpeg_busy.size(); ++xcp)
{
insert_decode_vector_metrics(category::amd_smi_jpeg_activity{},
is_jpeg_enabled, gpu_metrics.jpeg_busy[xcp],
xcp);
}
}
// Insert XGMI metrics (scalar values)
insert_event_and_sample(
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_width>::value,
info::annotate_with_device_id<category::amd_smi_xgmi_link_width>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.xgmi_link_width);
insert_event_and_sample(
is_xgmi_enabled, trait::name<category::amd_smi_xgmi_link_speed>::value,
info::annotate_with_device_id<category::amd_smi_xgmi_link_speed>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.xgmi_link_speed);
insert_xgmi_vector_metrics(category::amd_smi_xgmi_read_data{}, is_xgmi_enabled,
gpu_metrics.xgmi_read_data_acc, std::nullopt);
insert_xgmi_vector_metrics(category::amd_smi_xgmi_write_data{}, is_xgmi_enabled,
gpu_metrics.xgmi_write_data_acc, std::nullopt);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_width>::value,
info::annotate_with_device_id<category::amd_smi_pcie_link_width>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.pcie_link_width);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_link_speed>::value,
info::annotate_with_device_id<category::amd_smi_pcie_link_speed>(
_amd_smi.device_id)
.c_str(),
gpu_metrics.pcie_link_speed);
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_acc>::value,
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_acc>(
_amd_smi.device_id)
.c_str(),
static_cast<double>(gpu_metrics.pcie_bandwidth_acc));
insert_event_and_sample(
is_pcie_enabled, trait::name<category::amd_smi_pcie_bandwidth_inst>::value,
info::annotate_with_device_id<category::amd_smi_pcie_bandwidth_inst>(
_amd_smi.device_id)
.c_str(),
static_cast<double>(gpu_metrics.pcie_bandwidth_inst));
};
}
+4 -2
查看文件
@@ -188,7 +188,9 @@ struct amd_smi_sample : storage_parsed_type_base
power,
mem_usage,
vcn_activity,
jpeg_activity
jpeg_activity,
xgmi,
pcie
};
uint64_t settings; // bitfield
@@ -200,7 +202,7 @@ struct amd_smi_sample : storage_parsed_type_base
uint32_t power;
int64_t temperature;
size_t mem_usage;
std::vector<uint8_t> xcp_activity;
std::vector<uint8_t> gpu_activity;
};
struct cpu_freq_sample : storage_parsed_type_base
+1 -1
查看文件
@@ -213,7 +213,7 @@ storage_parser::consume_storage()
_amd_smi_sample.gfx_activity, _amd_smi_sample.umc_activity,
_amd_smi_sample.mm_activity, _amd_smi_sample.power,
_amd_smi_sample.temperature, _amd_smi_sample.mem_usage,
_amd_smi_sample.xcp_activity);
_amd_smi_sample.gpu_activity);
invoke_callbacks(header.type, _amd_smi_sample);
break;
}
@@ -67,6 +67,14 @@ extern "C"
ROCPROFSYS_CATEGORY_AMD_SMI_MEMORY_USAGE,
ROCPROFSYS_CATEGORY_AMD_SMI_VCN_ACTIVITY,
ROCPROFSYS_CATEGORY_AMD_SMI_JPEG_ACTIVITY,
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_WIDTH,
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_LINK_SPEED,
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_READ_DATA,
ROCPROFSYS_CATEGORY_AMD_SMI_XGMI_WRITE_DATA,
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_WIDTH,
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_LINK_SPEED,
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_ACC,
ROCPROFSYS_CATEGORY_AMD_SMI_PCIE_BANDWIDTH_INST,
ROCPROFSYS_CATEGORY_ROCM_RCCL,
ROCPROFSYS_CATEGORY_SAMPLING,
ROCPROFSYS_CATEGORY_PTHREAD,
+411 -142
查看文件
@@ -41,6 +41,7 @@
#include "core/config.hpp"
#include "core/debug.hpp"
#include "core/gpu.hpp"
#include "core/gpu_metrics.hpp"
#include "core/node_info.hpp"
#include "core/perfetto.hpp"
#include "core/state.hpp"
@@ -127,7 +128,7 @@ metadata_initialize_smi_tracks(size_t gpu_id)
}
};
if(gpu::is_vcn_activity_supported(gpu_id))
if(gpu::vcn_is_device_level_only(gpu_id))
{
add_vcn_track(std::nullopt);
}
@@ -139,7 +140,7 @@ metadata_initialize_smi_tracks(size_t gpu_id)
}
}
if(gpu::is_jpeg_activity_supported(gpu_id))
if(gpu::jpeg_is_device_level_only(gpu_id))
{
add_jpeg_track(std::nullopt);
}
@@ -150,6 +151,49 @@ metadata_initialize_smi_tracks(size_t gpu_id)
add_jpeg_track(xcp);
}
}
// Add XGMI tracks using specific categories for each metric type
trace_cache::get_metadata_registry().add_track(
{ trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_link_width>(
gpu_id),
thread_id, "{}" });
trace_cache::get_metadata_registry().add_track(
{ trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_link_speed>(
gpu_id),
thread_id, "{}" });
for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i)
{
auto read_name =
trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_read_data>(
gpu_id, std::nullopt, i);
trace_cache::get_metadata_registry().add_track(
{ read_name.c_str(), thread_id, "{}" });
auto write_name =
trace_cache::info::annotate_with_device_id<category::amd_smi_xgmi_write_data>(
gpu_id, std::nullopt, i);
trace_cache::get_metadata_registry().add_track(
{ write_name.c_str(), thread_id, "{}" });
}
// Add PCIe tracks using specific categories for each metric
trace_cache::get_metadata_registry().add_track(
{ trace_cache::info::annotate_with_device_id<category::amd_smi_pcie_link_width>(
gpu_id),
thread_id, "{}" });
trace_cache::get_metadata_registry().add_track(
{ trace_cache::info::annotate_with_device_id<category::amd_smi_pcie_link_speed>(
gpu_id),
thread_id, "{}" });
trace_cache::get_metadata_registry().add_track(
{ trace_cache::info::annotate_with_device_id<
category::amd_smi_pcie_bandwidth_acc>(gpu_id),
thread_id, "{}" });
trace_cache::get_metadata_registry().add_track(
{ trace_cache::info::annotate_with_device_id<
category::amd_smi_pcie_bandwidth_inst>(gpu_id),
thread_id, "{}" });
}
void
@@ -250,7 +294,7 @@ metadata_initialize_smi_pmc(size_t gpu_id)
}
};
if(gpu::is_vcn_activity_supported(gpu_id))
if(gpu::vcn_is_device_level_only(gpu_id))
{
add_vcn_pmc(std::nullopt);
}
@@ -262,7 +306,7 @@ metadata_initialize_smi_pmc(size_t gpu_id)
}
}
if(gpu::is_jpeg_activity_supported(gpu_id))
if(gpu::jpeg_is_device_level_only(gpu_id))
{
add_jpeg_pmc(std::nullopt);
}
@@ -273,6 +317,75 @@ metadata_initialize_smi_pmc(size_t gpu_id)
add_jpeg_pmc(xcp);
}
}
// Add XGMI PMC info using specific categories for each metric type
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
trait::name<category::amd_smi_xgmi_link_width>::value, "XgmiLinkWidth",
trait::name<category::amd_smi_xgmi_link_width>::description, LONG_DESCRIPTION,
COMPONENT, "bits", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0,
0 });
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
trait::name<category::amd_smi_xgmi_link_speed>::value, "XgmiLinkSpeed",
trait::name<category::amd_smi_xgmi_link_speed>::description, LONG_DESCRIPTION,
COMPONENT, "GT/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0,
0 });
for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i)
{
std::stringstream read_name_ss, read_symbol_ss;
read_name_ss << trait::name<category::amd_smi_xgmi_read_data>::value << "_" << i;
read_symbol_ss << "XgmiRead_" << i;
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
read_name_ss.str(), read_symbol_ss.str(),
trait::name<category::amd_smi_xgmi_read_data>::description,
LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
EXPRESSION, 0, 0 });
std::stringstream write_name_ss, write_symbol_ss;
write_name_ss << trait::name<category::amd_smi_xgmi_write_data>::value << "_"
<< i;
write_symbol_ss << "XgmiWrite_" << i;
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
write_name_ss.str(), write_symbol_ss.str(),
trait::name<category::amd_smi_xgmi_write_data>::description,
LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
EXPRESSION, 0, 0 });
}
// Add PCIe PMC info using specific categories for each metric
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
trait::name<category::amd_smi_pcie_link_width>::value, "PcieLinkWidth",
trait::name<category::amd_smi_pcie_link_width>::description, LONG_DESCRIPTION,
COMPONENT, "", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 });
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
trait::name<category::amd_smi_pcie_link_speed>::value, "PcieLinkSpeed",
trait::name<category::amd_smi_pcie_link_speed>::description, LONG_DESCRIPTION,
COMPONENT, "GT/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0,
0 });
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
trait::name<category::amd_smi_pcie_bandwidth_acc>::value, "PcieBwAcc",
trait::name<category::amd_smi_pcie_bandwidth_acc>::description,
LONG_DESCRIPTION, COMPONENT, "MB", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
EXPRESSION, 0, 0 });
trace_cache::get_metadata_registry().add_pmc_info(
{ agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID,
trait::name<category::amd_smi_pcie_bandwidth_inst>::value, "PcieBwInst",
trait::name<category::amd_smi_pcie_bandwidth_inst>::description,
LONG_DESCRIPTION, COMPONENT, "MB/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK,
EXPRESSION, 0, 0 });
}
auto&
@@ -335,70 +448,21 @@ get_state()
}
std::vector<uint8_t>
serialize_xcp_metrics(const bool& use_vcn_activity, const bool& use_jpeg_activity,
const amdsmi_gpu_metrics_t& gpu_metrics)
serialize_gpu_metrics(uint32_t device_id, const data::gpu_metrics_t& metrics,
const gpu::gpu_metrics_capabilities_t& capabilities)
{
// Chunk:
// <vcn_data_0>..<vcn_data_[vcn_count]> // lower and higher byte
// <jpeg_data_0>..<jpeg_data_[jpeg_count]> // lower and higher byte
// Get settings for this device
auto settings = get_settings(device_id);
// Serialized:
// <is_vcn_supported>
// <is_jpeg_supported>
// <xcp_count>
// <vcn_count>
// <jpeg_count>
// Chunk_0
// ...
// Chunk_[xcp_count]
// Convert amd_smi::settings to gpu::gpu_metrics_settings_t
gpu::gpu_metrics_settings_t gpu_settings;
gpu_settings.vcn_activity = settings.vcn_activity;
gpu_settings.jpeg_activity = settings.jpeg_activity;
gpu_settings.xgmi = settings.xgmi;
gpu_settings.pcie = settings.pcie;
constexpr uint8_t vcn_count = AMDSMI_MAX_NUM_VCN;
constexpr uint8_t jpeg_count = AMDSMI_MAX_NUM_JPEG;
constexpr uint8_t xcp_count = AMDSMI_MAX_NUM_XCP;
constexpr size_t elem_size = sizeof(uint16_t) / sizeof(uint8_t);
constexpr uint8_t vector_size_header = sizeof(uint8_t);
constexpr uint8_t serialized_data_headers =
5 * vector_size_header; // is_vcn_supported + is_jpeg_supported + xcp_count +
// vcn_count + jpeg_count
constexpr size_t chunk_size = ((vcn_count + jpeg_count) * elem_size);
auto serialize_uint16_array = [](std::vector<uint8_t>& data, const uint16_t* arr,
int array_size) {
for(int i = 0; i < array_size; ++i)
{
data.push_back(static_cast<uint8_t>(arr[i] & 0xFF));
data.push_back(static_cast<uint8_t>((arr[i] >> 8) & 0xFF));
}
};
std::vector<uint8_t> result;
const bool is_vcn_jpeg_supported = (use_vcn_activity || use_jpeg_activity);
const size_t chunk_count = is_vcn_jpeg_supported ? 1 : xcp_count;
const size_t total_size = serialized_data_headers + (chunk_count * chunk_size);
result.reserve(total_size);
result.push_back((uint8_t) use_vcn_activity);
result.push_back((uint8_t) use_jpeg_activity);
result.push_back(chunk_count);
result.push_back(vcn_count);
result.push_back(jpeg_count);
for(size_t count = 0; count < chunk_count; ++count)
{
const auto* vcn_data =
(is_vcn_jpeg_supported ? gpu_metrics.vcn_activity
: gpu_metrics.xcp_stats[count].vcn_busy);
const auto* jpeg_data =
(is_vcn_jpeg_supported ? gpu_metrics.jpeg_activity
: gpu_metrics.xcp_stats[count].jpeg_busy);
serialize_uint16_array(result, vcn_data, vcn_count);
serialize_uint16_array(result, jpeg_data, jpeg_count);
}
return result;
// Use the shared serialization function
return gpu::serialize_gpu_metrics(metrics, capabilities, gpu_settings);
}
size_t
@@ -425,6 +489,12 @@ serialize_settings(uint32_t _device_id)
settings_bits.set(
static_cast<int>(trace_cache::amd_smi_sample::settings_positions::jpeg_activity),
settings.jpeg_activity);
settings_bits.set(
static_cast<int>(trace_cache::amd_smi_sample::settings_positions::xgmi),
settings.xgmi);
settings_bits.set(
static_cast<int>(trace_cache::amd_smi_sample::settings_positions::pcie),
settings.pcie);
return settings_bits.to_ulong();
}
@@ -446,7 +516,7 @@ data::sample(uint32_t _device_id)
auto _timestamp = tim::get_clock_real_now<size_t, std::nano>();
assert(_timestamp < std::numeric_limits<int64_t>::max());
amdsmi_gpu_metrics_t _gpu_metrics;
bool _vcn_or_jpeg_activity_enabled = false;
bool _gpu_metrics_needed = false;
auto _state = get_state().load();
@@ -487,68 +557,153 @@ data::sample(uint32_t _device_id)
#endif
ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).mem_usage, amdsmi_get_gpu_memory_usage,
sample_handle, AMDSMI_MEM_TYPE_VRAM, &m_mem_usage);
_vcn_or_jpeg_activity_enabled =
get_settings(m_dev_id).vcn_activity || get_settings(m_dev_id).jpeg_activity;
ROCPROFSYS_AMDSMI_GET(_vcn_or_jpeg_activity_enabled, amdsmi_get_gpu_metrics_info,
sample_handle, &_gpu_metrics);
// Process metrics if either VCN or JPEG activity is enabled
if(_vcn_or_jpeg_activity_enabled)
// Check if GPU metrics are needed for VCN, JPEG, XGMI, or PCIe
_gpu_metrics_needed = get_settings(m_dev_id).vcn_activity ||
get_settings(m_dev_id).jpeg_activity ||
get_settings(m_dev_id).xgmi || get_settings(m_dev_id).pcie;
ROCPROFSYS_AMDSMI_GET(_gpu_metrics_needed, amdsmi_get_gpu_metrics_info, sample_handle,
&_gpu_metrics);
// Determine if basic metrics are enabled
bool _basic_metrics_enabled =
get_settings(m_dev_id).busy || get_settings(m_dev_id).temp ||
get_settings(m_dev_id).power || get_settings(m_dev_id).mem_usage;
// Process GPU metrics if needed
if(_gpu_metrics_needed || _basic_metrics_enabled)
{
// Helper lambda to fill busy metrics from a source array
auto fill_busy_metrics = [](auto& dest, const auto& src) {
for(const auto& val : src)
{
if(val != UINT16_MAX) dest.push_back(val);
}
};
gpu_metrics_t metrics;
bool has_data = false;
gpu::gpu_metrics_capabilities_t capabilities;
if(gpu::is_vcn_activity_supported(m_dev_id) &&
gpu::is_jpeg_activity_supported(m_dev_id))
if(_gpu_metrics_needed)
{
// Both VCN and JPEG are supported - create one entry with both metrics
xcp_metrics_t metrics;
fill_busy_metrics(metrics.vcn_busy, _gpu_metrics.vcn_activity);
fill_busy_metrics(metrics.jpeg_busy, _gpu_metrics.jpeg_activity);
if(!metrics.vcn_busy.empty() || !metrics.jpeg_busy.empty())
m_xcp_metrics.push_back(metrics);
}
else if(gpu::is_vcn_activity_supported(m_dev_id))
{
// Only VCN is supported
xcp_metrics_t metrics;
fill_busy_metrics(metrics.vcn_busy, _gpu_metrics.vcn_activity);
if(!metrics.vcn_busy.empty()) m_xcp_metrics.push_back(metrics);
}
else if(gpu::is_jpeg_activity_supported(m_dev_id))
{
// Only JPEG is supported
xcp_metrics_t metrics;
fill_busy_metrics(metrics.jpeg_busy, _gpu_metrics.jpeg_activity);
if(!metrics.jpeg_busy.empty()) m_xcp_metrics.push_back(metrics);
}
else
{
// Neither is supported - use XCP stats
// Each XCP gets one entry with both its VCN and JPEG metrics
for(const auto& xcp : _gpu_metrics.xcp_stats)
capabilities.flags.vcn_is_device_level_only =
gpu::vcn_is_device_level_only(m_dev_id);
capabilities.flags.jpeg_is_device_level_only =
gpu::jpeg_is_device_level_only(m_dev_id);
// Helper lambda to filter max uint values (unsupported) - returns 0 if max,
// otherwise the value
auto filter_max_uint_value = [](const auto& value) {
using ValueType = std::decay_t<decltype(value)>;
return (value == std::numeric_limits<ValueType>::max()) ? ValueType{ 0 }
: value;
};
auto fill_gpu_metrics = [](auto& dest, const auto& src, auto max_val) {
for(const auto& val : src)
{
if(val != max_val) dest.push_back(val);
}
};
if(get_settings(m_dev_id).vcn_activity)
{
xcp_metrics_t metrics;
fill_busy_metrics(metrics.vcn_busy, xcp.vcn_busy);
fill_busy_metrics(metrics.jpeg_busy, xcp.jpeg_busy);
if(!metrics.vcn_busy.empty() || !metrics.jpeg_busy.empty())
m_xcp_metrics.push_back(metrics);
if(capabilities.flags.vcn_is_device_level_only)
{
fill_gpu_metrics(metrics.vcn_activity, _gpu_metrics.vcn_activity,
UINT16_MAX);
if(!metrics.vcn_activity.empty()) has_data = true;
}
else
{
for(const auto& xcp : _gpu_metrics.xcp_stats)
{
std::vector<uint16_t> xcp_vcn_data;
fill_gpu_metrics(xcp_vcn_data, xcp.vcn_busy, UINT16_MAX);
if(!xcp_vcn_data.empty())
{
metrics.vcn_busy.push_back(std::move(xcp_vcn_data));
has_data = true;
}
}
}
}
if(get_settings(m_dev_id).jpeg_activity)
{
if(capabilities.flags.jpeg_is_device_level_only)
{
fill_gpu_metrics(metrics.jpeg_activity, _gpu_metrics.jpeg_activity,
UINT16_MAX);
if(!metrics.jpeg_activity.empty()) has_data = true;
}
else
{
for(const auto& xcp : _gpu_metrics.xcp_stats)
{
std::vector<uint16_t> xcp_jpeg_data;
fill_gpu_metrics(xcp_jpeg_data, xcp.jpeg_busy, UINT16_MAX);
if(!xcp_jpeg_data.empty())
{
metrics.jpeg_busy.push_back(std::move(xcp_jpeg_data));
has_data = true;
}
}
}
}
// Process XGMI metrics if enabled
if(get_settings(m_dev_id).xgmi)
{
// Filter scalar values - returns 0 if unsupported (max value)
metrics.xgmi_link_width =
filter_max_uint_value(_gpu_metrics.xgmi_link_width);
metrics.xgmi_link_speed =
filter_max_uint_value(_gpu_metrics.xgmi_link_speed);
// Vector values filtered by fill_gpu_metrics
fill_gpu_metrics(metrics.xgmi_read_data_acc,
_gpu_metrics.xgmi_read_data_acc, UINT64_MAX);
fill_gpu_metrics(metrics.xgmi_write_data_acc,
_gpu_metrics.xgmi_write_data_acc, UINT64_MAX);
if(metrics.xgmi_link_width != 0 || metrics.xgmi_link_speed != 0 ||
!metrics.xgmi_read_data_acc.empty() ||
!metrics.xgmi_write_data_acc.empty())
{
has_data = true;
}
}
// Process PCIe metrics if enabled
if(get_settings(m_dev_id).pcie)
{
// Filter scalar values - returns 0 if unsupported (max value)
metrics.pcie_link_width =
filter_max_uint_value(_gpu_metrics.pcie_link_width);
metrics.pcie_link_speed =
filter_max_uint_value(_gpu_metrics.pcie_link_speed);
metrics.pcie_bandwidth_acc =
filter_max_uint_value(_gpu_metrics.pcie_bandwidth_acc);
metrics.pcie_bandwidth_inst =
filter_max_uint_value(_gpu_metrics.pcie_bandwidth_inst);
if(metrics.pcie_link_width != 0 || metrics.pcie_link_speed != 0 ||
metrics.pcie_bandwidth_acc != 0 || metrics.pcie_bandwidth_inst != 0)
{
has_data = true;
}
}
}
// Store samples if basic metrics are enabled OR if there's advanced metric data
if(_basic_metrics_enabled || has_data)
{
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::amd_smi_sample, serialize_settings(m_dev_id),
_device_id, _timestamp, m_busy_perc.gfx_activity,
m_busy_perc.umc_activity, m_busy_perc.mm_activity,
m_power.current_socket_power, m_temp, m_mem_usage,
serialize_gpu_metrics(m_dev_id, metrics, capabilities));
if(has_data) m_gpu_metrics.push_back(metrics);
}
}
#undef ROCPROFSYS_AMDSMI_GET
trace_cache::get_buffer_storage().store(
trace_cache::entry_type::amd_smi_sample, serialize_settings(m_dev_id), _device_id,
_timestamp, m_busy_perc.gfx_activity, m_busy_perc.umc_activity,
m_busy_perc.mm_activity, m_power.current_socket_power, m_temp, m_mem_usage,
serialize_xcp_metrics(gpu::is_vcn_activity_supported(m_dev_id),
gpu::is_jpeg_activity_supported(m_dev_id), _gpu_metrics));
}
void
@@ -741,25 +896,28 @@ data::post_process(uint32_t _dev_id)
}
if(_settings.vcn_activity)
{
if(itr.m_xcp_metrics.empty())
if(itr.m_gpu_metrics.empty())
{
ROCPROFSYS_VERBOSE(
1, "No VCN activity data collected from device %u\n", _dev_id);
}
else if(gpu::is_vcn_activity_supported(_dev_id))
else if(gpu::vcn_is_device_level_only(_dev_id))
{
// For VCN activity, use simple indexing
for(std::size_t i = 0; i < std::size(itr.m_xcp_metrics[0].vcn_busy);
++i)
// For VCN activity supported: use vcn_activity vector
for(std::size_t i = 0;
i < std::size(itr.m_gpu_metrics[0].vcn_activity); ++i)
counter_track::emplace(_dev_id, addendum_blk(i, "VCN Activity"),
"%");
}
else
{
for(std::size_t xcp = 0; xcp < std::size(itr.m_xcp_metrics); ++xcp)
// For VCN activity NOT supported: use vcn_busy vector with per-XCP
// organization
for(size_t xcp = 0; xcp < itr.m_gpu_metrics[0].vcn_busy.size(); ++xcp)
{
for(std::size_t i = 0;
i < std::size(itr.m_xcp_metrics[xcp].vcn_busy); ++i)
// Loop through each XCP's VCN busy values
for(size_t i = 0; i < itr.m_gpu_metrics[0].vcn_busy[xcp].size();
++i)
{
counter_track::emplace(
_dev_id, addendum_blk(i, "VCN Activity", xcp), "%");
@@ -769,29 +927,73 @@ data::post_process(uint32_t _dev_id)
}
if(_settings.jpeg_activity)
{
if(itr.m_xcp_metrics.empty())
if(itr.m_gpu_metrics.empty())
{
ROCPROFSYS_VERBOSE(
1, "No JPEG activity data collected from device %u\n", _dev_id);
}
else if(gpu::is_jpeg_activity_supported(_dev_id))
else if(gpu::jpeg_is_device_level_only(_dev_id))
{
for(std::size_t i = 0; i < std::size(itr.m_xcp_metrics[0].jpeg_busy);
++i)
// For JPEG activity supported: use jpeg_activity vector
for(std::size_t i = 0;
i < std::size(itr.m_gpu_metrics[0].jpeg_activity); ++i)
counter_track::emplace(_dev_id, addendum_blk(i, "JPEG Activity"),
"%");
}
else
{
for(std::size_t xcp = 0; xcp < std::size(itr.m_xcp_metrics); ++xcp)
// For JPEG activity NOT supported: use jpeg_busy vector with per-XCP
// organization
for(size_t xcp = 0; xcp < itr.m_gpu_metrics[0].jpeg_busy.size();
++xcp)
{
for(std::size_t i = 0;
i < std::size(itr.m_xcp_metrics[xcp].jpeg_busy); ++i)
// Loop through each XCP's JPEG busy values
for(size_t i = 0; i < itr.m_gpu_metrics[0].jpeg_busy[xcp].size();
++i)
{
counter_track::emplace(
_dev_id, addendum_blk(i, "JPEG Activity", xcp), "%");
}
}
}
}
if(_settings.xgmi)
{
if(itr.m_gpu_metrics.empty())
{
ROCPROFSYS_VERBOSE(
1, "No XGMI activity data collected from device %u\n", _dev_id);
}
else
{
counter_track::emplace(_dev_id, addendum("XGMI Link Width"), "bits");
counter_track::emplace(_dev_id, addendum("XGMI Link Speed"), "GT/s");
for(std::size_t i = 0;
i < std::size(itr.m_gpu_metrics[0].xgmi_read_data_acc); ++i)
counter_track::emplace(_dev_id, addendum_blk(i, "XGMI Read Data"),
"KB");
for(std::size_t i = 0;
i < std::size(itr.m_gpu_metrics[0].xgmi_write_data_acc); ++i)
counter_track::emplace(_dev_id,
addendum_blk(i, "XGMI Write Data"), "KB");
}
}
if(_settings.pcie)
{
if(itr.m_gpu_metrics.empty())
{
ROCPROFSYS_VERBOSE(
1, "No PCIe activity data collected from device %u\n", _dev_id);
}
else
{
counter_track::emplace(_dev_id, addendum("PCIe Link Width"), "");
counter_track::emplace(_dev_id, addendum("PCIe Link Speed"), "GT/s");
counter_track::emplace(_dev_id, addendum("PCIe Bandwidth Acc"), "MB");
counter_track::emplace(_dev_id, addendum("PCIe Bandwidth Inst"),
"MB/s");
}
}
};
auto write_perfetto_metrics = [&]() {
@@ -822,32 +1024,97 @@ data::post_process(uint32_t _dev_id)
counter_track::at(_dev_id, track_index++), _ts, _usage);
}
if(_settings.vcn_activity && !itr.m_xcp_metrics.empty())
if(_settings.vcn_activity && !itr.m_gpu_metrics.empty())
{
// Iterate over all XCPs and their VCN busy/activity values
for(const auto& metrics : itr.m_xcp_metrics)
if(gpu::vcn_is_device_level_only(_dev_id))
{
for(const auto& vcn_val : metrics.vcn_busy)
// Device-level VCN activity
for(const auto& vcn_val : itr.m_gpu_metrics[0].vcn_activity)
{
TRACE_COUNTER("device_vcn_activity",
counter_track::at(_dev_id, track_index++), _ts,
vcn_val);
}
}
else
{
// XCP-level VCN busy (per-XCP organization)
for(const auto& xcp_data : itr.m_gpu_metrics[0].vcn_busy)
{
for(const auto& vcn_val : xcp_data)
{
TRACE_COUNTER("device_vcn_activity",
counter_track::at(_dev_id, track_index++), _ts,
vcn_val);
}
}
}
}
if(_settings.jpeg_activity && !itr.m_xcp_metrics.empty())
if(_settings.jpeg_activity && !itr.m_gpu_metrics.empty())
{
// Iterate over all XCPs and their JPEG busy/activity values
for(const auto& metrics : itr.m_xcp_metrics)
if(gpu::jpeg_is_device_level_only(_dev_id))
{
for(const auto& jpeg_val : metrics.jpeg_busy)
// Device-level JPEG activity
for(const auto& jpeg_val : itr.m_gpu_metrics[0].jpeg_activity)
{
TRACE_COUNTER("device_jpeg_activity",
counter_track::at(_dev_id, track_index++), _ts,
jpeg_val);
}
}
else
{
// XCP-level JPEG busy (per-XCP organization)
for(const auto& xcp_data : itr.m_gpu_metrics[0].jpeg_busy)
{
for(const auto& jpeg_val : xcp_data)
{
TRACE_COUNTER("device_jpeg_activity",
counter_track::at(_dev_id, track_index++), _ts,
jpeg_val);
}
}
}
}
if(_settings.xgmi && !itr.m_gpu_metrics.empty())
{
TRACE_COUNTER("device_xgmi_link_width",
counter_track::at(_dev_id, track_index++), _ts,
itr.m_gpu_metrics[0].xgmi_link_width);
TRACE_COUNTER("device_xgmi_link_speed",
counter_track::at(_dev_id, track_index++), _ts,
itr.m_gpu_metrics[0].xgmi_link_speed);
for(const auto& read_val : itr.m_gpu_metrics[0].xgmi_read_data_acc)
{
TRACE_COUNTER("device_xgmi_read_data",
counter_track::at(_dev_id, track_index++), _ts,
read_val);
}
for(const auto& write_val : itr.m_gpu_metrics[0].xgmi_write_data_acc)
{
TRACE_COUNTER("device_xgmi_write_data",
counter_track::at(_dev_id, track_index++), _ts,
write_val);
}
}
if(_settings.pcie && !itr.m_gpu_metrics.empty())
{
TRACE_COUNTER("device_pcie_link_width",
counter_track::at(_dev_id, track_index++), _ts,
itr.m_gpu_metrics[0].pcie_link_width);
TRACE_COUNTER("device_pcie_link_speed",
counter_track::at(_dev_id, track_index++), _ts,
itr.m_gpu_metrics[0].pcie_link_speed);
TRACE_COUNTER("device_pcie_bandwidth_acc",
counter_track::at(_dev_id, track_index++), _ts,
itr.m_gpu_metrics[0].pcie_bandwidth_acc);
TRACE_COUNTER("device_pcie_bandwidth_inst",
counter_track::at(_dev_id, track_index++), _ts,
itr.m_gpu_metrics[0].pcie_bandwidth_inst);
}
};
@@ -951,6 +1218,8 @@ setup()
key_pair_t{ "mem_usage", get_settings(itr).mem_usage },
key_pair_t{ "vcn_activity", get_settings(itr).vcn_activity },
key_pair_t{ "jpeg_activity", get_settings(itr).jpeg_activity },
key_pair_t{ "xgmi", get_settings(itr).xgmi },
key_pair_t{ "pcie", get_settings(itr).pcie },
};
// Initialize all metrics to false
+6 -6
查看文件
@@ -31,6 +31,7 @@
#include "core/common.hpp"
#include "core/components/fwd.hpp"
#include "core/defines.hpp"
#include "core/gpu_metrics.hpp"
#include "core/state.hpp"
#include "library/thread_data.hpp"
@@ -78,6 +79,8 @@ struct settings
bool mem_usage = true;
bool vcn_activity = true;
bool jpeg_activity = true;
bool xgmi = true;
bool pcie = true;
};
struct data
@@ -93,11 +96,8 @@ struct data
using mem_usage_t = uint64_t;
using temp_t = int64_t;
struct xcp_metrics_t
{
std::vector<uint16_t> vcn_busy;
std::vector<uint16_t> jpeg_busy;
};
// Use the shared gpu_metrics_t from core/gpu_metrics.hpp
using gpu_metrics_t = rocprofsys::gpu::gpu_metrics_t;
ROCPROFSYS_DEFAULT_OBJECT(data)
@@ -112,7 +112,7 @@ struct data
timestamp_t m_ts = 0;
temp_t m_temp = 0;
mem_usage_t m_mem_usage = 0;
std::vector<xcp_metrics_t> m_xcp_metrics = {};
std::vector<gpu_metrics_t> m_gpu_metrics = {};
#if ROCPROFSYS_USE_ROCM > 0
amdsmi_engine_usage_t m_busy_perc = {};
amdsmi_power_info_t m_power = {};
+1
查看文件
@@ -46,6 +46,7 @@ include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-annotate-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-causal-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-python-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-decode-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-gpu-connect-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-nic-perf.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-roctx-tests.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/rocprof-sys-rocm-hip-stream.cmake)
@@ -0,0 +1,100 @@
{
"required_tables": [
{
"min_rows": 1,
"name_prefix": "rocpd_info_pmc",
"required_columns": [
"agent_id",
"target_arch",
"name",
"symbol",
"description",
"units",
"value_type"
],
"validation_queries": [
{
"comparison": "greater_than",
"description": "Check for Xgmi amd-smi metrics",
"error_message": "Did not find Xgmi data in amd-smi metrics",
"expected_result": 1,
"query": "SELECT COUNT(*) as count FROM {table_name} WHERE symbol LIKE 'Xgmi%'"
},
{
"comparison": "greater_than",
"description": "Check for Pcie amd-smi metrics",
"error_message": "Did not find Pcie data in amd-smi metrics",
"expected_result": 1,
"query": "SELECT COUNT(*) as count FROM {table_name} WHERE symbol LIKE 'Pcie%'"
}
]
},
{
"min_rows": 500,
"name_prefix": "rocpd_pmc_event",
"required_columns": [
"event_id",
"pmc_id",
"value"
],
"validation_queries": [
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi xgmi link speed samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_xgmi_link_speed'"
},
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi xgmi link width samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_xgmi_link_width'"
},
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi xgmi read data samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name LIKE 'device_xgmi_read_data%'"
},
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi xgmi write data samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name LIKE 'device_xgmi_write_data%'"
},
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi pcie bandwidth instantaneous samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_bandwidth_inst'"
},
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi pcie bandwidth accumulated samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_bandwidth_acc'"
},
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi pcie link speed samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_link_speed'"
},
{
"comparison": "greater_than",
"description": "Check for amd-smi monitoring busy times",
"error_message": "Less than expected number of captured amd-smi pcie link width samples!",
"expected_result": 100,
"query": "SELECT COUNT(*) as count FROM {table_name} event JOIN rocpd_info_pmc info ON event.pmc_id = info.id WHERE info.name = 'device_pcie_link_width'"
}
]
}
]
}
@@ -0,0 +1,93 @@
{
"required_tables": [
{
"commit": "Validation rules for hip_api",
"name": "events_args",
"required_columns": [
"event_id",
"category",
"stack_id",
"parent_stack_id",
"correlation_id"
],
"validation_queries": [
{
"comparison": "greater_than",
"description": "Verify that 'rocm_hip_api' appears in category at least 100 times in table events_args",
"error_message": "'rocm_hip_api' category entries are fewer than expected in events_args",
"expected_result": 100,
"query": "SELECT COUNT(*) FROM events_args WHERE category = 'rocm_hip_api';"
},
{
"comparison": "equals",
"description": "Check for missing category entries",
"error_message": "Empty or NULL category entries found in events_args",
"expected_result": 0,
"query": "SELECT COUNT(*) FROM events_args WHERE category IS NULL OR TRIM(category) = '';"
}
]
},
{
"commit": "Validation rules for hip_api",
"name": "regions",
"required_columns": [
"id",
"guid",
"category",
"name"
],
"validation_queries": [
{
"comparison": "greater_than",
"description": "Verify that 'rocm_hip_api' appears in category at least 50 times in table regions",
"error_message": "'rocm_hip_api' category entries are fewer than expected in regions",
"expected_result": 50,
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_hip_api';"
},
{
"comparison": "equals",
"description": "Ensure there are no HIP API calls that last 0 seconds",
"error_message": "Found HIP API captures where duration is 0",
"expected_result": 0,
"query": "SELECT COUNT(*) FROM regions WHERE category = 'rocm_hip_api' AND duration = 0;"
},
{
"comparison": "equals",
"description": "Check for any NULL values in the 'name' column of regions",
"error_message": "NULL entries found in the name column of regions",
"expected_result": 0,
"query": "SELECT COUNT(*) FROM regions WHERE name IS NULL;"
}
]
},
{
"name": "rocpd_info_agent",
"required_columns": [
"id",
"guid",
"nid",
"pid",
"type",
"name"
],
"validation_queries": [
{
"comparison": "greater_than",
"description": "Check that we have GPU agents detected",
"error_message": "No GPU agents found",
"expected_result": 0,
"query": "SELECT COUNT(*) as count FROM rocpd_info_agent WHERE type = 'GPU'"
},
{
"comparison": "equals",
"description": "Check for NULL agent names",
"error_message": "Found agents with NULL names",
"expected_result": 0,
"query": "SELECT COUNT(*) as count FROM rocpd_info_agent WHERE name IS NULL"
}
]
}
]
}
+96
查看文件
@@ -0,0 +1,96 @@
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# -------------------------------------------------------------------------------------- #
#
# GPU connectivity tests (transferBench)
#
# -------------------------------------------------------------------------------------- #
set(_gpu_connect_environment
"ROCPROFSYS_ROCM_DOMAINS=hip_runtime_api"
"ROCPROFSYS_AMD_SMI_METRICS=busy,temp,power,xgmi,pcie"
"ROCPROFSYS_SAMPLING_CPUS=none"
"ROCPROFSYS_USE_SAMPLING=OFF"
"ROCPROFSYS_PROCESS_SAMPLING_FREQ=10"
"ROCPROFSYS_CPU_FREQ_ENABLED=OFF"
)
set(_gpu_connect_rocpd_validation_rules
"${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/gpu-connect/validation-rules.json"
"${CMAKE_CURRENT_LIST_DIR}/rocpd-validation-rules/gpu-connect/amd-smi-rules.json"
)
# Enable ROCPD for tests only if valid ROCm is installed and a valid GPU is detected
if(${ENABLE_ROCPD_TEST} AND ${_VALID_GPU})
list(APPEND _gpu_connect_environment "ROCPROFSYS_USE_ROCPD=ON")
endif()
set(skip_validation FALSE)
if(EXISTS "${PROJECT_BINARY_DIR}/transferBench")
execute_process(
COMMAND ${PROJECT_BINARY_DIR}/transferBench
OUTPUT_VARIABLE _transfer_output
ERROR_VARIABLE _transfer_output
RESULT_VARIABLE _transfer_result
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_STRIP_TRAILING_WHITESPACE
)
if(_transfer_output MATCHES "Error: No valid transfers created")
set(skip_validation TRUE)
endif()
endif()
rocprofiler_systems_add_test(
SKIP_BASELINE SKIP_REWRITE SKIP_SAMPLING SKIP_RUNTIME
NAME transferbench
TARGET transferBench
GPU ON
ENVIRONMENT "${_base_environment};${_gpu_connect_environment}"
LABELS "transferbench;xgmi;pcie"
SYS_RUN_SKIP_REGEX "Error: No valid transfers created"
)
if(NOT skip_validation)
rocprofiler_systems_add_validation_test(
NAME transferbench-sys-run
PERFETTO_FILE "perfetto-trace.proto"
LABELS "transferbench;perfetto"
ARGS --counter-names "XGMI Read Data" "XGMI Write Data" -p
)
if(${ENABLE_ROCPD_TEST} AND ${_VALID_GPU})
set_property(TEST transferbench-sys-run APPEND PROPERTY LABELS rocpd)
rocprofiler_systems_add_validation_test(
NAME transferbench-sys-run
ROCPD_FILE "rocpd.db"
LABELS "transferbench;rocpd"
ARGS --validation-rules
${_gpu_connect_rocpd_validation_rules}
)
endif()
else()
message(STATUS "TransferBench: No valid transfers created, skipping tests")
endif()
+37 -6
查看文件
@@ -530,6 +530,7 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
REWRITE
REWRITE_RUN
BASELINE
SYS_RUN
)
foreach(_TYPE PASS FAIL SKIP)
list(APPEND _REGEX_OPTS "${_PREFIX}_${_TYPE}_REGEX")
@@ -548,8 +549,8 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
cmake_parse_arguments(
TEST
"SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME"
"NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT;WILL_FAIL;DISABLED"
"SKIP_BASELINE;SKIP_SAMPLING;SKIP_REWRITE;SKIP_RUNTIME;SKIP_SYS_RUN"
"NAME;TARGET;MPI;GPU;NUM_PROCS;SAMPLING_TIMEOUT;REWRITE_TIMEOUT;RUNTIME_TIMEOUT;SYS_RUN_TIMEOUT;WILL_FAIL;DISABLED"
"${_KWARGS}"
${ARGN}
)
@@ -561,6 +562,7 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
REWRITE
REWRITE_RUN
BASELINE
SYS_RUN
)
if("${${_PREFIX}_FAIL_REGEX}" STREQUAL "")
set(${_PREFIX}_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})")
@@ -601,6 +603,10 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
set(TEST_SAMPLING_TIMEOUT 120)
endif()
if(NOT TEST_SYS_RUN_TIMEOUT)
set(TEST_SYS_RUN_TIMEOUT 300)
endif()
if(NOT TEST_DISABLED)
set(TEST_DISABLED OFF)
endif()
@@ -711,6 +717,16 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
)
endif()
if(NOT TEST_SKIP_SYS_RUN)
add_test(
NAME ${TEST_NAME}-sys-run
COMMAND
${COMMAND_PREFIX} $<TARGET_FILE:rocprofiler-systems-run> --
$<TARGET_FILE:${TEST_TARGET}> ${TEST_RUN_ARGS}
WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
)
endif()
if(TEST ${TEST_NAME}-binary-rewrite-run)
set_tests_properties(
${TEST_NAME}-binary-rewrite-run
@@ -725,10 +741,17 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
binary-rewrite
binary-rewrite-run
runtime-instrument
sys-run
)
string(REGEX REPLACE "-run(-|/)" "\\1" _prefix "${TEST_NAME}-${_TEST}/")
string(
REGEX REPLACE
"rewrite-run(-|/)"
"rewrite\\1"
_prefix
"${TEST_NAME}-${_TEST}/"
)
set(_labels "${_TEST}")
string(REPLACE "-run" "" _labels "${_TEST}")
string(REPLACE "rewrite-run" "rewrite" _labels "${_TEST}")
if(TEST_TARGET)
list(APPEND _labels "${TEST_TARGET}")
endif()
@@ -748,10 +771,12 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
set(_timeout ${TEST_SAMPLING_TIMEOUT})
elseif("${_TEST}" MATCHES "runtime-instrument")
set(_timeout ${TEST_RUNTIME_TIMEOUT})
elseif("${_TEST}" MATCHES "sys-run")
set(_timeout ${TEST_SYS_RUN_TIMEOUT})
endif()
set(_props)
if("${_TEST}" MATCHES "run|sampling|baseline")
if("${_TEST}" MATCHES "sys-run|sampling|baseline")
set(_props ${TEST_PROPERTIES})
if(NOT "RUN_SERIAL" IN_LIST _props)
list(APPEND _props RUN_SERIAL ON)
@@ -768,11 +793,17 @@ function(ROCPROFILER_SYSTEMS_ADD_TEST)
set(_REGEX_VAR BASELINE)
elseif("${_TEST}" MATCHES "sampling")
set(_REGEX_VAR SAMPLING)
elseif("${_TEST}" MATCHES "sys-run")
set(_REGEX_VAR SYS_RUN)
else()
set(_REGEX_VAR)
endif()
if("${_TEST}" MATCHES "binary-rewrite-run|runtime-instrument|sampling")
if(
"${_TEST}"
MATCHES
"binary-rewrite-run|runtime-instrument|sampling|sys-run"
)
rocprofiler_systems_patch_sanitizer_environment(_environ)
endif()