[rocprofiler-compute] Fix for multi process workload profiling (#2418)

* Fix for multi process workload profiling

Native counter collection tool updates:
    * Do not dump empty counter data for a process
    * Use PID instead of UUID for dumped csv files to facilitate correlation
    * Handle merging multiple pairs of rocpd (from sdk tool) and csv (from
      native tool) files
    * Handle merging multiple pairs of csv (from sdk tool) and csv (from
      native tool) files

Rocpd output format updates:
    * Merge multiple rocpd databases into a single csv
    * Reset dispatch id and kernel id for unique dispatches and unique
      kernels respectively
    * Retain multiple rocpd databases per run for multi process workloads

* Add test case for multiprocess profiling using rocflop workload

* Add rocflop

* Fix native counter csv to rocprofv3 csv conversion

* Use kernel_id instead of dispatch_id to correlate native counter csv
  and kernel trace csv

* python formatting using ruff 0.14 instead of 0.13
This commit is contained in:
vedithal-amd
2025-12-23 13:12:18 -05:00
zatwierdzone przez GitHub
rodzic 3e49440495
commit 588773f9bf
13 zmienionych plików z 880 dodań i 144 usunięć
@@ -45,6 +45,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* Fix the check to prevent showing table where a column is full of N/A
* Improve detection of empty values when metric evalulation fails due to counter data missing
* Fix the wrong logic in native counter csv to rocprofv3 csv conversion
* Use kernel_id instead of dispatch_id to correlate native counter csv and kernel trace csv
### Removed
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
@@ -748,6 +748,7 @@ if(INSTALL_TESTS)
tests/hip_dynamic_shared
tests/laplace_eqn
tests/mat_mul_max
tests/rocflop
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
COMPONENT tests
)
@@ -0,0 +1,681 @@
// Copied from https://github.com/benrichard-amd/rocflop/tree/82f197e12314bab694fc70451a2b495b4f51bf90
#include <iostream>
#include <cstring>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <unistd.h>
#include <type_traits>
#include <vector>
#include <sys/wait.h>
#include <fcntl.h>
using float16 = _Float16;
// Vector types. Useful for packed math (where supported) and MFMA inputs.
template<typename T, uint32_t Rank>
using vecT = T __attribute__((ext_vector_type(Rank)));
template<typename T> using vec4 = vecT<T, 4>;
template<typename T> using vec8 = vecT<T, 8>;
// Kernels
template<typename T> __global__ void fma_throughput(vec4<T>* buffer, int count)
{
const T k = 1.0;
const int grid_size = gridDim.x * blockDim.x;
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
vec4<T>* ptr = buffer;
vec4<T> value0 = ptr[0 * grid_size + tid];
vec4<T> value1 = ptr[1 * grid_size + tid];
vec4<T> value2 = ptr[2 * grid_size + tid];
vec4<T> value3 = ptr[3 * grid_size + tid];
for(int j = 0; j < count; j++) {
for(int j = 0; j < 64; j++) {
// 16 FMA ops
value0 = value0 * value0 + k;
value1 = value1 * value1 + k;
value2 = value2 * value2 + k;
value3 = value3 * value3 + k;
}
}
ptr[tid] = value0 + value1 + value2 + value3;
}
__global__ void matmul_fp16_throughput(vec4<float16>* inputs, vec4<float>* outputs, int count)
{
int grid_size = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
vec4<float16>* ptr = inputs;
vec4<float16> value0 = ptr[0 * grid_size + tid];
vec4<float16> value1 = ptr[1 * grid_size + tid];
vec4<float16> value2 = ptr[2 * grid_size + tid];
vec4<float16> value3 = ptr[3 * grid_size + tid];
vec4<float> accum0;
vec4<float> accum1;
vec4<float> accum2;
vec4<float> accum3;
for(int i = 0; i < count; i++) {
for(int j = 0; j < 64; j++) {
// 4 MFMA ops
accum0 = __builtin_amdgcn_mfma_f32_16x16x16f16(value0, value0, accum0, 0, 0, 0);
accum1 = __builtin_amdgcn_mfma_f32_16x16x16f16(value1, value1, accum1, 0, 0, 0);
accum2 = __builtin_amdgcn_mfma_f32_16x16x16f16(value2, value2, accum2, 0, 0, 0);
accum3 = __builtin_amdgcn_mfma_f32_16x16x16f16(value3, value3, accum3, 0, 0, 0);
}
}
outputs[tid] = accum0 + accum1 + accum2 + accum3;
}
__global__ void sparse_matmul_fp16_throughput(vec4<float16>* input0, vec8<float16>* input1, vec4<float>* outputs, int count)
{
int grid_size = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
vec4<float16>* x_ptr = input0;
vec8<float16>* y_ptr = input1;
vec4<float16> x0 = x_ptr[0 * grid_size + tid];
vec4<float16> x1 = x_ptr[1 * grid_size + tid];
vec4<float16> x2 = x_ptr[2 * grid_size + tid];
vec4<float16> x3 = x_ptr[3 * grid_size + tid];
vec8<float16> y0 = y_ptr[0 * grid_size + tid];
vec8<float16> y1 = y_ptr[1 * grid_size + tid];
vec8<float16> y2 = y_ptr[2 * grid_size + tid];
vec8<float16> y3 = y_ptr[3 * grid_size + tid];
vec4<float> accum0;
vec4<float> accum1;
vec4<float> accum2;
vec4<float> accum3;
for(int i = 0; i < count; i++) {
for(int j = 0; j < 64; j++) {
// 4 SMFMAC ops
accum0 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x0, y0, accum0, 0, 0, 0);
accum1 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x1, y1, accum1, 0, 0, 0);
accum2 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x2, y2, accum2, 0, 0, 0);
accum3 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x3, y3, accum3, 0, 0, 0);
}
}
outputs[tid] = accum0 + accum1 + accum2 + accum3;
}
__global__ void matmul_fp32_throughput(float* inputs, vec4<float>* outputs, int count)
{
int grid_size = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
float* ptr = inputs;
float value0 = ptr[0 * grid_size + tid];
float value1 = ptr[1 * grid_size + tid];
float value2 = ptr[2 * grid_size + tid];
float value3 = ptr[2 * grid_size + tid];
vec4<float> accum0;
vec4<float> accum1;
vec4<float> accum2;
vec4<float> accum3;
for(int i = 0; i < count; i++) {
for(int j = 0; j < 64; j++) {
// 4 MFMA ops
accum0 = __builtin_amdgcn_mfma_f32_16x16x4f32(value0, value0, accum0, 0, 0, 0);
accum1 = __builtin_amdgcn_mfma_f32_16x16x4f32(value1, value1, accum1, 0, 0, 0);
accum2 = __builtin_amdgcn_mfma_f32_16x16x4f32(value2, value2, accum2, 0, 0, 0);
accum3 = __builtin_amdgcn_mfma_f32_16x16x4f32(value3, value3, accum3, 0, 0, 0);
}
}
outputs[tid] = accum0 + accum1 + accum2 + accum3;
}
void HIP_CALL(hipError_t err)
{
if(err != hipSuccess) {
std::cout << "HIP Error: " << (int)err << " " << hipGetErrorString(err) << std::endl;
exit(1);
}
}
struct GCNArch {
int major;
int minor;
int rev;
};
GCNArch get_gcn_arch(int device)
{
hipDeviceProp_t props;
HIP_CALL(hipGetDeviceProperties(&props, device));
// Example: gfx908:sramecc+:xnack-
std::string arch_full(props.gcnArchName);
// Extract number e.g. "908"
std::string gfx_str = arch_full.substr(3, arch_full.find_first_of(':'));
int gfx_num = std::stoi(gfx_str, nullptr, 16);
GCNArch arch;
arch.major = (gfx_num & 0xff00) >> 8;
arch.minor = (gfx_num & 0x00f0) >> 4;
arch.rev = (gfx_num & 0x000f);
return arch;
}
enum : uint32_t {
VALU_FP32 = 1 << 0,
VALU_FP16 = 1 << 1,
VALU_FP64 = 1 << 2,
MATRIX_FP16 = 1 << 3,
MATRIX_FP32 = 1 << 4,
SMATRIX_FP16 = 1 << 5,
VALU_INT32 = 1 << 6,
ALL = (uint32_t)-1
};
// Timer for measuring kernel duration
class HIPTimer {
private:
hipEvent_t m_start;
hipEvent_t m_stop;
public:
HIPTimer()
{
HIP_CALL(hipEventCreate(&m_start));
HIP_CALL(hipEventCreate(&m_stop));
}
void start()
{
HIP_CALL(hipEventRecord(m_start));
}
void stop()
{
HIP_CALL(hipEventRecord(m_stop));
}
double elapsed()
{
float ms;
HIP_CALL(hipEventElapsedTime(&ms, m_start, m_stop));
return (double)ms / 1000.0;
}
};
// Host code
template<typename T> double fma_throughput_test(int device, int count, int runs = 1)
{
vec4<T>* buffer = nullptr;
hipDeviceProp_t props;
HIP_CALL(hipGetDeviceProperties(&props, device));
int blocks = props.multiProcessorCount * 512;
int threads_per_block = 64;
int total_threads = blocks * threads_per_block;
HIP_CALL(hipMalloc(&buffer, sizeof(vec4<T>) * total_threads * 4));
HIPTimer t;
t.start();
for(int i = 0; i < runs; i++) {
fma_throughput<T><<<blocks, threads_per_block>>>(buffer, count);
}
t.stop();
HIP_CALL(hipDeviceSynchronize());
double elapsed = t.elapsed();
double ops = (double)total_threads * count * 64 * 16 * runs;
double flops = (double)ops * 2.0 / elapsed;
HIP_CALL(hipFree(buffer));
return flops;
}
template<typename matT, typename accumT> double matmul_throughput_test(int device, int count, int runs = 1)
{
const int wave_size = 64;
int k;
int m;
int n;
if(std::is_same<matT, float16>::value) {
m = 16;
n = 16;
k = 16;
} else if(std::is_same<matT, float>::value) {
m = 16;
n = 16;
k = 4;
} else {
assert(false);
}
int ops_per_matmul = k * m * n * 2;
void* buffer = nullptr;
void* accum = nullptr;
hipDeviceProp_t props;
HIP_CALL(hipGetDeviceProperties(&props, device));
int blocks = props.multiProcessorCount * 512;
int threads_per_block = wave_size;
int total_threads = blocks * threads_per_block;
HIP_CALL(hipMalloc(&buffer, 4 * sizeof(matT) * m * k * total_threads));
HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
HIPTimer t;
t.start();
for(int i = 0; i < runs; i++) {
if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer, (vec4<float>*)accum, count);
} else if(std::is_same<matT,float>::value && std::is_same<accumT, float>::value) {
matmul_fp32_throughput<<<blocks, threads_per_block>>>((float*)buffer, (vec4<float>*)accum, count);
}
}
t.stop();
HIP_CALL(hipDeviceSynchronize());
double elapsed = t.elapsed();
double ops = (double)blocks * count * 64 * 4 * runs;
double flops = (double)ops * ops_per_matmul / elapsed;
HIP_CALL(hipFree(buffer));
HIP_CALL(hipFree(accum));
return flops;
}
template<typename matT, typename accumT> double sparse_matmul_throughput_test(int device, int count, int runs = 1)
{
const int wave_size = 64;
int k;
int m;
int n;
if(std::is_same<matT, float16>::value) {
m = 16;
n = 16;
k = 32;
} else {
assert(false);
}
int ops_per_matmul = k * m * n * 2;
void* buffer1 = nullptr;
void* buffer2 = nullptr;
void* accum = nullptr;
hipDeviceProp_t props;
HIP_CALL(hipGetDeviceProperties(&props, device));
int blocks = props.multiProcessorCount * 512;
int threads_per_block = wave_size;
int total_threads = blocks * threads_per_block;
HIP_CALL(hipMalloc(&buffer1, 4 * sizeof(matT) * m * k * total_threads));
HIP_CALL(hipMalloc(&buffer2, 8 * sizeof(matT) * n * k * total_threads));
HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
HIPTimer t;
t.start();
for(int i = 0; i < runs; i++) {
if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
sparse_matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer1,
(vec8<float16>*)buffer2, (vec4<float>*)accum, count);
}
}
t.stop();
HIP_CALL(hipDeviceSynchronize());
double elapsed = t.elapsed();
double ops = (double)blocks * count * 64 * 4 * runs;
double flops = (double)ops * ops_per_matmul / elapsed;
HIP_CALL(hipFree(buffer1));
HIP_CALL(hipFree(buffer2));
HIP_CALL(hipFree(accum));
return flops;
}
struct Result {
int device = -1;
double valu_fp16 = 0;
double valu_fp32 = 0;
double valu_fp64 = 0;
double valu_int32 = 0;
double mfma_fp16 = 0;
double mfma_fp32 = 0;
double smfmac_fp16 = 0;
// Used for sorting
bool operator<(const Result& other) {
return device < other.device;
}
};
void print_result(const Result& res, uint32_t mask)
{
if(mask & VALU_FP16) {
printf("VALU FP16: %8.2f TFLOPS\n", res.valu_fp16 / 1e12);
}
if(mask & VALU_FP32) {
printf("VALU FP32: %8.2f TFLOPS\n", res.valu_fp32 / 1e12);
}
if(mask & VALU_FP64) {
printf("VALU FP64: %8.2f TFLOPS\n", res.valu_fp64 / 1e12);
}
if(mask & VALU_INT32) {
printf("VALU INT32: %8.2f TIOPS\n", res.valu_int32 / 1e12);
}
if(mask & MATRIX_FP16) {
printf("MFMA FP16: %8.2f TFLOPS\n", res.mfma_fp16 / 1e12);
}
if(mask & MATRIX_FP32) {
printf("MFMA FP32: %8.2f TFLOPS\n", res.mfma_fp32 / 1e12);
}
if(mask & SMATRIX_FP16) {
printf("SMFMAC FP16: %8.2f TFLOPS\n", res.smfmac_fp16 / 1e12);
}
}
Result run_tests(int device, int runs, uint32_t mask)
{
int device_count;
HIP_CALL(hipGetDeviceCount(&device_count));
if(device >= device_count) {
std::cout << "Device " << device << " does not exist. Skipping..." << std::endl;
exit(1);
}
HIP_CALL(hipSetDevice(device));
GCNArch arch = get_gcn_arch(device);
Result res = {.device = device};
if(mask & VALU_FP16) {
res.valu_fp16 = fma_throughput_test<float16>(device, 4096, runs);
}
if(mask & VALU_FP32) {
res.valu_fp32 = fma_throughput_test<float>(device, 4096, runs);
}
if(mask & VALU_FP64) {
res.valu_fp64 = fma_throughput_test<double>(device, 4096, runs);
}
if(mask & VALU_INT32) {
res.valu_int32 = fma_throughput_test<int>(device, 4096, runs);
}
if(mask & MATRIX_FP16) {
if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) {
res.mfma_fp16 = matmul_throughput_test<float16, float>(device, 4096, runs);
} else {
res.mfma_fp16 = 0;
}
}
if(mask & MATRIX_FP32) {
if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) {
res.mfma_fp32 = matmul_throughput_test<float, float>(device, 4096, runs);
} else {
res.mfma_fp32 = 0;
}
}
if(mask & SMATRIX_FP16) {
if(arch.major == 9 && arch.minor >= 4) {
res.smfmac_fp16 = sparse_matmul_throughput_test<float16, float>(device, 4096, runs);
} else {
res.smfmac_fp16 = 0;
}
}
return res;
}
// Use fork() followed by exec() to run child process. For some reason
// rocprof does not pick up the child processes when only fork() is
// used.
pid_t fork_process(int device, int runs, uint32_t mask, int fd)
{
pid_t pid = fork();
if(pid != 0) {
return pid;
}
std::string str_device = std::to_string(device);
std::string str_runs = std::to_string(runs);
std::string str_mask = std::to_string(mask);
std::string str_fd = std::to_string(fd);
char* const args[] = {
(char*)"CHILD",
(char*)str_device.c_str(),
(char*)str_runs.c_str(),
(char*)str_mask.c_str(),
(char*)str_fd.c_str(),
NULL
};
execv("/proc/self/exe", args);
std::cout << "execv() failed: " << std::strerror(errno) << std::endl;
exit(1);
}
void run(std::vector<int>& devices, int runs, uint32_t mask)
{
std::vector<pid_t> pids;
// We will receive results from the child processes using a pipe
int fd[2];
if(pipe(fd)) {
std::cout << std::strerror(errno) << std::endl;
exit(1);
}
// Start a new process for each GPU
for(auto d : devices) {
pid_t pid = fork_process(d, runs, mask, fd[1]);
pids.push_back(pid);
}
// Wait for all processes to finish
for(auto pid : pids) {
int status;
waitpid(pid, &status, 0);
}
// Set the read to non-blocking
int flags = fcntl(fd[0], F_GETFL, 0);
fcntl(fd[0], F_SETFL, flags | O_NONBLOCK);
// Read records from pipe
std::vector<Result> results(pids.size());
int count = read(fd[0], results.data(), results.size() * sizeof(Result)) / sizeof(Result);
results.resize(count);
// Sort results by GPU id
std::sort(results.begin(), results.end());
// Print results
for(auto r : results) {
std::cout << std::endl << "GPU " << r.device << std::endl;
print_result(r, mask);
}
Result total;
for(auto r : results) {
total.valu_fp16 += r.valu_fp16;
total.valu_fp32 += r.valu_fp32;
total.valu_fp64 += r.valu_fp64;
total.valu_int32 += r.valu_int32;
total.mfma_fp16 += r.mfma_fp16;
total.mfma_fp32 += r.mfma_fp32;
total.smfmac_fp16 += r.smfmac_fp16;
}
std::cout << std::endl << "System total" << std::endl;
print_result(total, mask);
}
void usage()
{
std::cout << "--device ID Use device with the given numerical ID" << std::endl;
std::cout << "--devices IDS | ALL Comma-separated list of device Ids (e.g., 1,2,3)" << std::endl;
std::cout << " ALL for all devices" << std::endl;
std::cout << "--runs RUNS Number of times each kernel is dispatched" << std::endl;
std::cout << "--fp16 Run FP16 (VALU) test" << std::endl;
std::cout << "--fp32 Run FP32 (VALU) test" << std::endl;
std::cout << "--fp64 Run FP64 (VALU) test" << std::endl;
std::cout << "--matfp16 Run FP16 (MFMA) test" << std::endl;
std::cout << "--matfp32 Run FP32 (MFMA) test" << std::endl;
std::cout << "--smatfp16 Run FP16 (SMFMAC) test" << std::endl;
}
int main(int argc, char** argv)
{
if(std::string(argv[0]) == "CHILD") {
int device = atoi(argv[1]);
int runs = atoi(argv[2]);
uint32_t mask = atoi(argv[3]);
int fd = atoi(argv[4]);
Result res = run_tests(device, runs, mask);
write(fd, &res, sizeof(res));
return 0;
}
int runs = 1;
uint32_t mask = 0;
bool all_devices = false;
std::vector<int> devices;
int device_count;
int device = 0;
HIP_CALL(hipGetDeviceCount(&device_count));
int i = 1;
while(i < argc) {
std::string arg = std::string(argv[i]);
if(arg == "--help") {
usage();
return 0;
} else if(arg == "--device") {
devices.push_back(atoi(argv[i + 1]));
// Skip next
i++;
} else if(arg == "--devices") {
// Parse comma-separated string of numbers
std::string s(argv[i + 1]);
if(s == "all" || s == "ALL") {
all_devices = true;
} else {
std::stringstream ss(s);
std::string r;
while(getline(ss, r, ',')) {
devices.push_back(std::stoi(r));
}
}
// Skip next
i++;
} else if(arg == "--runs") {
runs = atoi(argv[i + 1]);
// Skip next
i++;
} else if(arg == "--fp32") {
mask |= VALU_FP32;
} else if(arg == "--fp64") {
mask |= VALU_FP64;
} else if(arg == "--fp16") {
mask |= VALU_FP16;
} else if(arg == "--int32") {
mask |= VALU_INT32;
} else if(arg == "--matfp16") {
mask |= MATRIX_FP16;
} else if(arg == "--matfp32") {
mask |= MATRIX_FP32;
} else if(arg == "--smatfp16") {
mask |= SMATRIX_FP16;
} else {
std::cout << "Invalid argument '" << arg << "'" << std::endl;
std::cout << std::endl;
usage();
return 1;
}
i++;
}
if(all_devices) {
for(int i = 0; i < device_count; i++ ){
devices.push_back(i);
}
}
// Verify device ID's
for(auto d : devices) {
if(d >= device_count) {
std::cout << "Invalid device ordinal: " << d << std::endl;
return 1;
}
}
if(devices.size() == 0) {
devices.push_back(0);
}
if(mask == 0) {
mask = ALL;
}
run(devices, runs, mask);
return 0;
}
@@ -77,11 +77,11 @@ for the agent and returns a pointer to it.
#include <iostream>
#include <memory>
#include <mutex>
#include <random>
#include <set>
#include <shared_mutex>
#include <sstream>
#include <string>
#include <unistd.h>
#include <unordered_map>
#include <vector>
@@ -148,7 +148,7 @@ struct counter_info_record_t {
// Tool data struct, now includes a vector of counter_info_record_t
struct tool_data_t {
std::mutex mut{};
std::unique_ptr<std::ostream> output_stream{nullptr};
std::string output_filename{};
std::unordered_map<uint64_t, std::string> counter_id_name_map{};
std::string requested_counters{};
std::string kernel_filter_include_regex{};
@@ -614,14 +614,28 @@ void generate_output(tool_data_t *tool_data) {
}),
tool_data->counter_records.end());
}
if (tool_data->counter_records.empty()) {
return;
}
// Write collected counter records and clean up
if (auto &os = tool_data->output_stream) {
if (!tool_data->output_filename.empty()) {
std::ofstream ofs(tool_data->output_filename);
if (!ofs.is_open()) {
std::cerr << "Failed to open output file: " << tool_data->output_filename
<< std::endl;
return;
}
// Write header at the beginning of the file
ofs << "dispatch_id,gpu_id,kernel_id,lds_per_workgroup,"
"counter_id,counter_name,counter_value\n";
for (const auto &r : tool_data->counter_records)
*os << r.dispatch_id << ',' << r.agent_id << "," << r.kernel_id << ','
ofs << r.dispatch_id << ',' << r.agent_id << "," << r.kernel_id << ','
<< r.LDS_memory_size << ',' << r.counter_id << ',' << r.counter_name
<< ',' << r.counter_value << '\n';
os->flush();
ofs.flush();
std::clog << "[rocprofiler-compute] [" << __FUNCTION__
<< "] Counter collection data has been written to: "
<< tool_data->output_filename << std::endl;
}
}
@@ -638,18 +652,13 @@ void tool_fini(void *user_data) {
} // namespace
std::unique_ptr<tool_data_t> create_tool_data(rocprofiler_client_id_t *id) {
std::unique_ptr<tool_data_t>
create_tool_data(rocprofiler_client_id_t * /*id*/) {
auto tool_data = std::make_unique<tool_data_t>();
// Generate a unique output filename using a random hex string (no libuuid
// dependency)
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<uint32_t> dis(0, 0xFFFFFFFF);
std::stringstream filename_ss;
filename_ss << std::hex << dis(gen);
// Generate a unique output filename using the process ID
std::string base_filename =
"counter_collection_" + filename_ss.str().substr(0, 8) + ".csv";
std::to_string(getpid()) + "_native_counter_collection.csv";
// Require ROCPROF_OUTPUT_PATH to be set, otherwise error out
std::string filename;
@@ -664,20 +673,7 @@ std::unique_ptr<tool_data_t> create_tool_data(rocprofiler_client_id_t *id) {
// Use the generated base filename along with ROCPROF_OUTPUT_PATH
filename += base_filename;
// Set output stream to file
auto ofs = std::make_unique<std::ofstream>(filename);
if (!ofs->is_open()) {
throw std::runtime_error("Failed to open output file: " + filename);
}
tool_data->output_stream = std::move(ofs);
// Write header at the beginning of the file
*tool_data->output_stream << "dispatch_id,gpu_id,kernel_id,lds_per_workgroup,"
"counter_id,counter_name,counter_value\n";
tool_data->output_stream->flush();
// Write to clog the path of the logging file
std::clog << id->name << " [" << __FUNCTION__
<< "] Logging counter collection to: " << filename << std::endl;
tool_data->output_filename = filename;
// Store ROCPROF env. vars. in tool_data
@@ -61,7 +61,8 @@ def simple_bar(df: pd.DataFrame, title: Optional[str] = None) -> Optional[str]:
if "Metric" in df.columns and "Avg" in df.columns:
metric_dict = (
pd.DataFrame([df["Metric"], df["Avg"]])
pd
.DataFrame([df["Metric"], df["Avg"]])
.replace("", 0)
.replace(float("inf"), -1) # It should not happen
.replace(float("-inf"), -1)
@@ -258,7 +259,8 @@ def px_simple_multi_bar(
for group, metric in nested_bar.items():
dfigs.append(
px.bar(
px
.bar(
title=group,
x=metric.values(),
y=metric.keys(),
@@ -219,7 +219,8 @@ def get_views() -> list[TextClause]:
select(
Kernel.kernel_name,
(Dispatch.end_timestamp - Dispatch.start_timestamp).label("duration"),
func.row_number()
func
.row_number()
.over(
partition_by=Kernel.kernel_name,
order_by=Dispatch.end_timestamp - Dispatch.start_timestamp,
@@ -132,7 +132,8 @@ class MIGPUSpecs:
cls._all_gpu_models.append(curr_gpu_model)
cls._gpu_model_dict[curr_gpu_arch].append(curr_gpu_model)
cls._num_xcds_dict[curr_gpu_model] = (
models.get("partition_mode", {})
models
.get("partition_mode", {})
.get("compute_partition_mode", {})
.get("num_xcds", {})
)
@@ -580,7 +580,8 @@ def gen_counter_list(formula: str) -> tuple[bool, list[str]]:
return visited, counters
try:
tree = ast.parse(
formula.replace("$normUnit", "SQ_WAVES")
formula
.replace("$normUnit", "SQ_WAVES")
.replace("$denom", "SQ_WAVES")
.replace(
"$numActiveCUs",
@@ -1606,9 +1607,9 @@ def load_pc_sampling_data_per_kernel(
pc_sample_instructions = search_key_in_json(file_name, "pc_sample_instructions")
df["instruction"] = (
df["inst_index"].apply(
lambda x: pc_sample_instructions[x]
if x < len(pc_sample_instructions)
else None
lambda x: (
pc_sample_instructions[x] if x < len(pc_sample_instructions) else None
)
)
if pc_sample_instructions
else None
@@ -1618,9 +1619,11 @@ def load_pc_sampling_data_per_kernel(
pc_sample_comments = search_key_in_json(file_name, "pc_sample_comments")
df["source_line"] = (
df["inst_index"].apply(
lambda x: f".../{Path(pc_sample_comments[x]).name}"
if x < len(pc_sample_comments)
else None
lambda x: (
f".../{Path(pc_sample_comments[x]).name}"
if x < len(pc_sample_comments)
else None
)
)
if pc_sample_comments
else None
@@ -1719,7 +1722,8 @@ def load_pc_sampling_data(
# Group by Instruction_Comment and aggregate
grouped_counts = (
merged_df.groupby("Instruction_Comment")
merged_df
.groupby("Instruction_Comment")
.agg(
count=("Instruction_Comment", "count"),
instruction=("Instruction", "first"),
@@ -38,6 +38,7 @@ COUNTERS_COLLECTION_QUERY = """
SELECT
agent_id as GPU_ID,
dispatch_id as Dispatch_ID,
pid as PID,
grid_size as Grid_Size,
workgroup_size as Workgroup_Size,
lds_block_size as LDS_Per_Workgroup,
@@ -61,24 +62,28 @@ TABLE_NAME_PREFIX_QUERY = (
INSERT_QUERY = "INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
def convert_db_to_csv(
db_path: str,
def convert_dbs_to_csv(
db_paths: list[str],
csv_file_path: str,
) -> None:
"""
Read rocpd database and write to CSV file
Read rocpd databases and write to CSV file
"""
# Read counters_collection view from the database and write to CSV
# Read counters_collection view from the databases and write to CSV
try:
with closing(sqlite3.connect(db_path)) as conn:
with closing(conn.execute(COUNTERS_COLLECTION_QUERY)) as cursor:
with open(csv_file_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow([
description[0] for description in cursor.description
])
for row in cursor:
writer.writerow(row)
with open(csv_file_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
header_written = False
for db_path in db_paths:
with closing(sqlite3.connect(db_path)) as conn:
with closing(conn.execute(COUNTERS_COLLECTION_QUERY)) as cursor:
if not header_written:
writer.writerow([
description[0] for description in cursor.description
])
header_written = True
for row in cursor:
writer.writerow(row)
except OSError as e:
console_error(f"Database error while converting to CSV: {e}")
except Exception as e:
@@ -426,7 +426,8 @@ def format_table_output(
and "Value" in df.columns
):
mem_data = (
pd.DataFrame([df["Metric"], df["Value"]])
pd
.DataFrame([df["Metric"], df["Value"]])
.transpose()
.set_index("Metric")
.to_dict()["Value"]
@@ -885,24 +885,48 @@ def run_prof(
rocprof_cmd == "rocprofiler-sdk"
and options["ROCPROF_COUNTER_COLLECTION"] == "0"
):
# Update rocpd database with counter csv created by native tool
rocpd_data.update_rocpd_pmc_events(
pd.read_csv(glob.glob(workload_dir + "/out/pmc_1/*.csv")[0]),
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
)
for db_name in glob.glob(workload_dir + "/out/pmc_1/*/*.db"):
pid = Path(db_name).stem.split("_")[0]
rocpd_data.update_rocpd_pmc_events(
pd.read_csv(
f"{workload_dir}/out/pmc_1/{pid}_native_counter_collection.csv"
),
db_name,
)
console_debug(f"Updated rocpd db {db_name} with native tool counters.")
# Write results_fbase.csv
rocpd_data.convert_db_to_csv(
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
rocpd_data.convert_dbs_to_csv(
glob.glob(workload_dir + "/out/pmc_1/*/*.db"),
workload_dir + f"/results_{fbase}.csv",
)
combined_df = pd.read_csv(workload_dir + f"/results_{fbase}.csv")
# Reset Dispatch_ID based on PID, Kernel_Name, Grid_Size,
# Workgroup_Size, LDS_Per_Workgroup
combined_df["Dispatch_ID"] = combined_df.groupby(
["PID", "Kernel_Name", "Grid_Size", "Workgroup_Size", "LDS_Per_Workgroup"],
sort=False,
).ngroup()
# Reset Kernel_ID based on Kernel_Name, Grid_Size,
# Workgroup_Size, LDS_Per_Workgroup
combined_df["Kernel_ID"] = combined_df.groupby(
["Kernel_Name", "Grid_Size", "Workgroup_Size", "LDS_Per_Workgroup"],
sort=False,
).ngroup()
# Drop PID since its not required
combined_df = combined_df.drop(columns=["PID"])
combined_df.to_csv(workload_dir + f"/results_{fbase}.csv", index=False)
if retain_rocpd_output:
shutil.copyfile(
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
workload_dir + "/" + fbase + ".db",
)
console_warning(
f"Retaining large raw rocpd database: {workload_dir}/{fbase}.db"
)
for db_path in glob.glob(workload_dir + "/out/pmc_1/*/*.db"):
pid = Path(db_path).stem.split("_")[0]
shutil.copyfile(
db_path,
workload_dir + f"/{fbase}_{pid}.db",
)
console_warning(
f"Retaining large raw rocpd database: "
f"{workload_dir}/{fbase}_{pid}.db"
)
# Remove temp directory
shutil.rmtree(workload_dir + "/" + "out")
return
@@ -1064,81 +1088,66 @@ def convert_native_counter_collection_csv(workload_dir: str) -> None:
trace to write counter collection csv in rocprofiler-sdk format
for further processing to pmc_perf.csv file
"""
counter_data = pd.read_csv(
glob.glob(f"{workload_dir}/out/pmc_1/*.csv")[0], index_col=False
)
# Group by on counter_data based on dispatch_id and
# counter_id and sum the counter_value
counter_data = counter_data.groupby(
["dispatch_id", "counter_name"], as_index=False
).agg({"counter_value": "sum"})
kernel_data_filename = glob.glob(f"{workload_dir}/out/pmc_1/*/*_kernel_trace.csv")[
0
]
kernel_data = pd.read_csv(kernel_data_filename)
rocprofv3_counter_data = pd.DataFrame({
"Correlation_Id": counter_data["dispatch_id"],
"Dispatch_Id": counter_data["dispatch_id"],
"Agent_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Agent_Id"
].values,
"Queue_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Queue_Id"
].values,
"Process_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Thread_Id"
].values,
"Thread_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Thread_Id"
].values,
"Grid_Size": (
kernel_data.iloc[counter_data["dispatch_id"] - 1][
["Grid_Size_X", "Grid_Size_Y", "Grid_Size_Z"]
]
.prod(axis=1)
.values
),
"Kernel_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Kernel_Id"
].values,
"Kernel_Name": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Kernel_Name"
].values,
"Workgroup_Size": (
kernel_data.iloc[counter_data["dispatch_id"] - 1][
["Workgroup_Size_X", "Workgroup_Size_Y", "Workgroup_Size_Z"]
]
.prod(axis=1)
.values
),
"LDS_Block_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"LDS_Block_Size"
].values,
"Scratch_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Scratch_Size"
].values,
"VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"VGPR_Count"
].values,
"Accum_VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Accum_VGPR_Count"
].values,
"SGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"SGPR_Count"
].values,
"Counter_Name": counter_data["counter_name"],
"Counter_Value": counter_data["counter_value"],
"Start_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"Start_Timestamp"
].values,
"End_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
"End_Timestamp"
].values,
})
rocprofv3_counter_data.to_csv(
kernel_data_filename.replace("kernel_trace", "counter_collection"),
index=False,
)
for native_filename in glob.glob(
f"{workload_dir}/out/pmc_1/*_native_counter_collection.csv"
):
counter_data = pd.read_csv(native_filename, index_col=False)
# Group by on dispatch_id and counter_id and sum the counter_value,
# Other rows in group have the same value, so take the first one
groupby_cols = ["dispatch_id", "counter_name"]
agg_dict = {
col: "first" for col in counter_data.columns if col not in groupby_cols
}
# Overwrite counter_value aggregation to sum
agg_dict["counter_value"] = "sum"
counter_data = counter_data.groupby(groupby_cols, as_index=False).agg(agg_dict)
pid = Path(native_filename).stem.split("_")[0]
kernel_data_filename = glob.glob(
f"{workload_dir}/out/pmc_1/*/{pid}_kernel_trace.csv"
)[0]
kernel_data = pd.read_csv(kernel_data_filename)
# Merge counter_data with kernel_data on kernel_id
merged_data = pd.merge(
counter_data,
kernel_data,
left_on="kernel_id",
right_on="Kernel_Id",
how="left",
)
rocprofv3_counter_data = pd.DataFrame({
"Correlation_Id": merged_data["dispatch_id"],
"Dispatch_Id": merged_data["dispatch_id"],
"Agent_Id": merged_data["Agent_Id"],
"Queue_Id": merged_data["Queue_Id"],
"Process_Id": merged_data["Thread_Id"],
"Thread_Id": merged_data["Thread_Id"],
"Grid_Size": (
merged_data[["Grid_Size_X", "Grid_Size_Y", "Grid_Size_Z"]].prod(axis=1)
),
"Kernel_Id": merged_data["Kernel_Id"],
"Kernel_Name": merged_data["Kernel_Name"],
"Workgroup_Size": (
merged_data[
["Workgroup_Size_X", "Workgroup_Size_Y", "Workgroup_Size_Z"]
].prod(axis=1)
),
"LDS_Block_Size": merged_data["LDS_Block_Size"],
"Scratch_Size": merged_data["Scratch_Size"],
"VGPR_Count": merged_data["VGPR_Count"],
"Accum_VGPR_Count": merged_data["Accum_VGPR_Count"],
"SGPR_Count": merged_data["SGPR_Count"],
"Counter_Name": merged_data["counter_name"],
"Counter_Value": merged_data["counter_value"],
"Start_Timestamp": merged_data["Start_Timestamp"],
"End_Timestamp": merged_data["End_Timestamp"],
})
rocprofv3_counter_data.to_csv(
kernel_data_filename.replace("kernel_trace", "counter_collection"),
index=False,
)
def process_rocprofv3_output(workload_dir: str, using_native_tool: bool) -> list[str]:
@@ -67,3 +67,11 @@ set_target_properties(
laplace_eqn
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
)
set(ROCFLOP_SOURCES ../sample/rocflop.cpp)
set_source_files_properties(${ROCFLOP_SOURCES} PROPERTIES LANGUAGE HIP)
add_executable(rocflop ${ROCFLOP_SOURCES})
set_target_properties(
rocflop
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
)
@@ -68,6 +68,7 @@ config["app_mat_mul_max"] = ["./tests/mat_mul_max"]
config["app_hip_dynamic_shared"] = ["./tests/hip_dynamic_shared"]
config["app_laplace_eqn"] = ["./tests/laplace_eqn", "-i", "5000"]
config["app_laplace_eqn_iter"] = ["./tests/laplace_eqn", "-i", "15000"]
config["rocflop"] = ["./tests/rocflop", "--device", "0"]
config["cleanup"] = True
config["COUNTER_LOGGING"] = False
config["METRIC_COMPARE"] = False
@@ -637,6 +638,29 @@ def test_path(binary_handler_profile_rocprof_compute):
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.path
def test_path_rocflop(
binary_handler_profile_rocprof_compute,
):
# Test whether multiprocess workloads like rocflop are handled correctly
workload_dir = test_utils.get_output_dir()
options = ["--block", "2.1.1"]
_ = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=True,
roof=False,
app_name="rocflop",
)
pmc_perf_df = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)[
"pmc_perf.csv"
]
# Ensure non zero length of df
assert len(pmc_perf_df) > 0
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.path
def test_path_no_native(binary_handler_profile_rocprof_compute):
workload_dir = test_utils.get_output_dir()