[rocprofiler-compute] Fix for multi process workload profiling (#2418)
* Fix for multi process workload profiling
Native counter collection tool updates:
* Do not dump empty counter data for a process
* Use PID instead of UUID for dumped csv files to facilitate correlation
* Handle merging multiple pairs of rocpd (from sdk tool) and csv (from
native tool) files
* Handle merging multiple pairs of csv (from sdk tool) and csv (from
native tool) files
Rocpd output format updates:
* Merge multiple rocpd databases into a single csv
* Reset dispatch id and kernel id for unique dispatches and unique
kernels respectively
* Retain multiple rocpd databases per run for multi process workloads
* Add test case for multiprocess profiling using rocflop workload
* Add rocflop
* Fix native counter csv to rocprofv3 csv conversion
* Use kernel_id instead of dispatch_id to correlate native counter csv
and kernel trace csv
* python formatting using ruff 0.14 instead of 0.13
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
3e49440495
commit
588773f9bf
@@ -45,6 +45,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
|
||||
* Fix the check to prevent showing table where a column is full of N/A
|
||||
* Improve detection of empty values when metric evalulation fails due to counter data missing
|
||||
|
||||
* Fix the wrong logic in native counter csv to rocprofv3 csv conversion
|
||||
* Use kernel_id instead of dispatch_id to correlate native counter csv and kernel trace csv
|
||||
|
||||
### Removed
|
||||
|
||||
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
|
||||
|
||||
@@ -748,6 +748,7 @@ if(INSTALL_TESTS)
|
||||
tests/hip_dynamic_shared
|
||||
tests/laplace_eqn
|
||||
tests/mat_mul_max
|
||||
tests/rocflop
|
||||
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
|
||||
COMPONENT tests
|
||||
)
|
||||
|
||||
@@ -0,0 +1,681 @@
|
||||
// Copied from https://github.com/benrichard-amd/rocflop/tree/82f197e12314bab694fc70451a2b495b4f51bf90
|
||||
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <unistd.h>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include <sys/wait.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
using float16 = _Float16;
|
||||
|
||||
// Vector types. Useful for packed math (where supported) and MFMA inputs.
|
||||
template<typename T, uint32_t Rank>
|
||||
using vecT = T __attribute__((ext_vector_type(Rank)));
|
||||
|
||||
template<typename T> using vec4 = vecT<T, 4>;
|
||||
template<typename T> using vec8 = vecT<T, 8>;
|
||||
|
||||
|
||||
// Kernels
|
||||
|
||||
|
||||
template<typename T> __global__ void fma_throughput(vec4<T>* buffer, int count)
|
||||
{
|
||||
const T k = 1.0;
|
||||
|
||||
const int grid_size = gridDim.x * blockDim.x;
|
||||
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
vec4<T>* ptr = buffer;
|
||||
|
||||
vec4<T> value0 = ptr[0 * grid_size + tid];
|
||||
vec4<T> value1 = ptr[1 * grid_size + tid];
|
||||
vec4<T> value2 = ptr[2 * grid_size + tid];
|
||||
vec4<T> value3 = ptr[3 * grid_size + tid];
|
||||
|
||||
for(int j = 0; j < count; j++) {
|
||||
for(int j = 0; j < 64; j++) {
|
||||
|
||||
// 16 FMA ops
|
||||
value0 = value0 * value0 + k;
|
||||
value1 = value1 * value1 + k;
|
||||
value2 = value2 * value2 + k;
|
||||
value3 = value3 * value3 + k;
|
||||
}
|
||||
}
|
||||
|
||||
ptr[tid] = value0 + value1 + value2 + value3;
|
||||
}
|
||||
|
||||
__global__ void matmul_fp16_throughput(vec4<float16>* inputs, vec4<float>* outputs, int count)
|
||||
{
|
||||
int grid_size = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
vec4<float16>* ptr = inputs;
|
||||
|
||||
vec4<float16> value0 = ptr[0 * grid_size + tid];
|
||||
vec4<float16> value1 = ptr[1 * grid_size + tid];
|
||||
vec4<float16> value2 = ptr[2 * grid_size + tid];
|
||||
vec4<float16> value3 = ptr[3 * grid_size + tid];
|
||||
|
||||
vec4<float> accum0;
|
||||
vec4<float> accum1;
|
||||
vec4<float> accum2;
|
||||
vec4<float> accum3;
|
||||
for(int i = 0; i < count; i++) {
|
||||
for(int j = 0; j < 64; j++) {
|
||||
// 4 MFMA ops
|
||||
accum0 = __builtin_amdgcn_mfma_f32_16x16x16f16(value0, value0, accum0, 0, 0, 0);
|
||||
accum1 = __builtin_amdgcn_mfma_f32_16x16x16f16(value1, value1, accum1, 0, 0, 0);
|
||||
accum2 = __builtin_amdgcn_mfma_f32_16x16x16f16(value2, value2, accum2, 0, 0, 0);
|
||||
accum3 = __builtin_amdgcn_mfma_f32_16x16x16f16(value3, value3, accum3, 0, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
outputs[tid] = accum0 + accum1 + accum2 + accum3;
|
||||
}
|
||||
|
||||
__global__ void sparse_matmul_fp16_throughput(vec4<float16>* input0, vec8<float16>* input1, vec4<float>* outputs, int count)
|
||||
{
|
||||
int grid_size = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
vec4<float16>* x_ptr = input0;
|
||||
vec8<float16>* y_ptr = input1;
|
||||
|
||||
vec4<float16> x0 = x_ptr[0 * grid_size + tid];
|
||||
vec4<float16> x1 = x_ptr[1 * grid_size + tid];
|
||||
vec4<float16> x2 = x_ptr[2 * grid_size + tid];
|
||||
vec4<float16> x3 = x_ptr[3 * grid_size + tid];
|
||||
|
||||
vec8<float16> y0 = y_ptr[0 * grid_size + tid];
|
||||
vec8<float16> y1 = y_ptr[1 * grid_size + tid];
|
||||
vec8<float16> y2 = y_ptr[2 * grid_size + tid];
|
||||
vec8<float16> y3 = y_ptr[3 * grid_size + tid];
|
||||
|
||||
vec4<float> accum0;
|
||||
vec4<float> accum1;
|
||||
vec4<float> accum2;
|
||||
vec4<float> accum3;
|
||||
|
||||
for(int i = 0; i < count; i++) {
|
||||
for(int j = 0; j < 64; j++) {
|
||||
// 4 SMFMAC ops
|
||||
accum0 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x0, y0, accum0, 0, 0, 0);
|
||||
accum1 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x1, y1, accum1, 0, 0, 0);
|
||||
accum2 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x2, y2, accum2, 0, 0, 0);
|
||||
accum3 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x3, y3, accum3, 0, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
outputs[tid] = accum0 + accum1 + accum2 + accum3;
|
||||
}
|
||||
|
||||
__global__ void matmul_fp32_throughput(float* inputs, vec4<float>* outputs, int count)
|
||||
{
|
||||
int grid_size = gridDim.x * blockDim.x;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
float* ptr = inputs;
|
||||
|
||||
float value0 = ptr[0 * grid_size + tid];
|
||||
float value1 = ptr[1 * grid_size + tid];
|
||||
float value2 = ptr[2 * grid_size + tid];
|
||||
float value3 = ptr[2 * grid_size + tid];
|
||||
|
||||
vec4<float> accum0;
|
||||
vec4<float> accum1;
|
||||
vec4<float> accum2;
|
||||
vec4<float> accum3;
|
||||
for(int i = 0; i < count; i++) {
|
||||
for(int j = 0; j < 64; j++) {
|
||||
// 4 MFMA ops
|
||||
accum0 = __builtin_amdgcn_mfma_f32_16x16x4f32(value0, value0, accum0, 0, 0, 0);
|
||||
accum1 = __builtin_amdgcn_mfma_f32_16x16x4f32(value1, value1, accum1, 0, 0, 0);
|
||||
accum2 = __builtin_amdgcn_mfma_f32_16x16x4f32(value2, value2, accum2, 0, 0, 0);
|
||||
accum3 = __builtin_amdgcn_mfma_f32_16x16x4f32(value3, value3, accum3, 0, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
outputs[tid] = accum0 + accum1 + accum2 + accum3;
|
||||
}
|
||||
|
||||
void HIP_CALL(hipError_t err)
|
||||
{
|
||||
if(err != hipSuccess) {
|
||||
std::cout << "HIP Error: " << (int)err << " " << hipGetErrorString(err) << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
struct GCNArch {
|
||||
int major;
|
||||
int minor;
|
||||
int rev;
|
||||
};
|
||||
|
||||
GCNArch get_gcn_arch(int device)
|
||||
{
|
||||
hipDeviceProp_t props;
|
||||
|
||||
HIP_CALL(hipGetDeviceProperties(&props, device));
|
||||
|
||||
// Example: gfx908:sramecc+:xnack-
|
||||
std::string arch_full(props.gcnArchName);
|
||||
|
||||
// Extract number e.g. "908"
|
||||
std::string gfx_str = arch_full.substr(3, arch_full.find_first_of(':'));
|
||||
|
||||
int gfx_num = std::stoi(gfx_str, nullptr, 16);
|
||||
|
||||
GCNArch arch;
|
||||
arch.major = (gfx_num & 0xff00) >> 8;
|
||||
arch.minor = (gfx_num & 0x00f0) >> 4;
|
||||
arch.rev = (gfx_num & 0x000f);
|
||||
|
||||
return arch;
|
||||
}
|
||||
|
||||
enum : uint32_t {
|
||||
VALU_FP32 = 1 << 0,
|
||||
VALU_FP16 = 1 << 1,
|
||||
VALU_FP64 = 1 << 2,
|
||||
MATRIX_FP16 = 1 << 3,
|
||||
MATRIX_FP32 = 1 << 4,
|
||||
SMATRIX_FP16 = 1 << 5,
|
||||
VALU_INT32 = 1 << 6,
|
||||
|
||||
ALL = (uint32_t)-1
|
||||
};
|
||||
|
||||
// Timer for measuring kernel duration
|
||||
class HIPTimer {
|
||||
|
||||
private:
|
||||
hipEvent_t m_start;
|
||||
hipEvent_t m_stop;
|
||||
|
||||
public:
|
||||
HIPTimer()
|
||||
{
|
||||
HIP_CALL(hipEventCreate(&m_start));
|
||||
HIP_CALL(hipEventCreate(&m_stop));
|
||||
}
|
||||
|
||||
void start()
|
||||
{
|
||||
HIP_CALL(hipEventRecord(m_start));
|
||||
}
|
||||
|
||||
void stop()
|
||||
{
|
||||
HIP_CALL(hipEventRecord(m_stop));
|
||||
}
|
||||
|
||||
double elapsed()
|
||||
{
|
||||
float ms;
|
||||
HIP_CALL(hipEventElapsedTime(&ms, m_start, m_stop));
|
||||
|
||||
return (double)ms / 1000.0;
|
||||
}
|
||||
};
|
||||
|
||||
// Host code
|
||||
|
||||
template<typename T> double fma_throughput_test(int device, int count, int runs = 1)
|
||||
{
|
||||
vec4<T>* buffer = nullptr;
|
||||
|
||||
hipDeviceProp_t props;
|
||||
HIP_CALL(hipGetDeviceProperties(&props, device));
|
||||
|
||||
int blocks = props.multiProcessorCount * 512;
|
||||
int threads_per_block = 64;
|
||||
int total_threads = blocks * threads_per_block;
|
||||
|
||||
HIP_CALL(hipMalloc(&buffer, sizeof(vec4<T>) * total_threads * 4));
|
||||
|
||||
HIPTimer t;
|
||||
t.start();
|
||||
for(int i = 0; i < runs; i++) {
|
||||
fma_throughput<T><<<blocks, threads_per_block>>>(buffer, count);
|
||||
}
|
||||
t.stop();
|
||||
HIP_CALL(hipDeviceSynchronize());
|
||||
|
||||
double elapsed = t.elapsed();
|
||||
double ops = (double)total_threads * count * 64 * 16 * runs;
|
||||
double flops = (double)ops * 2.0 / elapsed;
|
||||
|
||||
HIP_CALL(hipFree(buffer));
|
||||
|
||||
return flops;
|
||||
}
|
||||
|
||||
template<typename matT, typename accumT> double matmul_throughput_test(int device, int count, int runs = 1)
|
||||
{
|
||||
const int wave_size = 64;
|
||||
int k;
|
||||
int m;
|
||||
int n;
|
||||
|
||||
if(std::is_same<matT, float16>::value) {
|
||||
m = 16;
|
||||
n = 16;
|
||||
k = 16;
|
||||
} else if(std::is_same<matT, float>::value) {
|
||||
m = 16;
|
||||
n = 16;
|
||||
k = 4;
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
int ops_per_matmul = k * m * n * 2;
|
||||
|
||||
void* buffer = nullptr;
|
||||
void* accum = nullptr;
|
||||
|
||||
hipDeviceProp_t props;
|
||||
HIP_CALL(hipGetDeviceProperties(&props, device));
|
||||
|
||||
int blocks = props.multiProcessorCount * 512;
|
||||
int threads_per_block = wave_size;
|
||||
int total_threads = blocks * threads_per_block;
|
||||
|
||||
HIP_CALL(hipMalloc(&buffer, 4 * sizeof(matT) * m * k * total_threads));
|
||||
HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
|
||||
|
||||
HIPTimer t;
|
||||
t.start();
|
||||
for(int i = 0; i < runs; i++) {
|
||||
if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
|
||||
matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer, (vec4<float>*)accum, count);
|
||||
} else if(std::is_same<matT,float>::value && std::is_same<accumT, float>::value) {
|
||||
matmul_fp32_throughput<<<blocks, threads_per_block>>>((float*)buffer, (vec4<float>*)accum, count);
|
||||
}
|
||||
}
|
||||
t.stop();
|
||||
HIP_CALL(hipDeviceSynchronize());
|
||||
|
||||
double elapsed = t.elapsed();
|
||||
double ops = (double)blocks * count * 64 * 4 * runs;
|
||||
double flops = (double)ops * ops_per_matmul / elapsed;
|
||||
|
||||
HIP_CALL(hipFree(buffer));
|
||||
HIP_CALL(hipFree(accum));
|
||||
|
||||
return flops;
|
||||
}
|
||||
|
||||
template<typename matT, typename accumT> double sparse_matmul_throughput_test(int device, int count, int runs = 1)
|
||||
{
|
||||
const int wave_size = 64;
|
||||
int k;
|
||||
int m;
|
||||
int n;
|
||||
|
||||
if(std::is_same<matT, float16>::value) {
|
||||
m = 16;
|
||||
n = 16;
|
||||
k = 32;
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
int ops_per_matmul = k * m * n * 2;
|
||||
|
||||
void* buffer1 = nullptr;
|
||||
void* buffer2 = nullptr;
|
||||
void* accum = nullptr;
|
||||
|
||||
hipDeviceProp_t props;
|
||||
HIP_CALL(hipGetDeviceProperties(&props, device));
|
||||
|
||||
int blocks = props.multiProcessorCount * 512;
|
||||
int threads_per_block = wave_size;
|
||||
int total_threads = blocks * threads_per_block;
|
||||
|
||||
HIP_CALL(hipMalloc(&buffer1, 4 * sizeof(matT) * m * k * total_threads));
|
||||
HIP_CALL(hipMalloc(&buffer2, 8 * sizeof(matT) * n * k * total_threads));
|
||||
HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
|
||||
|
||||
HIPTimer t;
|
||||
t.start();
|
||||
for(int i = 0; i < runs; i++) {
|
||||
if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
|
||||
sparse_matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer1,
|
||||
(vec8<float16>*)buffer2, (vec4<float>*)accum, count);
|
||||
}
|
||||
}
|
||||
t.stop();
|
||||
HIP_CALL(hipDeviceSynchronize());
|
||||
|
||||
double elapsed = t.elapsed();
|
||||
double ops = (double)blocks * count * 64 * 4 * runs;
|
||||
double flops = (double)ops * ops_per_matmul / elapsed;
|
||||
|
||||
HIP_CALL(hipFree(buffer1));
|
||||
HIP_CALL(hipFree(buffer2));
|
||||
HIP_CALL(hipFree(accum));
|
||||
|
||||
return flops;
|
||||
}
|
||||
|
||||
struct Result {
|
||||
int device = -1;
|
||||
double valu_fp16 = 0;
|
||||
double valu_fp32 = 0;
|
||||
double valu_fp64 = 0;
|
||||
double valu_int32 = 0;
|
||||
double mfma_fp16 = 0;
|
||||
double mfma_fp32 = 0;
|
||||
double smfmac_fp16 = 0;
|
||||
|
||||
// Used for sorting
|
||||
bool operator<(const Result& other) {
|
||||
return device < other.device;
|
||||
}
|
||||
};
|
||||
|
||||
void print_result(const Result& res, uint32_t mask)
|
||||
{
|
||||
if(mask & VALU_FP16) {
|
||||
printf("VALU FP16: %8.2f TFLOPS\n", res.valu_fp16 / 1e12);
|
||||
}
|
||||
if(mask & VALU_FP32) {
|
||||
printf("VALU FP32: %8.2f TFLOPS\n", res.valu_fp32 / 1e12);
|
||||
}
|
||||
if(mask & VALU_FP64) {
|
||||
printf("VALU FP64: %8.2f TFLOPS\n", res.valu_fp64 / 1e12);
|
||||
}
|
||||
if(mask & VALU_INT32) {
|
||||
printf("VALU INT32: %8.2f TIOPS\n", res.valu_int32 / 1e12);
|
||||
}
|
||||
if(mask & MATRIX_FP16) {
|
||||
printf("MFMA FP16: %8.2f TFLOPS\n", res.mfma_fp16 / 1e12);
|
||||
}
|
||||
if(mask & MATRIX_FP32) {
|
||||
printf("MFMA FP32: %8.2f TFLOPS\n", res.mfma_fp32 / 1e12);
|
||||
}
|
||||
if(mask & SMATRIX_FP16) {
|
||||
printf("SMFMAC FP16: %8.2f TFLOPS\n", res.smfmac_fp16 / 1e12);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Result run_tests(int device, int runs, uint32_t mask)
|
||||
{
|
||||
int device_count;
|
||||
|
||||
HIP_CALL(hipGetDeviceCount(&device_count));
|
||||
|
||||
if(device >= device_count) {
|
||||
std::cout << "Device " << device << " does not exist. Skipping..." << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
HIP_CALL(hipSetDevice(device));
|
||||
GCNArch arch = get_gcn_arch(device);
|
||||
|
||||
Result res = {.device = device};
|
||||
|
||||
if(mask & VALU_FP16) {
|
||||
res.valu_fp16 = fma_throughput_test<float16>(device, 4096, runs);
|
||||
}
|
||||
|
||||
if(mask & VALU_FP32) {
|
||||
res.valu_fp32 = fma_throughput_test<float>(device, 4096, runs);
|
||||
}
|
||||
|
||||
if(mask & VALU_FP64) {
|
||||
res.valu_fp64 = fma_throughput_test<double>(device, 4096, runs);
|
||||
}
|
||||
|
||||
if(mask & VALU_INT32) {
|
||||
res.valu_int32 = fma_throughput_test<int>(device, 4096, runs);
|
||||
}
|
||||
|
||||
if(mask & MATRIX_FP16) {
|
||||
if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) {
|
||||
res.mfma_fp16 = matmul_throughput_test<float16, float>(device, 4096, runs);
|
||||
} else {
|
||||
res.mfma_fp16 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(mask & MATRIX_FP32) {
|
||||
if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) {
|
||||
res.mfma_fp32 = matmul_throughput_test<float, float>(device, 4096, runs);
|
||||
} else {
|
||||
res.mfma_fp32 = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(mask & SMATRIX_FP16) {
|
||||
if(arch.major == 9 && arch.minor >= 4) {
|
||||
res.smfmac_fp16 = sparse_matmul_throughput_test<float16, float>(device, 4096, runs);
|
||||
} else {
|
||||
res.smfmac_fp16 = 0;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// Use fork() followed by exec() to run child process. For some reason
|
||||
// rocprof does not pick up the child processes when only fork() is
|
||||
// used.
|
||||
pid_t fork_process(int device, int runs, uint32_t mask, int fd)
|
||||
{
|
||||
pid_t pid = fork();
|
||||
|
||||
if(pid != 0) {
|
||||
return pid;
|
||||
}
|
||||
|
||||
std::string str_device = std::to_string(device);
|
||||
std::string str_runs = std::to_string(runs);
|
||||
std::string str_mask = std::to_string(mask);
|
||||
std::string str_fd = std::to_string(fd);
|
||||
|
||||
char* const args[] = {
|
||||
(char*)"CHILD",
|
||||
(char*)str_device.c_str(),
|
||||
(char*)str_runs.c_str(),
|
||||
(char*)str_mask.c_str(),
|
||||
(char*)str_fd.c_str(),
|
||||
NULL
|
||||
};
|
||||
|
||||
execv("/proc/self/exe", args);
|
||||
std::cout << "execv() failed: " << std::strerror(errno) << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void run(std::vector<int>& devices, int runs, uint32_t mask)
|
||||
{
|
||||
std::vector<pid_t> pids;
|
||||
|
||||
// We will receive results from the child processes using a pipe
|
||||
int fd[2];
|
||||
|
||||
if(pipe(fd)) {
|
||||
std::cout << std::strerror(errno) << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Start a new process for each GPU
|
||||
for(auto d : devices) {
|
||||
pid_t pid = fork_process(d, runs, mask, fd[1]);
|
||||
|
||||
pids.push_back(pid);
|
||||
}
|
||||
|
||||
// Wait for all processes to finish
|
||||
for(auto pid : pids) {
|
||||
int status;
|
||||
waitpid(pid, &status, 0);
|
||||
}
|
||||
|
||||
// Set the read to non-blocking
|
||||
int flags = fcntl(fd[0], F_GETFL, 0);
|
||||
fcntl(fd[0], F_SETFL, flags | O_NONBLOCK);
|
||||
|
||||
// Read records from pipe
|
||||
std::vector<Result> results(pids.size());
|
||||
int count = read(fd[0], results.data(), results.size() * sizeof(Result)) / sizeof(Result);
|
||||
|
||||
results.resize(count);
|
||||
|
||||
// Sort results by GPU id
|
||||
std::sort(results.begin(), results.end());
|
||||
|
||||
// Print results
|
||||
for(auto r : results) {
|
||||
std::cout << std::endl << "GPU " << r.device << std::endl;
|
||||
print_result(r, mask);
|
||||
}
|
||||
|
||||
Result total;
|
||||
for(auto r : results) {
|
||||
total.valu_fp16 += r.valu_fp16;
|
||||
total.valu_fp32 += r.valu_fp32;
|
||||
total.valu_fp64 += r.valu_fp64;
|
||||
total.valu_int32 += r.valu_int32;
|
||||
total.mfma_fp16 += r.mfma_fp16;
|
||||
total.mfma_fp32 += r.mfma_fp32;
|
||||
total.smfmac_fp16 += r.smfmac_fp16;
|
||||
}
|
||||
std::cout << std::endl << "System total" << std::endl;
|
||||
print_result(total, mask);
|
||||
}
|
||||
|
||||
|
||||
void usage()
|
||||
{
|
||||
std::cout << "--device ID Use device with the given numerical ID" << std::endl;
|
||||
std::cout << "--devices IDS | ALL Comma-separated list of device Ids (e.g., 1,2,3)" << std::endl;
|
||||
std::cout << " ALL for all devices" << std::endl;
|
||||
std::cout << "--runs RUNS Number of times each kernel is dispatched" << std::endl;
|
||||
|
||||
std::cout << "--fp16 Run FP16 (VALU) test" << std::endl;
|
||||
std::cout << "--fp32 Run FP32 (VALU) test" << std::endl;
|
||||
std::cout << "--fp64 Run FP64 (VALU) test" << std::endl;
|
||||
std::cout << "--matfp16 Run FP16 (MFMA) test" << std::endl;
|
||||
std::cout << "--matfp32 Run FP32 (MFMA) test" << std::endl;
|
||||
std::cout << "--smatfp16 Run FP16 (SMFMAC) test" << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
if(std::string(argv[0]) == "CHILD") {
|
||||
int device = atoi(argv[1]);
|
||||
int runs = atoi(argv[2]);
|
||||
uint32_t mask = atoi(argv[3]);
|
||||
int fd = atoi(argv[4]);
|
||||
|
||||
Result res = run_tests(device, runs, mask);
|
||||
|
||||
write(fd, &res, sizeof(res));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int runs = 1;
|
||||
|
||||
uint32_t mask = 0;
|
||||
bool all_devices = false;
|
||||
std::vector<int> devices;
|
||||
int device_count;
|
||||
int device = 0;
|
||||
|
||||
HIP_CALL(hipGetDeviceCount(&device_count));
|
||||
|
||||
int i = 1;
|
||||
while(i < argc) {
|
||||
std::string arg = std::string(argv[i]);
|
||||
|
||||
if(arg == "--help") {
|
||||
usage();
|
||||
return 0;
|
||||
} else if(arg == "--device") {
|
||||
devices.push_back(atoi(argv[i + 1]));
|
||||
// Skip next
|
||||
i++;
|
||||
} else if(arg == "--devices") {
|
||||
// Parse comma-separated string of numbers
|
||||
std::string s(argv[i + 1]);
|
||||
|
||||
if(s == "all" || s == "ALL") {
|
||||
all_devices = true;
|
||||
} else {
|
||||
std::stringstream ss(s);
|
||||
std::string r;
|
||||
while(getline(ss, r, ',')) {
|
||||
devices.push_back(std::stoi(r));
|
||||
}
|
||||
}
|
||||
// Skip next
|
||||
i++;
|
||||
} else if(arg == "--runs") {
|
||||
runs = atoi(argv[i + 1]);
|
||||
|
||||
// Skip next
|
||||
i++;
|
||||
} else if(arg == "--fp32") {
|
||||
mask |= VALU_FP32;
|
||||
} else if(arg == "--fp64") {
|
||||
mask |= VALU_FP64;
|
||||
} else if(arg == "--fp16") {
|
||||
mask |= VALU_FP16;
|
||||
} else if(arg == "--int32") {
|
||||
mask |= VALU_INT32;
|
||||
} else if(arg == "--matfp16") {
|
||||
mask |= MATRIX_FP16;
|
||||
} else if(arg == "--matfp32") {
|
||||
mask |= MATRIX_FP32;
|
||||
} else if(arg == "--smatfp16") {
|
||||
mask |= SMATRIX_FP16;
|
||||
} else {
|
||||
std::cout << "Invalid argument '" << arg << "'" << std::endl;
|
||||
std::cout << std::endl;
|
||||
usage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
if(all_devices) {
|
||||
for(int i = 0; i < device_count; i++ ){
|
||||
devices.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify device ID's
|
||||
for(auto d : devices) {
|
||||
if(d >= device_count) {
|
||||
std::cout << "Invalid device ordinal: " << d << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(devices.size() == 0) {
|
||||
devices.push_back(0);
|
||||
}
|
||||
|
||||
if(mask == 0) {
|
||||
mask = ALL;
|
||||
}
|
||||
|
||||
run(devices, runs, mask);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -77,11 +77,11 @@ for the agent and returns a pointer to it.
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <set>
|
||||
#include <shared_mutex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unistd.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
@@ -148,7 +148,7 @@ struct counter_info_record_t {
|
||||
// Tool data struct, now includes a vector of counter_info_record_t
|
||||
struct tool_data_t {
|
||||
std::mutex mut{};
|
||||
std::unique_ptr<std::ostream> output_stream{nullptr};
|
||||
std::string output_filename{};
|
||||
std::unordered_map<uint64_t, std::string> counter_id_name_map{};
|
||||
std::string requested_counters{};
|
||||
std::string kernel_filter_include_regex{};
|
||||
@@ -614,14 +614,28 @@ void generate_output(tool_data_t *tool_data) {
|
||||
}),
|
||||
tool_data->counter_records.end());
|
||||
}
|
||||
|
||||
if (tool_data->counter_records.empty()) {
|
||||
return;
|
||||
}
|
||||
// Write collected counter records and clean up
|
||||
if (auto &os = tool_data->output_stream) {
|
||||
if (!tool_data->output_filename.empty()) {
|
||||
std::ofstream ofs(tool_data->output_filename);
|
||||
if (!ofs.is_open()) {
|
||||
std::cerr << "Failed to open output file: " << tool_data->output_filename
|
||||
<< std::endl;
|
||||
return;
|
||||
}
|
||||
// Write header at the beginning of the file
|
||||
ofs << "dispatch_id,gpu_id,kernel_id,lds_per_workgroup,"
|
||||
"counter_id,counter_name,counter_value\n";
|
||||
for (const auto &r : tool_data->counter_records)
|
||||
*os << r.dispatch_id << ',' << r.agent_id << "," << r.kernel_id << ','
|
||||
ofs << r.dispatch_id << ',' << r.agent_id << "," << r.kernel_id << ','
|
||||
<< r.LDS_memory_size << ',' << r.counter_id << ',' << r.counter_name
|
||||
<< ',' << r.counter_value << '\n';
|
||||
os->flush();
|
||||
ofs.flush();
|
||||
std::clog << "[rocprofiler-compute] [" << __FUNCTION__
|
||||
<< "] Counter collection data has been written to: "
|
||||
<< tool_data->output_filename << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -638,18 +652,13 @@ void tool_fini(void *user_data) {
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<tool_data_t> create_tool_data(rocprofiler_client_id_t *id) {
|
||||
std::unique_ptr<tool_data_t>
|
||||
create_tool_data(rocprofiler_client_id_t * /*id*/) {
|
||||
auto tool_data = std::make_unique<tool_data_t>();
|
||||
|
||||
// Generate a unique output filename using a random hex string (no libuuid
|
||||
// dependency)
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<uint32_t> dis(0, 0xFFFFFFFF);
|
||||
std::stringstream filename_ss;
|
||||
filename_ss << std::hex << dis(gen);
|
||||
// Generate a unique output filename using the process ID
|
||||
std::string base_filename =
|
||||
"counter_collection_" + filename_ss.str().substr(0, 8) + ".csv";
|
||||
std::to_string(getpid()) + "_native_counter_collection.csv";
|
||||
|
||||
// Require ROCPROF_OUTPUT_PATH to be set, otherwise error out
|
||||
std::string filename;
|
||||
@@ -664,20 +673,7 @@ std::unique_ptr<tool_data_t> create_tool_data(rocprofiler_client_id_t *id) {
|
||||
// Use the generated base filename along with ROCPROF_OUTPUT_PATH
|
||||
filename += base_filename;
|
||||
|
||||
// Set output stream to file
|
||||
auto ofs = std::make_unique<std::ofstream>(filename);
|
||||
if (!ofs->is_open()) {
|
||||
throw std::runtime_error("Failed to open output file: " + filename);
|
||||
}
|
||||
tool_data->output_stream = std::move(ofs);
|
||||
// Write header at the beginning of the file
|
||||
*tool_data->output_stream << "dispatch_id,gpu_id,kernel_id,lds_per_workgroup,"
|
||||
"counter_id,counter_name,counter_value\n";
|
||||
tool_data->output_stream->flush();
|
||||
|
||||
// Write to clog the path of the logging file
|
||||
std::clog << id->name << " [" << __FUNCTION__
|
||||
<< "] Logging counter collection to: " << filename << std::endl;
|
||||
tool_data->output_filename = filename;
|
||||
|
||||
// Store ROCPROF env. vars. in tool_data
|
||||
|
||||
|
||||
@@ -61,7 +61,8 @@ def simple_bar(df: pd.DataFrame, title: Optional[str] = None) -> Optional[str]:
|
||||
|
||||
if "Metric" in df.columns and "Avg" in df.columns:
|
||||
metric_dict = (
|
||||
pd.DataFrame([df["Metric"], df["Avg"]])
|
||||
pd
|
||||
.DataFrame([df["Metric"], df["Avg"]])
|
||||
.replace("", 0)
|
||||
.replace(float("inf"), -1) # It should not happen
|
||||
.replace(float("-inf"), -1)
|
||||
@@ -258,7 +259,8 @@ def px_simple_multi_bar(
|
||||
|
||||
for group, metric in nested_bar.items():
|
||||
dfigs.append(
|
||||
px.bar(
|
||||
px
|
||||
.bar(
|
||||
title=group,
|
||||
x=metric.values(),
|
||||
y=metric.keys(),
|
||||
|
||||
@@ -219,7 +219,8 @@ def get_views() -> list[TextClause]:
|
||||
select(
|
||||
Kernel.kernel_name,
|
||||
(Dispatch.end_timestamp - Dispatch.start_timestamp).label("duration"),
|
||||
func.row_number()
|
||||
func
|
||||
.row_number()
|
||||
.over(
|
||||
partition_by=Kernel.kernel_name,
|
||||
order_by=Dispatch.end_timestamp - Dispatch.start_timestamp,
|
||||
|
||||
@@ -132,7 +132,8 @@ class MIGPUSpecs:
|
||||
cls._all_gpu_models.append(curr_gpu_model)
|
||||
cls._gpu_model_dict[curr_gpu_arch].append(curr_gpu_model)
|
||||
cls._num_xcds_dict[curr_gpu_model] = (
|
||||
models.get("partition_mode", {})
|
||||
models
|
||||
.get("partition_mode", {})
|
||||
.get("compute_partition_mode", {})
|
||||
.get("num_xcds", {})
|
||||
)
|
||||
|
||||
@@ -580,7 +580,8 @@ def gen_counter_list(formula: str) -> tuple[bool, list[str]]:
|
||||
return visited, counters
|
||||
try:
|
||||
tree = ast.parse(
|
||||
formula.replace("$normUnit", "SQ_WAVES")
|
||||
formula
|
||||
.replace("$normUnit", "SQ_WAVES")
|
||||
.replace("$denom", "SQ_WAVES")
|
||||
.replace(
|
||||
"$numActiveCUs",
|
||||
@@ -1606,9 +1607,9 @@ def load_pc_sampling_data_per_kernel(
|
||||
pc_sample_instructions = search_key_in_json(file_name, "pc_sample_instructions")
|
||||
df["instruction"] = (
|
||||
df["inst_index"].apply(
|
||||
lambda x: pc_sample_instructions[x]
|
||||
if x < len(pc_sample_instructions)
|
||||
else None
|
||||
lambda x: (
|
||||
pc_sample_instructions[x] if x < len(pc_sample_instructions) else None
|
||||
)
|
||||
)
|
||||
if pc_sample_instructions
|
||||
else None
|
||||
@@ -1618,9 +1619,11 @@ def load_pc_sampling_data_per_kernel(
|
||||
pc_sample_comments = search_key_in_json(file_name, "pc_sample_comments")
|
||||
df["source_line"] = (
|
||||
df["inst_index"].apply(
|
||||
lambda x: f".../{Path(pc_sample_comments[x]).name}"
|
||||
if x < len(pc_sample_comments)
|
||||
else None
|
||||
lambda x: (
|
||||
f".../{Path(pc_sample_comments[x]).name}"
|
||||
if x < len(pc_sample_comments)
|
||||
else None
|
||||
)
|
||||
)
|
||||
if pc_sample_comments
|
||||
else None
|
||||
@@ -1719,7 +1722,8 @@ def load_pc_sampling_data(
|
||||
|
||||
# Group by Instruction_Comment and aggregate
|
||||
grouped_counts = (
|
||||
merged_df.groupby("Instruction_Comment")
|
||||
merged_df
|
||||
.groupby("Instruction_Comment")
|
||||
.agg(
|
||||
count=("Instruction_Comment", "count"),
|
||||
instruction=("Instruction", "first"),
|
||||
|
||||
@@ -38,6 +38,7 @@ COUNTERS_COLLECTION_QUERY = """
|
||||
SELECT
|
||||
agent_id as GPU_ID,
|
||||
dispatch_id as Dispatch_ID,
|
||||
pid as PID,
|
||||
grid_size as Grid_Size,
|
||||
workgroup_size as Workgroup_Size,
|
||||
lds_block_size as LDS_Per_Workgroup,
|
||||
@@ -61,24 +62,28 @@ TABLE_NAME_PREFIX_QUERY = (
|
||||
INSERT_QUERY = "INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
|
||||
|
||||
|
||||
def convert_db_to_csv(
|
||||
db_path: str,
|
||||
def convert_dbs_to_csv(
|
||||
db_paths: list[str],
|
||||
csv_file_path: str,
|
||||
) -> None:
|
||||
"""
|
||||
Read rocpd database and write to CSV file
|
||||
Read rocpd databases and write to CSV file
|
||||
"""
|
||||
# Read counters_collection view from the database and write to CSV
|
||||
# Read counters_collection view from the databases and write to CSV
|
||||
try:
|
||||
with closing(sqlite3.connect(db_path)) as conn:
|
||||
with closing(conn.execute(COUNTERS_COLLECTION_QUERY)) as cursor:
|
||||
with open(csv_file_path, "w", newline="") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow([
|
||||
description[0] for description in cursor.description
|
||||
])
|
||||
for row in cursor:
|
||||
writer.writerow(row)
|
||||
with open(csv_file_path, "w", newline="") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
header_written = False
|
||||
for db_path in db_paths:
|
||||
with closing(sqlite3.connect(db_path)) as conn:
|
||||
with closing(conn.execute(COUNTERS_COLLECTION_QUERY)) as cursor:
|
||||
if not header_written:
|
||||
writer.writerow([
|
||||
description[0] for description in cursor.description
|
||||
])
|
||||
header_written = True
|
||||
for row in cursor:
|
||||
writer.writerow(row)
|
||||
except OSError as e:
|
||||
console_error(f"Database error while converting to CSV: {e}")
|
||||
except Exception as e:
|
||||
|
||||
@@ -426,7 +426,8 @@ def format_table_output(
|
||||
and "Value" in df.columns
|
||||
):
|
||||
mem_data = (
|
||||
pd.DataFrame([df["Metric"], df["Value"]])
|
||||
pd
|
||||
.DataFrame([df["Metric"], df["Value"]])
|
||||
.transpose()
|
||||
.set_index("Metric")
|
||||
.to_dict()["Value"]
|
||||
|
||||
@@ -885,24 +885,48 @@ def run_prof(
|
||||
rocprof_cmd == "rocprofiler-sdk"
|
||||
and options["ROCPROF_COUNTER_COLLECTION"] == "0"
|
||||
):
|
||||
# Update rocpd database with counter csv created by native tool
|
||||
rocpd_data.update_rocpd_pmc_events(
|
||||
pd.read_csv(glob.glob(workload_dir + "/out/pmc_1/*.csv")[0]),
|
||||
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
|
||||
)
|
||||
for db_name in glob.glob(workload_dir + "/out/pmc_1/*/*.db"):
|
||||
pid = Path(db_name).stem.split("_")[0]
|
||||
rocpd_data.update_rocpd_pmc_events(
|
||||
pd.read_csv(
|
||||
f"{workload_dir}/out/pmc_1/{pid}_native_counter_collection.csv"
|
||||
),
|
||||
db_name,
|
||||
)
|
||||
console_debug(f"Updated rocpd db {db_name} with native tool counters.")
|
||||
# Write results_fbase.csv
|
||||
rocpd_data.convert_db_to_csv(
|
||||
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
|
||||
rocpd_data.convert_dbs_to_csv(
|
||||
glob.glob(workload_dir + "/out/pmc_1/*/*.db"),
|
||||
workload_dir + f"/results_{fbase}.csv",
|
||||
)
|
||||
combined_df = pd.read_csv(workload_dir + f"/results_{fbase}.csv")
|
||||
# Reset Dispatch_ID based on PID, Kernel_Name, Grid_Size,
|
||||
# Workgroup_Size, LDS_Per_Workgroup
|
||||
combined_df["Dispatch_ID"] = combined_df.groupby(
|
||||
["PID", "Kernel_Name", "Grid_Size", "Workgroup_Size", "LDS_Per_Workgroup"],
|
||||
sort=False,
|
||||
).ngroup()
|
||||
# Reset Kernel_ID based on Kernel_Name, Grid_Size,
|
||||
# Workgroup_Size, LDS_Per_Workgroup
|
||||
combined_df["Kernel_ID"] = combined_df.groupby(
|
||||
["Kernel_Name", "Grid_Size", "Workgroup_Size", "LDS_Per_Workgroup"],
|
||||
sort=False,
|
||||
).ngroup()
|
||||
# Drop PID since its not required
|
||||
combined_df = combined_df.drop(columns=["PID"])
|
||||
combined_df.to_csv(workload_dir + f"/results_{fbase}.csv", index=False)
|
||||
|
||||
if retain_rocpd_output:
|
||||
shutil.copyfile(
|
||||
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
|
||||
workload_dir + "/" + fbase + ".db",
|
||||
)
|
||||
console_warning(
|
||||
f"Retaining large raw rocpd database: {workload_dir}/{fbase}.db"
|
||||
)
|
||||
for db_path in glob.glob(workload_dir + "/out/pmc_1/*/*.db"):
|
||||
pid = Path(db_path).stem.split("_")[0]
|
||||
shutil.copyfile(
|
||||
db_path,
|
||||
workload_dir + f"/{fbase}_{pid}.db",
|
||||
)
|
||||
console_warning(
|
||||
f"Retaining large raw rocpd database: "
|
||||
f"{workload_dir}/{fbase}_{pid}.db"
|
||||
)
|
||||
# Remove temp directory
|
||||
shutil.rmtree(workload_dir + "/" + "out")
|
||||
return
|
||||
@@ -1064,81 +1088,66 @@ def convert_native_counter_collection_csv(workload_dir: str) -> None:
|
||||
trace to write counter collection csv in rocprofiler-sdk format
|
||||
for further processing to pmc_perf.csv file
|
||||
"""
|
||||
counter_data = pd.read_csv(
|
||||
glob.glob(f"{workload_dir}/out/pmc_1/*.csv")[0], index_col=False
|
||||
)
|
||||
# Group by on counter_data based on dispatch_id and
|
||||
# counter_id and sum the counter_value
|
||||
counter_data = counter_data.groupby(
|
||||
["dispatch_id", "counter_name"], as_index=False
|
||||
).agg({"counter_value": "sum"})
|
||||
kernel_data_filename = glob.glob(f"{workload_dir}/out/pmc_1/*/*_kernel_trace.csv")[
|
||||
0
|
||||
]
|
||||
kernel_data = pd.read_csv(kernel_data_filename)
|
||||
rocprofv3_counter_data = pd.DataFrame({
|
||||
"Correlation_Id": counter_data["dispatch_id"],
|
||||
"Dispatch_Id": counter_data["dispatch_id"],
|
||||
"Agent_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Agent_Id"
|
||||
].values,
|
||||
"Queue_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Queue_Id"
|
||||
].values,
|
||||
"Process_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Thread_Id"
|
||||
].values,
|
||||
"Thread_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Thread_Id"
|
||||
].values,
|
||||
"Grid_Size": (
|
||||
kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
["Grid_Size_X", "Grid_Size_Y", "Grid_Size_Z"]
|
||||
]
|
||||
.prod(axis=1)
|
||||
.values
|
||||
),
|
||||
"Kernel_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Kernel_Id"
|
||||
].values,
|
||||
"Kernel_Name": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Kernel_Name"
|
||||
].values,
|
||||
"Workgroup_Size": (
|
||||
kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
["Workgroup_Size_X", "Workgroup_Size_Y", "Workgroup_Size_Z"]
|
||||
]
|
||||
.prod(axis=1)
|
||||
.values
|
||||
),
|
||||
"LDS_Block_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"LDS_Block_Size"
|
||||
].values,
|
||||
"Scratch_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Scratch_Size"
|
||||
].values,
|
||||
"VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"VGPR_Count"
|
||||
].values,
|
||||
"Accum_VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Accum_VGPR_Count"
|
||||
].values,
|
||||
"SGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"SGPR_Count"
|
||||
].values,
|
||||
"Counter_Name": counter_data["counter_name"],
|
||||
"Counter_Value": counter_data["counter_value"],
|
||||
"Start_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"Start_Timestamp"
|
||||
].values,
|
||||
"End_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
|
||||
"End_Timestamp"
|
||||
].values,
|
||||
})
|
||||
rocprofv3_counter_data.to_csv(
|
||||
kernel_data_filename.replace("kernel_trace", "counter_collection"),
|
||||
index=False,
|
||||
)
|
||||
for native_filename in glob.glob(
|
||||
f"{workload_dir}/out/pmc_1/*_native_counter_collection.csv"
|
||||
):
|
||||
counter_data = pd.read_csv(native_filename, index_col=False)
|
||||
# Group by on dispatch_id and counter_id and sum the counter_value,
|
||||
# Other rows in group have the same value, so take the first one
|
||||
groupby_cols = ["dispatch_id", "counter_name"]
|
||||
agg_dict = {
|
||||
col: "first" for col in counter_data.columns if col not in groupby_cols
|
||||
}
|
||||
# Overwrite counter_value aggregation to sum
|
||||
agg_dict["counter_value"] = "sum"
|
||||
counter_data = counter_data.groupby(groupby_cols, as_index=False).agg(agg_dict)
|
||||
|
||||
pid = Path(native_filename).stem.split("_")[0]
|
||||
kernel_data_filename = glob.glob(
|
||||
f"{workload_dir}/out/pmc_1/*/{pid}_kernel_trace.csv"
|
||||
)[0]
|
||||
kernel_data = pd.read_csv(kernel_data_filename)
|
||||
|
||||
# Merge counter_data with kernel_data on kernel_id
|
||||
merged_data = pd.merge(
|
||||
counter_data,
|
||||
kernel_data,
|
||||
left_on="kernel_id",
|
||||
right_on="Kernel_Id",
|
||||
how="left",
|
||||
)
|
||||
|
||||
rocprofv3_counter_data = pd.DataFrame({
|
||||
"Correlation_Id": merged_data["dispatch_id"],
|
||||
"Dispatch_Id": merged_data["dispatch_id"],
|
||||
"Agent_Id": merged_data["Agent_Id"],
|
||||
"Queue_Id": merged_data["Queue_Id"],
|
||||
"Process_Id": merged_data["Thread_Id"],
|
||||
"Thread_Id": merged_data["Thread_Id"],
|
||||
"Grid_Size": (
|
||||
merged_data[["Grid_Size_X", "Grid_Size_Y", "Grid_Size_Z"]].prod(axis=1)
|
||||
),
|
||||
"Kernel_Id": merged_data["Kernel_Id"],
|
||||
"Kernel_Name": merged_data["Kernel_Name"],
|
||||
"Workgroup_Size": (
|
||||
merged_data[
|
||||
["Workgroup_Size_X", "Workgroup_Size_Y", "Workgroup_Size_Z"]
|
||||
].prod(axis=1)
|
||||
),
|
||||
"LDS_Block_Size": merged_data["LDS_Block_Size"],
|
||||
"Scratch_Size": merged_data["Scratch_Size"],
|
||||
"VGPR_Count": merged_data["VGPR_Count"],
|
||||
"Accum_VGPR_Count": merged_data["Accum_VGPR_Count"],
|
||||
"SGPR_Count": merged_data["SGPR_Count"],
|
||||
"Counter_Name": merged_data["counter_name"],
|
||||
"Counter_Value": merged_data["counter_value"],
|
||||
"Start_Timestamp": merged_data["Start_Timestamp"],
|
||||
"End_Timestamp": merged_data["End_Timestamp"],
|
||||
})
|
||||
rocprofv3_counter_data.to_csv(
|
||||
kernel_data_filename.replace("kernel_trace", "counter_collection"),
|
||||
index=False,
|
||||
)
|
||||
|
||||
|
||||
def process_rocprofv3_output(workload_dir: str, using_native_tool: bool) -> list[str]:
|
||||
|
||||
@@ -67,3 +67,11 @@ set_target_properties(
|
||||
laplace_eqn
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
|
||||
)
|
||||
|
||||
set(ROCFLOP_SOURCES ../sample/rocflop.cpp)
|
||||
set_source_files_properties(${ROCFLOP_SOURCES} PROPERTIES LANGUAGE HIP)
|
||||
add_executable(rocflop ${ROCFLOP_SOURCES})
|
||||
set_target_properties(
|
||||
rocflop
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
|
||||
)
|
||||
|
||||
@@ -68,6 +68,7 @@ config["app_mat_mul_max"] = ["./tests/mat_mul_max"]
|
||||
config["app_hip_dynamic_shared"] = ["./tests/hip_dynamic_shared"]
|
||||
config["app_laplace_eqn"] = ["./tests/laplace_eqn", "-i", "5000"]
|
||||
config["app_laplace_eqn_iter"] = ["./tests/laplace_eqn", "-i", "15000"]
|
||||
config["rocflop"] = ["./tests/rocflop", "--device", "0"]
|
||||
config["cleanup"] = True
|
||||
config["COUNTER_LOGGING"] = False
|
||||
config["METRIC_COMPARE"] = False
|
||||
@@ -637,6 +638,29 @@ def test_path(binary_handler_profile_rocprof_compute):
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.path
|
||||
def test_path_rocflop(
|
||||
binary_handler_profile_rocprof_compute,
|
||||
):
|
||||
# Test whether multiprocess workloads like rocflop are handled correctly
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
options = ["--block", "2.1.1"]
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="rocflop",
|
||||
)
|
||||
pmc_perf_df = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)[
|
||||
"pmc_perf.csv"
|
||||
]
|
||||
# Ensure non zero length of df
|
||||
assert len(pmc_perf_df) > 0
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.path
|
||||
def test_path_no_native(binary_handler_profile_rocprof_compute):
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
|
||||
Reference in New Issue
Block a user