[rocprofiler-compute] Fix for multi process workload profiling (#2418)

* Fix for multi process workload profiling Native counter collection tool updates: * Do not dump empty counter data for a process * Use PID instead of UUID for dumped csv files to facilitate correlation * Handle merging multiple pairs of rocpd (from sdk tool) and csv (from native tool) files * Handle merging multiple pairs of csv (from sdk tool) and csv (from native tool) files Rocpd output format updates: * Merge multiple rocpd databases into a single csv * Reset dispatch id and kernel id for unique dispatches and unique kernels respectively * Retain multiple rocpd databases per run for multi process workloads * Add test case for multiprocess profiling using rocflop workload * Add rocflop * Fix native counter csv to rocprofv3 csv conversion * Use kernel_id instead of dispatch_id to correlate native counter csv and kernel trace csv * python formatting using ruff 0.14 instead of 0.13
2025-12-23 13:12:18 -05:00
commit 588773f9bf
@@ -45,6 +45,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
  * Fix the check to prevent showing table where a column is full of N/A
  * Improve detection of empty values when metric evalulation fails due to counter data missing

+* Fix the wrong logic in native counter csv to rocprofv3 csv conversion
+  * Use kernel_id instead of dispatch_id to correlate native counter csv and kernel trace csv
+
 ### Removed

 * Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
@@ -748,6 +748,7 @@ if(INSTALL_TESTS)
            tests/hip_dynamic_shared
            tests/laplace_eqn
            tests/mat_mul_max
+            tests/rocflop
        DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
        COMPONENT tests
    )
@@ -0,0 +1,681 @@
+// Copied from https://github.com/benrichard-amd/rocflop/tree/82f197e12314bab694fc70451a2b495b4f51bf90
+
+#include <iostream>
+#include <cstring>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <unistd.h>
+#include <type_traits>
+#include <vector>
+#include <sys/wait.h>
+#include <fcntl.h>
+
+using float16 = _Float16;
+
+// Vector types. Useful for packed math (where supported) and MFMA inputs.
+template<typename T, uint32_t Rank>
+using vecT = T __attribute__((ext_vector_type(Rank)));
+
+template<typename T> using vec4 = vecT<T, 4>;
+template<typename T> using vec8 = vecT<T, 8>;
+
+
+// Kernels
+
+
+template<typename T> __global__ void fma_throughput(vec4<T>* buffer, int count)
+{
+    const T k = 1.0;
+
+    const int grid_size = gridDim.x * blockDim.x;
+    const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    vec4<T>* ptr = buffer;
+
+    vec4<T> value0 = ptr[0 * grid_size + tid];
+    vec4<T> value1 = ptr[1 * grid_size + tid];
+    vec4<T> value2 = ptr[2 * grid_size + tid];
+    vec4<T> value3 = ptr[3 * grid_size + tid];
+
+    for(int j = 0; j < count; j++) {
+        for(int j = 0; j < 64; j++) {
+
+            // 16 FMA ops
+            value0 = value0 * value0 + k;
+            value1 = value1 * value1 + k;
+            value2 = value2 * value2 + k;
+            value3 = value3 * value3 + k;
+        }
+    }
+
+    ptr[tid] = value0 + value1 + value2 + value3;
+}
+
+__global__ void matmul_fp16_throughput(vec4<float16>* inputs, vec4<float>* outputs, int count)
+{
+    int grid_size = gridDim.x * blockDim.x;
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    vec4<float16>* ptr = inputs;
+
+    vec4<float16> value0 = ptr[0 * grid_size + tid];
+    vec4<float16> value1 = ptr[1 * grid_size + tid];
+    vec4<float16> value2 = ptr[2 * grid_size + tid];
+    vec4<float16> value3 = ptr[3 * grid_size + tid];
+
+    vec4<float> accum0;
+    vec4<float> accum1;
+    vec4<float> accum2;
+    vec4<float> accum3;
+    for(int i = 0; i < count; i++) {
+        for(int j = 0; j < 64; j++) {
+            // 4 MFMA ops
+            accum0 = __builtin_amdgcn_mfma_f32_16x16x16f16(value0, value0, accum0, 0, 0, 0);
+            accum1 = __builtin_amdgcn_mfma_f32_16x16x16f16(value1, value1, accum1, 0, 0, 0);
+            accum2 = __builtin_amdgcn_mfma_f32_16x16x16f16(value2, value2, accum2, 0, 0, 0);
+            accum3 = __builtin_amdgcn_mfma_f32_16x16x16f16(value3, value3, accum3, 0, 0, 0);
+        }
+    }
+
+    outputs[tid] = accum0 + accum1 + accum2 + accum3;
+}
+
+__global__ void sparse_matmul_fp16_throughput(vec4<float16>* input0, vec8<float16>* input1, vec4<float>* outputs, int count)
+{
+    int grid_size = gridDim.x * blockDim.x;
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    vec4<float16>* x_ptr = input0;
+    vec8<float16>* y_ptr = input1;
+
+    vec4<float16> x0 = x_ptr[0 * grid_size + tid];
+    vec4<float16> x1 = x_ptr[1 * grid_size + tid];
+    vec4<float16> x2 = x_ptr[2 * grid_size + tid];
+    vec4<float16> x3 = x_ptr[3 * grid_size + tid];
+    
+    vec8<float16> y0 = y_ptr[0 * grid_size + tid];
+    vec8<float16> y1 = y_ptr[1 * grid_size + tid];
+    vec8<float16> y2 = y_ptr[2 * grid_size + tid];
+    vec8<float16> y3 = y_ptr[3 * grid_size + tid];
+    
+    vec4<float> accum0;
+    vec4<float> accum1;
+    vec4<float> accum2;
+    vec4<float> accum3;
+   
+    for(int i = 0; i < count; i++) {
+        for(int j = 0; j < 64; j++) {
+            // 4 SMFMAC ops
+            accum0 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x0, y0, accum0, 0, 0, 0);
+            accum1 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x1, y1, accum1, 0, 0, 0);
+            accum2 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x2, y2, accum2, 0, 0, 0);
+            accum3 = __builtin_amdgcn_smfmac_f32_16x16x32_f16(x3, y3, accum3, 0, 0, 0);
+        }
+    }
+
+    outputs[tid] = accum0 + accum1 + accum2 + accum3;
+}
+
+__global__ void matmul_fp32_throughput(float* inputs, vec4<float>* outputs, int count)
+{
+    int grid_size = gridDim.x * blockDim.x;
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+    float* ptr = inputs;
+
+    float value0 = ptr[0 * grid_size + tid];
+    float value1 = ptr[1 * grid_size + tid];
+    float value2 = ptr[2 * grid_size + tid];
+    float value3 = ptr[2 * grid_size + tid];
+
+    vec4<float> accum0;
+    vec4<float> accum1;
+    vec4<float> accum2;
+    vec4<float> accum3;
+    for(int i = 0; i < count; i++) {
+        for(int j = 0; j < 64; j++) {
+            // 4 MFMA ops
+            accum0 = __builtin_amdgcn_mfma_f32_16x16x4f32(value0, value0, accum0, 0, 0, 0);
+            accum1 = __builtin_amdgcn_mfma_f32_16x16x4f32(value1, value1, accum1, 0, 0, 0);
+            accum2 = __builtin_amdgcn_mfma_f32_16x16x4f32(value2, value2, accum2, 0, 0, 0);
+            accum3 = __builtin_amdgcn_mfma_f32_16x16x4f32(value3, value3, accum3, 0, 0, 0);
+        }
+    }
+
+    outputs[tid] = accum0 + accum1 + accum2 + accum3;
+}
+
+void HIP_CALL(hipError_t err)
+{
+    if(err != hipSuccess) {
+        std::cout << "HIP Error: " << (int)err << " " << hipGetErrorString(err) << std::endl;
+        exit(1);
+    }
+}
+
+struct GCNArch {
+    int major;
+    int minor;
+    int rev;
+};
+
+GCNArch get_gcn_arch(int device)
+{
+    hipDeviceProp_t props;
+
+    HIP_CALL(hipGetDeviceProperties(&props, device));
+
+    // Example: gfx908:sramecc+:xnack-
+    std::string arch_full(props.gcnArchName);
+
+    // Extract number e.g. "908" 
+    std::string gfx_str = arch_full.substr(3, arch_full.find_first_of(':'));
+
+    int gfx_num = std::stoi(gfx_str, nullptr, 16);
+
+    GCNArch arch;
+    arch.major = (gfx_num & 0xff00) >> 8;
+    arch.minor = (gfx_num & 0x00f0) >> 4;
+    arch.rev   = (gfx_num & 0x000f);
+
+    return arch;
+}
+
+enum : uint32_t {
+    VALU_FP32   = 1 << 0,
+    VALU_FP16   = 1 << 1,
+    VALU_FP64   = 1 << 2,
+    MATRIX_FP16 = 1 << 3,
+    MATRIX_FP32 = 1 << 4,
+    SMATRIX_FP16 = 1 << 5,
+    VALU_INT32  = 1 << 6,
+
+    ALL         = (uint32_t)-1
+};
+
+// Timer for measuring kernel duration
+class HIPTimer {
+
+private:
+    hipEvent_t m_start;
+    hipEvent_t m_stop;
+
+public:
+    HIPTimer()
+    {
+        HIP_CALL(hipEventCreate(&m_start));
+        HIP_CALL(hipEventCreate(&m_stop));
+    }
+
+    void start()
+    {
+        HIP_CALL(hipEventRecord(m_start));
+    }
+
+    void stop()
+    {
+        HIP_CALL(hipEventRecord(m_stop));
+    }
+
+    double elapsed()
+    {
+        float ms;
+        HIP_CALL(hipEventElapsedTime(&ms, m_start, m_stop));
+
+        return (double)ms / 1000.0;
+    }
+};
+
+// Host code
+
+template<typename T> double fma_throughput_test(int device, int count, int runs = 1)
+{
+    vec4<T>* buffer = nullptr;
+
+    hipDeviceProp_t props;
+    HIP_CALL(hipGetDeviceProperties(&props, device));
+    
+    int blocks = props.multiProcessorCount * 512;
+    int threads_per_block = 64;
+    int total_threads = blocks * threads_per_block;
+
+    HIP_CALL(hipMalloc(&buffer, sizeof(vec4<T>) * total_threads * 4));
+    
+    HIPTimer t;
+    t.start();
+    for(int i = 0; i < runs; i++) {
+        fma_throughput<T><<<blocks, threads_per_block>>>(buffer, count);
+    }
+    t.stop();
+    HIP_CALL(hipDeviceSynchronize());
+
+    double elapsed = t.elapsed();
+    double ops = (double)total_threads * count * 64 * 16 * runs;
+    double flops = (double)ops * 2.0 / elapsed;
+
+    HIP_CALL(hipFree(buffer));
+
+    return flops;
+}
+
+template<typename matT, typename accumT> double matmul_throughput_test(int device, int count, int runs = 1)
+{
+    const int wave_size = 64;
+    int k;
+    int m;
+    int n;
+
+    if(std::is_same<matT, float16>::value) {
+        m = 16;
+        n = 16;
+        k = 16;
+    } else if(std::is_same<matT, float>::value) {
+        m = 16;
+        n = 16;
+        k = 4;
+    } else {
+        assert(false);
+    }
+    
+    int ops_per_matmul = k * m * n * 2;
+
+    void* buffer = nullptr;
+    void* accum = nullptr;
+
+    hipDeviceProp_t props;
+    HIP_CALL(hipGetDeviceProperties(&props, device));
+
+    int blocks = props.multiProcessorCount * 512;
+    int threads_per_block = wave_size;
+    int total_threads = blocks * threads_per_block;
+
+    HIP_CALL(hipMalloc(&buffer, 4 * sizeof(matT) * m * k * total_threads));
+    HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
+
+    HIPTimer t;
+    t.start();
+    for(int i = 0; i < runs; i++) {
+        if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
+            matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer, (vec4<float>*)accum, count);
+        } else if(std::is_same<matT,float>::value && std::is_same<accumT, float>::value) {
+            matmul_fp32_throughput<<<blocks, threads_per_block>>>((float*)buffer, (vec4<float>*)accum, count);
+        }
+    }
+    t.stop();
+    HIP_CALL(hipDeviceSynchronize());
+
+    double elapsed = t.elapsed();
+    double ops = (double)blocks * count * 64 * 4 * runs;
+    double flops = (double)ops * ops_per_matmul / elapsed;
+
+    HIP_CALL(hipFree(buffer));
+    HIP_CALL(hipFree(accum));
+
+    return flops;
+}
+
+template<typename matT, typename accumT> double sparse_matmul_throughput_test(int device, int count, int runs = 1)
+{
+    const int wave_size = 64;
+    int k;
+    int m;
+    int n;
+
+    if(std::is_same<matT, float16>::value) {
+        m = 16;
+        n = 16;
+        k = 32;
+    } else {
+        assert(false);
+    }
+    
+    int ops_per_matmul = k * m * n * 2;
+
+    void* buffer1 = nullptr;
+    void* buffer2 = nullptr;
+    void* accum = nullptr;
+
+    hipDeviceProp_t props;
+    HIP_CALL(hipGetDeviceProperties(&props, device));
+
+    int blocks = props.multiProcessorCount * 512;
+    int threads_per_block = wave_size;
+    int total_threads = blocks * threads_per_block;
+
+    HIP_CALL(hipMalloc(&buffer1, 4 * sizeof(matT) * m * k * total_threads));
+    HIP_CALL(hipMalloc(&buffer2, 8 * sizeof(matT) * n * k * total_threads));
+    HIP_CALL(hipMalloc(&accum, sizeof(accumT) * m * n * total_threads));
+
+    HIPTimer t;
+    t.start();
+    for(int i = 0; i < runs; i++) {
+        if(std::is_same<matT, float16>::value && std::is_same<accumT, float>::value) {
+            sparse_matmul_fp16_throughput<<<blocks, threads_per_block>>>((vec4<float16>*)buffer1,
+            (vec8<float16>*)buffer2, (vec4<float>*)accum, count);
+        }
+    }
+    t.stop();
+    HIP_CALL(hipDeviceSynchronize());
+
+    double elapsed = t.elapsed();
+    double ops = (double)blocks * count * 64 * 4 * runs;
+    double flops = (double)ops * ops_per_matmul / elapsed;
+
+    HIP_CALL(hipFree(buffer1));
+    HIP_CALL(hipFree(buffer2));
+    HIP_CALL(hipFree(accum));
+
+    return flops;
+}
+
+struct Result {
+    int device = -1;
+    double valu_fp16 = 0;
+    double valu_fp32 = 0;
+    double valu_fp64 = 0;
+    double valu_int32 = 0;
+    double mfma_fp16 = 0;
+    double mfma_fp32 = 0;
+    double smfmac_fp16 = 0;
+
+    // Used for sorting
+    bool operator<(const Result& other) {
+        return device < other.device;
+    }
+};
+
+void print_result(const Result& res, uint32_t mask)
+{
+    if(mask & VALU_FP16) {
+        printf("VALU FP16: %8.2f TFLOPS\n", res.valu_fp16 / 1e12);
+    }
+    if(mask & VALU_FP32) {
+        printf("VALU FP32: %8.2f TFLOPS\n", res.valu_fp32 / 1e12);
+    }
+    if(mask & VALU_FP64) {
+        printf("VALU FP64: %8.2f TFLOPS\n", res.valu_fp64 / 1e12);
+    }
+    if(mask & VALU_INT32) {
+        printf("VALU INT32: %8.2f TIOPS\n", res.valu_int32 / 1e12);
+    }
+    if(mask & MATRIX_FP16) {
+        printf("MFMA FP16: %8.2f TFLOPS\n", res.mfma_fp16 / 1e12);
+    }
+    if(mask & MATRIX_FP32) {
+        printf("MFMA FP32: %8.2f TFLOPS\n", res.mfma_fp32 / 1e12);
+    }
+    if(mask & SMATRIX_FP16) {
+        printf("SMFMAC FP16: %8.2f TFLOPS\n", res.smfmac_fp16 / 1e12);
+
+    }
+}
+
+Result run_tests(int device, int runs, uint32_t mask)
+{
+    int device_count;
+
+    HIP_CALL(hipGetDeviceCount(&device_count));
+
+    if(device >= device_count) {
+        std::cout << "Device " << device << " does not exist. Skipping..." << std::endl;
+        exit(1);
+    }
+
+    HIP_CALL(hipSetDevice(device));
+    GCNArch arch = get_gcn_arch(device);
+
+    Result res = {.device = device};
+
+    if(mask & VALU_FP16) {
+        res.valu_fp16 = fma_throughput_test<float16>(device, 4096, runs);
+    }
+
+    if(mask & VALU_FP32) {
+        res.valu_fp32 = fma_throughput_test<float>(device, 4096, runs);
+    }
+
+    if(mask & VALU_FP64) {
+        res.valu_fp64 = fma_throughput_test<double>(device, 4096, runs);
+    }
+
+    if(mask & VALU_INT32) {
+        res.valu_int32 = fma_throughput_test<int>(device, 4096, runs);
+    }
+
+    if(mask & MATRIX_FP16) {
+        if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) {
+            res.mfma_fp16 = matmul_throughput_test<float16, float>(device, 4096, runs);
+        } else {
+            res.mfma_fp16 = 0;
+        }
+    }
+    
+    if(mask & MATRIX_FP32) {
+        if(arch.major == 0x9 && (arch.minor >= 0x4 || (arch.minor == 0 && arch.rev >= 8))) {
+            res.mfma_fp32 = matmul_throughput_test<float, float>(device, 4096, runs);
+        } else {
+            res.mfma_fp32 = 0;
+        }
+    }
+
+    if(mask & SMATRIX_FP16) {
+        if(arch.major == 9 && arch.minor >= 4) {
+            res.smfmac_fp16 = sparse_matmul_throughput_test<float16, float>(device, 4096, runs);
+        } else {
+            res.smfmac_fp16 = 0;
+        }
+    }
+    return res;
+}
+
+// Use fork() followed by exec() to run child process. For some reason
+// rocprof does not pick up the child processes when only fork() is
+// used.
+pid_t fork_process(int device, int runs, uint32_t mask, int fd)
+{
+    pid_t pid = fork();
+
+    if(pid != 0) {
+        return pid;
+    }
+
+    std::string str_device = std::to_string(device);
+    std::string str_runs = std::to_string(runs);
+    std::string str_mask = std::to_string(mask);
+    std::string str_fd = std::to_string(fd);
+
+    char* const args[] = {
+        (char*)"CHILD",
+        (char*)str_device.c_str(),
+        (char*)str_runs.c_str(),
+        (char*)str_mask.c_str(),
+        (char*)str_fd.c_str(),
+        NULL
+    };
+
+    execv("/proc/self/exe", args);
+    std::cout << "execv() failed: " << std::strerror(errno) << std::endl;
+    exit(1);
+}
+
+void run(std::vector<int>& devices, int runs, uint32_t mask)
+{
+    std::vector<pid_t> pids;
+
+    // We will receive results from the child processes using a pipe
+    int fd[2];
+
+    if(pipe(fd)) {
+        std::cout << std::strerror(errno) << std::endl;
+        exit(1);
+    }
+
+    // Start a new process for each GPU
+    for(auto d : devices) {
+        pid_t pid = fork_process(d, runs, mask, fd[1]);
+        
+        pids.push_back(pid);
+    }
+
+    // Wait for all processes to finish
+    for(auto pid : pids) {
+        int status;
+        waitpid(pid, &status, 0);
+    }
+
+    // Set the read to non-blocking
+    int flags = fcntl(fd[0], F_GETFL, 0);
+    fcntl(fd[0], F_SETFL, flags | O_NONBLOCK);
+
+    // Read records from pipe
+    std::vector<Result> results(pids.size());
+    int count = read(fd[0], results.data(), results.size() * sizeof(Result)) / sizeof(Result);
+
+    results.resize(count);
+
+    // Sort results by GPU id
+    std::sort(results.begin(), results.end());
+ 
+    // Print results
+    for(auto r : results) {
+        std::cout << std::endl << "GPU " << r.device << std::endl;
+        print_result(r, mask);
+    }
+
+    Result total;
+    for(auto r : results) {
+        total.valu_fp16 += r.valu_fp16;
+        total.valu_fp32 += r.valu_fp32;
+        total.valu_fp64 += r.valu_fp64;
+        total.valu_int32 += r.valu_int32;
+        total.mfma_fp16 += r.mfma_fp16;
+        total.mfma_fp32 += r.mfma_fp32;
+        total.smfmac_fp16 += r.smfmac_fp16;
+    }
+    std::cout << std::endl << "System total" << std::endl;
+    print_result(total, mask);
+}
+
+
+void usage()
+{
+    std::cout << "--device  ID          Use device with the given numerical ID" << std::endl;
+    std::cout << "--devices IDS | ALL   Comma-separated list of device Ids (e.g., 1,2,3)" << std::endl;
+    std::cout << "                      ALL for all devices" << std::endl;                                  
+    std::cout << "--runs    RUNS        Number of times each kernel is dispatched" << std::endl;
+
+    std::cout << "--fp16                Run FP16 (VALU) test" << std::endl;
+    std::cout << "--fp32                Run FP32 (VALU) test" << std::endl;
+    std::cout << "--fp64                Run FP64 (VALU) test" << std::endl;
+    std::cout << "--matfp16             Run FP16 (MFMA) test" << std::endl;
+    std::cout << "--matfp32             Run FP32 (MFMA) test" << std::endl;
+    std::cout << "--smatfp16            Run FP16 (SMFMAC) test" << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    if(std::string(argv[0]) == "CHILD") {
+        int device = atoi(argv[1]);
+        int runs = atoi(argv[2]);
+        uint32_t mask = atoi(argv[3]);
+        int fd = atoi(argv[4]);
+
+        Result res = run_tests(device, runs, mask);
+
+        write(fd, &res, sizeof(res));
+        return 0;
+    }
+
+    int runs = 1;
+
+    uint32_t mask = 0;
+    bool all_devices = false;
+    std::vector<int> devices;
+    int device_count;
+    int device = 0;
+
+    HIP_CALL(hipGetDeviceCount(&device_count));
+
+    int i = 1;
+    while(i < argc) {
+        std::string arg = std::string(argv[i]);
+
+        if(arg == "--help") {
+            usage();
+            return 0;
+        } else if(arg == "--device") {
+            devices.push_back(atoi(argv[i + 1]));
+            // Skip next 
+            i++;
+        } else if(arg == "--devices") {
+            // Parse comma-separated string of numbers
+            std::string s(argv[i + 1]);
+
+            if(s == "all" || s == "ALL") {
+                all_devices = true;
+            } else {
+                std::stringstream ss(s);
+                std::string r;
+                while(getline(ss, r, ',')) {
+                    devices.push_back(std::stoi(r));
+                }
+            }
+            // Skip next 
+            i++;
+        } else if(arg == "--runs") {
+            runs = atoi(argv[i + 1]);
+
+            // Skip next
+            i++;
+        } else if(arg == "--fp32") {
+            mask |= VALU_FP32;
+        } else if(arg == "--fp64") {
+            mask |= VALU_FP64;
+        } else if(arg == "--fp16") {
+            mask |= VALU_FP16;
+        } else if(arg == "--int32") {
+            mask |= VALU_INT32;
+        } else if(arg == "--matfp16") {
+            mask |= MATRIX_FP16;
+        } else if(arg == "--matfp32") {
+            mask |= MATRIX_FP32;
+        } else if(arg == "--smatfp16") {
+            mask |= SMATRIX_FP16;
+        } else {
+            std::cout << "Invalid argument '" << arg << "'" << std::endl;
+            std::cout << std::endl;
+            usage();
+            return 1;
+        }
+
+        i++;
+    }
+
+    if(all_devices) {
+        for(int i = 0; i < device_count; i++ ){
+            devices.push_back(i);
+        }
+    }
+
+    // Verify device ID's
+    for(auto d : devices) {
+        if(d >= device_count) {
+            std::cout << "Invalid device ordinal: " << d << std::endl;
+            return 1;
+        }
+    }
+
+    if(devices.size() == 0) {
+        devices.push_back(0);
+    }
+
+    if(mask == 0) {
+        mask = ALL;
+    }
+
+    run(devices, runs, mask);
+
+    return 0;
+}
+
+
@@ -77,11 +77,11 @@ for the agent and returns a pointer to it.
 #include <iostream>
 #include <memory>
 #include <mutex>
-#include <random>
 #include <set>
 #include <shared_mutex>
 #include <sstream>
 #include <string>
+#include <unistd.h>
 #include <unordered_map>
 #include <vector>

@@ -148,7 +148,7 @@ struct counter_info_record_t {
 // Tool data struct, now includes a vector of counter_info_record_t
 struct tool_data_t {
  std::mutex mut{};
-  std::unique_ptr<std::ostream> output_stream{nullptr};
+  std::string output_filename{};
  std::unordered_map<uint64_t, std::string> counter_id_name_map{};
  std::string requested_counters{};
  std::string kernel_filter_include_regex{};
@@ -614,14 +614,28 @@ void generate_output(tool_data_t *tool_data) {
                       }),
        tool_data->counter_records.end());
  }
-
+  if (tool_data->counter_records.empty()) {
+    return;
+  }
  // Write collected counter records and clean up
-  if (auto &os = tool_data->output_stream) {
+  if (!tool_data->output_filename.empty()) {
+    std::ofstream ofs(tool_data->output_filename);
+    if (!ofs.is_open()) {
+      std::cerr << "Failed to open output file: " << tool_data->output_filename
+                << std::endl;
+      return;
+    }
+    // Write header at the beginning of the file
+    ofs << "dispatch_id,gpu_id,kernel_id,lds_per_workgroup,"
+           "counter_id,counter_name,counter_value\n";
    for (const auto &r : tool_data->counter_records)
-      *os << r.dispatch_id << ',' << r.agent_id << "," << r.kernel_id << ','
+      ofs << r.dispatch_id << ',' << r.agent_id << "," << r.kernel_id << ','
          << r.LDS_memory_size << ',' << r.counter_id << ',' << r.counter_name
          << ',' << r.counter_value << '\n';
-    os->flush();
+    ofs.flush();
+    std::clog << "[rocprofiler-compute] [" << __FUNCTION__
+              << "] Counter collection data has been written to: "
+              << tool_data->output_filename << std::endl;
  }
 }

@@ -638,18 +652,13 @@ void tool_fini(void *user_data) {

 } // namespace

-std::unique_ptr<tool_data_t> create_tool_data(rocprofiler_client_id_t *id) {
+std::unique_ptr<tool_data_t>
+create_tool_data(rocprofiler_client_id_t * /*id*/) {
  auto tool_data = std::make_unique<tool_data_t>();

-  // Generate a unique output filename using a random hex string (no libuuid
-  // dependency)
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<uint32_t> dis(0, 0xFFFFFFFF);
-  std::stringstream filename_ss;
-  filename_ss << std::hex << dis(gen);
+  // Generate a unique output filename using the process ID
  std::string base_filename =
-      "counter_collection_" + filename_ss.str().substr(0, 8) + ".csv";
+      std::to_string(getpid()) + "_native_counter_collection.csv";

  // Require ROCPROF_OUTPUT_PATH to be set, otherwise error out
  std::string filename;
@@ -664,20 +673,7 @@ std::unique_ptr<tool_data_t> create_tool_data(rocprofiler_client_id_t *id) {
  // Use the generated base filename along with ROCPROF_OUTPUT_PATH
  filename += base_filename;

-  // Set output stream to file
-  auto ofs = std::make_unique<std::ofstream>(filename);
-  if (!ofs->is_open()) {
-    throw std::runtime_error("Failed to open output file: " + filename);
-  }
-  tool_data->output_stream = std::move(ofs);
-  // Write header at the beginning of the file
-  *tool_data->output_stream << "dispatch_id,gpu_id,kernel_id,lds_per_workgroup,"
-                               "counter_id,counter_name,counter_value\n";
-  tool_data->output_stream->flush();
-
-  // Write to clog the path of the logging file
-  std::clog << id->name << " [" << __FUNCTION__
-            << "] Logging counter collection to: " << filename << std::endl;
+  tool_data->output_filename = filename;

  // Store ROCPROF env. vars. in tool_data

@@ -61,7 +61,8 @@ def simple_bar(df: pd.DataFrame, title: Optional[str] = None) -> Optional[str]:

    if "Metric" in df.columns and "Avg" in df.columns:
        metric_dict = (
-            pd.DataFrame([df["Metric"], df["Avg"]])
+            pd
+            .DataFrame([df["Metric"], df["Avg"]])
            .replace("", 0)
            .replace(float("inf"), -1)  # It should not happen
            .replace(float("-inf"), -1)
@@ -258,7 +259,8 @@ def px_simple_multi_bar(

    for group, metric in nested_bar.items():
        dfigs.append(
-            px.bar(
+            px
+            .bar(
                title=group,
                x=metric.values(),
                y=metric.keys(),
@@ -219,7 +219,8 @@ def get_views() -> list[TextClause]:
        select(
            Kernel.kernel_name,
            (Dispatch.end_timestamp - Dispatch.start_timestamp).label("duration"),
-            func.row_number()
+            func
+            .row_number()
            .over(
                partition_by=Kernel.kernel_name,
                order_by=Dispatch.end_timestamp - Dispatch.start_timestamp,
@@ -132,7 +132,8 @@ class MIGPUSpecs:
                    cls._all_gpu_models.append(curr_gpu_model)
                    cls._gpu_model_dict[curr_gpu_arch].append(curr_gpu_model)
                    cls._num_xcds_dict[curr_gpu_model] = (
-                        models.get("partition_mode", {})
+                        models
+                        .get("partition_mode", {})
                        .get("compute_partition_mode", {})
                        .get("num_xcds", {})
                    )
@@ -580,7 +580,8 @@ def gen_counter_list(formula: str) -> tuple[bool, list[str]]:
        return visited, counters
    try:
        tree = ast.parse(
-            formula.replace("$normUnit", "SQ_WAVES")
+            formula
+            .replace("$normUnit", "SQ_WAVES")
            .replace("$denom", "SQ_WAVES")
            .replace(
                "$numActiveCUs",
@@ -1606,9 +1607,9 @@ def load_pc_sampling_data_per_kernel(
    pc_sample_instructions = search_key_in_json(file_name, "pc_sample_instructions")
    df["instruction"] = (
        df["inst_index"].apply(
-            lambda x: pc_sample_instructions[x]
-            if x < len(pc_sample_instructions)
-            else None
+            lambda x: (
+                pc_sample_instructions[x] if x < len(pc_sample_instructions) else None
+            )
        )
        if pc_sample_instructions
        else None
@@ -1618,9 +1619,11 @@ def load_pc_sampling_data_per_kernel(
    pc_sample_comments = search_key_in_json(file_name, "pc_sample_comments")
    df["source_line"] = (
        df["inst_index"].apply(
-            lambda x: f".../{Path(pc_sample_comments[x]).name}"
-            if x < len(pc_sample_comments)
-            else None
+            lambda x: (
+                f".../{Path(pc_sample_comments[x]).name}"
+                if x < len(pc_sample_comments)
+                else None
+            )
        )
        if pc_sample_comments
        else None
@@ -1719,7 +1722,8 @@ def load_pc_sampling_data(

        # Group by Instruction_Comment and aggregate
        grouped_counts = (
-            merged_df.groupby("Instruction_Comment")
+            merged_df
+            .groupby("Instruction_Comment")
            .agg(
                count=("Instruction_Comment", "count"),
                instruction=("Instruction", "first"),
@@ -38,6 +38,7 @@ COUNTERS_COLLECTION_QUERY = """
 SELECT
    agent_id as GPU_ID,
    dispatch_id as Dispatch_ID,
+    pid as PID,
    grid_size as Grid_Size,
    workgroup_size as Workgroup_Size,
    lds_block_size as LDS_Per_Workgroup,
@@ -61,24 +62,28 @@ TABLE_NAME_PREFIX_QUERY = (
 INSERT_QUERY = "INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"


-def convert_db_to_csv(
-    db_path: str,
+def convert_dbs_to_csv(
+    db_paths: list[str],
    csv_file_path: str,
 ) -> None:
    """
-    Read rocpd database and write to CSV file
+    Read rocpd databases and write to CSV file
    """
-    # Read counters_collection view from the database and write to CSV
+    # Read counters_collection view from the databases and write to CSV
    try:
-        with closing(sqlite3.connect(db_path)) as conn:
-            with closing(conn.execute(COUNTERS_COLLECTION_QUERY)) as cursor:
-                with open(csv_file_path, "w", newline="") as csvfile:
-                    writer = csv.writer(csvfile)
-                    writer.writerow([
-                        description[0] for description in cursor.description
-                    ])
-                    for row in cursor:
-                        writer.writerow(row)
+        with open(csv_file_path, "w", newline="") as csvfile:
+            writer = csv.writer(csvfile)
+            header_written = False
+            for db_path in db_paths:
+                with closing(sqlite3.connect(db_path)) as conn:
+                    with closing(conn.execute(COUNTERS_COLLECTION_QUERY)) as cursor:
+                        if not header_written:
+                            writer.writerow([
+                                description[0] for description in cursor.description
+                            ])
+                            header_written = True
+                        for row in cursor:
+                            writer.writerow(row)
    except OSError as e:
        console_error(f"Database error while converting to CSV: {e}")
    except Exception as e:
@@ -426,7 +426,8 @@ def format_table_output(
        and "Value" in df.columns
    ):
        mem_data = (
-            pd.DataFrame([df["Metric"], df["Value"]])
+            pd
+            .DataFrame([df["Metric"], df["Value"]])
            .transpose()
            .set_index("Metric")
            .to_dict()["Value"]
@@ -885,24 +885,48 @@ def run_prof(
            rocprof_cmd == "rocprofiler-sdk"
            and options["ROCPROF_COUNTER_COLLECTION"] == "0"
        ):
-            # Update rocpd database with counter csv created by native tool
-            rocpd_data.update_rocpd_pmc_events(
-                pd.read_csv(glob.glob(workload_dir + "/out/pmc_1/*.csv")[0]),
-                glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
-            )
+            for db_name in glob.glob(workload_dir + "/out/pmc_1/*/*.db"):
+                pid = Path(db_name).stem.split("_")[0]
+                rocpd_data.update_rocpd_pmc_events(
+                    pd.read_csv(
+                        f"{workload_dir}/out/pmc_1/{pid}_native_counter_collection.csv"
+                    ),
+                    db_name,
+                )
+                console_debug(f"Updated rocpd db {db_name} with native tool counters.")
        # Write results_fbase.csv
-        rocpd_data.convert_db_to_csv(
-            glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
+        rocpd_data.convert_dbs_to_csv(
+            glob.glob(workload_dir + "/out/pmc_1/*/*.db"),
            workload_dir + f"/results_{fbase}.csv",
        )
+        combined_df = pd.read_csv(workload_dir + f"/results_{fbase}.csv")
+        # Reset Dispatch_ID based on PID, Kernel_Name, Grid_Size,
+        # Workgroup_Size, LDS_Per_Workgroup
+        combined_df["Dispatch_ID"] = combined_df.groupby(
+            ["PID", "Kernel_Name", "Grid_Size", "Workgroup_Size", "LDS_Per_Workgroup"],
+            sort=False,
+        ).ngroup()
+        # Reset Kernel_ID based on Kernel_Name, Grid_Size,
+        # Workgroup_Size, LDS_Per_Workgroup
+        combined_df["Kernel_ID"] = combined_df.groupby(
+            ["Kernel_Name", "Grid_Size", "Workgroup_Size", "LDS_Per_Workgroup"],
+            sort=False,
+        ).ngroup()
+        # Drop PID since its not required
+        combined_df = combined_df.drop(columns=["PID"])
+        combined_df.to_csv(workload_dir + f"/results_{fbase}.csv", index=False)
+
        if retain_rocpd_output:
-            shutil.copyfile(
-                glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
-                workload_dir + "/" + fbase + ".db",
-            )
-            console_warning(
-                f"Retaining large raw rocpd database: {workload_dir}/{fbase}.db"
-            )
+            for db_path in glob.glob(workload_dir + "/out/pmc_1/*/*.db"):
+                pid = Path(db_path).stem.split("_")[0]
+                shutil.copyfile(
+                    db_path,
+                    workload_dir + f"/{fbase}_{pid}.db",
+                )
+                console_warning(
+                    f"Retaining large raw rocpd database: "
+                    f"{workload_dir}/{fbase}_{pid}.db"
+                )
        # Remove temp directory
        shutil.rmtree(workload_dir + "/" + "out")
        return
@@ -1064,81 +1088,66 @@ def convert_native_counter_collection_csv(workload_dir: str) -> None:
    trace to write counter collection csv in rocprofiler-sdk format
    for further processing to pmc_perf.csv file
    """
-    counter_data = pd.read_csv(
-        glob.glob(f"{workload_dir}/out/pmc_1/*.csv")[0], index_col=False
-    )
-    # Group by on counter_data based on dispatch_id and
-    # counter_id and sum the counter_value
-    counter_data = counter_data.groupby(
-        ["dispatch_id", "counter_name"], as_index=False
-    ).agg({"counter_value": "sum"})
-    kernel_data_filename = glob.glob(f"{workload_dir}/out/pmc_1/*/*_kernel_trace.csv")[
-        0
-    ]
-    kernel_data = pd.read_csv(kernel_data_filename)
-    rocprofv3_counter_data = pd.DataFrame({
-        "Correlation_Id": counter_data["dispatch_id"],
-        "Dispatch_Id": counter_data["dispatch_id"],
-        "Agent_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Agent_Id"
-        ].values,
-        "Queue_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Queue_Id"
-        ].values,
-        "Process_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Thread_Id"
-        ].values,
-        "Thread_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Thread_Id"
-        ].values,
-        "Grid_Size": (
-            kernel_data.iloc[counter_data["dispatch_id"] - 1][
-                ["Grid_Size_X", "Grid_Size_Y", "Grid_Size_Z"]
-            ]
-            .prod(axis=1)
-            .values
-        ),
-        "Kernel_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Kernel_Id"
-        ].values,
-        "Kernel_Name": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Kernel_Name"
-        ].values,
-        "Workgroup_Size": (
-            kernel_data.iloc[counter_data["dispatch_id"] - 1][
-                ["Workgroup_Size_X", "Workgroup_Size_Y", "Workgroup_Size_Z"]
-            ]
-            .prod(axis=1)
-            .values
-        ),
-        "LDS_Block_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "LDS_Block_Size"
-        ].values,
-        "Scratch_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Scratch_Size"
-        ].values,
-        "VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "VGPR_Count"
-        ].values,
-        "Accum_VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Accum_VGPR_Count"
-        ].values,
-        "SGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "SGPR_Count"
-        ].values,
-        "Counter_Name": counter_data["counter_name"],
-        "Counter_Value": counter_data["counter_value"],
-        "Start_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "Start_Timestamp"
-        ].values,
-        "End_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
-            "End_Timestamp"
-        ].values,
-    })
-    rocprofv3_counter_data.to_csv(
-        kernel_data_filename.replace("kernel_trace", "counter_collection"),
-        index=False,
-    )
+    for native_filename in glob.glob(
+        f"{workload_dir}/out/pmc_1/*_native_counter_collection.csv"
+    ):
+        counter_data = pd.read_csv(native_filename, index_col=False)
+        # Group by on dispatch_id and counter_id and sum the counter_value,
+        # Other rows in group have the same value, so take the first one
+        groupby_cols = ["dispatch_id", "counter_name"]
+        agg_dict = {
+            col: "first" for col in counter_data.columns if col not in groupby_cols
+        }
+        # Overwrite counter_value aggregation to sum
+        agg_dict["counter_value"] = "sum"
+        counter_data = counter_data.groupby(groupby_cols, as_index=False).agg(agg_dict)
+
+        pid = Path(native_filename).stem.split("_")[0]
+        kernel_data_filename = glob.glob(
+            f"{workload_dir}/out/pmc_1/*/{pid}_kernel_trace.csv"
+        )[0]
+        kernel_data = pd.read_csv(kernel_data_filename)
+
+        # Merge counter_data with kernel_data on kernel_id
+        merged_data = pd.merge(
+            counter_data,
+            kernel_data,
+            left_on="kernel_id",
+            right_on="Kernel_Id",
+            how="left",
+        )
+
+        rocprofv3_counter_data = pd.DataFrame({
+            "Correlation_Id": merged_data["dispatch_id"],
+            "Dispatch_Id": merged_data["dispatch_id"],
+            "Agent_Id": merged_data["Agent_Id"],
+            "Queue_Id": merged_data["Queue_Id"],
+            "Process_Id": merged_data["Thread_Id"],
+            "Thread_Id": merged_data["Thread_Id"],
+            "Grid_Size": (
+                merged_data[["Grid_Size_X", "Grid_Size_Y", "Grid_Size_Z"]].prod(axis=1)
+            ),
+            "Kernel_Id": merged_data["Kernel_Id"],
+            "Kernel_Name": merged_data["Kernel_Name"],
+            "Workgroup_Size": (
+                merged_data[
+                    ["Workgroup_Size_X", "Workgroup_Size_Y", "Workgroup_Size_Z"]
+                ].prod(axis=1)
+            ),
+            "LDS_Block_Size": merged_data["LDS_Block_Size"],
+            "Scratch_Size": merged_data["Scratch_Size"],
+            "VGPR_Count": merged_data["VGPR_Count"],
+            "Accum_VGPR_Count": merged_data["Accum_VGPR_Count"],
+            "SGPR_Count": merged_data["SGPR_Count"],
+            "Counter_Name": merged_data["counter_name"],
+            "Counter_Value": merged_data["counter_value"],
+            "Start_Timestamp": merged_data["Start_Timestamp"],
+            "End_Timestamp": merged_data["End_Timestamp"],
+        })
+        rocprofv3_counter_data.to_csv(
+            kernel_data_filename.replace("kernel_trace", "counter_collection"),
+            index=False,
+        )


 def process_rocprofv3_output(workload_dir: str, using_native_tool: bool) -> list[str]:
@@ -67,3 +67,11 @@ set_target_properties(
    laplace_eqn
    PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
 )
+
+set(ROCFLOP_SOURCES ../sample/rocflop.cpp)
+set_source_files_properties(${ROCFLOP_SOURCES} PROPERTIES LANGUAGE HIP)
+add_executable(rocflop ${ROCFLOP_SOURCES})
+set_target_properties(
+    rocflop
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
+)
@@ -68,6 +68,7 @@ config["app_mat_mul_max"] = ["./tests/mat_mul_max"]
 config["app_hip_dynamic_shared"] = ["./tests/hip_dynamic_shared"]
 config["app_laplace_eqn"] = ["./tests/laplace_eqn", "-i", "5000"]
 config["app_laplace_eqn_iter"] = ["./tests/laplace_eqn", "-i", "15000"]
+config["rocflop"] = ["./tests/rocflop", "--device", "0"]
 config["cleanup"] = True
 config["COUNTER_LOGGING"] = False
 config["METRIC_COMPARE"] = False
@@ -637,6 +638,29 @@ def test_path(binary_handler_profile_rocprof_compute):
    test_utils.clean_output_dir(config["cleanup"], workload_dir)


+@pytest.mark.path
+def test_path_rocflop(
+    binary_handler_profile_rocprof_compute,
+):
+    # Test whether multiprocess workloads like rocflop are handled correctly
+    workload_dir = test_utils.get_output_dir()
+    options = ["--block", "2.1.1"]
+    _ = binary_handler_profile_rocprof_compute(
+        config,
+        workload_dir,
+        options,
+        check_success=True,
+        roof=False,
+        app_name="rocflop",
+    )
+    pmc_perf_df = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)[
+        "pmc_perf.csv"
+    ]
+    # Ensure non zero length of df
+    assert len(pmc_perf_df) > 0
+    test_utils.clean_output_dir(config["cleanup"], workload_dir)
+
+
@pytest.mark.path
 def test_path_no_native(binary_handler_profile_rocprof_compute):
    workload_dir = test_utils.get_output_dir()