Files
Sourav Chakraborty 046af13751 Fix build failure in rccl_prim_test (#1984)
Added missing header in rccl_prim_test

[ROCm/rccl commit: 5b345d105c]
2025-10-21 12:51:14 -05:00

753 строки
28 KiB
C++

/*
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file rccl_prim_test.cpp
*
* test performance if individual rccl primitives
*/
#include <cstdio> //fprintf
#include <iostream> //cerr
#include <unistd.h> //usleep
#include <cstring>
#include <chrono>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include "copy_kernel.h"
#define MAX_GPU 16
#define MAX_WORKGROUPS 56
#define THREADS 256
#define NGPUS 2
#define COPY_UNROLL 4
#define REDUCE_UNROLL 2
#define DOUBLECOPY_UNROLL 2
#define DOUBLECOPYLOCAL_UNROLL 2
#define REDUCECOPY_UNROLL 2
#define PRINT_GPU0_ONLY 1
#define RST "\x1B[0m"
#define KBLU "\x1B[34m"
#define FBLU(x) KBLU x RST
#define BOLD(x) "\x1B[1m" x RST
struct transfer_data_t {
float *dest0[MAX_WORKGROUPS]; //remote fine grain
float *src0[MAX_WORKGROUPS]; //local fine grain
float *dest1[MAX_WORKGROUPS]; //local coarse grain
float *dest2[MAX_WORKGROUPS]; //local fine grain
float *src1[MAX_WORKGROUPS]; //local coarse grain
int N;
int gpu;
int ngpu;
uint64_t *remOpCount;
};
struct profiling_data_t {
uint64_t write_cycles[MAX_WORKGROUPS];
uint64_t bytes_transferred[MAX_WORKGROUPS];
};
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
void print_table_header(void) {
fprintf(stderr, "%120s","=================================================================================================================================\n");
fprintf(stderr, "%-20s %-13s %-13s %-13s %-13s %-20s %-20s %-10s\n","[Originating GPU]", "[Directions]", "[WorkGroup]", "[linktype]", "[time(ms)]" , "[bytes_transferred]", "[throughput(GB/s)]", "[StdDev]");
fprintf(stderr, "%120s","=================================================================================================================================\n");
}
void print_table_summary_line(void) {
fprintf(stderr, "%120s","---------------------------------------------------------------------------------------------------------------------------------\n");
}
enum Ops {
OP_COPY,
OP_LOCALCOPY,
OP_DOUBLECOPY,
OP_DOUBLECOPYLOCAL,
OP_REDUCE,
OP_REDUCECOPY,
OP_READ,
NUM_OPS,
};
template<int op, int sync>
__global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount) {
size_t tid = threadIdx.x;
uint64_t curr_time;
int bid = blockIdx.x;
int n = transfer_data->N;
const float *srcs[NGPUS];
float *dsts[NGPUS];
// signal self ready and wait until all GPUs are ready
if (tid == 0) {
__atomic_fetch_add(&transfer_data->remOpCount[transfer_data->gpu], 1, __ATOMIC_SEQ_CST);
if (sync) {
for (int i = 0; i < transfer_data->ngpu; i++) {
while (LOAD(&transfer_data->remOpCount[i]) < opCount) {};
}
}
}
__syncthreads();
if (tid == 0)
curr_time = wall_clock64();
if (op == OP_COPY) {
srcs[0] = transfer_data->src0[bid];
dsts[0] = transfer_data->dest0[bid];
ReduceOrCopyMulti<COPY_UNROLL, FuncPassA<float>, float, 1, 1, 1, 1>(threadIdx.x, THREADS,
1, srcs, 1, dsts, n);
}
if (op == OP_LOCALCOPY) {
srcs[0] = transfer_data->src0[bid];
dsts[0] = transfer_data->dest1[bid];
ReduceOrCopyMulti<COPY_UNROLL, FuncPassA<float>, float, 1, 1, 1, 1>(threadIdx.x, THREADS,
1, srcs, 1, dsts, n);
}
if (op == OP_DOUBLECOPY) {
srcs[0] = transfer_data->src0[bid];
dsts[0] = transfer_data->dest0[bid];
dsts[1] = transfer_data->dest1[bid];
ReduceOrCopyMulti<DOUBLECOPY_UNROLL, FuncPassA<float>, float, 1, 1, 1, 2>(threadIdx.x, THREADS,
1, srcs, 2, dsts, n);
}
if (op == OP_DOUBLECOPYLOCAL) {
srcs[0] = transfer_data->src0[bid];
dsts[0] = transfer_data->dest1[bid];
dsts[1] = transfer_data->dest2[bid];
ReduceOrCopyMulti<DOUBLECOPYLOCAL_UNROLL, FuncPassA<float>, float, 1, 1, 1, 2>(threadIdx.x, THREADS,
1, srcs, 2, dsts, n);
}
if (op == OP_REDUCE) {
srcs[0] = transfer_data->src0[bid];
srcs[1] = transfer_data->src1[bid];
dsts[0] = transfer_data->dest0[bid];
ReduceOrCopyMulti<REDUCE_UNROLL, FuncSum<float>, float, 1, 2, 1, 1>(threadIdx.x, THREADS,
2, srcs, 1, dsts, n);
}
if (op == OP_REDUCECOPY) {
srcs[0] = transfer_data->src0[bid];
srcs[1] = transfer_data->src1[bid];
dsts[0] = transfer_data->dest0[bid];
dsts[1] = transfer_data->dest1[bid];
ReduceOrCopyMulti<REDUCECOPY_UNROLL, FuncSum<float>, float, 1, 2, 1, 2>(threadIdx.x, THREADS,
2, srcs, 2, dsts, n);
}
if (op == OP_READ) {
// Swapped the dest0 and src0 in passed parameter of copy kernel so that it can utilized for as a read kernel.
// fetch op will happen on transfer_data->dest0[bid] and store op will happen on transfer_data->src0[bid]
srcs[0] = transfer_data->dest0[bid];
dsts[0] = transfer_data->src0[bid];
ReduceOrCopyMulti<COPY_UNROLL, FuncPassA<float>, float, 1, 1, 1, 1>(threadIdx.x, THREADS,
1, srcs, 1, dsts, n);
}
__syncthreads();
if (tid == 0) {
__atomic_fetch_add(&(profiling_data->write_cycles[bid]), __builtin_amdgcn_s_memrealtime() - curr_time, __ATOMIC_SEQ_CST);
__atomic_fetch_add(&(profiling_data->bytes_transferred[bid]), n * sizeof(float), __ATOMIC_SEQ_CST);
}
}
typedef void(*flag_sync_kernel_t)(struct transfer_data_t* transfer_data, struct profiling_data_t* profiling_data, uint64_t opCount);
static flag_sync_kernel_t const flagSyncKerns[NUM_OPS*2] = {
flag_sync_kernel<OP_COPY, 0>,
flag_sync_kernel<OP_COPY, 1>,
flag_sync_kernel<OP_LOCALCOPY, 0>,
flag_sync_kernel<OP_LOCALCOPY, 1>,
flag_sync_kernel<OP_DOUBLECOPY, 0>,
flag_sync_kernel<OP_DOUBLECOPY, 1>,
flag_sync_kernel<OP_DOUBLECOPYLOCAL, 0>,
flag_sync_kernel<OP_DOUBLECOPYLOCAL, 1>,
flag_sync_kernel<OP_REDUCE, 0>,
flag_sync_kernel<OP_REDUCE, 1>,
flag_sync_kernel<OP_REDUCECOPY, 0>,
flag_sync_kernel<OP_REDUCECOPY, 1>,
flag_sync_kernel<OP_READ, 0>,
flag_sync_kernel<OP_READ, 1>,
};
__global__ void initTestDataKernel(float* data, const size_t N, const int gpu) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N) {
data[tid] = 1.0/(float)(gpu*17 + tid%77);
tid += blockDim.x * gridDim.x;
}
}
#define HIPCHECK(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << error << ") at line " \
<< __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
static void setupPeers(uint32_t *info, bool* is_xgmi) {
int deviceCnt, dev;
// is_xgmi indicates all link are one hop XGMI
*is_xgmi = 1;
HIPCHECK(hipGetDeviceCount(&deviceCnt));
HIPCHECK(hipGetDevice(&dev));
//! If gpus are not peer enabled, enable them
for (int i = 0; i < deviceCnt; i++) {
HIPCHECK(hipSetDevice(i));
for (int j = 0; j < deviceCnt; j++) {
if (i != j) {
int p2p;
HIPCHECK(hipDeviceCanAccessPeer(&p2p, i, j));
if (!p2p) {
printf("Cannot enable peer access between device %d and %d. You may use HIP_VISIBLE_DEVICES to limit GPUs.\n",
i, j);
exit(-1);
}
HIPCHECK(hipDeviceEnablePeerAccess(j, 0));
uint32_t linktype;
hipError_t error = hipExtGetLinkTypeAndHopCount(i, j, &linktype, &info[i*deviceCnt+j]);
if (error != hipSuccess)
*is_xgmi = 0;
if (linktype != 4 || info[i*deviceCnt+j] != 1) *is_xgmi = 0;
}
else
info[i*deviceCnt+j] = 0;
}
}
HIPCHECK(hipSetDevice(dev));
}
static void parseChordalRing(char **str) {
static const char *ringBase = "0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3|0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4";
static char ringRemap[256];
int id[8], dist[8];
int i;
int ngpus;
HIPCHECK(hipGetDeviceCount(&ngpus));
// single node CR8G only
if (ngpus != 8)
return;
// validate chordal ring and calculate distance
for (i=0; i<ngpus; i++) {
int sum = ngpus*(ngpus-1)/2 - i;
int count = 0;
for (int n = 0; n<ngpus; n++) {
uint32_t linktype, hop;
hipError_t error = hipExtGetLinkTypeAndHopCount(i, n, &linktype, &hop);
if (error != hipSuccess)
return;
if (linktype != 4 || hop != 1) continue;
sum -= n;
count ++;
}
if(count != ngpus-2 || sum < 0 || sum > ngpus-1) {
return;
}
dist[i] = sum;
}
// remap GPU ids
for (i = 0; i<ngpus; i++) id[i] = i;
for (i = 0; i<ngpus; i++) {
if (dist[i] == ngpus-1-i) continue;
int j, m, n, temp;
for (j=i+1; j < ngpus; j++)
if(dist[j] == ngpus-1-i) break;
m = dist[i]; n = dist[j]; dist[i] = n; dist[j] = m;
temp = id[m]; id[m] = id[n]; id[n] = temp; temp =dist[m];
dist[m] = dist[n]; dist[n] = temp;
}
// create chordal ring based on reference and remapped ids
for (i = 0; i <strlen(ringBase); i++) {
if (ringBase[i] >= '0' && ringBase[i] <= '9')
ringRemap[i] = id[ringBase[i]-'0']+'0';
else
ringRemap[i] = ringBase[i];
}
ringRemap[i] = 0;
*str = ringRemap;
return;
}
static void printRing(int id, int *ring, int deviceCnt) {
printf("Ring %d: ", id);
for (int i = 0; i < deviceCnt; i++)
printf("%1d ", ring[i]);
printf("\n");
}
static void findConnect(uint32_t *info, int *ring, int deviceCnt) {
int n = 0, curr = 0, best;
uint32_t temp[MAX_GPU*MAX_GPU];
for (int i = 0; i < deviceCnt*deviceCnt; i++) temp[i] = 0;
for (int i = 0; i < deviceCnt; i++) {
for (int j = 0; j < deviceCnt; j++) temp[j*deviceCnt+curr] = 1;
ring[n] = curr;
n++;
int hops = 99;
for (int j = 0; j < deviceCnt; j++) {
if (temp[curr*deviceCnt+j]) continue;
if (info[curr*deviceCnt+j] < hops) {
best = j;
hops = info[curr*deviceCnt+j];
}
}
curr = best;
}
}
static int findNextGpu(int *ring, int gpu, int deviceCnt) {
int i;
for (i = 0; i < deviceCnt; i ++)
if (ring[i] == gpu) break;
return ring[(i+1)%deviceCnt];
}
static void setupRings(uint32_t *info, int *ring_0, int *ring_1) {
int deviceCnt, dev;
HIPCHECK(hipGetDeviceCount(&deviceCnt));
printf("Connection matrix:\n");
for (int i = 0; i < deviceCnt; i++) {
for (int j = 0; j < deviceCnt; j++)
printf("%2d ", info[i*deviceCnt+j]);
printf("\n");
}
findConnect(info, ring_0, deviceCnt);
ring_1[0] =0;
for (int i = 1; i < deviceCnt; i++)
ring_1[i] = ring_0[deviceCnt-i];
}
char* getCmdOption(char ** begin, char ** end, const std::string & option) {
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end)
{
return *itr;
}
return 0;
}
bool cmdOptionExists(char** begin, char** end, const std::string& option) {
return std::find(begin, end, option) != end;
}
static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"};
int main(int argc,char* argv[])
{
if (cmdOptionExists(argv, argv + argc, "-h")) {
printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|doublecopylocal|reduce|reducecopy|all -i iterations -n bytes -r \"0 1 2 3|3 2 1 0\"\n");
exit(0);
}
int workgroups = 0;
char *wg = getCmdOption(argv, argv + argc, "-w");
if (wg)
workgroups = atol(wg);
printf("Benchmarking using %d workgroups\n", workgroups);
int iters = 1000;
char *it = getCmdOption(argv, argv + argc, "-i");
if (it)
iters = atol(it);
printf("Benchmarking using %d iterations\n", iters);
uint64_t nBytes = 2097152;
char *nb = getCmdOption(argv, argv + argc, "-n");
if (nb)
nBytes = atol(nb);
printf("Benchmarking using %ld bytes\n", nBytes);
uint64_t N = nBytes/sizeof(float);
int sync = 0;
char *s = getCmdOption(argv, argv + argc, "-s");
if (s)
sync = atol(s);
if (sync) printf("Sync all GPUs before operation\n");
char *r = getCmdOption(argv, argv + argc, "-r");
if (r) printf("User specified ring topology: %s\n", r);
const char *ops[] = {"copy", "localcopy", "doublecopy", "doublecopylocal", "reduce", "reducecopy", "read", "all"};
char *prim = getCmdOption(argv, argv + argc, "-p");
int op = NUM_OPS, begin_op, end_op;
if (prim) {
for (op = 0; op < sizeof(ops); op++)
if (!strcmp((const char *)prim, ops[op]))
break;
}
if (op < NUM_OPS ) {
begin_op = op;
end_op = op + 1;
} else {
begin_op = 0;
end_op = NUM_OPS;
printf("Benchmarking all ops\n");
}
int nGpu = 1;
HIPCHECK(hipGetDeviceCount(&nGpu));
uint32_t connection_info[MAX_GPU*MAX_GPU];
// Enable peer access
bool is_xgmi;
char *cr8g = 0;
static const char *ring_4p3l = "0 1 2 3|0 1 3 2|0 2 1 3|0 2 3 1|0 3 1 2|0 3 2 1";
static const char *ring_8p1h = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0";
static const char *ring_16p1h = "0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4|0 1 2 3 7 6 13 12 8 9 10 11 15 14 5 4|0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1|4 5 13 12 8 9 11 10 14 15 7 6 2 3 1 0|4 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0|1 5 4 12 13 9 8 10 11 15 14 6 7 3 2 0";
static const char *ring_gfx942_8p = "0 1 2 3 4 5 6 7|0 1 2 3 4 5 7 6|0 2 4 1 3 6 5 7|0 2 4 6 1 7 3 5|0 3 1 5 2 7 4 6|0 3 5 1 6 2 7 4|0 4 1 7 3 6 2 5|7 6 5 4 3 2 1 0|6 7 5 4 3 2 1 0|7 5 6 3 1 4 2 0|5 3 7 1 6 4 2 0|6 4 7 2 5 1 3 0|4 7 2 6 1 5 3 0|5 2 6 3 7 1 4 0";
setupPeers(connection_info, &is_xgmi);
if (!r) {
parseChordalRing(&cr8g);
if (nGpu == 4 && is_xgmi) r = (char *)ring_4p3l;
if (nGpu == 8 && cr8g) r = (char *)cr8g;
if (nGpu == 8 && !cr8g) {
hipDeviceProp_t prop;
HIPCHECK(hipGetDeviceProperties(&prop, 0));
if (strncmp(prop.gcnArchName, "gfx942", 6) == 0) {
r = (char *)ring_gfx942_8p;
if(!workgroups) workgroups = 28;
} else {
r = (char *)ring_8p1h;
if(!workgroups) workgroups = 16;
}
}
if (nGpu == 16) {
r = (char *)ring_16p1h;
if(!workgroups) workgroups = 24;
}
}
if(!workgroups) workgroups = 1;
// clockwise and counter clockwise rings
int ring[MAX_WORKGROUPS][MAX_GPU];
for (int i = 0; i < MAX_WORKGROUPS; i++)
for (int j = 0; j <MAX_GPU; j++)
ring[i][j] = -1;
int num_rings = 0;
if (r) {
int j = 0, n = 0;
int state = 0;
do {
int digit = r[n] - '0';
if (digit >= 0 && digit <= 9) {
if (state)
ring[num_rings][j] = ring[num_rings][j]*10 + digit;
else {
ring[num_rings][j] = digit;
state = 1;
}
}
else {
state = 0;
j++;
if (r[n] == ' ') continue;
if (r[n] == '|') {
num_rings ++;
j = 0;
continue;
}
}
} while (r[n++] != 0x0);
num_rings ++;
} else {
setupRings(connection_info, ring[0], ring[1]);
num_rings = 2;
}
// duplicate rings
for (int i = num_rings; i < MAX_WORKGROUPS; i++) {
for (int j = 0; j <MAX_GPU; j++)
ring[i][j] = ring[i%num_rings][j];
}
// data buffers
float *buff[MAX_GPU*MAX_WORKGROUPS], *buff_coarse[MAX_GPU*MAX_WORKGROUPS];
float *buff_fine[MAX_GPU*MAX_WORKGROUPS]; // additional fine grain buffer for local double copy
struct transfer_data_t h_transfer_data[MAX_GPU], *transfer_data[MAX_GPU];
struct profiling_data_t *profiling_data[MAX_GPU], *d_profiling_data[MAX_GPU];
hipStream_t stream[MAX_GPU];
uint64_t *remOpCount, *d_remOpCount;
HIPCHECK(hipHostMalloc((void**)&remOpCount, sizeof(uint64_t)*MAX_GPU, hipHostMallocMapped));
HIPCHECK(hipHostGetDevicePointer((void**)&d_remOpCount, (void*)remOpCount, 0));
// print rings
for (int i = 0; i < workgroups; i++) {
printRing(i, ring[i], nGpu);
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
hipDeviceProp_t prop;
HIPCHECK(hipGetDeviceProperties(&prop, i));
printf("# device %d [0x%02x] %s\n",
i, prop.pciBusID, prop.name);
//create stream
HIPCHECK(hipStreamCreate(&stream[i]));
profiling_data[i] = (struct profiling_data_t *)malloc(sizeof(struct profiling_data_t)*iters);
HIPCHECK(hipMalloc((void**) &d_profiling_data[i], sizeof(struct profiling_data_t)*iters));
HIPCHECK(hipExtMallocWithFlags((void**) &transfer_data[i], sizeof(struct transfer_data_t), strncmp(prop.gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
for (int j = 0; j < workgroups; j++) {
HIPCHECK(hipExtMallocWithFlags((void**) &buff[i*MAX_WORKGROUPS+j], 2*N*sizeof(float), strncmp(prop.gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
// additional fine grained buffer for local doublecopy, only need 1 buffer (not used by remote)
HIPCHECK(hipExtMallocWithFlags((void**) &buff_fine[i*MAX_WORKGROUPS+j], N*sizeof(float), strncmp(prop.gcnArchName, "gfx942", 6) == 0 ? hipDeviceMallocUncached : hipDeviceMallocFinegrained));
HIPCHECK(hipMalloc((void**) &buff_coarse[i*MAX_WORKGROUPS+j], 2*N*sizeof(float)));
//randomize test data
hipLaunchKernelGGL(initTestDataKernel,
/*grid dim x,y,z*/ dim3(32, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ buff[i*MAX_WORKGROUPS+j], 2*N, 0);
hipLaunchKernelGGL(initTestDataKernel,
/*grid dim x,y,z*/ dim3(32, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ buff_fine[i*MAX_WORKGROUPS+j], N, 0);
hipLaunchKernelGGL(initTestDataKernel,
/*grid dim x,y,z*/ dim3(32, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ buff_coarse[i*MAX_WORKGROUPS+j], 2*N, 0);
}
}
for (int i = 0; i < nGpu; i ++) {
for (int j = 0; j < workgroups; j++) {
int next_gpu;
next_gpu = findNextGpu(ring[j], i, nGpu);
//printf("GPU %d Ring %d -> Next GPU %d\n", i, j, next_gpu);
h_transfer_data[i].dest0[j] = buff[next_gpu*MAX_WORKGROUPS+j] + N;
h_transfer_data[i].dest1[j] = buff_coarse[i*MAX_WORKGROUPS+j] + N;
h_transfer_data[i].dest2[j] = buff_fine[i*MAX_WORKGROUPS+j]; // additional local fine grain
h_transfer_data[i].src0[j] = buff[i*MAX_WORKGROUPS+j];
h_transfer_data[i].src1[j] = buff_coarse[i*MAX_WORKGROUPS+j];
}
h_transfer_data[i].N = N;
h_transfer_data[i].gpu = i;
h_transfer_data[i].ngpu = nGpu;
h_transfer_data[i].remOpCount = d_remOpCount;
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemcpyAsync(transfer_data[i], &h_transfer_data[i],
sizeof(struct transfer_data_t), hipMemcpyHostToDevice,
stream[i]));
HIPCHECK(hipStreamSynchronize(stream[i]));
}
void *args[MAX_GPU*3];
hipLaunchParams *launchParamsList= reinterpret_cast<hipLaunchParams *>(
malloc(sizeof(hipLaunchParams)*MAX_GPU));
uint64_t opCount = workgroups;
for (int op = begin_op; op < end_op; op ++) {
const char *OpsName[] = {"Copy", "Local Copy", "Double Copy", "doublecopylocal", "Reduce", "ReduceCopy", "Read"};
printf("\n[Testing %s]: \n", OpsName[op]);
// 20 warm up cycles
for (int j = 0; j < 20; j ++) {
for (int i = 0; i < nGpu; i ++) {
#if 0
args[i*3] = &transfer_data[i];
args[i*3+1] = &d_profiling_data[i];
args[i*3+2] = &opCount;
launchParamsList[i].func =
reinterpret_cast<void *>(flagSyncKerns[op*2 + sync]);
launchParamsList[i].gridDim = dim3(workgroups, 1, 1),
launchParamsList[i].blockDim = dim3(THREADS, 1, 1),
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = args + i*3;
}
hipExtLaunchMultiKernelMultiDevice(launchParamsList, nGpu,
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync);
#else
HIPCHECK(hipSetDevice(i));
//launch the kernel
hipLaunchKernelGGL(flagSyncKerns[op*2 + sync],
/*grid dim x,y,z*/ dim3(workgroups, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ transfer_data[i], d_profiling_data[i]+j, opCount);
}
#endif
opCount+=workgroups;
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipMemsetAsync(d_profiling_data[i], 0, sizeof(struct profiling_data_t)*iters, stream[i]));
HIPCHECK(hipStreamSynchronize(stream[i]));
}
auto start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < iters; j ++) {
for (int i = 0; i < nGpu; i ++) {
#if 0
args[i*3] = &transfer_data[i];
args[i*3+1] = &d_profiling_data[i];
args[i*3+2] = &opCount;
launchParamsList[i].func =
reinterpret_cast<void *>(flagSyncKerns[op*2 + sync]);
launchParamsList[i].gridDim = dim3(workgroups, 1, 1),
launchParamsList[i].blockDim = dim3(THREADS, 1, 1),
launchParamsList[i].sharedMem = 0;
launchParamsList[i].stream = stream[i];
launchParamsList[i].args = args + i*3;
}
hipExtLaunchMultiKernelMultiDevice(launchParamsList, nGpu,
hipCooperativeLaunchMultiDeviceNoPreSync|hipCooperativeLaunchMultiDeviceNoPostSync);
#else
HIPCHECK(hipSetDevice(i));
//launch the kernel
hipLaunchKernelGGL(flagSyncKerns[op*2 + sync],
/*grid dim x,y,z*/ dim3(workgroups, 1, 1),
/*block dim x,y,z*/ dim3(THREADS, 1, 1),
/*dynamic shared mem*/ 0,
/*stream*/ stream[i],
/*kernel args*/ transfer_data[i], d_profiling_data[i]+j, opCount);
}
#endif
opCount+=workgroups;
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipStreamSynchronize(stream[i]));
}
auto delta = std::chrono::high_resolution_clock::now() - start;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
std::cout << BOLD(FBLU("[GPU to GPU Transfer Profiling Data]"))<<std::endl;
print_table_header();
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipMemcpyAsync(profiling_data[i], d_profiling_data[i],
sizeof(struct profiling_data_t)*iters, hipMemcpyDeviceToHost,
stream[i]));
HIPCHECK(hipStreamSynchronize(stream[i]));
uint64_t max_write_cycle = 0;
uint64_t bytes_transferred = 0;
hipDeviceProp_t prop;
HIPCHECK(hipGetDeviceProperties(&prop, i));
double vega_gpu_rtc_freq, bw_std_dev = 0, mean_write_cycle = 0;
if (strncmp(prop.gcnArchName, "gfx942", 6) == 0)
vega_gpu_rtc_freq = 1.0E8;
else
vega_gpu_rtc_freq = 2.5E7;
for (int j = 0; j < workgroups; j++) {
int next_gpu;
next_gpu = findNextGpu(ring[j], i, nGpu);
uint32_t linktype;
uint32_t hopcount;
HIPCHECK(hipExtGetLinkTypeAndHopCount(i, next_gpu , &linktype, &hopcount));
//find mean/max/stddev of iterations
uint64_t iter_max_write_cycle = 0, iter_bytes_transferred = 0;
double iter_bw_std_dev = 0, iter_total_write_cycle = 0;
for (int k = 0; k < iters; k++) {
iter_max_write_cycle = std::max(iter_max_write_cycle, (profiling_data[i]+k)->write_cycles[j]);
iter_total_write_cycle = iter_total_write_cycle + (profiling_data[i]+k)->write_cycles[j];
iter_bytes_transferred = iter_bytes_transferred + (profiling_data[i]+k)->bytes_transferred[j];
}
bytes_transferred += iter_bytes_transferred;
double t1 = iter_total_write_cycle/vega_gpu_rtc_freq;
max_write_cycle = std::max(max_write_cycle, (uint64_t)iter_total_write_cycle);
mean_write_cycle = mean_write_cycle + iter_total_write_cycle;
for (int k = 0; k < iters; k++) {
double t0 = (double)(profiling_data[i]+k)->write_cycles[j]/vega_gpu_rtc_freq;
iter_bw_std_dev += std::pow((double)(profiling_data[i]+k)->bytes_transferred[j]/(t0*1.0E9) - (double)(profiling_data[i]+k)->bytes_transferred[j]*iters/(iter_total_write_cycle*1.0E9/vega_gpu_rtc_freq), 2);
}
iter_bw_std_dev = std::sqrt(iter_bw_std_dev/iters);
//store bytes_transferred and write_cycle from all itres into in first iter entry
profiling_data[i]->write_cycles[j] = (uint64_t)iter_total_write_cycle;
profiling_data[i]->bytes_transferred[j] = iter_bytes_transferred;
fprintf(stderr, "%-20d %-d->%-10d %-13d %-13s %-13.3f %-20lu %-8.2f %.3f\n",
i,i, next_gpu,j,link_type_name[linktype], t1*1000, iter_bytes_transferred, (double)(iter_bytes_transferred)/(t1*1.0E9), iter_bw_std_dev);
}
//calculate stddev for rings
mean_write_cycle /= workgroups;
for (int j = 0; j < workgroups; j++) {
double t0 = (double)profiling_data[i]->write_cycles[j]/vega_gpu_rtc_freq;
bw_std_dev += std::pow((double)profiling_data[i]->bytes_transferred[j]/(t0*1.0E9) - (double)profiling_data[i]->bytes_transferred[j]/(mean_write_cycle*1.0E9/vega_gpu_rtc_freq), 2);
}
bw_std_dev = std::sqrt(bw_std_dev/workgroups);
print_table_summary_line();
double total = 0;
total = (double)max_write_cycle/vega_gpu_rtc_freq;
fprintf(stderr, " Workgroups throughput standard deviation %-20.3f %-13.3f %-20lu %-.2f\n",
bw_std_dev, total*1000, bytes_transferred, (double)bytes_transferred/(total*1.0E9));
print_table_summary_line();
#ifdef PRINT_GPU0_ONLY
break;
#endif
}
std::cout << BOLD(FBLU("[Application Level Transfer Profiling Data]"))<<std::endl;
uint64_t total_bytes_transferred = profiling_data[0]->bytes_transferred[0] * workgroups ;
print_table_summary_line();
fprintf(stderr, " %-61s %-13.3f %-20lu %-.2f\n",
"Total" , deltaSec*1000, total_bytes_transferred, (double)total_bytes_transferred/(deltaSec*1.0E9));
print_table_summary_line();
}
for (int i = 0; i < nGpu; i ++) {
HIPCHECK(hipStreamDestroy(stream[i]));
HIPCHECK(hipFree((void*) transfer_data[i]));
for (int j = 0; j < workgroups; j++) {
HIPCHECK(hipFree((void*) buff[i*MAX_WORKGROUPS+j]));
HIPCHECK(hipFree((void*) buff_coarse[i*MAX_WORKGROUPS+j]));
HIPCHECK(hipFree((void*) buff_fine[i*MAX_WORKGROUPS+j]));
}
HIPCHECK(hipFree((void*) d_profiling_data[i]));
free(profiling_data[i]);
}
printf("opCount: ");
for (int i = 0; i < nGpu; i++)
printf("%ld ", remOpCount[i]);
printf("\n");
HIPCHECK(hipHostFree((void*)remOpCount));
}