From 390c63cf0d849295ece7dceb40fa1901ab187fa1 Mon Sep 17 00:00:00 2001 From: MurtadhaAldallal <43885974+MurtadhaAldallal@users.noreply.github.com> Date: Fri, 7 Aug 2020 11:20:14 -0400 Subject: [PATCH] Update rccl_prim_test.cpp (#246) Adding doublelocalcopy operation and freeing buffer memory at end. DoubleLocalCopy Patch Added --- tools/rccl-prim-test/rccl_prim_test.cpp | 26 +++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tools/rccl-prim-test/rccl_prim_test.cpp b/tools/rccl-prim-test/rccl_prim_test.cpp index ea45e177d0..bee96442c6 100644 --- a/tools/rccl-prim-test/rccl_prim_test.cpp +++ b/tools/rccl-prim-test/rccl_prim_test.cpp @@ -40,6 +40,7 @@ THE SOFTWARE. #define COPY_UNROLL 4 #define REDUCE_UNROLL 2 #define DOUBLECOPY_UNROLL 2 +#define DOUBLECOPYLOCAL_UNROLL 2 #define REDUCECOPY_UNROLL 2 #define ALL2ALL_UNROLL 2 @@ -72,6 +73,7 @@ struct transfer_data_t { float *dest0[MAX_WORKGROUPS]; //remote fine grain float *src0[MAX_WORKGROUPS]; //local fine grain float *dest1[MAX_WORKGROUPS]; //local coarse grain + float *dest2[MAX_WORKGROUPS]; //local fine grain float *src1[MAX_WORKGROUPS]; //local coarse grain // Buffers for all to all const float *srcs[MAX_WORKGROUPS][MAX_GPU]; @@ -104,6 +106,7 @@ enum Ops { OP_COPY, OP_LOCALCOPY, OP_DOUBLECOPY, + OP_DOUBLECOPYLOCAL, OP_REDUCE, OP_REDUCECOPY, OP_READ, @@ -144,6 +147,13 @@ __global__ void flag_sync_kernel(struct transfer_data_t* transfer_data, struct p ReduceOrCopyMulti, float, 1, 1, 1, 2>(threadIdx.x, THREADS, 1, srcs, 2, dsts, n); } + if (op == OP_DOUBLECOPYLOCAL) { + srcs[0] = transfer_data->src0[bid]; + dsts[0] = transfer_data->dest1[bid]; + dsts[1] = transfer_data->dest2[bid]; + ReduceOrCopyMulti, float, 1, 1, 1, 2>(threadIdx.x, THREADS, + 1, srcs, 2, dsts, n); + } if (op == OP_REDUCE) { srcs[0] = transfer_data->src0[bid]; srcs[1] = transfer_data->src1[bid]; @@ -192,6 +202,7 @@ static flag_sync_kernel_t const flagSyncKerns[NUM_OPS+1] = { flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, + flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, flag_sync_kernel, @@ -368,7 +379,7 @@ static const char* link_type_name[] = {"HT", "QPI", "PCIE", "IB", "XGMI"}; int main(int argc,char* argv[]) { if (cmdOptionExists(argv, argv + argc, "-h")) { - printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|reduce|reducecopy|all2all -i iterations -n bytes -r \"0 1 2 3|3 2 1 0\"\n"); + printf("./rccl_prim_test -w num_workgroups -p copy|localcopy|doublecopy|doublecopylocal|reduce|reducecopy|all2all -i iterations -n bytes -r \"0 1 2 3|3 2 1 0\"\n"); exit(0); } @@ -394,7 +405,7 @@ int main(int argc,char* argv[]) char *r = getCmdOption(argv, argv + argc, "-r"); if (r) printf("User specified ring topology: %s\n", r); - const char *ops[] = {"copy", "localcopy", "doublecopy", "reduce", "reducecopy", "read", "all2all"}; + const char *ops[] = {"copy", "localcopy", "doublecopy", "doublecopylocal", "reduce", "reducecopy", "read", "all2all"}; char *prim = getCmdOption(argv, argv + argc, "-p"); int op = NUM_OPS, begin_op, end_op; if (prim) { @@ -456,6 +467,7 @@ int main(int argc,char* argv[]) // data buffers float *buff[MAX_GPU*MAX_WORKGROUPS], *buff_coarse[MAX_GPU*MAX_WORKGROUPS]; + float *buff_fine[MAX_GPU*MAX_WORKGROUPS]; // additional fine grain buffer for local double copy struct transfer_data_t h_transfer_data[MAX_GPU], *transfer_data[MAX_GPU]; struct profiling_data_t *profiling_data[MAX_GPU], *d_profiling_data[MAX_GPU]; hipStream_t stream[MAX_GPU]; @@ -479,6 +491,8 @@ int main(int argc,char* argv[]) HIPCHECK(hipExtMallocWithFlags((void**) &transfer_data[i], sizeof(struct transfer_data_t), hipDeviceMallocFinegrained)); for (int j = 0; j < workgroups; j++) { HIPCHECK(hipExtMallocWithFlags((void**) &buff[i*MAX_WORKGROUPS+j], 2*N*sizeof(float), hipDeviceMallocFinegrained)); + // additional fine grained buffer for local doublecopy, only need 1 buffer (not used by remote) + HIPCHECK(hipExtMallocWithFlags((void**) &buff_fine[i*MAX_WORKGROUPS+j], N*sizeof(float), hipDeviceMallocFinegrained)); HIPCHECK(hipMalloc((void**) &buff_coarse[i*MAX_WORKGROUPS+j], 2*N*sizeof(float))); //randomize test data hipLaunchKernelGGL(initTestDataKernel, @@ -487,6 +501,12 @@ int main(int argc,char* argv[]) /*dynamic shared mem*/ 0, /*stream*/ stream[i], /*kernel args*/ buff[i*MAX_WORKGROUPS+j], 2*N, 0); + hipLaunchKernelGGL(initTestDataKernel, + /*grid dim x,y,z*/ dim3(32, 1, 1), + /*block dim x,y,z*/ dim3(THREADS, 1, 1), + /*dynamic shared mem*/ 0, + /*stream*/ stream[i], + /*kernel args*/ buff_fine[i*MAX_WORKGROUPS+j], N, 0); hipLaunchKernelGGL(initTestDataKernel, /*grid dim x,y,z*/ dim3(32, 1, 1), /*block dim x,y,z*/ dim3(THREADS, 1, 1), @@ -503,6 +523,7 @@ int main(int argc,char* argv[]) //printf("GPU %d Ring %d -> Next GPU %d\n", i, j, next_gpu); h_transfer_data[i].dest0[j] = buff[next_gpu*MAX_WORKGROUPS+j] + N; h_transfer_data[i].dest1[j] = buff_coarse[i*MAX_WORKGROUPS+j] + N; + h_transfer_data[i].dest2[j] = buff_fine[i*MAX_WORKGROUPS+j]; // additional local fine grain h_transfer_data[i].src0[j] = buff[i*MAX_WORKGROUPS+j]; h_transfer_data[i].src1[j] = buff_coarse[i*MAX_WORKGROUPS+j]; } @@ -685,6 +706,7 @@ int main(int argc,char* argv[]) for (int j = 0; j < workgroups; j++) { HIPCHECK(hipFree((void*) buff[i*MAX_WORKGROUPS+j])); HIPCHECK(hipFree((void*) buff_coarse[i*MAX_WORKGROUPS+j])); + HIPCHECK(hipFree((void*) buff_fine[i*MAX_WORKGROUPS+j])); } HIPCHECK(hipFree((void*) d_profiling_data[i])); free(profiling_data[i]);