From 1044cd1f3277edffacd69dc5496619bc60b0f257 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 28 Jun 2021 18:23:12 -0700 Subject: [PATCH] Resync with changes in gitilab-master code [ROCm/rccl-tests commit: 1ae8cdc315d10c3f65764a1915e0fb0f1563d893] --- projects/rccl-tests/src/common.cu | 81 ++++++++++++++----------------- projects/rccl-tests/src/common.h | 5 +- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index c180294644..7aad2c1868 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -67,6 +67,8 @@ static int parallel_init = 0; static int blocking_coll = 0; static int cudaGraphLaunches = 0; +#define NUM_BLOCKS 32 + double parsesize(char *value) { long long int units; double size; @@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { const T* A = (const T*)A_; const T* B = (const T*)B_; __shared__ double temp[BSIZE]; - int tid = threadIdx.x; + int tid = blockIdx.x*blockDim.x + threadIdx.x; double locmax = 0.0; - for(int i=tid; i locmax ) { @@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { } } + tid = threadIdx.x; temp[tid] = locmax; for(int stride = BSIZE/2; stride > 1; stride>>=1) { __syncthreads(); @@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { } __syncthreads(); if( threadIdx.x == 0) - *max = temp[0] > temp[1] ? temp[0] : temp[1]; + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; } - -testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { switch (type) { #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: - deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; #endif case ncclHalf: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclFloat: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclDouble: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclChar: #if NCCL_MAJOR >= 2 case ncclUint8: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclInt: #if NCCL_MAJOR >= 2 case ncclUint32: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclInt64: case ncclUint64: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; } CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; isendBytes, args->expectedBytes); - size_t shift = (totalnbytes * iter) % args->maxbytes; - if (shift + totalnbytes > args->maxbytes) shift = 0; + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { @@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) { testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + } // Sync TESTCHECK(startColl(args, type, op, root, in_place, 0)); @@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } double timeUsec = deltaSec*1.0E6; - char timeStr[10]; + char timeStr[100]; if (timeUsec > 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); - } else if (timeUsec > 100.0) { + } else if (timeUsec >= 100.0) { sprintf(timeStr, "%7.1f", timeUsec); } else { sprintf(timeStr, "%7.2f", timeUsec); @@ -812,31 +819,6 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; - case 'h': - printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" - "[-g,--ngpus ] \n\t" - "[-b,--minbytes ] \n\t" - "[-e,--maxbytes ] \n\t" - "[-i,--stepbytes ] \n\t" - "[-f,--stepfactor ] \n\t" - "[-n,--iters ] \n\t" - "[-m,--agg_iters ] \n\t" - "[-w,--warmup_iters ] \n\t" - "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - "[-o,--op ] \n\t" -#else - "[-o,--op ] \n\t" -#endif - "[-d,--datatype ] \n\t" - "[-r,--root ] \n\t" - "[-z,--blocking <0/1>] \n\t" - "[-G,--cudagraph ] \n\t" - "[-h,--help]\n", - basename(argv[0])); - return 0; default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" @@ -868,7 +850,8 @@ int main(int argc, char* argv[]) { #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); #endif - return run(); + TESTCHECK(run()); + return 0; } testResult_t run() { @@ -900,6 +883,7 @@ testResult_t run() { #define MAX_LINE 2048 char line[MAX_LINE]; int len = 0; + size_t maxMem = ~0; for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + ncclUniqueId ncclId; if (proc == 0) { NCCLCHECK(ncclGetUniqueId(&ncclId)); @@ -963,7 +956,7 @@ testResult_t run() { int errors[nThreads]; double bw[nThreads]; double* delta; - CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped)); + CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped)); int bw_count[nThreads]; for (int t=0; t