diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md index 9630c50fef..c56eac9fcb 100644 --- a/projects/rccl-tests/README.md +++ b/projects/rccl-tests/README.md @@ -81,9 +81,9 @@ All tests support the same set of arguments : * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation - * `-p,--parallel_init <0/1>` use threads to initialize RCCL in parallel. Default : 0. - * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. - * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. + * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. + * `-c,--check ` perform count iterations, checking correctness of results on each iteration. This can be quite slow on large numbers of GPUs. Default : 1. + * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. ## Unit tests diff --git a/projects/rccl-tests/src/all_gather.cu b/projects/rccl-tests/src/all_gather.cu index 759f347d98..f18ce0cb65 100644 --- a/projects/rccl-tests/src/all_gather.cu +++ b/projects/rccl-tests/src/all_gather.cu @@ -22,21 +22,16 @@ void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); - for (int j=0; jexpected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); - } - k++; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); + for (int j=0; jexpected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } HIPCHECK(hipDeviceSynchronize()); } diff --git a/projects/rccl-tests/src/all_reduce.cu b/projects/rccl-tests/src/all_reduce.cu index 92fdbadb4e..de03a206ff 100644 --- a/projects/rccl-tests/src/all_reduce.cu +++ b/projects/rccl-tests/src/all_reduce.cu @@ -19,20 +19,15 @@ void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k = 0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); - TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks)); - k++; - } + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); HIPCHECK(hipDeviceSynchronize()); } return testSuccess; diff --git a/projects/rccl-tests/src/alltoall.cu b/projects/rccl-tests/src/alltoall.cu index 77546f4eb7..acfeb7d8ee 100644 --- a/projects/rccl-tests/src/alltoall.cu +++ b/projects/rccl-tests/src/alltoall.cu @@ -19,22 +19,17 @@ void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *para testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); - for (int j=0; jexpected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); - } - k++; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); + for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); } HIPCHECK(hipDeviceSynchronize()); } diff --git a/projects/rccl-tests/src/alltoallv.cu b/projects/rccl-tests/src/alltoallv.cu index 3f2204cd49..73b53d20c3 100644 --- a/projects/rccl-tests/src/alltoallv.cu +++ b/projects/rccl-tests/src/alltoallv.cu @@ -29,49 +29,44 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0)); #if 0 - int *dataHost = (int *)malloc(args->sendBytes); - hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); - printf(" Rank [%d] Original: ", rank); - for(int j=0; jsendBytes); + hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); + printf(" Rank [%d] Original: ", rank); + for(int j=0; jexpected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0)); - rdisp += rcount; + size_t rdisp = 0; + size_t data_count = sendcount*2/nranks; + size_t chunksize = data_count/nranks; + for (int j=0; jexpected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0)); + rdisp += rcount; } HIPCHECK(hipDeviceSynchronize()); } diff --git a/projects/rccl-tests/src/broadcast.cu b/projects/rccl-tests/src/broadcast.cu index 3797a84ee9..5cd6147f10 100644 --- a/projects/rccl-tests/src/broadcast.cu +++ b/projects/rccl-tests/src/broadcast.cu @@ -20,18 +20,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); - TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0)); - k++; - } + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); + TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0)); HIPCHECK(hipDeviceSynchronize()); } return testSuccess; diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index d71f0a7a35..0096ecb729 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -91,8 +91,6 @@ static int report_cputime = 0; // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) static int average = 1; static int numDevices = 1; -static int ranksPerGpu = 1; -static int enable_multiranks = 0; static int delay_inout_place = 0; static int enable_out_of_place = 1; @@ -275,11 +273,9 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int64_t *wrongPerGpu = nullptr; HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped)); - for (int i=0; inGpus*args->nRanks; i++) { - int device; - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i); - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - HIPCHECK(hipSetDevice(device)); + for (int i=0; inGpus; i++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipSetDevice(args->gpus[i])); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); @@ -317,16 +313,16 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t return testSuccess; } -testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_t* comms) { +testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) { hipError_t hipErr; - int remaining = nStreams; - int* done = (int*)malloc(sizeof(int)*nStreams); - memset(done, 0, sizeof(int)*nStreams); + int remaining = ngpus; + int* done = (int*)malloc(sizeof(int)*ngpus); + memset(done, 0, sizeof(int)*ngpus); timer tim; while (remaining) { int idle = 1; - for (int i=0; i timeout && timeout > 0) { - for (int i=0; imaxbytes / totalnbytes : 1; size_t shift = totalnbytes * (iter % steps); - if (args->nGpus> 1 || args->nRanks > 1) NCCLCHECK(ncclGroupStart()); - for (int i = 0; i < args->nGpus*args->nRanks; i++) { + if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { #ifndef NCCL_MAJOR - int hipDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev)); - HIPCHECK(hipSetDevice(hipDev)); + HIPCHECK(hipSetDevice(args->gpus[i])); #endif - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; char* sendBuff = ((char*)args->sendbuffs[i]) + shift; ncclRedOp_t op; @@ -436,11 +430,11 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #endif } - if (args->nGpus > 1 || args->nRanks > 1) NCCLCHECK(ncclGroupEnd()); + if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); if (blocking_coll) { // Complete op before returning - TESTCHECK(testStreamSynchronize(args->nGpus*args->nRanks, args->streams, args->comms)); + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); } if (blocking_coll) Barrier(args); return testSuccess; @@ -449,11 +443,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t testResult_t completeColl(struct threadArgs* args) { if (blocking_coll) return testSuccess; - TESTCHECK(testStreamSynchronize(args->nGpus*args->nRanks, args->streams, args->comms)); + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); return testSuccess; } -//RCCL: Revisit because of cudaGraphLaunches testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); if (datacheck) { @@ -470,15 +463,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); #if HIP_VERSION >= 50221310 - hipGraph_t graphs[args->nGpus*args->nRanks]; - hipGraphExec_t graphExec[args->nGpus*args->nRanks]; + hipGraph_t graphs[args->nGpus]; + hipGraphExec_t graphExec[args->nGpus]; if (cudaGraphLaunches >= 1) { // Begin cuda graph capture - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { // Thread local mdoe is needed for: // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture. - // Since pre-connect calls cudaMalloc, we cannot use global capture mode + // Since pre-connect calls hipMalloc, we cannot use global capture mode HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal)); } } @@ -497,18 +490,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { // End cuda graph capture - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i)); } // Instantiate cuda graph - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); } // Resync CPU, restart timing, launch cuda graph Barrier(args); tim.reset(); for (int l=0; lnGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i])); } } @@ -526,7 +519,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { //destroy cuda graph - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipGraphExecDestroy(graphExec[i])); HIPCHECK(hipGraphDestroy(graphs[i])); } @@ -534,21 +527,21 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif double algBw, busBw; - args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus*args->nRanks); + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); Barrier(args); int64_t wrongElts = 0; static __thread int rep = 0; rep++; - if (datacheck) { + for (int c = 0; c < datacheck; c++) { // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); #if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for data check - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal)); } } @@ -560,15 +553,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { // End cuda graph capture - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i)); } // Instantiate cuda graph - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); } // Launch cuda graph - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i])); } } @@ -579,7 +572,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { //destroy cuda graph - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { HIPCHECK(hipGraphExecDestroy(graphExec[i])); HIPCHECK(hipGraphDestroy(graphs[i])); } @@ -590,8 +583,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t //aggregate delta from all threads and procs long long wrongElts1 = wrongElts; + //if (wrongElts) fprintf(stderr, "\nERROR: Data corruption : rank %d size %ld wrongElts %ld\n", args->proc, args->expectedBytes, wrongElts); Allreduce(args, &wrongElts1, /*sum*/4); wrongElts = wrongElts1; + if (wrongElts) break; } double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; @@ -615,7 +610,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { - int nranks = args->nProcs*args->nGpus*args->nThreads*args->nRanks; + int nranks = args->nProcs*args->nGpus*args->nThreads; size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; count = size / wordSize(type); @@ -677,30 +672,22 @@ testResult_t threadRunTests(struct threadArgs* args) { testResult_t threadInit(struct threadArgs* args) { char hostname[1024]; getHostName(hostname, 1024); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; //set main thread again is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0; NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { + int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int j=0; jnRanks; j++) { - int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j; - if (args->enable_multiranks) - NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); -#ifdef RCCL_MULTIRANKPERGPU - else - NCCLCHECK(ncclCommInitRankMulti(args->comms+i*args->nRanks+j, nranks, args->ncclId, rank, rank)); -#endif - } + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); TESTCHECK(threadRunTests(args)); - for (int i=0; inGpus*args->nRanks; i++) { + for (int i=0; inGpus; i++) { NCCLCHECK(ncclCommDestroy(args->comms[i])); } return testSuccess; @@ -799,10 +786,6 @@ int main(int argc, char* argv[]) { {"report_cputime", required_argument, 0, 'C'}, {"average", required_argument, 0, 'a'}, {"out_of_place", required_argument, 0, 'O'}, -#ifdef RCCL_MULTIRANKPERGPU - {"enable_multiranks", required_argument, 0, 'x'}, - {"ranks_per_gpu", required_argument, 0, 'R'}, -#endif {"help", no_argument, 0, 'h'}, {} }; @@ -810,11 +793,7 @@ int main(int argc, char* argv[]) { while(1) { int c; -#ifdef RCCL_MULTIRANKPERGPU - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:a:y:s:u:h:R:x:q:", longopts, &longindex); -#else c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:O:a:y:s:u:h:q:", longopts, &longindex); -#endif if (c == -1) break; @@ -917,14 +896,6 @@ int main(int argc, char* argv[]) { case 'a': average = (int)strtol(optarg, NULL, 0); break; -#ifdef RCCL_MULTIRANKPERGPU - case 'x': - enable_multiranks = (int)strtol(optarg, NULL, 0); - break; - case 'R': - ranksPerGpu = (int)strtol(optarg, NULL, 0); - break; -#endif case 'q': delay_inout_place = (int)strtol(optarg, NULL, 10); break; @@ -942,7 +913,7 @@ int main(int argc, char* argv[]) { "[-m,--agg_iters ] \n\t" "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" + "[-c,--check ] \n\t" #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) "[-o,--op ] \n\t" #elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) @@ -962,10 +933,6 @@ int main(int argc, char* argv[]) { "[-C,--report_cputime <0/1>] \n\t" "[-O,--out_of_place <0/1>] \n\t" "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" -#ifdef RCCL_MULTIRANKPERGPU - "[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t" - "[-R,--ranks_per_gpu] \n\t" -#endif "[-q,--delay ] \n\t" "[-h,--help]\n", basename(argv[0])); @@ -987,21 +954,6 @@ int main(int argc, char* argv[]) { (unsigned long long)maxBytes); return -1; } - if (!minReqVersion(2, 12, 12) && enable_multiranks) { - fprintf(stderr, "Multiple Ranks per GPU requested, but rccl library found does not support this feature.\n"); - fprintf(stderr, "Please check LD_LIBRARY_PATH. Resetting enable_multiranks and ranksPerGpu to default values.\n"); - enable_multiranks = 0; - ranksPerGpu = 1; - } - - if (enable_multiranks && parallel_init) { - fprintf(stderr, "Cannot use parallel_init when using multiple ranks per GPU.\n"); - return -1; - } - if (ranksPerGpu > 1 && !enable_multiranks) { - fprintf(stderr, "Need to enable multiranks option to use multiple ranks per GPU\n"); - return -1; - } #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); #endif @@ -1046,10 +998,10 @@ testResult_t run() { #endif is_main_thread = is_main_proc = (proc == 0) ? 1 : 0; - PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d agg iters: %d validation: %d graph: %d\n", - nThreads, nGpus, ranksPerGpu, minBytes, maxBytes, - (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", - warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches); + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", + warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches); if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); PRINT("#\n"); @@ -1062,20 +1014,15 @@ testResult_t run() { char* envstr = getenv("NCCL_TESTS_DEVICE"); int gpu0 = envstr ? atoi(envstr) : -1; for (int i=0; ilen ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n", - rank, getpid(), hostname, hipDev, busIdStr, prop.name); - maxMem = std::min(maxMem, prop.totalGlobalMem); - } + HIPCHECK(hipGetDeviceProperties(&prop, cudaDev)); + char busIdStr[] = "00000000:00:00.0"; + HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev)); + len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n", + rank, getpid(), hostname, cudaDev, busIdStr, prop.name); + maxMem = std::min(maxMem, prop.totalGlobalMem); } #if MPI_SUPPORT char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL; @@ -1104,64 +1051,40 @@ testResult_t run() { } #ifdef MPI_SUPPORT MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm); + MPI_Barrier(MPI_COMM_WORLD); // Ensure Bcast is complete for HCOLL #endif int gpus[nGpus*nThreads]; - hipStream_t streams[nGpus*nThreads*ranksPerGpu]; - void* sendbuffs[nGpus*nThreads*ranksPerGpu]; - void* recvbuffs[nGpus*nThreads*ranksPerGpu]; - void* expected[nGpus*nThreads*ranksPerGpu]; + hipStream_t streams[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads]; + void* recvbuffs[nGpus*nThreads]; + void* expected[nGpus*nThreads]; size_t sendBytes, recvBytes; - ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads*ranksPerGpu); + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads); envstr = getenv("NCCL_TESTS_DEVICE"); gpu0 = envstr ? atoi(envstr) : -1; - for (int ii=0; iisendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); - HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault)); - if (rank == root) { - for (int j=0; jexpected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0)); - } - } - k++; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); + HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); + if (rank == root) { + TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0)); } HIPCHECK(hipDeviceSynchronize()); } diff --git a/projects/rccl-tests/src/hypercube.cu b/projects/rccl-tests/src/hypercube.cu index 5cab39c298..2058de1dd3 100644 --- a/projects/rccl-tests/src/hypercube.cu +++ b/projects/rccl-tests/src/hypercube.cu @@ -22,21 +22,16 @@ void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); - for (int j=0; jexpected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); - } - k++; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); + for (int j=0; jexpected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } HIPCHECK(hipDeviceSynchronize()); } diff --git a/projects/rccl-tests/src/reduce.cu b/projects/rccl-tests/src/reduce.cu index 44c8c4ff7e..62850f8212 100644 --- a/projects/rccl-tests/src/reduce.cu +++ b/projects/rccl-tests/src/reduce.cu @@ -19,21 +19,16 @@ void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramc testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); - HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault)); - if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks)); - k++; - } + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); + HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); + if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); HIPCHECK(hipDeviceSynchronize()); } return testSuccess; diff --git a/projects/rccl-tests/src/reduce_scatter.cu b/projects/rccl-tests/src/reduce_scatter.cu index 2abfa8af53..a58d2578af 100644 --- a/projects/rccl-tests/src/reduce_scatter.cu +++ b/projects/rccl-tests/src/reduce_scatter.cu @@ -22,21 +22,16 @@ void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); - HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault)); - TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks)); - k++; - } + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); + HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); HIPCHECK(hipDeviceSynchronize()); } return testSuccess; diff --git a/projects/rccl-tests/src/scatter.cu b/projects/rccl-tests/src/scatter.cu index 517596dea7..7445624b71 100644 --- a/projects/rccl-tests/src/scatter.cu +++ b/projects/rccl-tests/src/scatter.cu @@ -20,19 +20,13 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); - TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0)); - k++; - - } + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); + TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0)); HIPCHECK(hipDeviceSynchronize()); } return testSuccess; diff --git a/projects/rccl-tests/src/sendrecv.cu b/projects/rccl-tests/src/sendrecv.cu index 0d2ae9b6f3..d5b0300cdf 100644 --- a/projects/rccl-tests/src/sendrecv.cu +++ b/projects/rccl-tests/src/sendrecv.cu @@ -19,21 +19,16 @@ void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *para testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int nranks = args->nProcs*args->nThreads*args->nGpus; - int k=0; for (int i=0; inGpus; i++) { HIPCHECK(hipSetDevice(args->gpus[i])); - - for (int l=0; lnRanks; l++) { - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); - HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); - int peer = (rank-1+nranks)%nranks; - TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0)); - k++; - } + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); + int peer = (rank-1+nranks)%nranks; + TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0)); HIPCHECK(hipDeviceSynchronize()); } // We don't support in-place sendrecv