From b188a152997740a84b3ce0da864bb4f1423eb35a Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 8 Aug 2017 16:18:34 -0700 Subject: [PATCH 001/233] Initial commit --- LICENSE.txt | 27 ++ Makefile | 20 + README.md | 62 +++ src/Makefile | 78 ++++ src/all_gather.cu | 106 +++++ src/all_reduce.cu | 130 ++++++ src/broadcast.cu | 121 +++++ src/common.cu | 1036 +++++++++++++++++++++++++++++++++++++++++ src/common.h | 158 +++++++ src/nccl1_compat.h | 47 ++ src/reduce.cu | 159 +++++++ src/reduce_scatter.cu | 139 ++++++ 12 files changed, 2083 insertions(+) create mode 100644 LICENSE.txt create mode 100644 Makefile create mode 100644 README.md create mode 100644 src/Makefile create mode 100644 src/all_gather.cu create mode 100644 src/all_reduce.cu create mode 100644 src/broadcast.cu create mode 100644 src/common.cu create mode 100644 src/common.h create mode 100644 src/nccl1_compat.h create mode 100644 src/reduce.cu create mode 100644 src/reduce_scatter.cu diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000000..4573c07c44 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,27 @@ + + Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..29409a8422 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +# +# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# +# See LICENCE.txt for license information +# + +.PHONY : all clean + +default : src.build + +TARGETS=src + +all: ${TARGETS:%=%.build} +clean: ${TARGETS:%=%.clean} + +%.build: + ${MAKE} -C $* build + +%.clean: + ${MAKE} -C $* clean diff --git a/README.md b/README.md new file mode 100644 index 0000000000..d70bb1f54c --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +# NCCL Tests + +These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL 1](http://github.com/nvidia/nccl) and [NCCL 2](http://developer.nvidia.com/nccl). + +## Build + +To build the tests, just type `make`. + +If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME. + +```shell +$ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl +``` + +NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed. + +```shell +$ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl +``` + +## Usage + +NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of gpus per thread). + +### Quick examples + +Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : +```shell +$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 +``` + +Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, disabling checks : +```shell +$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0 +``` + +All tests support the same arguments : +* Number of GPUs + * `-t,--nthreads ` number of threads per process. Default : 1. + * `-g,--ngpus ` number of gpus per process. Default : 1. +* Sizes to scan + * `-b,--minbytes ` minimum size to start with. Default : 32M. + * `-e,--maxbytes ` maximum size to end at. Default : 32M. + * Increments can be either fixes of a multiplication factor. Only one of those should be used + * `-i,--stepbytes ` fixed increment between sizes. Default : (max-min)/10. + * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. +* Performance + * `-n,--iters ` number of iterations. Default : 20. + * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. +* `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0. +* `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. +* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. +* NCCL operations arguments + * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations. Default : Sum. + * `-d,--datatype ` Specify which datatype to use. Default : Float. + * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. + * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. + +## Copyright + +NCCL tests are provided under the BSD licence. All source code and accompanying documentation is copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000000..6188d01424 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,78 @@ +# +# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. +# +# See LICENCE.txt for license information +# + +CUDA_HOME ?= /usr/local/cuda +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG ?= 0 + +CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include +NVCC = $(CUDA_HOME)/bin/nvcc + +# Better define NVCC_GENCODE in your environment to the minimal set +# of archs to reduce compile time. +NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \ + -gencode=arch=compute_35,code=sm_35 \ + -gencode=arch=compute_50,code=sm_50 \ + -gencode=arch=compute_52,code=sm_52 \ + -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_61,code=sm_61 \ + -gencode=arch=compute_61,code=compute_61 + +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 + +LDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt + +ifeq ($(DEBUG), 0) +NVCUFLAGS += -O3 +CXXFLAGS += -O3 +else +NVCUFLAGS += -O0 -G -g +CXXFLAGS += -O0 -g -ggdb3 +endif + +ifeq ($(VERBOSE), 0) +.SILENT: +endif + +.PHONY: build clean + +BUILDDIR ?= ../build +ifneq ($(NCCLDIR), "") +NVCUFLAGS += -I$(NCCLDIR)/include/ +NVLDFLAGS += -L$(NCCLDIR)/lib +endif + +ifeq ($(MPI), 1) +NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include +NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi +endif +LIBRARIES += curand nccl nvToolsExt +NVLDFLAGS += $(LIBRARIES:%=-l%) + +DST_DIR := $(BUILDDIR) +SRC_FILES := $(wildcard *.cu) +OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce +BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) + +build: ${BIN_FILES} + +clean: + rm -rf ${DST_DIR} + +${DST_DIR}/%.o: %.cu + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) -c $< + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o + @printf "Linking %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + diff --git a/src/all_gather.cu b/src/all_gather.cu new file mode 100644 index 0000000000..2386842cdd --- /dev/null +++ b/src/all_gather.cu @@ -0,0 +1,106 @@ +/************************************************************************* + * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + * + * See LICENCE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" + + +void print_header() { + PRINT("# %10s %12s %6s %6s out-of-place in-place\n", "", "", "", ""); + PRINT("# %10s %12s %6s %7s %5s %5s %7s %7s %5s %5s %7s\n", "bytes", "N", "type", + "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s", size, count, typeName); +} + +void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) { + *sendcount = count/nranks; + *recvcount = (count/nranks)*nranks; + *sameExpected = 1; + *procSharedCount = 0; + *sendInplaceOffset = count/nranks; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { + size_t nBytes = args->nbytes; + size_t count = nBytes / wordSize(type); + int proc = args->proc; + int nThreads = args->nThreads; + int t = args->thread; + int nGpus = args->nGpus; + + while (args->sync[args->sync_idx] != t) pthread_yield(); + + for (int i=0; iproc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + + void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i]; + + CUDACHECK(cudaMemcpy((void *)((uintptr_t)args->expectedHost[0] + ((proc*nThreads + t)*nGpus + i)*nBytes), + data, + nBytes, cudaMemcpyDeviceToHost)); + + if (in_place == 0) { + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + + args->sync[args->sync_idx] = t + 1; + + if (t+1 == nThreads) { +#ifdef MPI_SUPPORT + // Last thread does the MPI allgather + MPI_Allgather(MPI_IN_PLACE, nBytes*nThreads*nGpus, MPI_BYTE, + args->expectedHost[0], + nBytes*nThreads*nGpus, MPI_BYTE, MPI_COMM_WORLD); +#endif + args->sync[args->sync_idx] = 0; + } else { + while (args->sync[args->sync_idx]) pthread_yield(); + } + + args->sync_idx=!args->sync_idx; +} + +void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); +} + +void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; inbytes / wordSize(type); + + while (args->sync[args->sync_idx] != args->thread) pthread_yield(); + + for (int i=0; inGpus; i++) { + int device; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + + if (is_first && i == 0) { + CUDACHECK(cudaMemcpy(args->expected[0], data, count*wordSize(type), cudaMemcpyDeviceToHost)); + } else { + Accumulate(args->expected[0], data, count, type, op); + } + + if (in_place == 0) { + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + + args->sync[args->sync_idx] = args->thread + 1; + + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + // Last thread does the MPI reduction + if (args->nbytes > 0) { + void* remote, *remoteHost = malloc(args->nbytes); + void* myInitialData = malloc(args->nbytes); + memcpy(myInitialData, args->expectedHost[0], args->nbytes); + CUDACHECK(cudaHostRegister(remoteHost, args->nbytes, cudaHostRegisterPortable | cudaHostRegisterMapped)); + CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0)); + for (int i=0; inProcs; i++) { + if (i == args->proc) { + MPI_Bcast(myInitialData, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD); + free(myInitialData); + } else { + MPI_Bcast(remoteHost, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD); + Accumulate(args->expected[0], remote, count, type, op); + cudaDeviceSynchronize(); + } + } + CUDACHECK(cudaHostUnregister(remoteHost)); + free(remoteHost); + } +#endif + args->sync[args->sync_idx] = 0; + } else { + while (args->sync[args->sync_idx]) pthread_yield(); + } + + args->sync_idx = !args->sync_idx; +} + +void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(2*(nranks - 1)))/((double)nranks); + *busBw = baseBw * factor; +} + +void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); +} + + +void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = ncclNumOps; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i + +void print_header() { + PRINT("# %10s %12s %6s %6s out-of-place\n", "", "", "", ""); + PRINT("# %10s %12s %6s %6s %7s %5s %5s %7s\n", "bytes", "N", "type", "root", + "time", "algbw", "busbw", "res"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s %6i", size, count, typeName, root); +} + +void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sameExpected = 0; + *procSharedCount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { + int root_proc = root/(args->nThreads*args->nGpus); + int root_thread = (root/args->nGpus)%(args->nThreads); + int root_gpu = root%args->nGpus; + + assert(args->expectedBytes == args->nbytes); + + if (root_thread == args->thread) { + if (root_proc == args->proc) { + CUDACHECK(cudaMemcpy(args->procSharedHost, + args->sendbuffs[root_gpu], + args->nbytes, cudaMemcpyDeviceToHost)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(args->procSharedHost, args->nbytes, MPI_BYTE, root_proc, MPI_COMM_WORLD); +#endif + + args->sync[0] = 0; + } + + Barrier(args); + + for (int i=0; inGpus; i++) { + int device; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + + //set expected buf to zero at root, copy over source data at others + if ((root_proc == args->proc) + && (root_thread == args->thread) + && (root_gpu == i)) { + memset(args->expectedHost[i], 0, args->nbytes); + } else { + memcpy(args->expectedHost[i], args->procSharedHost, args->nbytes); + } + + //reset recvbufs to zero + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes)); + CUDACHECK(cudaDeviceSynchronize()); + } + + Barrier(args); +} + +void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + if (rank == root) { + NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream)); + } else { + NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream)); + } +} + +void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; i +#include +#include +#include "cuda.h" + +#if NCCL_MAJOR >= 2 +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; +const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"}; +#else +ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; +const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; +#endif +ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin}; +const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"}; + +thread_local int is_main_thread = 0; + +static int datacheck = 1; +static int warmup_iters = 5; +static int iters = 20; +static int ncclop = ncclSum; +static int nccltype = ncclFloat; +static int ncclroot = 0; +static int swap_args = 0; +static int parallel_init = 0; +static int blocking_coll = 0; + +double parsesize(char *value) { + long long int units; + double size; + + if (strchr(value, 'G') != NULL) { + units=1024*1024*1024; + } else if (strchr(value, 'M') != NULL) { + units=1024*1024; + } else if (strchr(value, 'K') != NULL) { + units=1024; + } else { + units=1; + } + + size = atof(value)*units; + return size; +} + +double DeltaMaxValue(ncclDataType_t type) { + switch(type) { + case ncclHalf: return 1e-2; + case ncclFloat: return 1e-5; + case ncclDouble: return 1e-12; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint8: + //case ncclInt32: + case ncclUint32: +#endif + case ncclInt64: + case ncclUint64: return 1e-200; + } + return 1e-200; +} + +template __device__ +double absDiff(T a, T b) { + return fabs((double)(b - a)); +} + +template<> __device__ +double absDiff(half a, half b) { + float x = __half2float(a); + float y = __half2float(b); + return fabs((double)(y-x)); +} + +template __device__ +float toFloat(T a) { + return (float)a; +} +template<> __device__ +float toFloat(half a) { + return __half2float(a); +} + + +template __global__ +void deltaKern(void* A_, void* B_, size_t count, double* max) { + const T* A = (const T*)A_; + const T* B = (const T*)B_; + __shared__ double temp[BSIZE]; + int tid = threadIdx.x; + double locmax = 0.0; + for(int i=tid; i locmax ) { + locmax = delta; +#ifdef DEBUG_PRINT + if (delta > .1) printf("Error at %d/%d : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i])); +#endif + } + } + + temp[tid] = locmax; + for(int stride = BSIZE/2; stride > 1; stride>>=1) { + __syncthreads(); + if( tid < stride ) + temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; + } + __syncthreads(); + if( threadIdx.x == 0) + *max = temp[0] > temp[1] ? temp[0] : temp[1]; +} + + +void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { + switch (type) { + case ncclHalf: + deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + case ncclFloat: + deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + case ncclDouble: + deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + + case ncclChar: +#if NCCL_MAJOR >= 2 + case ncclUint8: +#endif + deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + case ncclInt: +#if NCCL_MAJOR >= 2 + case ncclUint32: +#endif + deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + case ncclInt64: + case ncclUint64: + deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + } +} + +#define CURAND_CHK(cmd) \ + do { \ + curandStatus_t error = (cmd); \ + if (error != CURAND_STATUS_SUCCESS) { \ + printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (false) + + +template +void GenerateRandom(curandGenerator_t generator, T * const dest, + const size_t N); + +template<> +void GenerateRandom(curandGenerator_t generator, int8_t * const dest, + const size_t N) { + size_t align = (4 - (((size_t)dest) & 3)) % 4; + CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align), + N * sizeof(int8_t) / sizeof(int))); + CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice)); +} +template<> +void GenerateRandom(curandGenerator_t generator, uint8_t * const dest, + const size_t N) { + size_t align = (4 - (((size_t)dest) & 3)) % 4; + CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align), + N * sizeof(uint8_t) / sizeof(int))); + CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice)); +} + +template<> +void GenerateRandom(curandGenerator_t generator, int32_t * const dest, + const size_t N) { + CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N)); +} + +template<> +void GenerateRandom(curandGenerator_t generator, uint32_t * const dest, + const size_t N) { + CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N)); +} + +template<> +void GenerateRandom(curandGenerator_t generator, float * const dest, + const size_t N) { + CURAND_CHK(curandGenerateUniform(generator, dest, N)); +} + +template<> +void GenerateRandom(curandGenerator_t generator, double * const dest, + const size_t N) { + CURAND_CHK(curandGenerateUniformDouble(generator, dest, N)); +} + +template<> +void GenerateRandom(curandGenerator_t generator, uint64_t * const dest, + const size_t N) { + CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2)); +} + +template<> +void GenerateRandom(curandGenerator_t generator, int64_t * const dest, + const size_t N) { + CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2)); +} + +template +void RandomizeType(void* dest, const size_t N, const int randomSeed) { + T* ptr = (T*)dest; + curandGenerator_t gen; + CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32)); + CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed)); + GenerateRandom(gen, ptr, N); + CURAND_CHK(curandDestroyGenerator(gen)); + CUDACHECK(cudaDeviceSynchronize()); +} + +__global__ void halve(const float * src, half* dest, size_t N) { + for(int tid = threadIdx.x + blockIdx.x*blockDim.x; + tid < N; tid += blockDim.x * gridDim.x) + dest[tid] = __float2half(src[tid]); +} + +void RandomizeHalf(void* dest, const size_t N, const int randomSeed) { + half* ptr = (half*)dest; + curandGenerator_t gen; + CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32)); + CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed)); + + float* temp; + CUDACHECK(cudaMalloc(&temp, N*sizeof(float))); + GenerateRandom(gen, temp, N); + halve<<<128, 512>>>(temp, ptr, N); + CURAND_CHK(curandDestroyGenerator(gen)); + CUDACHECK(cudaFree(temp)); + CUDACHECK(cudaDeviceSynchronize()); +} + +void Randomize(void* ptr, const size_t count, ncclDataType_t type, const int seed) { + switch (type) { + case ncclChar: RandomizeType (ptr, count, seed); break; +#if NCCL_MAJOR >= 2 + case ncclUint8: RandomizeType (ptr, count, seed); break; +#endif + case ncclInt: RandomizeType (ptr, count, seed); break; +#if NCCL_MAJOR >= 2 + case ncclUint32: RandomizeType(ptr, count, seed); break; +#endif + case ncclInt64: RandomizeType (ptr, count, seed); break; + case ncclUint64: RandomizeType(ptr, count, seed); break; + case ncclHalf: RandomizeHalf (ptr, count, seed); break; + case ncclFloat: RandomizeType (ptr, count, seed); break; + case ncclDouble: RandomizeType (ptr, count, seed); break; + } +} + +template __global__ static +void accumKern(T* acum, const T* contrib, size_t N) { + int tid = threadIdx.x + blockIdx.x*blockDim.x; + int offset = blockDim.x*gridDim.x; + for(int i=tid; i c) ? a : c; + } else if(OP == ncclMin) { + acum[i] = (a < c) ? a : c; + } + } +} + +template<> __global__ +void accumKern(half* acum, const half* contrib, size_t N) { + int tid = threadIdx.x + blockIdx.x*blockDim.x; + int offset = blockDim.x*gridDim.x; + for(int i=tid; i __global__ +void accumKern(half* acum, const half* contrib, size_t N) { + int tid = threadIdx.x + blockIdx.x*blockDim.x; + int offset = blockDim.x*gridDim.x; + for(int i=tid; i __global__ +void accumKern(half* acum, const half* contrib, size_t N) { + int tid = threadIdx.x + blockIdx.x*blockDim.x; + int offset = blockDim.x*gridDim.x; + for(int i=tid; ic) ? a : c ); + } +} + +template<> __global__ +void accumKern(half* acum, const half* contrib, size_t N) { + int tid = threadIdx.x + blockIdx.x*blockDim.x; + int offset = blockDim.x*gridDim.x; + for(int i=tid; i +void accVecType(void* out, void* in, size_t n, ncclRedOp_t op) { + switch(op) { + case ncclSum: accumKern <<<256,256>>>((T*)out, (T*)in, n); break; + case ncclProd: accumKern<<<256,256>>>((T*)out, (T*)in, n); break; + case ncclMax: accumKern <<<256,256>>>((T*)out, (T*)in, n); break; + case ncclMin: accumKern <<<256,256>>>((T*)out, (T*)in, n); break; + default: + printf("Unknown reduction operation.\n"); + exit(EXIT_FAILURE); + } +} + +void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op) { + switch (type) { + case ncclChar: accVecType (out, in, n, op); break; +#if NCCL_MAJOR >= 2 + case ncclUint8: accVecType (out, in, n, op); break; +#endif + case ncclInt: accVecType (out, in, n, op); break; +#if NCCL_MAJOR >= 2 + case ncclUint32: accVecType (out, in, n, op); break; +#endif + case ncclInt64: accVecType (out, in, n, op); break; + case ncclUint64: accVecType (out, in, n, op); break; + case ncclHalf: accVecType (out, in, n, op); break; + case ncclFloat: accVecType (out, in, n, op); break; + case ncclDouble: accVecType (out, in, n, op); break; + default: + printf("Unknown reduction type.\n"); + exit(EXIT_FAILURE); + } +} + +void Barrier(struct threadArgs_t* args) +{ + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + + args->barrier[args->barrier_idx] = args->thread + 1; + + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); +#endif + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + + args->barrier_idx=!args->barrier_idx; +} + +void RandomizeAccumulate(void* data, void* accum, size_t count, ncclDataType_t type, ncclRedOp_t op, int seed, int rank) { + Randomize(data, count, type, seed); + if (rank == 0) { + CUDACHECK(cudaMemcpy(accum, data, count*wordSize(type), cudaMemcpyDeviceToHost)); + } else { + Accumulate(accum, data, count, type, op); + } +} + +double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + size_t count = args->expectedBytes/wordSize(type); + double maxDelta = 0.0; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; + CheckDelta(data , args->expected[i], count, type, args->delta); + cudaDeviceSynchronize(); + maxDelta = std::max(*(args->deltaHost), maxDelta); + +#ifdef DEBUG_PRINT + if (rank == 0) { + int *temp = (int *)malloc(args->expectedBytes); + + printf("\n Expected: "); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, *((int *)args->expectedHost[0] + j)); + } + printf("\n"); + + cudaMemcpy(temp, data, args->expectedBytes, cudaMemcpyDeviceToHost); + printf("\n Actual: "); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, *((int *)temp + j)); + } + printf("\n"); + free(temp); + } +#endif + } + double nranks = args->nProcs*args->nThreads*args->nGpus; + if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + return maxDelta; +} + +void InitSend(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { + size_t count = args->sendBytes / wordSize(type); + static int rep = 1; + for (int i=0; inGpus; i++) { + int device; + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i]; + int seed = rank+count+rep+in_place; + Randomize(data, count, type, seed); + +#ifdef DEBUG_PRINT + if (rank == 2) { + int *temp = (int *)malloc(args->sendBytes); + cudaMemcpy(temp, data, args->sendBytes, cudaMemcpyDeviceToHost); + printf("\n Send Data at rank %d:", rank); + for (int i=0; isendBytes/sizeof(int); i++) { + printf("%d:%d ", i, *((int *)temp + i)); + } + printf("\n"); + free(temp); + } +#endif + + cudaDeviceSynchronize(); + } + rep++; +} + +#define CHECK 1 + +void startColl(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int thread_offset) { + size_t count = args->nbytes / wordSize(type); + + if (swap_args) { + args = (struct threadArgs_t*)args->proc_args + (args->thread + thread_offset)%args->nThreads; + } + + if (args->nGpus == 1) { + int rank = args->proc*args->nThreads + args->thread; + RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[0] + args->sendInplaceOffset*rank)) : args->sendbuffs[0]), + (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[0] + args->recvInplaceOffset*rank) : args->recvbuffs[0]), + count, type, op, root, args->comms[0], args->streams[0]); + } else { + NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { +#ifndef NCCL_MAJOR + int cudaDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); +#endif + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank)) : args->sendbuffs[i]), + (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank) : args->recvbuffs[i]), + count, type, op, root, args->comms[i], args->streams[i]); + } + NCCLCHECK(ncclGroupEnd()); + } + + if (swap_args || blocking_coll) { + //if args have been swapped, complete op before returning + for (int i = 0; i < args->nGpus; ++i) { + cudaError_t err = cudaErrorNotReady; + while (err == cudaErrorNotReady) { + err = cudaStreamQuery(args->streams[i]); + pthread_yield(); + } + CUDACHECK(err); + } + } + if (blocking_coll) Barrier(args); +} + +void completeColl(struct threadArgs_t* args) { + //it swap_args was enabled, op would have been completed immediately + if (swap_args || blocking_coll) return; + + for (int i = 0; i < args->nGpus; ++i) { + cudaError_t err = cudaErrorNotReady; + while (err == cudaErrorNotReady) { + err = cudaStreamQuery(args->streams[i]); + pthread_yield(); + } + CUDACHECK(err); + } +} + +void BenchTime(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { + size_t count = args->nbytes / wordSize(type); + + // Sync + startColl(args, type, op, root, in_place, 0); + completeColl(args); + + Barrier(args); + + // Performance Benchmark + auto start = std::chrono::high_resolution_clock::now(); + for (int iter = 0; iter < iters; iter++) { + startColl(args, type, op, root, in_place, iter); + } + completeColl(args); + + auto delta = std::chrono::high_resolution_clock::now() - start; + double deltaSec = std::chrono::duration_cast>(delta).count(); + deltaSec = deltaSec/iters; + + double algBw, busBw; + GetBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + + Barrier(args); + + if (datacheck) { + InitSend(args, type, op, root, in_place, args->thread == 0 ? 1 : 0); + InitRecvResult(args, type, op, root, in_place, args->thread == 0 ? 1 : 0); + cudaDeviceSynchronize(); + } + + //test validation in single itertion, should ideally be included into the multi-iteration run + startColl(args, type, op, root, in_place, 0); + completeColl(args); + + double maxDelta = 0; +#ifdef CHECK + if (datacheck) { + maxDelta = CheckData(args, type, op, root, in_place); + } else { + maxDelta = -1.0; + } +#else + maxDelta = -1.0; +#endif + + //aggregate delta from all threads and procs + Barrier(args); + if (args->thread == 0) { + for (int i=1; inThreads; i++) { + maxDelta += args->deltaThreads[i]; + } +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); +#endif + } + Barrier(args); + + if (datacheck) { + PRINT(" %7.3f %5.2f %5.2f %7.0le", deltaSec * 1.0E3, algBw, busBw, + maxDelta); + } else { + PRINT(" %7.3f %5.2f %5.2f \tN/A", deltaSec * 1.0E3, algBw, busBw); + } + + args->bw[0] += busBw; + args->bw_count[0]++; +} + +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs_t* args) { + int nranks = args->nProcs*args->nGpus*args->nThreads; + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset, procSharedCount; + int sameExpected; + + count = size / wordSize(type); + getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, &procSharedCount, &sameExpected, (size_t)count, (size_t)nranks); + + args->nbytes = paramCount * wordSize(type); + args->sendBytes = sendCount * wordSize(type); + args->expectedBytes = recvCount * wordSize(type); + args->sendInplaceOffset = sendInplaceOffset * wordSize(type); + args->recvInplaceOffset = recvInplaceOffset * wordSize(type); +} + +void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, int inPlace) { + // Warm-up + setupArgs(args->maxbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + startColl(args, type, op, root, 0, iter); + } + completeColl(args); + + // Benchmark + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + setupArgs(size, type, args); + print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + BenchTime(args, type, op, root, 0); + if (inPlace) BenchTime(args, type, op, root, 1); + PRINT("\n"); + } +} + + +void* threadRunTests(void* args) { + struct threadArgs_t* targs = (struct threadArgs_t*)args; + // Set device to the first of our GPUs. If we don't do that, some operations + // will be done on the current GPU (by default : 0) and if the GPUs are in + // exclusive mode those operations will fail. + int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus; + CUDACHECK(cudaSetDevice(gpuid)); + + RunTest(targs, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]); + + return NULL; +} + +void* threadInit(void* args) { + struct threadArgs_t* targs = (struct threadArgs_t*)args; + char hostname[1024]; + getHostName(hostname, 1024); + int nranks = targs->nProcs*targs->nThreads*targs->nGpus; + + //set main thread again + is_main_thread = (targs->proc == 0 && targs->thread == 0) ? 1 : 0; + + NCCLCHECK(ncclGroupStart()); + for (int i=0; inGpus; i++) { + int rank = targs->proc*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i; + int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + NCCLCHECK(ncclCommInitRank(targs->comms+i, nranks, targs->ncclId, rank)); + } + NCCLCHECK(ncclGroupEnd()); + + PRINT("# Using devices\n"); + for (int p=0; pnProcs; p++) { + if (p == targs->proc) { + for (int t=0; tnThreads; t++) { + if (t == targs->thread) { + for (int i=0; inGpus; i++) { + int cudaDev; + int rank; + cudaDeviceProp prop; + NCCLCHECK(ncclCommCuDevice(targs->comms[i], &cudaDev)); + NCCLCHECK(ncclCommUserRank(targs->comms[i], &rank)); + CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); + printf("# Rank %2d on %10s device %2d [0x%02x] %s\n", rank, hostname, cudaDev, + prop.pciBusID, prop.name); + fflush(stdout); + } + Barrier(targs); + fflush(stdout); + } + } + } + } + + threadRunTests(args); + + return NULL; +} + +void AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, void **expectedHost, size_t nbytes, int nranks, int sameExpected) { + static int is_first = 1; + static void *cached_ptr = NULL; + static void *cached_hostptr = NULL; + + CUDACHECK(cudaMalloc(sendbuff, sendBytes)); + //work around for inline reduce scatter where recv count is smaller that send count + CUDACHECK(cudaMalloc(recvbuff, (sendBytes > recvBytes) ? sendBytes : recvBytes)); + + if (is_first || !sameExpected) { + *expectedHost = malloc(recvBytes); + CUDACHECK(cudaHostRegister(*expectedHost, recvBytes, cudaHostRegisterPortable | cudaHostRegisterMapped)); + CUDACHECK(cudaHostGetDevicePointer(expected, *expectedHost, 0)); + cached_ptr = *expected; + cached_hostptr = *expectedHost; + is_first = 0; + } else { + *expected = cached_ptr; + *expectedHost = cached_hostptr; + } +} + +int ncclstringtotype(char *str) { + for (int t=0; t] \n\t " + "[-g,--ngpus ] \n\t " + "[-b,--minbytes ] \n\t " + "[-e,--maxbytes ] \n\t " + "[-i,--stepbytes ] \n\t " + "[-f,--stepfactor ] \n\t " + "[-n,--iters ] \n\t " + "[-w,--warmup_iters ] \n\t" + "[-s,--swap_args <0/1>] \n\t " + "[-p,--parallel_init <0/1>] \n\t " + "[-c,--check <0/1>] \n\t " + "[-o,--op ] \n\t " + "[-d,--datatype ] \n\t " + "[-r,--root ] \n\t " + "[-z,--blocking <0/1>] \n\t " + "[-h,--help]\n"); + return 0; + default: + printf("invalid option \n"); + printf("USAGE: ./test \n\t" + "[-t,--nthreads ] \n\t " + "[-g,--ngpus ] \n\t " + "[-b,--minbytes ] \n\t " + "[-e,--maxbytes ] \n\t " + "[-i,--stepbytes ] \n\t " + "[-f,--stepfactor ] \n\t " + "[-n,--iters ] \n\t " + "[-w,--warmup_iters ] \n\t" + "[-s,--swap_args <0/1>] \n\t " + "[-p,--parallel_init <0/1>] \n\t " + "[-c,--check <0/1>] \n\t " + "[-o,--op ] \n\t " + "[-d,--datatype ] \n\t " + "[-r,--root ] \n\t " + "[-z,--blocking <0/1>] \n\t " + "[-h,--help]\n"); + return 0; + } + } + + // Make sure everyline is flushed so that we see the progress of the test + setlinebuf(stdout); + +#ifdef MPI_SUPPORT + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nProcs); + MPI_Comm_rank(MPI_COMM_WORLD, &proc); + uint64_t hostHashs[nProcs]; + hostHashs[proc] = getHostHash(hostname); + MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD); + for (int p=0; p 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (swap_args) printf("Swap Comms Enabled: swapping communicators among threads for each iteration \n"); + if (blocking_coll) printf("Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) printf("Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + } + + ncclUniqueId ncclId; + if (proc == 0) { + NCCLCHECK(ncclGetUniqueId(&ncclId)); + } +#ifdef MPI_SUPPORT + MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); +#endif + cudaStream_t streams[nGpus*nThreads]; + void* sendbuffs[nGpus*nThreads]; + void* recvbuffs[nGpus*nThreads]; + void* expected[nGpus*nThreads]; + void* expectedHost[nGpus*nThreads]; + void *procSharedHost, *procShared; + size_t sendBytes, recvBytes, paramBytes, procSharedBytes, sendInplaceOffset, recvInplaceOffset; + int sameExpected; + + getCollByteCount(&sendBytes, &recvBytes, ¶mBytes, &sendInplaceOffset, &recvInplaceOffset, &procSharedBytes, &sameExpected, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + + for (int i=0; i 0) { + procSharedHost = malloc(procSharedBytes); + CUDACHECK(cudaHostRegister(procSharedHost, procSharedBytes, cudaHostRegisterPortable | cudaHostRegisterMapped)); + CUDACHECK(cudaHostGetDevicePointer(&procShared, procSharedHost, 0)); + } + + //if parallel init is not selected, use main thread to initialize NCCL + ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus); + if (!parallel_init) { + if (nProcs == 1) { + int gpuArray[nGpus*nThreads]; + for (int i=0; i=0; t--) { + args[t].proc_args = (void *)args; + args[t].minbytes=minBytes; + args[t].maxbytes=maxBytes; + args[t].stepbytes=stepBytes; + args[t].stepfactor=stepFactor; + args[t].localRank = localRank; + + args[t].nProcs=nProcs; + args[t].proc=proc; + args[t].nThreads=nThreads; + args[t].thread=t; + args[t].nGpus=nGpus; + args[t].sendbuffs = sendbuffs+t*nGpus; + args[t].recvbuffs = recvbuffs+t*nGpus; + args[t].ncclId = ncclId; + args[t].comms=comms+t*nGpus; + args[t].streams=streams+t*nGpus; + + args[t].expectedHost = expectedHost + t*nGpus; + args[t].expected = expected + t*nGpus; + args[t].procSharedHost = procSharedHost; + args[t].procShared = procShared; + args[t].barrier = (volatile int*)barrier; + args[t].barrier_idx = 0; + args[t].sync = (volatile int*)sync; + args[t].sync_idx = 0; + args[t].deltaThreads = delta; + args[t].deltaHost = (delta + t); + CUDACHECK(cudaHostRegister(args[t].deltaHost, sizeof(double), cudaHostRegisterPortable|cudaHostRegisterMapped)); + CUDACHECK(cudaHostGetDevicePointer(&args[t].delta, args[t].deltaHost, 0)); + args[t].errors=errors+t; + args[t].bw=bw+t; + args[t].bw_count=bw_count+t; + + if (!parallel_init) { + if (t) + pthread_create(threads+t, NULL, threadRunTests, args+t); + else + threadRunTests(args); + } else { + if (t || (parallel_init && (proc == 0))) + pthread_create(threads+t, NULL, threadInit, args+t); + else + threadInit(args); + } + } + + // Wait for other threads + for (int t=nThreads-1; t>=0; t--) { + if (t || (parallel_init && (proc == 0))) pthread_join(threads[t], NULL); + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; + } + +#ifdef MPI_SUPPORT + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); +#endif + + for(int i=0; i +#include +#include +#ifdef MPI_SUPPORT +#include "mpi.h" +#endif +#include +#include "nccl1_compat.h" + +#define CUDACHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + printf("Cuda failure %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCLCHECK(cmd) do { \ + ncclResult_t r = cmd; \ + if (r!= ncclSuccess) { \ + printf("NCCL failure %s:%d '%s'\n", \ + __FILE__,__LINE__,ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +struct threadArgs_t { + void *proc_args; + size_t nbytes; + size_t minbytes; + size_t maxbytes; + size_t stepbytes; + size_t stepfactor; + + int nProcs; + int proc; + int nThreads; + int thread; + int nGpus; + int localRank; + void** sendbuffs; + size_t sendBytes; + size_t sendInplaceOffset; + void** recvbuffs; + size_t recvInplaceOffset; + ncclUniqueId ncclId; + ncclComm_t* comms; + cudaStream_t* streams; + + void** expectedHost; + void** expected; + size_t expectedBytes; + void* procSharedHost; + void* procShared; + volatile int* sync; + int sync_idx; + volatile int* barrier; + int barrier_idx; + int syncRank; + int syncNranks; + double* deltaThreads; + double* deltaHost; + double* delta; + int* errors; + double* bw; + int* bw_count; +}; + +#include + +// Provided by common.cu +extern void Barrier(struct threadArgs_t* args); +extern void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, int inPlace); +extern void Randomize(void* ptr, size_t count, ncclDataType_t type, int seed); +extern void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op); +extern void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax); +extern double DeltaMaxValue(ncclDataType_t type); + +// Provided by each coll +void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName); +extern void GetBw(size_t count, int typeSize, double sec, double* algBw, double* busBw, int nranks); +extern void RunColl(void* sendbuf, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +extern void InitData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int in_place, int is_first); +extern double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); +extern void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first); +extern void getCollByteCount(size_t *sendbytes, size_t *recvbytes, size_t *parambytes, size_t *sendInlineOffset, size_t *recvInlineOffset, size_t *procSharedBytes, int *sameexpected, size_t nbytes, int nranks); +extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); +extern void print_header(); + +#include + +static void getHostName(char* hostname, int maxlen) { + gethostname(hostname, maxlen); + for (int i=0; i< maxlen; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + return; + } + } +} + +#include + +static uint64_t getHostHash(const char* string) { + // Based on DJB2, result = result * 33 + char + uint64_t result = 5381; + for (int c = 0; string[c] != '\0'; c++){ + result = ((result << 5) + result) + string[c]; + } + return result; +} + +static size_t wordSize(ncclDataType_t type) { + switch(type) { + case ncclChar: +#if NCCL_MAJOR >= 2 + //case ncclInt8: + case ncclUint8: +#endif + return 1; + case ncclHalf: + //case ncclFloat16: + return 2; + case ncclInt: + case ncclFloat: +#if NCCL_MAJOR >= 2 + //case ncclInt32: + case ncclUint32: + //case ncclFloat32: +#endif + return 4; + case ncclInt64: + case ncclUint64: + case ncclDouble: + //case ncclFloat64: + return 8; + default: return 0; + } +} + +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[ncclNumOps]; +extern const char *test_opnames[ncclNumOps]; + +extern thread_local int is_main_thread; +#define PRINT if (is_main_thread) printf + + diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h new file mode 100644 index 0000000000..4279789af6 --- /dev/null +++ b/src/nccl1_compat.h @@ -0,0 +1,47 @@ +/************************************************************************* + * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * + * See LICENCE.txt for license information + ************************************************************************/ + +#ifndef NCCL1_COMPAT_H +#define NCCL1_COMPAT_H + +#ifndef NCCL_MAJOR // NCCL 1.x +#define ncclNumOps nccl_NUM_OPS +#define ncclNumTypes nccl_NUM_TYPES + +static ncclResult_t ncclGroupStart() { return ncclSuccess; } +static ncclResult_t ncclGroupEnd() { return ncclSuccess; } + +#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument; + +static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); +} +static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); +} +static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(count); + return ncclBcast(buff, (int)count, datatype, root, comm, stream); +} +static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream) { + CHECKCOUNT(recvcount); + return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); +} +static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + CHECKCOUNT(sendcount); + return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); +} +#endif + +#endif diff --git a/src/reduce.cu b/src/reduce.cu new file mode 100644 index 0000000000..0bc9a7db83 --- /dev/null +++ b/src/reduce.cu @@ -0,0 +1,159 @@ +/************************************************************************* + * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + * + * See LICENCE.txt for license information + ************************************************************************/ + +#include +#include "cuda_runtime.h" +#include "common.h" + +void print_header() { + PRINT("# %10s %12s %6s %6s out-of-place in-place\n", "", "", "", ""); + PRINT("# %10s %12s %6s %6s %6s %7s %5s %5s %7s %7s %5s %5s %7s\n", "bytes", "N", "type", "op", "root", + "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res"); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s %6s %6i", size, count, typeName, opName, root); +} + +void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sameExpected = 0; + *procSharedCount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; + } + +void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { + size_t count = args->expectedBytes / wordSize(type); + int root_gpu = root%args->nGpus; + + assert(args->expectedBytes == args->nbytes); + + while (args->sync[args->sync_idx] != args->thread) pthread_yield(); + + for (int i=0; inGpus; i++) { + int device; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + + if (is_first && i == 0) { + CUDACHECK(cudaMemcpy(args->procSharedHost, data, count*wordSize(type), cudaMemcpyDeviceToHost)); + } else { + Accumulate(args->procShared, data, count, type, op); + } + + if (in_place == 0) { + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + + args->sync[args->sync_idx] = args->thread + 1; + + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + int root_proc = root/(args->nThreads*args->nGpus); + if (args->expectedBytes) { + // Last thread does the MPI reduction + if (root_proc == args->proc) { + void* temp, *tempHost = malloc(args->expectedBytes); + CUDACHECK(cudaHostRegister(tempHost, args->expectedBytes, 0)); + CUDACHECK(cudaHostGetDevicePointer(&temp, tempHost, 0)); + + for (int i=0; inProcs; i++) { + if (i == args->proc) continue; + MPI_Recv(tempHost, args->expectedBytes, MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + Accumulate(args->procShared, temp, count, type, op); + CUDACHECK(cudaDeviceSynchronize()); + } + + CUDACHECK(cudaHostUnregister(tempHost)); + free(tempHost); + } else { + MPI_Send(args->procSharedHost, args->expectedBytes, MPI_BYTE, root_proc, 0, MPI_COMM_WORLD); + } + } +#endif + args->sync[args->sync_idx] = 0; + } else { + while (args->sync[args->sync_idx]) pthread_yield(); + } + + //if root fill expected bytes with reduced data + // else if in_place, leave fill it with original data, else set to zero + for (int i=0; inGpus; i++) { + int rank = (args->proc*args->nThreads + args->thread)*args->nGpus + i; + if (rank == root) { + memcpy(args->expectedHost[root_gpu], args->procSharedHost, args->expectedBytes); + } else { + if (in_place == 1) { + CUDACHECK(cudaMemcpy(args->expectedHost[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDeviceToHost)); + } else { + memset(args->expectedHost[i], 0, args->expectedBytes); + } + } + } + + args->sync_idx = !args->sync_idx; +} + +void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + *algBw = baseBw; + *busBw = baseBw; +} + +void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); +} + + +void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = ncclNumOps; + run_ops = test_ops; + run_opnames = test_opnames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; iexpectedBytes; + size_t recvcount = args->expectedBytes / wordSize(type); + size_t sendbytes = args->sendBytes; + size_t sendcount = args->sendBytes / wordSize(type); + + while (args->sync[args->sync_idx] != args->thread) pthread_yield(); + + for (int i=0; inGpus; i++) { + int device; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); + CUDACHECK(cudaSetDevice(device)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + + if (is_first && i == 0) { + CUDACHECK(cudaMemcpy(args->procSharedHost, data, sendbytes, cudaMemcpyDeviceToHost)); + } else { + Accumulate(args->procShared, data, sendcount, type, op); + } + + CUDACHECK(cudaDeviceSynchronize()); + if (in_place == 0) { + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, recvbytes)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + + args->sync[args->sync_idx] = args->thread + 1; + + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (sendbytes > 0) { + // Last thread does the MPI reduction + void* remote, *remoteHost = malloc(sendbytes); + void* myInitialData = malloc(sendbytes); + memcpy(myInitialData, args->procSharedHost, sendbytes); + CUDACHECK(cudaHostRegister(remoteHost, sendbytes, 0)); + CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0)); + + for (int i=0; inProcs; i++) { + if (i == args->proc) { + MPI_Bcast(myInitialData, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD); + free(myInitialData); + } else { + MPI_Bcast(remoteHost, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD); + Accumulate(args->procShared, remote, sendcount, type, op); + cudaDeviceSynchronize(); + } + } + CUDACHECK(cudaHostUnregister(remoteHost)); + free(remoteHost); + } +#endif + args->sync[args->sync_idx] = 0; + } else { + while (args->sync[args->sync_idx]) pthread_yield(); + } + + for (int i=0; inGpus; i++) { + int offset = ((args->proc*args->nThreads + args->thread)*args->nGpus + i)*recvbytes; + memcpy(args->expectedHost[i], (void *)((uintptr_t)args->procSharedHost + offset), recvbytes); + } + + args->sync_idx = !args->sync_idx; +} + +void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); +} + +void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + run_ops = &op; + run_opnames = &opName; + op_count = 1; + } else { + op_count = sizeof(test_ops)/sizeof(test_ops[0]); + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Tue, 8 Aug 2017 16:25:07 -0700 Subject: [PATCH 002/233] Improve Readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d70bb1f54c..1532a658fb 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,10 @@ Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, d $ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0 ``` -All tests support the same arguments : +### Arguments + +All tests support the same set of arguments : + * Number of GPUs * `-t,--nthreads ` number of threads per process. Default : 1. * `-g,--ngpus ` number of gpus per process. Default : 1. From a15599f5cfc6043e3514800c92ac9e55b8dec835 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 8 Aug 2017 16:28:46 -0700 Subject: [PATCH 003/233] Improve Readme --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1532a658fb..92b122c2f2 100644 --- a/README.md +++ b/README.md @@ -40,23 +40,24 @@ All tests support the same set of arguments : * Number of GPUs * `-t,--nthreads ` number of threads per process. Default : 1. - * `-g,--ngpus ` number of gpus per process. Default : 1. + * `-g,--ngpus ` number of gpus per thread. Default : 1. * Sizes to scan * `-b,--minbytes ` minimum size to start with. Default : 32M. * `-e,--maxbytes ` maximum size to end at. Default : 32M. * Increments can be either fixes of a multiplication factor. Only one of those should be used - * `-i,--stepbytes ` fixed increment between sizes. Default : (max-min)/10. - * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. + * `-i,--stepbytes ` fixed increment between sizes. Default : (max-min)/10. + * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. +* NCCL operations arguments + * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. + * `-d,--datatype ` Specify which datatype to use. Default : Float. + * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. * Performance * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. -* `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0. -* `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. -* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. -* NCCL operations arguments - * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations. Default : Sum. - * `-d,--datatype ` Specify which datatype to use. Default : Float. - * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. +* Test operation + * `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0. + * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. + * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. ## Copyright From 9ec3e352769c1ec9900c59755fad98b61404f5a0 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 8 Aug 2017 16:29:25 -0700 Subject: [PATCH 004/233] Fix typo in Readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 92b122c2f2..10c255dae9 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ All tests support the same set of arguments : * Sizes to scan * `-b,--minbytes ` minimum size to start with. Default : 32M. * `-e,--maxbytes ` maximum size to end at. Default : 32M. - * Increments can be either fixes of a multiplication factor. Only one of those should be used + * Increments can be either fixed or a multiplication factor. Only one of those should be used * `-i,--stepbytes ` fixed increment between sizes. Default : (max-min)/10. * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. * NCCL operations arguments From 25016c8eebbf8200208bfce9ebfbc1ea2254e915 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 9 Aug 2017 10:41:31 -0700 Subject: [PATCH 005/233] Fix NCCL_HOME to be consistent with README --- src/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 6188d01424..45d31d54b0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -43,9 +43,9 @@ endif .PHONY: build clean BUILDDIR ?= ../build -ifneq ($(NCCLDIR), "") -NVCUFLAGS += -I$(NCCLDIR)/include/ -NVLDFLAGS += -L$(NCCLDIR)/lib +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib endif ifeq ($(MPI), 1) From 925a70576e584e77bc930606c59595e9f66b71dd Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 21 Dec 2017 15:10:09 -0800 Subject: [PATCH 006/233] Print NCCL version at start --- src/common.cu | 1 + src/nccl1_compat.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/common.cu b/src/common.cu index a14c3aac01..f47e0f5da5 100644 --- a/src/common.cu +++ b/src/common.cu @@ -915,6 +915,7 @@ int main(int argc, char* argv[]) { NCCLCHECK(ncclGroupEnd()); } + PRINT("# NCCL Tests compiled with NCCL %d.%d\n", NCCL_MAJOR, NCCL_MINOR); PRINT("# Using devices\n"); for (int p=0; p Date: Mon, 29 Jan 2018 13:40:45 -0800 Subject: [PATCH 007/233] Added explanation about performance numbers --- README.md | 4 ++ doc/PERFORMANCE.md | 140 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 doc/PERFORMANCE.md diff --git a/README.md b/README.md index 10c255dae9..d036c69644 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,10 @@ Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, d $ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0 ``` +### Performance + +See the [doc/PERFORMANCE.md](Performance) page for explanation about numbers, and in particular the "busbw" column. + ### Arguments All tests support the same set of arguments : diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md new file mode 100644 index 0000000000..bc01e57b5f --- /dev/null +++ b/doc/PERFORMANCE.md @@ -0,0 +1,140 @@ +# Performance reported by NCCL tests + +NCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used. + +# Time + +Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations. + +On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but +also the bandwidth multiplied by the size. + +Therefore, on large sizes, it makes more sense to look at the bandwidth. + +# Bandwidth + +## Algorithm bandwidth + +Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth. + +`algbw = S/t` + +## Bus bandwidth + +While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks. +Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase). + +To provide a number which reflects how optimally the hardware is used, NCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output). +This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. +Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used. + +The formula depends on the collective operation. + +### AllReduce + +An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation : + +`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}` + +**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).** + +A ring would do that in an order which follows the ring : + +`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}` + +A tree would do this hierchically : + +`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))` + +In all cases, we need n-1 additions and n assignations for each element. Since every step is on a different rank except potentially one (the last input and the first output), +we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation. + +Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best : + + `t = (S*2*(n-1)) / (n*B)` + +Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them. +Reordering the elements, we find that + + `t = (S/B) * (2*(n-1)/n)` + +Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute : + + `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)` + +### ReduceScatter + +The ReduceScatter operation requires only to perform the addition part of the allReduce operation : + + `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}` + +With K being the rank which is getting the final result(K=offset/recvsize). + +The perfect reduceScatter time with a rank bandwidth of B would therefore be : + + `t = S*(n-1) / (B*n)` + +And the Bus Bandwidth is therefore computed as : + + `B = S/t * (n-1)/n = algbw * (n-1)/n` + +### AllGather + +The AllGather operation requires only to perform the assignation part of the allReduce operation : + + `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K` + +With K being the rank where the data originates from (K=offset*sendsize). + +The perfect allGather time with a rank bandwidth of B would therefore be : + + `t = S*(n-1) / (B*n)` + +And the Bus Bandwidth is therefore computed as : + + `B = S/t * (n-1)/n = algbw * (n-1)/n` + +### Broadcast + +The broadcast operation representation is similar to allGather : + + `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R` + +R being the root of the operation. + +However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations. +Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out : + + `t = S/B` + +And : + + `B = S/t` + +### Reduce + +The reduce operation performs : + + `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}` + +R being the root of the operation. + +Similarly to broadcast, all data need to be sent to the root, hence : + + `t = S/B` + +And : + + `B = S/t` + +### Summary + +To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth : + +* AllReduce : 2*(_n_-1)/_n_ +* ReduceScatter : (_n_-1)/_n_ +* AllGather : (_n_-1)/_n_ +* Broadcast : 1 +* Reduce : 1 + +The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network. From db39a88f8a88730e1d5ca428ee764486d87a5805 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 30 Jan 2018 09:14:49 -0800 Subject: [PATCH 008/233] Fix link to performance page --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d036c69644..0fd7a24bc3 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ $ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0 ### Performance -See the [doc/PERFORMANCE.md](Performance) page for explanation about numbers, and in particular the "busbw" column. +See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column. ### Arguments From e00cb1f1c429eb524f2e0903f986b46fe0d15e1f Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 30 Jan 2018 09:15:58 -0800 Subject: [PATCH 009/233] Typos/Clarifications --- doc/PERFORMANCE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md index bc01e57b5f..b9afbb4ecb 100644 --- a/doc/PERFORMANCE.md +++ b/doc/PERFORMANCE.md @@ -38,11 +38,11 @@ An allreduce operation, for each element of the N arrays (input i_X and output o **Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).** -A ring would do that in an order which follows the ring : +A ring would do that operation in an order which follows the ring : `i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}` -A tree would do this hierchically : +A tree would do it hierarchically : `(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))` From eb4c43ff3d37d656efdf2ed75ce49e7f73efa581 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 30 Jan 2018 09:17:29 -0800 Subject: [PATCH 010/233] Clarification --- doc/PERFORMANCE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md index b9afbb4ecb..97419ecde9 100644 --- a/doc/PERFORMANCE.md +++ b/doc/PERFORMANCE.md @@ -54,7 +54,7 @@ Considering that each rank has a bandwidth to the outside world of _B_, the time `t = (S*2*(n-1)) / (n*B)` Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them. -Reordering the elements, we find that +Reordering the equation, we find that `t = (S/B) * (2*(n-1)/n)` From dcf818955fa6e279e03263c984e95384164c24ad Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Fri, 17 Aug 2018 14:58:44 -0700 Subject: [PATCH 011/233] Added a precision for AllGather and ReduceScatter sizes since NCCL uses the size per rank. --- doc/PERFORMANCE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md index 97419ecde9..7cc6ecee66 100644 --- a/doc/PERFORMANCE.md +++ b/doc/PERFORMANCE.md @@ -78,6 +78,8 @@ And the Bus Bandwidth is therefore computed as : `B = S/t * (n-1)/n = algbw * (n-1)/n` +Note that here, S is the size in bytes of the total array, which for NCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank. + ### AllGather The AllGather operation requires only to perform the assignation part of the allReduce operation : @@ -94,6 +96,8 @@ And the Bus Bandwidth is therefore computed as : `B = S/t * (n-1)/n = algbw * (n-1)/n` +Note that here, S is the size in bytes of the total array, which for NCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank. + ### Broadcast The broadcast operation representation is similar to allGather : From cbe7f654001d4b4123d8b104c863d983fa746a02 Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 6 Mar 2019 18:17:20 -0800 Subject: [PATCH 012/233] Resync all tests with test code from NCCL 2.4 Major rework to merge most of the changes from the NCCL internal tests into the public ones Added "-m " operation aggregation option. Data integrity checking is now much more performant at scale. Startup times at scale are improved. Test latency units are now displayed in usec. --- README.md | 12 +- src/Makefile | 24 +- src/all_gather.cu | 117 ++--- src/all_reduce.cu | 146 +++--- src/broadcast.cu | 145 +++--- src/common.cu | 1148 ++++++++++++++++++----------------------- src/common.h | 130 ++++- src/nccl1_compat.h | 4 +- src/reduce.cu | 180 +++---- src/reduce_scatter.cu | 140 +++-- 10 files changed, 949 insertions(+), 1097 deletions(-) diff --git a/README.md b/README.md index 0fd7a24bc3..7a4bbbc6ca 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # NCCL Tests -These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL 1](http://github.com/nvidia/nccl) and [NCCL 2](http://developer.nvidia.com/nccl). +These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL](http://github.com/nvidia/nccl) ## Build @@ -20,7 +20,7 @@ $ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nc ## Usage -NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of gpus per thread). +NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). ### Quick examples @@ -44,7 +44,7 @@ All tests support the same set of arguments : * Number of GPUs * `-t,--nthreads ` number of threads per process. Default : 1. - * `-g,--ngpus ` number of gpus per thread. Default : 1. + * `-g,--ngpus ` number of gpus per thread. Default : 1. * Sizes to scan * `-b,--minbytes ` minimum size to start with. Default : 32M. * `-e,--maxbytes ` maximum size to end at. Default : 32M. @@ -55,16 +55,16 @@ All tests support the same set of arguments : * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. * `-d,--datatype ` Specify which datatype to use. Default : Float. * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. -* Performance +* Performance * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. + * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. * Test operation - * `-s,--swap_args <0/1>` when used with multiple threads, have threads manage different GPUs for each iteration. Default : 0. * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. ## Copyright -NCCL tests are provided under the BSD licence. All source code and accompanying documentation is copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. +NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. diff --git a/src/Makefile b/src/Makefile index 45d31d54b0..034cc672fa 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,7 @@ # -# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # -# See LICENCE.txt for license information +# See LICENSE.txt for license information # CUDA_HOME ?= /usr/local/cuda @@ -18,10 +18,10 @@ NVCC = $(CUDA_HOME)/bin/nvcc NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \ -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_52,code=sm_52 \ - -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_61,code=compute_61 + -gencode=arch=compute_70,code=compute_70 \ + -gencode=arch=compute_70,code=sm_70 NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 @@ -29,14 +29,16 @@ LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt ifeq ($(DEBUG), 0) -NVCUFLAGS += -O3 -CXXFLAGS += -O3 +NVCUFLAGS += -O3 -g +CXXFLAGS += -O3 -g else NVCUFLAGS += -O0 -G -g CXXFLAGS += -O0 -g -ggdb3 endif -ifeq ($(VERBOSE), 0) +ifneq ($(VERBOSE), 0) +NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter +else .SILENT: endif @@ -45,7 +47,7 @@ endif BUILDDIR ?= ../build ifneq ($(NCCL_HOME), "") NVCUFLAGS += -I$(NCCL_HOME)/include/ -NVLDFLAGS += -L$(NCCL_HOME)/lib +NVLDFLAGS += -L$(NCCL_HOME)/lib endif ifeq ($(MPI), 1) @@ -53,7 +55,7 @@ NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi endif LIBRARIES += curand nccl nvToolsExt -NVLDFLAGS += $(LIBRARIES:%=-l%) +NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) @@ -66,7 +68,7 @@ build: ${BIN_FILES} clean: rm -rf ${DST_DIR} -${DST_DIR}/%.o: %.cu +${DST_DIR}/%.o: %.cu common.h @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) -c $< diff --git a/src/all_gather.cu b/src/all_gather.cu index 2386842cdd..cfb2ec356b 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -1,79 +1,53 @@ /************************************************************************* - * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * - * See LICENCE.txt for license information + * See LICENSE.txt for license information ************************************************************************/ #include "cuda_runtime.h" #include "common.h" - void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place\n", "", "", "", ""); - PRINT("# %10s %12s %6s %7s %5s %5s %7s %7s %5s %5s %7s\n", "bytes", "N", "type", - "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res"); + PRINT("# %10s %12s %6s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { PRINT("%12li %12li %6s", size, count, typeName); } -void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) { - *sendcount = count/nranks; - *recvcount = (count/nranks)*nranks; - *sameExpected = 1; - *procSharedCount = 0; - *sendInplaceOffset = count/nranks; - *recvInplaceOffset = 0; - *paramcount = *sendcount; +void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count/nranks; + *recvcount = (count/nranks)*nranks; + *sendInplaceOffset = count/nranks; + *recvInplaceOffset = 0; + *paramcount = *sendcount; } -void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { - size_t nBytes = args->nbytes; - size_t count = nBytes / wordSize(type); - int proc = args->proc; - int nThreads = args->nThreads; - int t = args->thread; - int nGpus = args->nGpus; +testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; - while (args->sync[args->sync_idx] != t) pthread_yield(); - - for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); - - void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i]; - - CUDACHECK(cudaMemcpy((void *)((uintptr_t)args->expectedHost[0] + ((proc*nThreads + t)*nGpus + i)*nBytes), - data, - nBytes, cudaMemcpyDeviceToHost)); - - if (in_place == 0) { - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); } CUDACHECK(cudaDeviceSynchronize()); } - - args->sync[args->sync_idx] = t + 1; - - if (t+1 == nThreads) { -#ifdef MPI_SUPPORT - // Last thread does the MPI allgather - MPI_Allgather(MPI_IN_PLACE, nBytes*nThreads*nGpus, MPI_BYTE, - args->expectedHost[0], - nBytes*nThreads*nGpus, MPI_BYTE, MPI_COMM_WORLD); -#endif - args->sync[args->sync_idx] = 0; - } else { - while (args->sync[args->sync_idx]) pthread_yield(); - } - - args->sync_idx=!args->sync_idx; + return testSuccess; } -void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { +void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; *algBw = baseBw; @@ -81,26 +55,49 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, *busBw = baseBw * factor; } -void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); + return testSuccess; } -void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { +struct testColl allGatherTest = { + "AllGather", + AllGatherGetCollByteCount, + AllGatherInitData, + AllGatherGetBw, + AllGatherRunColl +}; + +void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllGatherGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allGatherTest; ncclDataType_t *run_types; const char **run_typenames; int type_count; - if ((int)type != -1) { + if ((int)type != -1) { type_count = 1; run_types = &type; run_typenames = &typeName; - } else { + } else { type_count = ncclNumTypes; run_types = test_types; run_typenames = test_typenames; } - for (int i=0; inbytes / wordSize(type); - - while (args->sync[args->sync_idx] != args->thread) pthread_yield(); - - for (int i=0; inGpus; i++) { - int device; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - - if (is_first && i == 0) { - CUDACHECK(cudaMemcpy(args->expected[0], data, count*wordSize(type), cudaMemcpyDeviceToHost)); - } else { - Accumulate(args->expected[0], data, count, type, op); - } - - if (in_place == 0) { - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes)); - } - CUDACHECK(cudaDeviceSynchronize()); - } - - args->sync[args->sync_idx] = args->thread + 1; - - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - // Last thread does the MPI reduction - if (args->nbytes > 0) { - void* remote, *remoteHost = malloc(args->nbytes); - void* myInitialData = malloc(args->nbytes); - memcpy(myInitialData, args->expectedHost[0], args->nbytes); - CUDACHECK(cudaHostRegister(remoteHost, args->nbytes, cudaHostRegisterPortable | cudaHostRegisterMapped)); - CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0)); - for (int i=0; inProcs; i++) { - if (i == args->proc) { - MPI_Bcast(myInitialData, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD); - free(myInitialData); - } else { - MPI_Bcast(remoteHost, args->nbytes, MPI_BYTE, i, MPI_COMM_WORLD); - Accumulate(args->expected[0], remote, count, type, op); - cudaDeviceSynchronize(); - } - } - CUDACHECK(cudaHostUnregister(remoteHost)); - free(remoteHost); - } -#endif - args->sync[args->sync_idx] = 0; - } else { - while (args->sync[args->sync_idx]) pthread_yield(); - } - - args->sync_idx = !args->sync_idx; +void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; } -void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { +testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize) / 1.0E9 / sec; *algBw = baseBw; @@ -91,40 +53,62 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, *busBw = baseBw * factor; } -void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); + return testSuccess; } +struct testColl allReduceTest = { + "AllReduce", + AllReduceGetCollByteCount, + AllReduceInitData, + AllReduceGetBw, + AllReduceRunColl +}; -void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { +void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &allReduceTest; ncclDataType_t *run_types; ncclRedOp_t *run_ops; const char **run_typenames, **run_opnames; int type_count, op_count; - if ((int)type != -1) { + if ((int)type != -1) { type_count = 1; run_types = &type; run_typenames = &typeName; - } else { + } else { type_count = ncclNumTypes; run_types = test_types; run_typenames = test_typenames; } - if ((int)op != -1) { + if ((int)op != -1) { op_count = 1; run_ops = &op; run_opnames = &opName; - } else { + } else { op_count = ncclNumOps; run_ops = test_ops; run_opnames = test_opnames; } - for (int i=0; i void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place\n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %5s %5s %7s\n", "bytes", "N", "type", "root", - "time", "algbw", "busbw", "res"); + PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { PRINT("%12li %12li %6s %6i", size, count, typeName, root); } -void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) { - *sendcount = count; - *recvcount = count; - *sameExpected = 0; - *procSharedCount = count; - *sendInplaceOffset = 0; - *recvInplaceOffset = 0; - *paramcount = *sendcount; +void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; } -void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { - int root_proc = root/(args->nThreads*args->nGpus); - int root_thread = (root/args->nGpus)%(args->nThreads); - int root_gpu = root%args->nGpus; - - assert(args->expectedBytes == args->nbytes); - - if (root_thread == args->thread) { - if (root_proc == args->proc) { - CUDACHECK(cudaMemcpy(args->procSharedHost, - args->sendbuffs[root_gpu], - args->nbytes, cudaMemcpyDeviceToHost)); - } -#ifdef MPI_SUPPORT - MPI_Bcast(args->procSharedHost, args->nbytes, MPI_BYTE, root_proc, MPI_COMM_WORLD); -#endif - - args->sync[0] = 0; - } - - Barrier(args); +testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); for (int i=0; inGpus; i++) { - int device; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); - - //set expected buf to zero at root, copy over source data at others - if ((root_proc == args->proc) - && (root_thread == args->thread) - && (root_gpu == i)) { - memset(args->expectedHost[i], 0, args->nbytes); - } else { - memcpy(args->expectedHost[i], args->procSharedHost, args->nbytes); - } - - //reset recvbufs to zero - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->nbytes)); - CUDACHECK(cudaDeviceSynchronize()); + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root)); + CUDACHECK(cudaDeviceSynchronize()); } - - Barrier(args); + return testSuccess; } -void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { +void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize) / 1.0E9 / sec; *algBw = baseBw; @@ -80,42 +52,69 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, *busBw = baseBw * factor; } -void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - int rank; +testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); - if (rank == root) { +#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 + NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream)); +#else + if (rank == root) { NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream)); - } else { + } else { NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream)); - } + } +#endif + return testSuccess; } -void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { +struct testColl broadcastTest = { + "Broadcast", + BroadcastGetCollByteCount, + BroadcastInitData, + BroadcastGetBw, + BroadcastRunColl +}; + +void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + BroadcastGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &broadcastTest; ncclDataType_t *run_types; const char **run_typenames; int type_count; - int begin_root, end_root; + int begin_root, end_root; - if ((int)type != -1) { + if ((int)type != -1) { type_count = 1; run_types = &type; run_typenames = &typeName; - } else { + } else { type_count = ncclNumTypes; run_types = test_types; run_typenames = test_typenames; } - if (root != -1) { - begin_root = end_root = root; - } else { - begin_root = 0; - end_root = args->nProcs*args->nThreads*args->nGpus-1; + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; } - for (int i=0; i #include #include +#include #include "cuda.h" #if NCCL_MAJOR >= 2 @@ -22,13 +23,20 @@ const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"}; thread_local int is_main_thread = 0; +// Command line parameter defaults +static int nThreads = 1; +static int nGpus = 1; +static size_t minBytes = 32*1024*1024; +static size_t maxBytes = 32*1024*1024; +static size_t stepBytes = 1*1024*1024; +static size_t stepFactor = 1; static int datacheck = 1; static int warmup_iters = 5; static int iters = 20; +static int agg_iters = 1; static int ncclop = ncclSum; static int nccltype = ncclFloat; static int ncclroot = 0; -static int swap_args = 0; static int parallel_init = 0; static int blocking_coll = 0; @@ -83,12 +91,11 @@ template __device__ float toFloat(T a) { return (float)a; } -template<> __device__ +template<> __device__ float toFloat(half a) { return __half2float(a); } - template __global__ void deltaKern(void* A_, void* B_, size_t count, double* max) { const T* A = (const T*)A_; @@ -102,7 +109,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { if( delta > locmax ) { locmax = delta; #ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %d/%d : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i])); + if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i])); #endif } } @@ -119,7 +126,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { } -void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { +testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { switch (type) { case ncclHalf: deltaKern<<<1, 512>>>(results, expected, count, devmax); break; @@ -142,223 +149,112 @@ void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type case ncclUint64: deltaKern<<<1, 512>>>(results, expected, count, devmax); break; } -} - -#define CURAND_CHK(cmd) \ - do { \ - curandStatus_t error = (cmd); \ - if (error != CURAND_STATUS_SUCCESS) { \ - printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__); \ - exit(EXIT_FAILURE); \ - } \ - } while (false) - - -template -void GenerateRandom(curandGenerator_t generator, T * const dest, - const size_t N); - -template<> -void GenerateRandom(curandGenerator_t generator, int8_t * const dest, - const size_t N) { - size_t align = (4 - (((size_t)dest) & 3)) % 4; - CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align), - N * sizeof(int8_t) / sizeof(int))); - CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice)); -} -template<> -void GenerateRandom(curandGenerator_t generator, uint8_t * const dest, - const size_t N) { - size_t align = (4 - (((size_t)dest) & 3)) % 4; - CURAND_CHK(curandGenerate(generator, (unsigned int*)(dest+align), - N * sizeof(uint8_t) / sizeof(int))); - CUDACHECK(cudaMemcpy(dest, dest+4, align, cudaMemcpyDeviceToDevice)); -} - -template<> -void GenerateRandom(curandGenerator_t generator, int32_t * const dest, - const size_t N) { - CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N)); -} - -template<> -void GenerateRandom(curandGenerator_t generator, uint32_t * const dest, - const size_t N) { - CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N)); -} - -template<> -void GenerateRandom(curandGenerator_t generator, float * const dest, - const size_t N) { - CURAND_CHK(curandGenerateUniform(generator, dest, N)); -} - -template<> -void GenerateRandom(curandGenerator_t generator, double * const dest, - const size_t N) { - CURAND_CHK(curandGenerateUniformDouble(generator, dest, N)); -} - -template<> -void GenerateRandom(curandGenerator_t generator, uint64_t * const dest, - const size_t N) { - CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2)); -} - -template<> -void GenerateRandom(curandGenerator_t generator, int64_t * const dest, - const size_t N) { - CURAND_CHK(curandGenerate(generator, (unsigned int *)dest, N*2)); -} - -template -void RandomizeType(void* dest, const size_t N, const int randomSeed) { - T* ptr = (T*)dest; - curandGenerator_t gen; - CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32)); - CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed)); - GenerateRandom(gen, ptr, N); - CURAND_CHK(curandDestroyGenerator(gen)); CUDACHECK(cudaDeviceSynchronize()); + return testSuccess; } -__global__ void halve(const float * src, half* dest, size_t N) { - for(int tid = threadIdx.x + blockIdx.x*blockDim.x; - tid < N; tid += blockDim.x * gridDim.x) - dest[tid] = __float2half(src[tid]); +// For integer values, we use values between 0 and 255 +template +__device__ T testValue(const size_t offset, const int rep, const int rank) { + uint8_t v = (rep+rank+offset) % 256; + return (T)v; } -void RandomizeHalf(void* dest, const size_t N, const int randomSeed) { - half* ptr = (half*)dest; - curandGenerator_t gen; - CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32)); - CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed)); - - float* temp; - CUDACHECK(cudaMalloc(&temp, N*sizeof(float))); - GenerateRandom(gen, temp, N); - halve<<<128, 512>>>(temp, ptr, N); - CURAND_CHK(curandDestroyGenerator(gen)); - CUDACHECK(cudaFree(temp)); - CUDACHECK(cudaDeviceSynchronize()); +// For floating point datatype, we use values between 0 and 1 otherwise the +// Product operation will produce NaNs. +template<> +__device__ double testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(double)testValue(offset, rep, rank)); +} +template<> +__device__ float testValue(const size_t offset, const int rep, const int rank) { + return 1.0/(1.0+(float)testValue(offset, rep, rank)); +} +template<> +__device__ half testValue(const size_t offset, const int rep, const int rank) { + return __float2half(testValue(offset, rep, rank)); } -void Randomize(void* ptr, const size_t count, ncclDataType_t type, const int seed) { - switch (type) { - case ncclChar: RandomizeType (ptr, count, seed); break; -#if NCCL_MAJOR >= 2 - case ncclUint8: RandomizeType (ptr, count, seed); break; -#endif - case ncclInt: RandomizeType (ptr, count, seed); break; -#if NCCL_MAJOR >= 2 - case ncclUint32: RandomizeType(ptr, count, seed); break; -#endif - case ncclInt64: RandomizeType (ptr, count, seed); break; - case ncclUint64: RandomizeType(ptr, count, seed); break; - case ncclHalf: RandomizeHalf (ptr, count, seed); break; - case ncclFloat: RandomizeType (ptr, count, seed); break; - case ncclDouble: RandomizeType (ptr, count, seed); break; - } -} +// Operations +template +__device__ T ncclOpSum(T a, T b) { return a+b; } +template +__device__ T ncclOpProd(T a, T b) { return a*b; } +template +__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } +template +__device__ T ncclOpMin(T a, T b) { return a __global__ static -void accumKern(T* acum, const T* contrib, size_t N) { - int tid = threadIdx.x + blockIdx.x*blockDim.x; - int offset = blockDim.x*gridDim.x; - for(int i=tid; i c) ? a : c; - } else if(OP == ncclMin) { - acum[i] = (a < c) ? a : c; +// Definitions for half +template<> +__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } +template<> +__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } +template<> +__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } +template<> +__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } + +template +__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); + for (int i=1; i(o+offset, rep, i)); } + data[o] = val; } } -template<> __global__ -void accumKern(half* acum, const half* contrib, size_t N) { - int tid = threadIdx.x + blockIdx.x*blockDim.x; - int offset = blockDim.x*gridDim.x; - for(int i=tid; i> +#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) -template<> __global__ -void accumKern(half* acum, const half* contrib, size_t N) { - int tid = threadIdx.x + blockIdx.x*blockDim.x; - int offset = blockDim.x*gridDim.x; - for(int i=tid; i __global__ -void accumKern(half* acum, const half* contrib, size_t N) { - int tid = threadIdx.x + blockIdx.x*blockDim.x; - int offset = blockDim.x*gridDim.x; - for(int i=tid; ic) ? a : c ); - } -} - -template<> __global__ -void accumKern(half* acum, const half* contrib, size_t N) { - int tid = threadIdx.x + blockIdx.x*blockDim.x; - int offset = blockDim.x*gridDim.x; - for(int i=tid; i -void accVecType(void* out, void* in, size_t n, ncclRedOp_t op) { - switch(op) { - case ncclSum: accumKern <<<256,256>>>((T*)out, (T*)in, n); break; - case ncclProd: accumKern<<<256,256>>>((T*)out, (T*)in, n); break; - case ncclMax: accumKern <<<256,256>>>((T*)out, (T*)in, n); break; - case ncclMin: accumKern <<<256,256>>>((T*)out, (T*)in, n); break; - default: - printf("Unknown reduction operation.\n"); - exit(EXIT_FAILURE); - } +__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { + for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); } -void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op) { - switch (type) { - case ncclChar: accVecType (out, in, n, op); break; -#if NCCL_MAJOR >= 2 - case ncclUint8: accVecType (out, in, n, op); break; -#endif - case ncclInt: accVecType (out, in, n, op); break; -#if NCCL_MAJOR >= 2 - case ncclUint32: accVecType (out, in, n, op); break; -#endif - case ncclInt64: accVecType (out, in, n, op); break; - case ncclUint64: accVecType (out, in, n, op); break; - case ncclHalf: accVecType (out, in, n, op); break; - case ncclFloat: accVecType (out, in, n, op); break; - case ncclDouble: accVecType (out, in, n, op); break; - default: - printf("Unknown reduction type.\n"); - exit(EXIT_FAILURE); - } +static void* const initDataKerns[ncclNumTypes] = { + (void*)InitDataKernel< int8_t>, + (void*)InitDataKernel< uint8_t>, + (void*)InitDataKernel< int32_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< int64_t>, + (void*)InitDataKernel, + (void*)InitDataKernel< half>, + (void*)InitDataKernel< float>, + (void*)InitDataKernel< double> +}; + +template +testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { + T* ptr = (T*)dest; + InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + return testSuccess; } -void Barrier(struct threadArgs_t* args) +testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { + dim3 grid = { 32, 1, 1 }; + dim3 block = { 256, 1, 1 }; + void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; + CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + return testSuccess; +} + +void Barrier(struct threadArgs* args) { while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); @@ -376,16 +272,7 @@ void Barrier(struct threadArgs_t* args) args->barrier_idx=!args->barrier_idx; } -void RandomizeAccumulate(void* data, void* accum, size_t count, ncclDataType_t type, ncclRedOp_t op, int seed, int rank) { - Randomize(data, count, type, seed); - if (rank == 0) { - CUDACHECK(cudaMemcpy(accum, data, count*wordSize(type), cudaMemcpyDeviceToHost)); - } else { - Accumulate(accum, data, count, type, op); - } -} - -double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { size_t count = args->expectedBytes/wordSize(type); double maxDelta = 0.0; for (int i=0; inGpus; i++) { @@ -394,24 +281,25 @@ double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); CUDACHECK(cudaSetDevice(device)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; - CheckDelta(data , args->expected[i], count, type, args->delta); - cudaDeviceSynchronize(); + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->delta)); maxDelta = std::max(*(args->deltaHost), maxDelta); #ifdef DEBUG_PRINT - if (rank == 0) { - int *temp = (int *)malloc(args->expectedBytes); + if (rank == 0) { + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); + cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); printf("\n Expected: "); - for(int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, *((int *)args->expectedHost[0] + j)); + for(int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, expectedHost[j]); } printf("\n"); - cudaMemcpy(temp, data, args->expectedBytes, cudaMemcpyDeviceToHost); + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); printf("\n Actual: "); - for (int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, *((int *)temp + j)); + for (int j=0; jexpectedBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); } printf("\n"); free(temp); @@ -420,173 +308,173 @@ double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, } double nranks = args->nProcs*args->nThreads*args->nGpus; if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; - return maxDelta; + *delta = maxDelta; + return testSuccess; } -void InitSend(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { - size_t count = args->sendBytes / wordSize(type); - static int rep = 1; - for (int i=0; inGpus; i++) { - int device; - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); - void* data = in_place ? (void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank) : args->sendbuffs[i]; - int seed = rank+count+rep+in_place; - Randomize(data, count, type, seed); +testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { + cudaError_t cudaErr; + int remaining = ngpus; + int* done = (int*)malloc(sizeof(int)*ngpus); + memset(done, 0, sizeof(int)*ngpus); + while (remaining) { + int idle = 1; + for (int i=0; isendBytes); - cudaMemcpy(temp, data, args->sendBytes, cudaMemcpyDeviceToHost); - printf("\n Send Data at rank %d:", rank); - for (int i=0; isendBytes/sizeof(int); i++) { - printf("%d:%d ", i, *((int *)temp + i)); + cudaErr = cudaStreamQuery(streams[i]); + if (cudaErr == cudaSuccess) { + done[i] = 1; + remaining--; + idle = 0; + continue; + } + + if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr); + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + if (comms) { + ncclResult_t ncclAsyncErr; + NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); + if (ncclAsyncErr != ncclSuccess) { + // An asynchronous error happened. Stop the operation and destroy + // the communicator + for (int i=0; inbytes / wordSize(type); - if (swap_args) { - args = (struct threadArgs_t*)args->proc_args + (args->thread + thread_offset)%args->nThreads; - } + // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange + size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t shift = (totalnbytes * iter) % args->maxbytes; + if (shift + totalnbytes > args->maxbytes) shift = 0; - if (args->nGpus == 1) { - int rank = args->proc*args->nThreads + args->thread; - RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[0] + args->sendInplaceOffset*rank)) : args->sendbuffs[0]), - (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[0] + args->recvInplaceOffset*rank) : args->recvbuffs[0]), - count, type, op, root, args->comms[0], args->streams[0]); - } else { - NCCLCHECK(ncclGroupStart()); - for (int i = 0; i < args->nGpus; i++) { + if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus; i++) { #ifndef NCCL_MAJOR - int cudaDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); - CUDACHECK(cudaSetDevice(cudaDev)); + int cudaDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); #endif - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - RunColl((void*)(in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->sendInplaceOffset*rank)) : args->sendbuffs[i]), - (void*)(in_place ? (void*)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank) : args->recvbuffs[i]), - count, type, op, root, args->comms[i], args->streams[i]); - } - NCCLCHECK(ncclGroupEnd()); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + char* recvBuff = ((char*)args->recvbuffs[i]) + shift; + char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + TESTCHECK(args->collTest->runColl( + (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), + (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), + count, type, op, root, args->comms[i], args->streams[i])); } + if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); - if (swap_args || blocking_coll) { - //if args have been swapped, complete op before returning - for (int i = 0; i < args->nGpus; ++i) { - cudaError_t err = cudaErrorNotReady; - while (err == cudaErrorNotReady) { - err = cudaStreamQuery(args->streams[i]); - pthread_yield(); - } - CUDACHECK(err); - } + if (blocking_coll) { + // Complete op before returning + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); } if (blocking_coll) Barrier(args); + return testSuccess; } -void completeColl(struct threadArgs_t* args) { - //it swap_args was enabled, op would have been completed immediately - if (swap_args || blocking_coll) return; +testResult_t completeColl(struct threadArgs* args) { + if (blocking_coll) return testSuccess; - for (int i = 0; i < args->nGpus; ++i) { - cudaError_t err = cudaErrorNotReady; - while (err == cudaErrorNotReady) { - err = cudaStreamQuery(args->streams[i]); - pthread_yield(); - } - CUDACHECK(err); - } + TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + return testSuccess; } -void BenchTime(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { +testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); - + // Sync - startColl(args, type, op, root, in_place, 0); - completeColl(args); + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); Barrier(args); // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { - startColl(args, type, op, root, in_place, iter); + if (agg_iters>1) NCCLCHECK(ncclGroupStart()); + for (int aiter = 0; aiter < agg_iters; aiter++) { + TESTCHECK(startColl(args, type, op, root, in_place, iter*agg_iters+aiter)); + } + if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } - completeColl(args); + TESTCHECK(completeColl(args)); auto delta = std::chrono::high_resolution_clock::now() - start; double deltaSec = std::chrono::duration_cast>(delta).count(); - deltaSec = deltaSec/iters; + deltaSec = deltaSec/(iters*agg_iters); double algBw, busBw; - GetBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); Barrier(args); - if (datacheck) { - InitSend(args, type, op, root, in_place, args->thread == 0 ? 1 : 0); - InitRecvResult(args, type, op, root, in_place, args->thread == 0 ? 1 : 0); - cudaDeviceSynchronize(); - } - - //test validation in single itertion, should ideally be included into the multi-iteration run - startColl(args, type, op, root, in_place, 0); - completeColl(args); - double maxDelta = 0; -#ifdef CHECK - if (datacheck) { - maxDelta = CheckData(args, type, op, root, in_place); - } else { - maxDelta = -1.0; - } -#else - maxDelta = -1.0; -#endif + static __thread int rep = 0; + rep++; + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); - //aggregate delta from all threads and procs - Barrier(args); - if (args->thread == 0) { - for (int i=1; inThreads; i++) { + //test validation in single itertion, should ideally be included into the multi-iteration run + TESTCHECK(startColl(args, type, op, root, in_place, 0)); + TESTCHECK(completeColl(args)); + + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + + //aggregate delta from all threads and procs + Barrier(args); + if (args->thread == 0) { + for (int i=1; inThreads; i++) { maxDelta += args->deltaThreads[i]; - } + } #ifdef MPI_SUPPORT - MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); #endif + } + Barrier(args); } - Barrier(args); - if (datacheck) { - PRINT(" %7.3f %5.2f %5.2f %7.0le", deltaSec * 1.0E3, algBw, busBw, - maxDelta); + double timeUsec = deltaSec*1.0E6; + char timeStr[10]; + if (timeUsec > 10000.0) { + sprintf(timeStr, "%7.0f", timeUsec); + } else if (timeUsec > 100.0) { + sprintf(timeStr, "%7.1f", timeUsec); } else { - PRINT(" %7.3f %5.2f %5.2f \tN/A", deltaSec * 1.0E3, algBw, busBw); + sprintf(timeStr, "%7.2f", timeUsec); + } + if (datacheck) { + PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + } else { + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); } args->bw[0] += busBw; args->bw_count[0]++; + return testSuccess; } -void setupArgs(size_t size, ncclDataType_t type, struct threadArgs_t* args) { +void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { int nranks = args->nProcs*args->nGpus*args->nThreads; - size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset, procSharedCount; - int sameExpected; - + size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; + count = size / wordSize(type); - getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, &procSharedCount, &sameExpected, (size_t)count, (size_t)nranks); + args->collTest->getCollByteCount(&sendCount, &recvCount, ¶mCount, &sendInplaceOffset, &recvInplaceOffset, (size_t)count, (size_t)nranks); args->nbytes = paramCount * wordSize(type); args->sendBytes = sendCount * wordSize(type); @@ -595,260 +483,224 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs_t* args) { args->recvInplaceOffset = recvInplaceOffset * wordSize(type); } -void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, int inPlace) { - // Warm-up +testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // Warm-up for large size setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { - startColl(args, type, op, root, 0, iter); + TESTCHECK(startColl(args, type, op, root, 0, iter)); } - completeColl(args); + TESTCHECK(completeColl(args)); + + // Warm-up for small size + setupArgs(args->minbytes, type, args); + for (int iter = 0; iter < warmup_iters; iter++) { + TESTCHECK(startColl(args, type, op, root, 0, iter)); + } + TESTCHECK(completeColl(args)); // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); - BenchTime(args, type, op, root, 0); - if (inPlace) BenchTime(args, type, op, root, 1); + TESTCHECK(BenchTime(args, type, op, root, 0)); + TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); } + return testSuccess; } - -void* threadRunTests(void* args) { - struct threadArgs_t* targs = (struct threadArgs_t*)args; +testResult_t threadRunTests(struct threadArgs* args) { // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. - int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus; + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; CUDACHECK(cudaSetDevice(gpuid)); - - RunTest(targs, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]); - - return NULL; + TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); + return testSuccess; } -void* threadInit(void* args) { - struct threadArgs_t* targs = (struct threadArgs_t*)args; +testResult_t threadInit(struct threadArgs* args) { char hostname[1024]; getHostName(hostname, 1024); - int nranks = targs->nProcs*targs->nThreads*targs->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus; //set main thread again - is_main_thread = (targs->proc == 0 && targs->thread == 0) ? 1 : 0; + is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; NCCLCHECK(ncclGroupStart()); - for (int i=0; inGpus; i++) { - int rank = targs->proc*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i; - int gpuid = targs->localRank*targs->nThreads*targs->nGpus + targs->thread*targs->nGpus + i; + for (int i=0; inGpus; i++) { + int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; CUDACHECK(cudaSetDevice(gpuid)); - NCCLCHECK(ncclCommInitRank(targs->comms+i, nranks, targs->ncclId, rank)); + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); - PRINT("# Using devices\n"); - for (int p=0; pnProcs; p++) { - if (p == targs->proc) { - for (int t=0; tnThreads; t++) { - if (t == targs->thread) { - for (int i=0; inGpus; i++) { - int cudaDev; - int rank; - cudaDeviceProp prop; - NCCLCHECK(ncclCommCuDevice(targs->comms[i], &cudaDev)); - NCCLCHECK(ncclCommUserRank(targs->comms[i], &rank)); - CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev)); - printf("# Rank %2d on %10s device %2d [0x%02x] %s\n", rank, hostname, cudaDev, - prop.pciBusID, prop.name); - fflush(stdout); - } - Barrier(targs); - fflush(stdout); - } - } - } + TESTCHECK(threadRunTests(args)); + + for (int i=0; inGpus; i++) { + NCCLCHECK(ncclCommDestroy(args->comms[i])); } + return testSuccess; +} - threadRunTests(args); - +void* threadLauncher(void* thread_) { + struct testThread* thread = (struct testThread*)thread_; + thread->ret = thread->func(&thread->args); return NULL; } - -void AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, void **expectedHost, size_t nbytes, int nranks, int sameExpected) { - static int is_first = 1; - static void *cached_ptr = NULL; - static void *cached_hostptr = NULL; - - CUDACHECK(cudaMalloc(sendbuff, sendBytes)); - //work around for inline reduce scatter where recv count is smaller that send count - CUDACHECK(cudaMalloc(recvbuff, (sendBytes > recvBytes) ? sendBytes : recvBytes)); - - if (is_first || !sameExpected) { - *expectedHost = malloc(recvBytes); - CUDACHECK(cudaHostRegister(*expectedHost, recvBytes, cudaHostRegisterPortable | cudaHostRegisterMapped)); - CUDACHECK(cudaHostGetDevicePointer(expected, *expectedHost, 0)); - cached_ptr = *expected; - cached_hostptr = *expectedHost; - is_first = 0; - } else { - *expected = cached_ptr; - *expectedHost = cached_hostptr; - } -} - -int ncclstringtotype(char *str) { - for (int t=0; tthread, NULL, threadLauncher, thread); + return testSuccess; } -int ncclstringtoop (char *str) { - for (int o=0; o] \n\t " - "[-g,--ngpus ] \n\t " - "[-b,--minbytes ] \n\t " - "[-e,--maxbytes ] \n\t " - "[-i,--stepbytes ] \n\t " - "[-f,--stepfactor ] \n\t " - "[-n,--iters ] \n\t " - "[-w,--warmup_iters ] \n\t" - "[-s,--swap_args <0/1>] \n\t " - "[-p,--parallel_init <0/1>] \n\t " - "[-c,--check <0/1>] \n\t " - "[-o,--op ] \n\t " - "[-d,--datatype ] \n\t " - "[-r,--root ] \n\t " - "[-z,--blocking <0/1>] \n\t " - "[-h,--help]\n"); - return 0; - default: - printf("invalid option \n"); - printf("USAGE: ./test \n\t" - "[-t,--nthreads ] \n\t " - "[-g,--ngpus ] \n\t " - "[-b,--minbytes ] \n\t " - "[-e,--maxbytes ] \n\t " - "[-i,--stepbytes ] \n\t " - "[-f,--stepfactor ] \n\t " - "[-n,--iters ] \n\t " - "[-w,--warmup_iters ] \n\t" - "[-s,--swap_args <0/1>] \n\t " - "[-p,--parallel_init <0/1>] \n\t " - "[-c,--check <0/1>] \n\t " - "[-o,--op ] \n\t " - "[-d,--datatype ] \n\t " - "[-r,--root ] \n\t " - "[-z,--blocking <0/1>] \n\t " - "[-h,--help]\n"); - return 0; - } - } - // Make sure everyline is flushed so that we see the progress of the test setlinebuf(stdout); + // Parse args + int longindex; + static struct option longopts[] = { + {"nthreads", required_argument, 0, 't'}, + {"ngpus", required_argument, 0, 'g'}, + {"minbytes", required_argument, 0, 'b'}, + {"maxbytes", required_argument, 0, 'e'}, + {"stepbytes", required_argument, 0, 'i'}, + {"stepfactor", required_argument, 0, 'f'}, + {"iters", required_argument, 0, 'n'}, + {"agg_iters", required_argument, 0, 'm'}, + {"warmup_iters", required_argument, 0, 'w'}, + {"parallel_init", required_argument, 0, 'p'}, + {"check", required_argument, 0, 'c'}, + {"op", required_argument, 0, 'o'}, + {"datatype", required_argument, 0, 'd'}, + {"root", required_argument, 0, 'r'}, + {"blocking", required_argument, 0, 'z'}, + {"help", no_argument, 0, 'h'} + }; + + while(1) { + int c; + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex); + + if (c == -1) + break; + + switch(c) { + case 't': + nThreads = strtol(optarg, NULL, 0); + break; + case 'g': + nGpus = strtol(optarg, NULL, 0); + break; + case 'b': + minBytes = (size_t)parsesize(optarg); + break; + case 'e': + maxBytes = (size_t)parsesize(optarg); + break; + case 'i': + stepBytes = strtol(optarg, NULL, 0); + break; + case 'f': + stepFactor = strtol(optarg, NULL, 0); + break; + case 'n': + iters = (int)strtol(optarg, NULL, 0); + break; + case 'm': +#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 + agg_iters = (int)strtol(optarg, NULL, 0); +#else + printf("Option -m not supported before NCCL 2.2. Ignoring\n"); +#endif + break; + case 'w': + warmup_iters = (int)strtol(optarg, NULL, 0); + break; + case 'c': + datacheck = (int)strtol(optarg, NULL, 0); + break; + case 'p': + parallel_init = (int)strtol(optarg, NULL, 0); + break; + case 'o': + ncclop = ncclstringtoop(optarg); + break; + case 'd': + nccltype = ncclstringtotype(optarg); + break; + case 'r': + ncclroot = strtol(optarg, NULL, 0); + break; + case 'z': + blocking_coll = strtol(optarg, NULL, 0); + break; + case 'h': + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" + "[-o,--op ] \n\t" + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + default: + printf("invalid option \n"); + printf("USAGE: %s \n\t" + "[-t,--nthreads ] \n\t" + "[-g,--ngpus ] \n\t" + "[-b,--minbytes ] \n\t" + "[-e,--maxbytes ] \n\t" + "[-i,--stepbytes ] \n\t" + "[-f,--stepfactor ] \n\t" + "[-n,--iters ] \n\t" + "[-m,--agg_iters ] \n\t" + "[-w,--warmup_iters ] \n\t" + "[-p,--parallel_init <0/1>] \n\t" + "[-c,--check <0/1>] \n\t" + "[-o,--op ] \n\t" + "[-d,--datatype ] \n\t" + "[-r,--root ] \n\t" + "[-z,--blocking <0/1>] \n\t" + "[-h,--help]\n", + basename(argv[0])); + return 0; + } + } #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); +#endif + return run(); +} + +testResult_t run() { + int nProcs = 1, proc = 0; + int localRank = 0; + char hostname[1024]; + getHostName(hostname, 1024); + +#ifdef MPI_SUPPORT MPI_Comm_size(MPI_COMM_WORLD, &nProcs); MPI_Comm_rank(MPI_COMM_WORLD, &proc); uint64_t hostHashs[nProcs]; @@ -861,14 +713,38 @@ int main(int argc, char* argv[]) { #endif is_main_thread = (proc == 0) ? 1 : 0; - if (proc == 0) { - printf("nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, - (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); - if (swap_args) printf("Swap Comms Enabled: swapping communicators among threads for each iteration \n"); - if (blocking_coll) printf("Blocking Enabled: wait for completion and barrier after each collective \n"); - if (parallel_init) printf("Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); + if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); + PRINT("#\n"); + + PRINT("# Using devices\n"); +#define MAX_LINE 2048 + char line[MAX_LINE]; + int len = 0; + for (int i=0; i 0) { - procSharedHost = malloc(procSharedBytes); - CUDACHECK(cudaHostRegister(procSharedHost, procSharedBytes, cudaHostRegisterPortable | cudaHostRegisterMapped)); - CUDACHECK(cudaHostGetDevicePointer(&procShared, procSharedHost, 0)); + AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus); + CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking)); } //if parallel init is not selected, use main thread to initialize NCCL @@ -910,128 +777,113 @@ int main(int argc, char* argv[]) { NCCLCHECK(ncclGroupStart()); for (int i=0; i=0; t--) { - args[t].proc_args = (void *)args; - args[t].minbytes=minBytes; - args[t].maxbytes=maxBytes; - args[t].stepbytes=stepBytes; - args[t].stepfactor=stepFactor; - args[t].localRank = localRank; + threads[t].args.minbytes=minBytes; + threads[t].args.maxbytes=maxBytes; + threads[t].args.stepbytes=stepBytes; + threads[t].args.stepfactor=stepFactor; + threads[t].args.localRank = localRank; - args[t].nProcs=nProcs; - args[t].proc=proc; - args[t].nThreads=nThreads; - args[t].thread=t; - args[t].nGpus=nGpus; - args[t].sendbuffs = sendbuffs+t*nGpus; - args[t].recvbuffs = recvbuffs+t*nGpus; - args[t].ncclId = ncclId; - args[t].comms=comms+t*nGpus; - args[t].streams=streams+t*nGpus; + threads[t].args.nProcs=nProcs; + threads[t].args.proc=proc; + threads[t].args.nThreads=nThreads; + threads[t].args.thread=t; + threads[t].args.nGpus=nGpus; + threads[t].args.sendbuffs = sendbuffs+t*nGpus; + threads[t].args.recvbuffs = recvbuffs+t*nGpus; + threads[t].args.expected = expected+t*nGpus; + threads[t].args.ncclId = ncclId; + threads[t].args.comms=comms+t*nGpus; + threads[t].args.streams=streams+t*nGpus; - args[t].expectedHost = expectedHost + t*nGpus; - args[t].expected = expected + t*nGpus; - args[t].procSharedHost = procSharedHost; - args[t].procShared = procShared; - args[t].barrier = (volatile int*)barrier; - args[t].barrier_idx = 0; - args[t].sync = (volatile int*)sync; - args[t].sync_idx = 0; - args[t].deltaThreads = delta; - args[t].deltaHost = (delta + t); - CUDACHECK(cudaHostRegister(args[t].deltaHost, sizeof(double), cudaHostRegisterPortable|cudaHostRegisterMapped)); - CUDACHECK(cudaHostGetDevicePointer(&args[t].delta, args[t].deltaHost, 0)); - args[t].errors=errors+t; - args[t].bw=bw+t; - args[t].bw_count=bw_count+t; + threads[t].args.barrier = (volatile int*)barrier; + threads[t].args.barrier_idx = 0; + threads[t].args.sync = (volatile int*)sync; + threads[t].args.sync_idx = 0; + threads[t].args.deltaThreads = delta; + threads[t].args.deltaHost = (delta + t); + threads[t].args.delta = delta; + threads[t].args.errors=errors+t; + threads[t].args.bw=bw+t; + threads[t].args.bw_count=bw_count+t; - if (!parallel_init) { - if (t) - pthread_create(threads+t, NULL, threadRunTests, args+t); - else - threadRunTests(args); - } else { - if (t || (parallel_init && (proc == 0))) - pthread_create(threads+t, NULL, threadInit, args+t); - else - threadInit(args); + threads[t].func = parallel_init ? threadInit : threadRunTests; + if (t) + TESTCHECK(threadLaunch(threads+t)); + else + TESTCHECK(threads[t].func(&threads[t].args)); + } + + // Wait for other threads and accumulate stats and errors + for (int t=nThreads-1; t>=0; t--) { + if (t) pthread_join(threads[t].thread, NULL); + TESTCHECK(threads[t].ret); + if (t) { + errors[0] += errors[t]; + bw[0] += bw[t]; + bw_count[0] += bw_count[t]; } } - // Wait for other threads - for (int t=nThreads-1; t>=0; t--) { - if (t || (parallel_init && (proc == 0))) pthread_join(threads[t], NULL); - errors[0] += errors[t]; - bw[0] += bw[t]; - bw_count[0] += bw_count[t]; - } - #ifdef MPI_SUPPORT - MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &errors[0], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); #endif - for(int i=0; i @@ -17,23 +19,75 @@ #define CUDACHECK(cmd) do { \ cudaError_t e = cmd; \ if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test CUDA failure %s:%d '%s'\n", \ + hostname, \ __FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ + return testCudaError; \ } \ } while(0) #define NCCLCHECK(cmd) do { \ ncclResult_t r = cmd; \ if (r!= ncclSuccess) { \ - printf("NCCL failure %s:%d '%s'\n", \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d '%s'\n", \ + hostname, \ __FILE__,__LINE__,ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ + return testNcclError; \ } \ } while(0) -struct threadArgs_t { - void *proc_args; +typedef enum { + testSuccess = 0, + testInternalError = 1, + testCudaError = 2, + testNcclError = 3, + testCuRandError = 4 +} testResult_t; + +// Relay errors up and trace +#define TESTCHECK(cmd) do { \ + testResult_t r = cmd; \ + if (r!= testSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf(" .. %s: Test failure %s:%d\n", \ + hostname, \ + __FILE__,__LINE__); \ + return r; \ + } \ +} while(0) + +struct testColl { + const char name[20]; + void (*getCollByteCount)( + size_t *sendcount, size_t *recvcount, size_t *paramcount, + size_t *sendInplaceOffset, size_t *recvInplaceOffset, + size_t count, int nranks); + testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type, + ncclRedOp_t op, int root, int rep, int in_place); + void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); + testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +}; +extern struct testColl allReduceTest; +extern struct testColl allGatherTest; +extern struct testColl reduceScatterTest; +extern struct testColl broadcastTest; +extern struct testColl reduceTest; + +struct testEngine { + void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks); + testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type, + const char* typeName, ncclRedOp_t op, const char* opName); +}; + +extern struct testEngine ncclTestEngine; + +struct threadArgs { size_t nbytes; size_t minbytes; size_t maxbytes; @@ -55,11 +109,8 @@ struct threadArgs_t { ncclComm_t* comms; cudaStream_t* streams; - void** expectedHost; void** expected; size_t expectedBytes; - void* procSharedHost; - void* procShared; volatile int* sync; int sync_idx; volatile int* barrier; @@ -72,27 +123,28 @@ struct threadArgs_t { int* errors; double* bw; int* bw_count; + + struct testColl* collTest; +}; + +typedef testResult_t (*threadFunc_t)(struct threadArgs* args); +struct testThread { + pthread_t thread; + threadFunc_t func; + struct threadArgs args; + testResult_t ret; }; #include // Provided by common.cu -extern void Barrier(struct threadArgs_t* args); -extern void TimeTest(struct threadArgs_t* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root, int inPlace); -extern void Randomize(void* ptr, size_t count, ncclDataType_t type, int seed); -extern void Accumulate(void* out, void* in, size_t n, ncclDataType_t type, ncclRedOp_t op); -extern void CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax); -extern double DeltaMaxValue(ncclDataType_t type); +extern void Barrier(struct threadArgs* args); +extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); +extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); // Provided by each coll -void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName); -extern void GetBw(size_t count, int typeSize, double sec, double* algBw, double* busBw, int nranks); -extern void RunColl(void* sendbuf, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); -extern void InitData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int in_place, int is_first); -extern double CheckData(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op); -extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); -extern void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first); -extern void getCollByteCount(size_t *sendbytes, size_t *recvbytes, size_t *parambytes, size_t *sendInlineOffset, size_t *recvInlineOffset, size_t *procSharedBytes, int *sameexpected, size_t nbytes, int nranks); extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root); extern void print_header(); @@ -152,7 +204,33 @@ extern const char *test_typenames[ncclNumTypes]; extern ncclRedOp_t test_ops[ncclNumOps]; extern const char *test_opnames[ncclNumOps]; +static int ncclstringtotype(char *str) { + for (int t=0; t #include "cuda_runtime.h" #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place\n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %6s %7s %5s %5s %7s %7s %5s %5s %7s\n", "bytes", "N", "type", "op", "root", - "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res"); + PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { PRINT("%12li %12li %6s %6s %6i", size, count, typeName, opName, root); } -void getCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t *procSharedCount, int *sameExpected, size_t count, int nranks) { - *sendcount = count; - *recvcount = count; - *sameExpected = 0; - *procSharedCount = count; - *sendInplaceOffset = 0; - *recvInplaceOffset = 0; - *paramcount = *sendcount; - } - -void InitRecvResult(struct threadArgs_t* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int is_first) { - size_t count = args->expectedBytes / wordSize(type); - int root_gpu = root%args->nGpus; - - assert(args->expectedBytes == args->nbytes); - - while (args->sync[args->sync_idx] != args->thread) pthread_yield(); - - for (int i=0; inGpus; i++) { - int device; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - - if (is_first && i == 0) { - CUDACHECK(cudaMemcpy(args->procSharedHost, data, count*wordSize(type), cudaMemcpyDeviceToHost)); - } else { - Accumulate(args->procShared, data, count, type, op); - } - - if (in_place == 0) { - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); - } - CUDACHECK(cudaDeviceSynchronize()); - } - - args->sync[args->sync_idx] = args->thread + 1; - - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - int root_proc = root/(args->nThreads*args->nGpus); - if (args->expectedBytes) { - // Last thread does the MPI reduction - if (root_proc == args->proc) { - void* temp, *tempHost = malloc(args->expectedBytes); - CUDACHECK(cudaHostRegister(tempHost, args->expectedBytes, 0)); - CUDACHECK(cudaHostGetDevicePointer(&temp, tempHost, 0)); - - for (int i=0; inProcs; i++) { - if (i == args->proc) continue; - MPI_Recv(tempHost, args->expectedBytes, MPI_BYTE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); - - Accumulate(args->procShared, temp, count, type, op); - CUDACHECK(cudaDeviceSynchronize()); - } - - CUDACHECK(cudaHostUnregister(tempHost)); - free(tempHost); - } else { - MPI_Send(args->procSharedHost, args->expectedBytes, MPI_BYTE, root_proc, 0, MPI_COMM_WORLD); - } - } -#endif - args->sync[args->sync_idx] = 0; - } else { - while (args->sync[args->sync_idx]) pthread_yield(); - } - - //if root fill expected bytes with reduced data - // else if in_place, leave fill it with original data, else set to zero - for (int i=0; inGpus; i++) { - int rank = (args->proc*args->nThreads + args->thread)*args->nGpus + i; - if (rank == root) { - memcpy(args->expectedHost[root_gpu], args->procSharedHost, args->expectedBytes); - } else { - if (in_place == 1) { - CUDACHECK(cudaMemcpy(args->expectedHost[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDeviceToHost)); - } else { - memset(args->expectedHost[i], 0, args->expectedBytes); - } - } - } - - args->sync_idx = !args->sync_idx; +void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; } -void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { +testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize) / 1.0E9 / sec; *algBw = baseBw; *busBw = baseBw; } -void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); + return testSuccess; } +struct testColl reduceTest = { + "Reduce", + ReduceGetCollByteCount, + ReduceInitData, + ReduceGetBw, + ReduceRunColl +}; -void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { +void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + ReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &reduceTest; ncclDataType_t *run_types; ncclRedOp_t *run_ops; const char **run_typenames, **run_opnames; int type_count, op_count; int begin_root, end_root; - if ((int)type != -1) { + if ((int)type != -1) { type_count = 1; run_types = &type; run_typenames = &typeName; - } else { + } else { type_count = ncclNumTypes; run_types = test_types; run_typenames = test_typenames; } - if ((int)op != -1) { + if ((int)op != -1) { op_count = 1; run_ops = &op; run_opnames = &opName; - } else { + } else { op_count = ncclNumOps; run_ops = test_ops; run_opnames = test_opnames; } - if (root != -1) { - begin_root = end_root = root; - } else { - begin_root = 0; - end_root = args->nProcs*args->nThreads*args->nGpus-1; + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; } - for (int i=0; iexpectedBytes; - size_t recvcount = args->expectedBytes / wordSize(type); - size_t sendbytes = args->sendBytes; +testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); - - while (args->sync[args->sync_idx] != args->thread) pthread_yield(); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - int device; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - - if (is_first && i == 0) { - CUDACHECK(cudaMemcpy(args->procSharedHost, data, sendbytes, cudaMemcpyDeviceToHost)); - } else { - Accumulate(args->procShared, data, sendcount, type, op); - } - - CUDACHECK(cudaDeviceSynchronize()); - if (in_place == 0) { - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, recvbytes)); - } + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); CUDACHECK(cudaDeviceSynchronize()); } - - args->sync[args->sync_idx] = args->thread + 1; - - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - if (sendbytes > 0) { - // Last thread does the MPI reduction - void* remote, *remoteHost = malloc(sendbytes); - void* myInitialData = malloc(sendbytes); - memcpy(myInitialData, args->procSharedHost, sendbytes); - CUDACHECK(cudaHostRegister(remoteHost, sendbytes, 0)); - CUDACHECK(cudaHostGetDevicePointer(&remote, remoteHost, 0)); - - for (int i=0; inProcs; i++) { - if (i == args->proc) { - MPI_Bcast(myInitialData, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD); - free(myInitialData); - } else { - MPI_Bcast(remoteHost, sendbytes, MPI_BYTE, i, MPI_COMM_WORLD); - Accumulate(args->procShared, remote, sendcount, type, op); - cudaDeviceSynchronize(); - } - } - CUDACHECK(cudaHostUnregister(remoteHost)); - free(remoteHost); - } -#endif - args->sync[args->sync_idx] = 0; - } else { - while (args->sync[args->sync_idx]) pthread_yield(); - } - - for (int i=0; inGpus; i++) { - int offset = ((args->proc*args->nThreads + args->thread)*args->nGpus + i)*recvbytes; - memcpy(args->expectedHost[i], (void *)((uintptr_t)args->procSharedHost + offset), recvbytes); - } - - args->sync_idx = !args->sync_idx; + return testSuccess; } -void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { +void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; *algBw = baseBw; @@ -101,17 +54,32 @@ void GetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, *busBw = baseBw * factor; } -void RunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); + return testSuccess; } -void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { +struct testColl reduceScatterTest = { + "ReduceScatter", + ReduceScatterGetCollByteCount, + ReduceScatterInitData, + ReduceScatterGetBw, + ReduceScatterRunColl +}; + +void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + ReduceScatterGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &reduceScatterTest; ncclDataType_t *run_types; ncclRedOp_t *run_ops; const char **run_typenames, **run_opnames; int type_count, op_count; - if ((int)type != -1) { + if ((int)type != -1) { type_count = 1; run_types = &type; run_typenames = &typeName; @@ -121,19 +89,27 @@ void RunTest(struct threadArgs_t* args, int root, ncclDataType_t type, const cha run_typenames = test_typenames; } - if ((int)op != -1) { + if ((int)op != -1) { run_ops = &op; run_opnames = &opName; op_count = 1; - } else { + } else { op_count = sizeof(test_ops)/sizeof(test_ops[0]); run_ops = test_ops; run_opnames = test_opnames; } - for (int i=0; i Date: Tue, 9 Apr 2019 15:51:40 -0700 Subject: [PATCH 013/233] hipify nccl-tests to become rccl-tests --- README.md | 22 ++++---- doc/PERFORMANCE.md | 10 ++-- src/Makefile | 57 ++++++++----------- src/all_gather.cu | 10 ++-- src/all_reduce.cu | 10 ++-- src/broadcast.cu | 10 ++-- src/common.cu | 124 +++++++++++++++++++++--------------------- src/common.h | 17 +++--- src/nccl1_compat.h | 10 ++-- src/reduce.cu | 12 ++-- src/reduce_scatter.cu | 12 ++-- 11 files changed, 143 insertions(+), 151 deletions(-) diff --git a/README.md b/README.md index 7a4bbbc6ca..13292fb93b 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,26 @@ -# NCCL Tests +# RCCL Tests -These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL](http://github.com/nvidia/nccl) +These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl) ## Build To build the tests, just type `make`. -If CUDA is not installed in /usr/local/cuda, you may specify CUDA\_HOME. Similarly, if NCCL is not installed in /usr, you may specify NCCL\_HOME. +If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify RCCL\_HOME. ```shell -$ make CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl +$ make HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl ``` -NCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed. +RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed. ```shell -$ make MPI=1 MPI_HOME=/path/to/mpi CUDA_HOME=/path/to/cuda NCCL_HOME=/path/to/nccl +$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl ``` ## Usage -NCCL tests can run on multiple processes, multiple threads, and multiple CUDA devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=CUDA devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). +RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread). ### Quick examples @@ -51,7 +51,7 @@ All tests support the same set of arguments : * Increments can be either fixed or a multiplication factor. Only one of those should be used * `-i,--stepbytes ` fixed increment between sizes. Default : (max-min)/10. * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. -* NCCL operations arguments +* RCCL operations arguments * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. * `-d,--datatype ` Specify which datatype to use. Default : Float. * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. @@ -60,11 +60,11 @@ All tests support the same set of arguments : * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. * Test operation - * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. + * `-p,--parallel_init <0/1>` use threads to initialize RCCL in parallel. Default : 0. * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. - * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. + * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. ## Copyright -NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. +RCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md index 7cc6ecee66..dd049bf6e9 100644 --- a/doc/PERFORMANCE.md +++ b/doc/PERFORMANCE.md @@ -1,6 +1,6 @@ -# Performance reported by NCCL tests +# Performance reported by RCCL tests -NCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used. +RCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used. # Time @@ -24,7 +24,7 @@ Algorithm bandwidth is using the most commonly used formula for bandwidth : size While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks. Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase). -To provide a number which reflects how optimally the hardware is used, NCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output). +To provide a number which reflects how optimally the hardware is used, RCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output). This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used. @@ -78,7 +78,7 @@ And the Bus Bandwidth is therefore computed as : `B = S/t * (n-1)/n = algbw * (n-1)/n` -Note that here, S is the size in bytes of the total array, which for NCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank. +Note that here, S is the size in bytes of the total array, which for RCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank. ### AllGather @@ -96,7 +96,7 @@ And the Bus Bandwidth is therefore computed as : `B = S/t * (n-1)/n = algbw * (n-1)/n` -Note that here, S is the size in bytes of the total array, which for NCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank. +Note that here, S is the size in bytes of the total array, which for RCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank. ### Broadcast diff --git a/src/Makefile b/src/Makefile index 034cc672fa..bb18157045 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,41 +4,30 @@ # See LICENSE.txt for license information # -CUDA_HOME ?= /usr/local/cuda +ROCM_HOME ?= /opt/rocm +MPI_HOME ?= /usr/lib/openmpi PREFIX ?= /usr/local VERBOSE ?= 0 DEBUG ?= 0 -CUDA_LIB ?= $(CUDA_HOME)/lib64 -CUDA_INC ?= $(CUDA_HOME)/include -NVCC = $(CUDA_HOME)/bin/nvcc +HIPCC = $(ROCM_HOME)/hip/bin/hipcc +CXX = $(HIPCC) -# Better define NVCC_GENCODE in your environment to the minimal set -# of archs to reduce compile time. -NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \ - -gencode=arch=compute_35,code=sm_35 \ - -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ - -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=compute_70 \ - -gencode=arch=compute_70,code=sm_70 - -NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 - -LDFLAGS := -L${CUDA_LIB} -lcudart -lrt -NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt +HIPCUFLAGS := +HIPCUFLAGS += -I$(ROCM_HOME)/include +HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl +HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip +HIPCUFLAGS += -I$(ROCM_HOME)/hiprand/include +LDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt +HIPLDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt ifeq ($(DEBUG), 0) -NVCUFLAGS += -O3 -g -CXXFLAGS += -O3 -g +HIPCUFLAGS += -O3 else -NVCUFLAGS += -O0 -G -g -CXXFLAGS += -O0 -g -ggdb3 +HIPCUFLAGS += -O0 -g -ggdb3 endif -ifneq ($(VERBOSE), 0) -NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter -else +ifeq ($(VERBOSE), 0) .SILENT: endif @@ -46,16 +35,16 @@ endif BUILDDIR ?= ../build ifneq ($(NCCL_HOME), "") -NVCUFLAGS += -I$(NCCL_HOME)/include/ -NVLDFLAGS += -L$(NCCL_HOME)/lib +HIPCUFLAGS += -I$(NCCL_HOME)/include/ +HIPLDFLAGS += -L$(NCCL_HOME)/lib endif ifeq ($(MPI), 1) -NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include -NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi +HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include +HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi endif -LIBRARIES += curand nccl nvToolsExt -NVLDFLAGS += $(LIBRARIES:%=-l%) +LIBRARIES += rccl +HIPLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) @@ -71,10 +60,12 @@ clean: ${DST_DIR}/%.o: %.cu common.h @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} - $(NVCC) -o $@ $(NVCUFLAGS) -c $< + echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<" + $(HIPCC) -o $@ $(HIPCUFLAGS) -c $< ${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} - $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} + echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}" + $(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS} diff --git a/src/all_gather.cu b/src/all_gather.cu index cfb2ec356b..e9d382cd69 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "cuda_runtime.h" +#include #include "common.h" void print_header() { @@ -34,15 +34,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + HIPCHECK(hipSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, type, rep, rank)); for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); } - CUDACHECK(cudaDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); } return testSuccess; } @@ -55,7 +55,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index bd8daaf0a2..4fcb9a0e48 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "cuda_runtime.h" +#include #include "common.h" void print_header() { @@ -34,13 +34,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + HIPCHECK(hipSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, type, rep, rank)); TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); - CUDACHECK(cudaDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); } return testSuccess; } @@ -53,7 +53,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/broadcast.cu b/src/broadcast.cu index c62a99ff62..4a7cdb9ae2 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "cuda_runtime.h" +#include #include "common.h" void print_header() { @@ -33,13 +33,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + HIPCHECK(hipSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root)); - CUDACHECK(cudaDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); } return testSuccess; } @@ -52,7 +52,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 diff --git a/src/common.cu b/src/common.cu index 5a3ae529d6..9fe70e5986 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * @@ -9,7 +10,6 @@ #include #include #include -#include "cuda.h" #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; @@ -129,27 +129,27 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { switch (type) { case ncclHalf: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; case ncclFloat: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; case ncclDouble: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; case ncclChar: #if NCCL_MAJOR >= 2 case ncclUint8: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; case ncclInt: #if NCCL_MAJOR >= 2 case ncclUint32: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; case ncclInt64: case ncclUint64: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; } - CUDACHECK(cudaDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); return testSuccess; } @@ -196,61 +196,63 @@ template<> __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } template -__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { +__global__ void InitDataReduceKernel(void* data, const size_t N, const size_t offset, const int rep, const int nranks) { for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); for (int i=1; i(o+offset, rep, i)); } - data[o] = val; + ((T*)data)[o] = val; } } -#define KERN(type, op) (void*)InitDataReduceKernel> +typedef void(*redInitKern_t)(void* data, const size_t N, const size_t offset, const int rep, const int nranks); + +#define KERN(type, op) InitDataReduceKernel> #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) -static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = { +static redInitKern_t const redInitDataKerns[ncclNumOps*ncclNumTypes] = { OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double) }; testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { dim3 grid = { 32, 1, 1 }; dim3 block = { 256, 1, 1 }; - void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; - CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*ncclNumOps+op], grid, block, args, 0, cudaStreamDefault)); + hipLaunchKernelGGL((redInitDataKerns[type*ncclNumOps+op]), grid, block, 0, 0, data, count, offset, rep, nranks); return testSuccess; } template -__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { +__global__ void InitDataKernel(void* data, const size_t N, const int rep, const int rank) { for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); + ((T*)data)[o] = testValue(o, rep, rank); } -static void* const initDataKerns[ncclNumTypes] = { - (void*)InitDataKernel< int8_t>, - (void*)InitDataKernel< uint8_t>, - (void*)InitDataKernel< int32_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< int64_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< half>, - (void*)InitDataKernel< float>, - (void*)InitDataKernel< double> +typedef void(*initDataKern_t)(void* data, const size_t N, const int rep, const int rank); + +static initDataKern_t const initDataKerns[ncclNumTypes] = { + InitDataKernel< int8_t>, + InitDataKernel< uint8_t>, + InitDataKernel< int32_t>, + InitDataKernel, + InitDataKernel< int64_t>, + InitDataKernel, + InitDataKernel< half>, + InitDataKernel< float>, + InitDataKernel< double> }; template testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { T* ptr = (T*)dest; - InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); + hipLaunchKernelGGL((InitDataKernel), dim3(16), dim3(512), 0, 0, ptr, N, rep, rank); return testSuccess; } testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { dim3 grid = { 32, 1, 1 }; dim3 block = { 256, 1, 1 }; - void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; - CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); + hipLaunchKernelGGL((initDataKerns[type]), grid, block, 0, 0, data, count, rep, rank); return testSuccess; } @@ -279,7 +281,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int device; int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); + HIPCHECK(hipSetDevice(device)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->delta)); maxDelta = std::max(*(args->deltaHost), maxDelta); @@ -289,14 +291,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int *expectedHost = (int *)malloc(args->expectedBytes); int *dataHost = (int *)malloc(args->expectedBytes); - cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); + hipMemcpy(expectedHost, args->expected[0], args->expectedBytes, hipMemcpyDeviceToHost); printf("\n Expected: "); for(int j=0; jexpectedBytes/sizeof(int); j++) { printf("%d:%d ", j, expectedHost[j]); } printf("\n"); - cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); + hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost); printf("\n Actual: "); for (int j=0; jexpectedBytes/sizeof(int); j++) { printf("%d:%d ", j, dataHost[j]); @@ -312,8 +314,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t return testSuccess; } -testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { - cudaError_t cudaErr; +testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) { + hipError_t hipErr; int remaining = ngpus; int* done = (int*)malloc(sizeof(int)*ngpus); memset(done, 0, sizeof(int)*ngpus); @@ -322,15 +324,15 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* for (int i=0; i= NCCL_VERSION(2,4,0) if (comms) { @@ -365,9 +367,9 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { #ifndef NCCL_MAJOR - int cudaDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); - CUDACHECK(cudaSetDevice(cudaDev)); + int hipDev; + NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev)); + HIPCHECK(hipSetDevice(hipDev)); #endif int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; @@ -514,7 +516,7 @@ testResult_t threadRunTests(struct threadArgs* args) { // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - CUDACHECK(cudaSetDevice(gpuid)); + HIPCHECK(hipSetDevice(gpuid)); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } @@ -531,7 +533,7 @@ testResult_t threadInit(struct threadArgs* args) { for (int i=0; inGpus; i++) { int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + HIPCHECK(hipSetDevice(gpuid)); NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); @@ -555,9 +557,9 @@ testResult_t threadLaunch(struct testThread* thread) { } testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { - CUDACHECK(cudaMalloc(sendbuff, nbytes)); - CUDACHECK(cudaMalloc(recvbuff, nbytes)); - CUDACHECK(cudaMalloc(expected, recvBytes)); + HIPCHECK(hipMalloc(sendbuff, nbytes)); + HIPCHECK(hipMalloc(recvbuff, nbytes)); + HIPCHECK(hipMalloc(expected, recvBytes)); return testSuccess; } @@ -724,12 +726,12 @@ testResult_t run() { char line[MAX_LINE]; int len = 0; for (int i=0; i #include -#include #ifdef MPI_SUPPORT #include "mpi.h" #endif #include #include "nccl1_compat.h" -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ +#define HIPCHECK(cmd) do { \ + hipError_t e = cmd; \ + if( e != hipSuccess ) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ - printf("%s: Test CUDA failure %s:%d '%s'\n", \ + printf("%s: Test HIP failure %s:%d '%s'\n", \ hostname, \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ + __FILE__,__LINE__,hipGetErrorString(e)); \ return testCudaError; \ } \ } while(0) @@ -71,7 +70,7 @@ struct testColl { ncclRedOp_t op, int root, int rep, int in_place); void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; @@ -107,7 +106,7 @@ struct threadArgs { size_t recvInplaceOffset; ncclUniqueId ncclId; ncclComm_t* comms; - cudaStream_t* streams; + hipStream_t* streams; void** expected; size_t expectedBytes; diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h index 020a4bc36f..726669c885 100644 --- a/src/nccl1_compat.h +++ b/src/nccl1_compat.h @@ -20,28 +20,28 @@ static ncclResult_t ncclGroupEnd() { return ncclSuccess; } #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument; static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { CHECKCOUNT(count); return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); } static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) { CHECKCOUNT(count); return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); } static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream) { + ncclComm_t comm, hipStream_t stream) { CHECKCOUNT(count); return ncclBcast(buff, (int)count, datatype, root, comm, stream); } static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - cudaStream_t stream) { + hipStream_t stream) { CHECKCOUNT(recvcount); return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); } static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { CHECKCOUNT(sendcount); return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); } diff --git a/src/reduce.cu b/src/reduce.cu index 08825e45b0..5a286c1b6b 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "cuda_runtime.h" +#include #include "common.h" void print_header() { @@ -34,14 +34,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + HIPCHECK(hipSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, type, rep, rank)); - CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); - CUDACHECK(cudaDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); } return testSuccess; } @@ -52,7 +52,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw; } -testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); return testSuccess; } diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index 0b1d986952..3906621e96 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "cuda_runtime.h" +#include #include "common.h" void print_header() { @@ -34,14 +34,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + HIPCHECK(hipSetDevice(gpuid)); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, type, rep, rank)); - CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); - CUDACHECK(cudaDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); } return testSuccess; } @@ -54,7 +54,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d *busBw = baseBw * factor; } -testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } From 71e663e62d4ffb124c470dba7a286f291653161f Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Wed, 10 Apr 2019 15:28:40 -0700 Subject: [PATCH 014/233] Adding AMD copyright notices --- LICENSE.txt | 1 + Makefile | 1 + README.md | 4 +++- src/Makefile | 1 + src/all_gather.cu | 1 + src/all_reduce.cu | 1 + src/broadcast.cu | 1 + src/common.cu | 4 +++- src/common.h | 3 ++- src/nccl1_compat.h | 1 + src/reduce.cu | 1 + src/reduce_scatter.cu | 1 + 12 files changed, 17 insertions(+), 3 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index 4573c07c44..d2e566e3e2 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,5 +1,6 @@ Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/Makefile b/Makefile index 29409a8422..6a90587888 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ # # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. # # See LICENCE.txt for license information # diff --git a/README.md b/README.md index 13292fb93b..e96ce21599 100644 --- a/README.md +++ b/README.md @@ -66,5 +66,7 @@ All tests support the same set of arguments : ## Copyright -RCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. +RCCL tests are provided under the BSD license. +All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. +All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. diff --git a/src/Makefile b/src/Makefile index bb18157045..56d52405bb 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,6 @@ # # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/all_gather.cu b/src/all_gather.cu index e9d382cd69..45615ccd27 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 4fcb9a0e48..177674085e 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/broadcast.cu b/src/broadcast.cu index 4a7cdb9ae2..4119d9eefb 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/common.cu b/src/common.cu index 9fe70e5986..cd2974189f 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1,10 +1,12 @@ -#include "hip/hip_runtime.h" + /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ +#include "hip/hip_runtime.h" #include "common.h" #include #include diff --git a/src/common.h b/src/common.h index be4000dd64..2ddf40b2ee 100644 --- a/src/common.h +++ b/src/common.h @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -192,7 +193,7 @@ static size_t wordSize(ncclDataType_t type) { case ncclInt64: case ncclUint64: case ncclDouble: - //case ncclFloat64: + //case ncclFloat64: return 8; default: return 0; } diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h index 726669c885..3c241d3d14 100644 --- a/src/nccl1_compat.h +++ b/src/nccl1_compat.h @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/reduce.cu b/src/reduce.cu index 5a286c1b6b..3e9056ad05 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index 3906621e96..c466ca284b 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ From 4474fe168d5c8b38f56ec2931d093102c74ef3d1 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 18 Apr 2019 10:34:55 -0700 Subject: [PATCH 015/233] workaround weak symbol issue hcc prints "error: alias must point to a defined variable or function" --- src/all_gather.cu | 6 ++---- src/all_reduce.cu | 6 ++---- src/broadcast.cu | 6 ++---- src/reduce.cu | 6 ++---- src/reduce_scatter.cu | 6 ++---- 5 files changed, 10 insertions(+), 20 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index 45615ccd27..2e6c880160 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -96,9 +96,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t return testSuccess; } -struct testEngine allGatherEngine = { +struct testEngine ncclTestEngine = { AllGatherGetBuffSize, AllGatherRunTest -}; - -#pragma weak ncclTestEngine=allGatherEngine +}; \ No newline at end of file diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 177674085e..acc7c9c69a 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -107,9 +107,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t return testSuccess; } -struct testEngine allReduceEngine = { +struct testEngine ncclTestEngine = { AllReduceGetBuffSize, AllReduceRunTest -}; - -#pragma weak ncclTestEngine=allReduceEngine +}; \ No newline at end of file diff --git a/src/broadcast.cu b/src/broadcast.cu index 4119d9eefb..bb29738ee0 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -113,9 +113,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t return testSuccess; } -struct testEngine broadcastEngine = { +struct testEngine ncclTestEngine = { BroadcastGetBuffSize, BroadcastRunTest -}; - -#pragma weak ncclTestEngine=broadcastEngine +}; \ No newline at end of file diff --git a/src/reduce.cu b/src/reduce.cu index 3e9056ad05..541930797b 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -116,9 +116,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ return testSuccess; } -struct testEngine reduceEngine = { +struct testEngine ncclTestEngine = { ReduceGetBuffSize, ReduceRunTest -}; - -#pragma weak ncclTestEngine=reduceEngine +}; \ No newline at end of file diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index c466ca284b..10856cc3da 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -108,9 +108,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp return testSuccess; } -struct testEngine reduceScatterEngine = { +struct testEngine ncclTestEngine = { ReduceScatterGetBuffSize, ReduceScatterRunTest -}; - -#pragma weak ncclTestEngine=reduceScatterEngine +}; \ No newline at end of file From 10e1572f726054ccef30f08526897e5f08fbe54c Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Mon, 22 Apr 2019 10:25:49 -0700 Subject: [PATCH 016/233] Adding way to specify a custom RCCL shared library file to link against --- src/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 56d52405bb..f1a6a6e2e3 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,13 +14,14 @@ DEBUG ?= 0 HIPCC = $(ROCM_HOME)/hip/bin/hipcc CXX = $(HIPCC) + HIPCUFLAGS := HIPCUFLAGS += -I$(ROCM_HOME)/include HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip HIPCUFLAGS += -I$(ROCM_HOME)/hiprand/include LDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt -HIPLDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt +HIPLDFLAGS := $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt ifeq ($(DEBUG), 0) HIPCUFLAGS += -O3 From 7e80ea9d3afd93985413cd5a59e4fd909d666d02 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 26 Apr 2019 17:00:31 -0700 Subject: [PATCH 017/233] fix build with 1.0 library --- src/common.cu | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/common.cu b/src/common.cu index cd2974189f..81351e0433 100644 --- a/src/common.cu +++ b/src/common.cu @@ -214,7 +214,11 @@ typedef void(*redInitKern_t)(void* data, const size_t N, const size_t offset, co #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) static redInitKern_t const redInitDataKerns[ncclNumOps*ncclNumTypes] = { +#if NCCL_MAJOR >= 2 OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double) +#else + OPS(char), OPS(int32_t), OPS(half), OPS(float), OPS(double), OPS(int64_t), OPS(uint64_t) +#endif }; testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { @@ -233,6 +237,7 @@ __global__ void InitDataKernel(void* data, const size_t N, const int rep, const typedef void(*initDataKern_t)(void* data, const size_t N, const int rep, const int rank); static initDataKern_t const initDataKerns[ncclNumTypes] = { +#if NCCL_MAJOR >= 2 InitDataKernel< int8_t>, InitDataKernel< uint8_t>, InitDataKernel< int32_t>, @@ -242,6 +247,15 @@ static initDataKern_t const initDataKerns[ncclNumTypes] = { InitDataKernel< half>, InitDataKernel< float>, InitDataKernel< double> +#else + InitDataKernel< char>, + InitDataKernel< int32_t>, + InitDataKernel< half>, + InitDataKernel< float>, + InitDataKernel< double>, + InitDataKernel< int64_t>, + InitDataKernel, +#endif }; template @@ -336,6 +350,7 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* if (hipErr != hipErrorNotReady) HIPCHECK(hipErr); +#if NCCL_MAJOR >= 2 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) if (comms) { ncclResult_t ncclAsyncErr; @@ -349,6 +364,7 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* NCCLCHECK(ncclAsyncErr); } } +#endif #endif } @@ -543,7 +559,11 @@ testResult_t threadInit(struct threadArgs* args) { TESTCHECK(threadRunTests(args)); for (int i=0; inGpus; i++) { +#if NCCL_MAJOR >= 2 NCCLCHECK(ncclCommDestroy(args->comms[i])); +#else + ncclCommDestroy(args->comms[i]); +#endif } return testSuccess; } @@ -860,7 +880,11 @@ testResult_t run() { if (!parallel_init) { for(int i=0; i= 2 NCCLCHECK(ncclCommDestroy(comms[i])); +#else + ncclCommDestroy(comms[i]); +#endif free(comms); } From 3f89175af5bc93db0d36758a0e4217f49b7147fa Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 1 May 2019 12:58:04 -0700 Subject: [PATCH 018/233] allow using different memory types for input and output buffers --- src/common.cu | 37 ++++++++++++++++++++++++++++++++----- src/common.h | 15 +++++++++++++++ 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/src/common.cu b/src/common.cu index 81351e0433..61084eb1bd 100644 --- a/src/common.cu +++ b/src/common.cu @@ -22,6 +22,7 @@ const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "dou #endif ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin}; const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"}; +const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host"}; thread_local int is_main_thread = 0; @@ -41,6 +42,7 @@ static int nccltype = ncclFloat; static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; +static int memorytype = 0; double parsesize(char *value) { long long int units; @@ -579,10 +581,22 @@ testResult_t threadLaunch(struct testThread* thread) { } testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { + if (memorytype == ncclFine) { + HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained)); + HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained)); + HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained)); + } + else if (memorytype == ncclHost) { + HIPCHECK(hipHostMalloc(sendbuff, nbytes)); + HIPCHECK(hipHostMalloc(recvbuff, nbytes)); + HIPCHECK(hipHostMalloc(expected, recvBytes)); + } + else { HIPCHECK(hipMalloc(sendbuff, nbytes)); HIPCHECK(hipMalloc(recvbuff, nbytes)); HIPCHECK(hipMalloc(expected, recvBytes)); - return testSuccess; + } + return testSuccess; } testResult_t run(); // Main function @@ -609,12 +623,13 @@ int main(int argc, char* argv[]) { {"datatype", required_argument, 0, 'd'}, {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, + {"memory_type", required_argument, 0, 'y'}, {"help", no_argument, 0, 'h'} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:h", longopts, &longindex); if (c == -1) break; @@ -669,6 +684,9 @@ int main(int argc, char* argv[]) { case 'z': blocking_coll = strtol(optarg, NULL, 0); break; + case 'y': + memorytype = ncclstringtomtype(optarg); + break; case 'h': printf("USAGE: %s \n\t" "[-t,--nthreads ] \n\t" @@ -686,6 +704,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-y,--memory_type ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -707,6 +726,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-y,--memory_type ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -890,9 +910,16 @@ testResult_t run() { // Free off HIP allocated memory for (int i=0; i Date: Tue, 7 May 2019 18:27:25 +0000 Subject: [PATCH 019/233] Adding copyright notice. --- NOTICES.txt | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 NOTICES.txt diff --git a/NOTICES.txt b/NOTICES.txt new file mode 100644 index 0000000000..6f49d61763 --- /dev/null +++ b/NOTICES.txt @@ -0,0 +1,66 @@ +Notices and Licenses file +_______________________________________________________________ + +Dependencies on nvidia-nccl-tests v2.0.0 (BSD3) +Copyright (c) 2016-2017, NVIDIA CORPORATION. +Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National + Laboratory, the U.S. Department of Energy, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The U.S. Department of Energy funded the development of this software +under subcontract 7078610 with Lawrence Berkeley National Laboratory. + + +nvidia-nccl-tests v2.0.0 (BSD2) +Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National + Laboratory, the U.S. Department of Energy, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The U.S. Department of Energy funded the development of this software +under subcontract 7078610 with Lawrence Berkeley National Laboratory. From 86f053be841023885cb24bef1096229a2e5327df Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 13 May 2019 23:45:28 +0000 Subject: [PATCH 020/233] enable building with mpich Use following command to build: MPICH=1 make --- src/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Makefile b/src/Makefile index f1a6a6e2e3..acf41d7e5a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -44,7 +44,11 @@ endif ifeq ($(MPI), 1) HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi +else ifeq($(MPICH), 1) +HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich +HIPLDFLAGS += -L/usr/lib -lmpi endif + LIBRARIES += rccl HIPLDFLAGS += $(LIBRARIES:%=-l%) From 79356ec21874624b39a3b5a1bfd8b2ec85b624fc Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Wed, 15 May 2019 16:59:47 +0000 Subject: [PATCH 021/233] Updating README to include CUSTOM_RCCL_LIB. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e96ce21599..c442de3934 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ These tests check both the performance and the correctness of RCCL operations. T To build the tests, just type `make`. -If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify RCCL\_HOME. +If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB. ```shell -$ make HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl +$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so ``` RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed. From 3aa32972f4b0bbb59eda1d7eb8e1a90a28fa7638 Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Wed, 15 May 2019 11:22:34 -0600 Subject: [PATCH 022/233] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c442de3934..2731d65c65 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # RCCL Tests -These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl) +These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl). ## Build @@ -67,6 +67,8 @@ All tests support the same set of arguments : ## Copyright RCCL tests are provided under the BSD license. + All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. From 3fac1d679be8bb10dc12d786025d35579ff9de58 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 16 May 2019 13:18:23 -0700 Subject: [PATCH 023/233] Fix missing space in Makefile --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index acf41d7e5a..78470b8f48 100644 --- a/src/Makefile +++ b/src/Makefile @@ -44,7 +44,7 @@ endif ifeq ($(MPI), 1) HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi -else ifeq($(MPICH), 1) +else ifeq ($(MPICH), 1) HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich HIPLDFLAGS += -L/usr/lib -lmpi endif From 924521ff570069f2969377001af9913b3b026065 Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Fri, 28 Jun 2019 09:52:44 -0600 Subject: [PATCH 024/233] Adding unit tests and files for CI (#4) * Adding initial unit test and Jenkins code. Fixing scope of unit tests Adding unit tests and files for CI Fixing Jenkinsfile * Removing typos from Jenkinsfile * Making some fixes to the Jenkins file; temporarily disabling MPI * Making corrections to Jenkinsfile * Correcting dockerNodes entry in Jenkinsfile * Fixed Jenkinsfile for CI * Correcting Jenkinsfile for CI * Updating README to include instructions on how to run unit tests. --- Jenkinsfile | 82 +++++++++++++++++++++++++++++ README.md | 12 +++++ install.sh | 98 +++++++++++++++++++++++++++++++++++ src/common.cu | 18 +++++-- test/__init__.py | 20 ++++++++ test/conftest.py | 23 +++++++++ test/test_AllGather.py | 102 +++++++++++++++++++++++++++++++++++++ test/test_AllReduce.py | 102 +++++++++++++++++++++++++++++++++++++ test/test_Broadcast.py | 102 +++++++++++++++++++++++++++++++++++++ test/test_Reduce.py | 102 +++++++++++++++++++++++++++++++++++++ test/test_ReduceScatter.py | 102 +++++++++++++++++++++++++++++++++++++ 11 files changed, 758 insertions(+), 5 deletions(-) create mode 100644 Jenkinsfile create mode 100755 install.sh create mode 100644 test/__init__.py create mode 100644 test/conftest.py create mode 100644 test/test_AllGather.py create mode 100644 test/test_AllReduce.py create mode 100644 test/test_Broadcast.py create mode 100644 test/test_Reduce.py create mode 100644 test/test_ReduceScatter.py diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000..7589636c68 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,82 @@ +#!/usr/bin/env groovy +// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. +// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS +@Library('rocJenkins@noDocker') _ + +// This is file for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +import com.amd.project.* +import com.amd.docker.* + +//////////////////////////////////////////////////////////////////////// +// Mostly generated from snippet generator 'properties; set job properties' +// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM +properties([ + pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), + buildDiscarder(logRotator( + artifactDaysToKeepStr: '', + artifactNumToKeepStr: '', + daysToKeepStr: '', + numToKeepStr: '10')), + disableConcurrentBuilds(), + [$class: 'CopyArtifactPermissionProperty', projectNames: '*'] + ]) + + +//////////////////////////////////////////////////////////////////////// +import java.nio.file.Path; + +rcclTestsCI: +{ + def rcclTests = new rocProject('rcclTests') + // customize for project + rcclTests.paths.build_command = './install.sh' + + // Define test architectures, optional rocm version argument is available + def nodes = new dockerNodes(['RCCL'], rcclTests) + + boolean formatCheck = false + + def compileCommand = + { + platform, project-> + + project.paths.construct_build_prefix() + + def command = """#!/usr/bin/env bash + set -x + rm -rf rccl + git clone https://github.com/ROCmSoftwarePlatform/rccl + cd rccl + export RCCL_PATH=${WORKSPACE}/rccl/rccl-install + ./install.sh -i --prefix=\$RCCL_PATH + cd .. + cd ${project.paths.project_build_prefix} + ${project.paths.build_command} --rccl_home=\$RCCL_PATH + """ + sh command + } + def testCommand = + { + platform, project-> + + def command = """#!/usr/bin/env bash + set -x + LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${WORKSPACE}/rccl/rccl-install/lib/ python3 -m pytest -k "not MPI" --junitxml=./testreport.xml + """ + + sh command + //junit "${project.paths.project_build_prefix}/build/release/*.xml" + } + + def packageCommand = + { + platform, project-> + + def command = """ + """ + } + + buildProjectNoDocker(rcclTests, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) +} diff --git a/README.md b/README.md index 2731d65c65..dc3120f119 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,18 @@ All tests support the same set of arguments : * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. +## Unit tests + +Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests: + +1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests. +2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type. + +The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests: +```shell +$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest +``` + ## Copyright RCCL tests are provided under the BSD license. diff --git a/install.sh b/install.sh new file mode 100755 index 0000000000..32e5dc4d4e --- /dev/null +++ b/install.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + +# ################################################# +# helper functions +# ################################################# +function display_help() +{ + echo "RCCL-tests build & installation helper script" + echo "./install [-h|--help] " + echo " [-h|--help] Prints this help message." + echo " [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)" + echo " [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)" + echo " [--mpi_home] Specify path to your MPI installation." +} + +# ################################################# +# global variables +# ################################################# +run_tests=false +build_release=true +mpi_enabled=false +rccl_dir=/opt/rocm/rccl +mpi_dir="" +# ################################################# +# Parameter parsing +# ################################################# + +# check if we have a modern version of getopt that can handle whitespace and long parameters +getopt -T +if [[ $? -eq 4 ]]; then + GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@") +else + echo "Need a new version of getopt" + exit 1 +fi + +if [[ $? -ne 0 ]]; then + echo "getopt invocation failed; could not parse the command line"; + exit 1 +fi + +eval set -- "${GETOPT_PARSE}" + +while true; do + case "${1}" in + -h|--help) + display_help + exit 0 + ;; + -m|--mpi) + mpi_enabled=true + shift ;; + -t|--test) + run_tests=true + shift ;; + --rccl_home) + rccl_dir=${2} + shift 2 ;; + --mpi_home) + mpi_dir=${2} + shift 2 ;; + --) shift ; break ;; + *) echo "Unexpected command line parameter received; aborting"; + exit 1 + ;; + esac + done + +# Install the pre-commit hook +#bash ./githooks/install + +build_dir=./build +# ################################################# +# prep +# ################################################# +# ensure a clean build environment +rm -rf ${build_dir} + +if ($mpi_enabled); then + if [[ ${mpi_dir} -eq "" ]]; then + echo "MPI flag enabled but path to MPI installation not specified. See --mpi_home command line argument." + exit 1 + else + make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc) + fi +else + make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc) +fi + +# Optionally, run tests if they're enabled. +if ($run_tests); then + if ($mpi_enabled); then + cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest + else + cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest + fi +fi diff --git a/src/common.cu b/src/common.cu index 61084eb1bd..d708a7a916 100644 --- a/src/common.cu +++ b/src/common.cu @@ -292,7 +292,7 @@ void Barrier(struct threadArgs* args) args->barrier_idx=!args->barrier_idx; } -testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta, bool *error) { size_t count = args->expectedBytes/wordSize(type); double maxDelta = 0.0; for (int i=0; inGpus; i++) { @@ -327,7 +327,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif } double nranks = args->nProcs*args->nThreads*args->nGpus; - if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) + { + args->errors[0]++; + *error = true; + } *delta = maxDelta; return testSuccess; } @@ -446,6 +450,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); double maxDelta = 0; + bool error = false; static __thread int rep = 0; rep++; if (datacheck) { @@ -456,7 +461,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(startColl(args, type, op, root, in_place, 0)); TESTCHECK(completeColl(args)); - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta, &error)); //aggregate delta from all threads and procs Barrier(args); @@ -466,6 +471,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #ifdef MPI_SUPPORT MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &error, 1, MPI_C_BOOL, MPI_LOR, MPI_COMM_WORLD); #endif } Barrier(args); @@ -481,7 +487,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t sprintf(timeStr, "%7.2f", timeUsec); } if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + PRINT(" %7s %6.2f %6.2f %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : ""); } else { PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); } @@ -757,7 +763,7 @@ testResult_t run() { #endif is_main_thread = (proc == 0) ? 1 : 0; - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, + PRINT("# nThread: %d nGpus: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); @@ -887,6 +893,7 @@ testResult_t run() { for (int t=nThreads-1; t>=0; t--) { if (t) pthread_join(threads[t].thread, NULL); TESTCHECK(threads[t].ret); + if (t) { errors[0] += errors[t]; bw[0] += bw[t]; @@ -927,6 +934,7 @@ testResult_t run() { double check_avg_bw = str ? atof(str) : -1; bw[0] /= bw_count[0]; + if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n"); PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK"); PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK")); PRINT("#\n"); diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000000..cfd487930d --- /dev/null +++ b/test/__init__.py @@ -0,0 +1,20 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000000..79ce9b8ef8 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,23 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +def pytest_addoption(parser): + parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile") \ No newline at end of file diff --git a/test/test_AllGather.py b/test/test_AllGather.py new file mode 100644 index 0000000000..2d3d74bcef --- /dev/null +++ b/test/test_AllGather.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/all_gather_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllGather test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllGather test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/test/test_AllReduce.py b/test/test_AllReduce.py new file mode 100644 index 0000000000..b3cb5f99ff --- /dev/null +++ b/test/test_AllReduce.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/all_reduce_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllReduce test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("AllReduce test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/test/test_Broadcast.py b/test/test_Broadcast.py new file mode 100644 index 0000000000..f4b8b38363 --- /dev/null +++ b/test/test_Broadcast.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/broadcast_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Broadcast test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Broadcast test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/test/test_Reduce.py b/test/test_Reduce.py new file mode 100644 index 0000000000..5df694490d --- /dev/null +++ b/test/test_Reduce.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/reduce_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Reduce test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("Reduce test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file diff --git a/test/test_ReduceScatter.py b/test/test_ReduceScatter.py new file mode 100644 index 0000000000..66b431b00a --- /dev/null +++ b/test/test_ReduceScatter.py @@ -0,0 +1,102 @@ +################################################################################# +# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- +# ies of the Software, and to permit persons to whom the Software is furnished +# to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- +# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- +# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +################################################################################ + +import os +import subprocess +import itertools + +import pytest + +nthreads = ["1"] +nprocs = ["2"] +ngpus_single = ["1","2","4"] +ngpus_mpi = ["1","2"] +byte_range = [("4", "128M")] +op = ["sum", "prod", "min", "max"] +step_factor = ["2"] +datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"] +memory_type = ["coarse","fine", "host"] + +path = os.path.dirname(os.path.abspath(__file__)) +executable = path + "/../build/reduce_scatter_perf" + +@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type", + itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type)) +def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type): + try: + args = [executable, + "-t", nthreads, + "-g", ngpus_single, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("ReduceScatter test error(s) detected.") + + assert rccl_test.returncode == 0 + +@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype", + itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype)) +def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype): + try: + mpi_hostfile = request.config.getoption('--hostfile') + if not mpi_hostfile: + args = ["mpirun -np", nprocs, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype] + else: + args = ["mpirun -np", nprocs, + "-host", mpi_hostfile, + executable, + "-p 1", + "-t", nthreads, + "-g", ngpus_mpi, + "-b", byte_range[0], + "-e", byte_range[1], + "-o", op, + "-f", step_factor, + "-d", datatype, + "-y", memory_type] + if memory_type == "fine": + args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1") + args_str = " ".join(args) + print(args_str) + rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True) + except subprocess.CalledProcessError as err: + print(rccl_test.stdout) + pytest.fail("ReduceScatter test error(s) detected.") + + assert rccl_test.returncode == 0 \ No newline at end of file From 043eef69996a825698c5679d8419ee12768740d0 Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Thu, 11 Jul 2019 15:36:21 +0000 Subject: [PATCH 025/233] Checking that number of requested GPUs is not more than number of available GPUs --- src/common.cu | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/common.cu b/src/common.cu index d708a7a916..d4d528ea7c 100644 --- a/src/common.cu +++ b/src/common.cu @@ -738,6 +738,15 @@ int main(int argc, char* argv[]) { return 0; } } + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + if (nGpus > numDevices) + { + fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices); + return testNcclError; + } + #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); #endif From 23c374475f0472a06b461ad5ba5d09b5312a1f3c Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 26 Jul 2019 00:12:41 +0000 Subject: [PATCH 026/233] Allow call ncclCommAbort on Ctrl+C --- src/common.cu | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index d4d528ea7c..4f97a4847f 100644 --- a/src/common.cu +++ b/src/common.cu @@ -12,6 +12,7 @@ #include #include #include +#include #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; @@ -43,6 +44,7 @@ static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; static int memorytype = 0; +static ncclResult_t ncclabort = ncclSuccess; double parsesize(char *value) { long long int units; @@ -336,6 +338,21 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t return testSuccess; } +void INThandler(int sig) { + char c; + + signal(sig, SIG_IGN); + printf("\nDo you want to call ncclCommAbort before exit? [y/n] "); + c = getchar(); + if (c == 'y' || c == 'Y') { + ncclabort = ncclSystemError; + signal(SIGINT, INThandler); + } + else + exit (0); + getchar(); // Get new line character +} + testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) { hipError_t hipErr; int remaining = ngpus; @@ -361,13 +378,17 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* if (comms) { ncclResult_t ncclAsyncErr; NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); - if (ncclAsyncErr != ncclSuccess) { + if (ncclAsyncErr != ncclSuccess || ncclabort != ncclSuccess) { // An asynchronous error happened. Stop the operation and destroy // the communicator for (int i=0; i= 2 +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + // may call ncclCommAbort + signal(SIGINT, INThandler); +#endif +#endif // Make sure everyline is flushed so that we see the progress of the test setlinebuf(stdout); From ab82f1af6f5a373bb71fe0c44b8d772db24a03af Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 9 Aug 2019 10:22:14 -0700 Subject: [PATCH 027/233] Fix memory leak and possible buffer overrun --- src/common.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 4f97a4847f..36f2aa0941 100644 --- a/src/common.cu +++ b/src/common.cu @@ -398,6 +398,7 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* // We might want to let other threads (including NCCL threads) use the CPU. if (idle) pthread_yield(); } + free(done); return testSuccess; } @@ -814,7 +815,7 @@ testResult_t run() { int rank = proc*nThreads*nGpus+i; hipDeviceProp_t prop; HIPCHECK(hipGetDeviceProperties(&prop, hipDev)); - len += snprintf(line+len, MAX_LINE-len, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", + len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name); } From ca7a565236ce9353d1fe56026afa5a2b0e7bb9f1 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Fri, 16 Aug 2019 09:06:28 -0700 Subject: [PATCH 028/233] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7a4bbbc6ca..b8b65676b0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # NCCL Tests -These tests check both the performance and the correctness of NCCL operations. They can be compiled against [NCCL](http://github.com/nvidia/nccl) +These tests check both the performance and the correctness of [NCCL](http://github.com/nvidia/nccl) operations. ## Build From 13d0ddd12e93d72ce69a083e5811439a3b658f73 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 25 Sep 2019 14:07:04 -0700 Subject: [PATCH 029/233] Init data for throughput iterations to avoid all zero data --- src/common.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/common.cu b/src/common.cu index 36f2aa0941..e13aa3521b 100644 --- a/src/common.cu +++ b/src/common.cu @@ -851,6 +851,9 @@ testResult_t run() { HIPCHECK(hipSetDevice(localRank*nThreads*nGpus+i)); AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus); HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking)); + // initialize data buffer to avoid all zero data + TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclUint8, 0, i)); + HIPCHECK(hipDeviceSynchronize()); } //if parallel init is not selected, use main thread to initialize NCCL From a2af1d959dd01a68b55a8f31aa44538d58dc0c35 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 10 Oct 2019 10:51:05 -0700 Subject: [PATCH 030/233] Update README.md Checks are now fully local, no need to disable them at scale. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b8b65676b0..791bed2599 100644 --- a/README.md +++ b/README.md @@ -29,9 +29,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 ``` -Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each, disabling checks : +Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each : ```shell -$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 -c 0 +$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 ``` ### Performance From 24eb972cae0f0cbe228ddcf9d653db38688d8cfd Mon Sep 17 00:00:00 2001 From: Pak Lui <5041261+paklui@users.noreply.github.com> Date: Thu, 17 Oct 2019 15:38:37 -0700 Subject: [PATCH 031/233] fix syntax error for string comparison --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 32e5dc4d4e..7c8a865ef5 100755 --- a/install.sh +++ b/install.sh @@ -78,7 +78,7 @@ build_dir=./build rm -rf ${build_dir} if ($mpi_enabled); then - if [[ ${mpi_dir} -eq "" ]]; then + if [[ ${mpi_dir} == "" ]]; then echo "MPI flag enabled but path to MPI installation not specified. See --mpi_home command line argument." exit 1 else From aa0f02bee034b85450075c5c0d0ed373f42d8e2d Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 11 Nov 2019 11:37:45 -0800 Subject: [PATCH 032/233] Fix incorrect print out when data size is greater than 4GB --- src/common.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index e13aa3521b..af0e2d22dd 100644 --- a/src/common.cu +++ b/src/common.cu @@ -13,6 +13,7 @@ #include #include #include +#include #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; @@ -551,7 +552,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); TESTCHECK(BenchTime(args, type, op, root, 0)); TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); From 32399955afc00d566d481e56c050bf22dcd11e76 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Tue, 12 Nov 2019 23:03:59 +0000 Subject: [PATCH 033/233] Fix build with RCCL 1.x API --- src/common.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common.cu b/src/common.cu index af0e2d22dd..5bf78eeebe 100644 --- a/src/common.cu +++ b/src/common.cu @@ -853,7 +853,11 @@ testResult_t run() { AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus); HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking)); // initialize data buffer to avoid all zero data +#if NCCL_MAJOR >= 2 TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclUint8, 0, i)); +#else + TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclChar, 0, i)); +#endif HIPCHECK(hipDeviceSynchronize()); } From 1cda2f52b6a3b96a5035c049519a988f8e3bccfa Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 15 Nov 2019 13:46:03 -0800 Subject: [PATCH 034/233] Add bf16 support in rccl-tests --- src/Makefile | 3 +- src/common.cu | 26 ++++- src/common.h | 4 + src/rccl_bfloat16.h | 253 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 283 insertions(+), 3 deletions(-) create mode 100644 src/rccl_bfloat16.h diff --git a/src/Makefile b/src/Makefile index 78470b8f48..157a351e5c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,11 +15,10 @@ HIPCC = $(ROCM_HOME)/hip/bin/hipcc CXX = $(HIPCC) -HIPCUFLAGS := +HIPCUFLAGS := -std=c++14 HIPCUFLAGS += -I$(ROCM_HOME)/include HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip -HIPCUFLAGS += -I$(ROCM_HOME)/hiprand/include LDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt HIPLDFLAGS := $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt diff --git a/src/common.cu b/src/common.cu index 5bf78eeebe..07ebcd90a3 100644 --- a/src/common.cu +++ b/src/common.cu @@ -7,6 +7,7 @@ ************************************************************************/ #include "hip/hip_runtime.h" +#include "rccl_bfloat16.h" #include "common.h" #include #include @@ -16,8 +17,13 @@ #include #if NCCL_MAJOR >= 2 +#if RCCL_BFLOAT16 == 1 +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, ncclBfloat16}; +const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", "bf16"}; +#else ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"}; +#endif #else ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; @@ -78,6 +84,9 @@ double DeltaMaxValue(ncclDataType_t type) { #endif case ncclInt64: case ncclUint64: return 1e-200; +#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 + case ncclBfloat16: return 1e-2; +#endif } return 1e-200; } @@ -155,6 +164,10 @@ testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataTyp case ncclInt64: case ncclUint64: hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; +#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 + case ncclBfloat16: + hipLaunchKernelGGL((deltaKern), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break; +#endif } HIPCHECK(hipDeviceSynchronize()); return testSuccess; @@ -181,6 +194,10 @@ template<> __device__ half testValue(const size_t offset, const int rep, const int rank) { return __float2half(testValue(offset, rep, rank)); } +template<> +__device__ rccl_bfloat16 testValue(const size_t offset, const int rep, const int rank) { + return (float)testValue(offset, rep, rank); +} // Operations template @@ -220,7 +237,11 @@ typedef void(*redInitKern_t)(void* data, const size_t N, const size_t offset, co static redInitKern_t const redInitDataKerns[ncclNumOps*ncclNumTypes] = { #if NCCL_MAJOR >= 2 +#if RCCL_BFLOAT16 == 1 + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), OPS(rccl_bfloat16) +#else OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double) +#endif #else OPS(char), OPS(int32_t), OPS(half), OPS(float), OPS(double), OPS(int64_t), OPS(uint64_t) #endif @@ -251,7 +272,10 @@ static initDataKern_t const initDataKerns[ncclNumTypes] = { InitDataKernel, InitDataKernel< half>, InitDataKernel< float>, - InitDataKernel< double> + InitDataKernel< double>, +#if RCCL_BFLOAT16 == 1 + InitDataKernel +#endif #else InitDataKernel< char>, InitDataKernel< int32_t>, diff --git a/src/common.h b/src/common.h index dd98d547df..54f216c9c5 100644 --- a/src/common.h +++ b/src/common.h @@ -195,6 +195,10 @@ static size_t wordSize(ncclDataType_t type) { case ncclDouble: //case ncclFloat64: return 8; +#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 + case ncclBfloat16: + return 2; +#endif default: return 0; } } diff --git a/src/rccl_bfloat16.h b/src/rccl_bfloat16.h new file mode 100644 index 0000000000..06b053a626 --- /dev/null +++ b/src/rccl_bfloat16.h @@ -0,0 +1,253 @@ +/** + * MIT License + * + * Copyright 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*!\file + * \brief rccl_bfloat16.h provides struct for rccl_bfloat16 typedef + */ + +#ifndef _RCCL_BFLOAT16_H_ +#define _RCCL_BFLOAT16_H_ + +#if __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__)) + +// If this is a C compiler, C++ compiler below C++14, or a host-only compiler, we only +// include a minimal definition of rccl_bfloat16 + +#include +/*! \brief Struct to represent a 16 bit brain floating point number. */ +typedef struct +{ + uint16_t data; +} rccl_bfloat16; + +#else // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__)) + +#include +#include +#include +#include +#include +#include + +struct rccl_bfloat16 +{ + uint16_t data; + + __host__ __device__ rccl_bfloat16() = default; + + // round upper 16 bits of IEEE float to convert to bfloat16 + explicit constexpr __host__ __device__ rccl_bfloat16(float f) + : data(float_to_bfloat16(f)) + { + } + + // zero extend lower 16 bits of bfloat16 to convert to IEEE float + constexpr __host__ __device__ operator float() const + { + union + { + uint32_t int32; + float fp32; + } u = {uint32_t(data) << 16}; + return u.fp32; + } + +private: + static constexpr __host__ __device__ uint16_t float_to_bfloat16(float f) + { + union + { + float fp32; + uint32_t int32; + } u = {f}; + if(~u.int32 & 0x7f800000) + { + // When the exponent bits are not all 1s, then the value is zero, normal, + // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus + // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). + // This causes the bfloat16's mantissa to be incremented by 1 if the 16 + // least significant bits of the float mantissa are greater than 0x8000, + // or if they are equal to 0x8000 and the least significant bit of the + // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when + // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already + // has the value 0x7f, then incrementing it causes it to become 0x00 and + // the exponent is incremented by one, which is the next higher FP value + // to the unrounded bfloat16 value. When the bfloat16 value is subnormal + // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up + // to a normal value with an exponent of 0x01 and a mantissa of 0x00. + // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, + // incrementing it causes it to become an exponent of 0xFF and a mantissa + // of 0x00, which is Inf, the next higher value to the unrounded value. + u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even + } + else if(u.int32 & 0xffff) + { + // When all of the exponent bits are 1, the value is Inf or NaN. + // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero + // mantissa bit. Quiet NaN is indicated by the most significant mantissa + // bit being 1. Signaling NaN is indicated by the most significant + // mantissa bit being 0 but some other bit(s) being 1. If any of the + // lower 16 bits of the mantissa are 1, we set the least significant bit + // of the bfloat16 mantissa, in order to preserve signaling NaN in case + // the bloat16's mantissa bits are all 0. + u.int32 |= 0x10000; // Preserve signaling NaN + } + return uint16_t(u.int32 >> 16); + } +}; + +typedef struct +{ + uint16_t data; +} rccl_bfloat16_public; + +static_assert(std::is_standard_layout{}, + "rccl_bfloat16 is not a standard layout type, and thus is " + "incompatible with C."); + +static_assert(std::is_trivial{}, + "rccl_bfloat16 is not a trivial type, and thus is " + "incompatible with C."); + +static_assert(sizeof(rccl_bfloat16) == sizeof(rccl_bfloat16_public) + && offsetof(rccl_bfloat16, data) == offsetof(rccl_bfloat16_public, data), + "internal rccl_bfloat16 does not match public rccl_bfloat16"); + +inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16) +{ + return os << float(bf16); +} +constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a) +{ + return a; +} +constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a) +{ + a.data ^= 0x8000; + return a; +} +constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return rccl_bfloat16(float(a) + float(b)); +} +constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return rccl_bfloat16(float(a) - float(b)); +} +constexpr __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return rccl_bfloat16(float(a) * float(b)); +} +constexpr __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return rccl_bfloat16(float(a) / float(b)); +} +constexpr __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return float(a) < float(b); +} +constexpr __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return float(a) == float(b); +} +constexpr __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return b < a; +} +constexpr __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return !(a > b); +} +constexpr __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return !(a == b); +} +constexpr __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b) +{ + return !(a < b); +} +constexpr __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b) +{ + return a = a + b; +} +constexpr __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b) +{ + return a = a - b; +} +constexpr __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b) +{ + return a = a * b; +} +constexpr __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b) +{ + return a = a / b; +} +constexpr __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a) +{ + return a += rccl_bfloat16(1.0f); +} +constexpr __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a) +{ + return a -= rccl_bfloat16(1.0f); +} +constexpr __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int) +{ + rccl_bfloat16 orig = a; + ++a; + return orig; +} +constexpr __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int) +{ + rccl_bfloat16 orig = a; + --a; + return orig; +} + +namespace std +{ + constexpr __host__ __device__ bool isinf(rccl_bfloat16 a) + { + return !(~a.data & 0x7f80) && !(a.data & 0x7f); + } + constexpr __host__ __device__ bool isnan(rccl_bfloat16 a) + { + return !(~a.data & 0x7f80) && +(a.data & 0x7f); + } + constexpr __host__ __device__ bool iszero(rccl_bfloat16 a) + { + return !(a.data & 0x7fff); + } + inline rccl_bfloat16 sin(rccl_bfloat16 a) + { + return rccl_bfloat16(sinf(float(a))); + } + inline rccl_bfloat16 cos(rccl_bfloat16 a) + { + return rccl_bfloat16(cosf(float(a))); + } +} + +#endif // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__)) + +#endif // _RCCL_BFLOAT16_H_ From bd53e98df32f0eff71556bdfb74543e1dda3c1d7 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 22 Nov 2019 10:29:32 -0800 Subject: [PATCH 035/233] Fix build error with hip-clang --- src/common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 07ebcd90a3..3edba38831 100644 --- a/src/common.cu +++ b/src/common.cu @@ -196,7 +196,7 @@ __device__ half testValue(const size_t offset, const int rep, const int ra } template<> __device__ rccl_bfloat16 testValue(const size_t offset, const int rep, const int rank) { - return (float)testValue(offset, rep, rank); + return rccl_bfloat16(testValue(offset, rep, rank)); } // Operations From 0f173234bb2837327d806e9e4de9af3dda9a7043 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 16 Dec 2019 16:18:22 -0800 Subject: [PATCH 036/233] Add -L$(MPI_HOME)/lib64 to NVLDFLAGS In some cases, the MPI library is not in $(MPI_HOME)/lib but in $(MPI_HOME)/lib64. For example, on RedHat like Linux system (CentOS, Amazon Linux), and MPI is installed by yum or rpm. Under such circumstance, the current make file will cause failure. This patch address this issue by adding -L$(MPI_HOME)/lib64 to NVLDFLAGS in src/Makefile. Signed-off-by: Wei Zhang --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 034cc672fa..ed723d4210 100644 --- a/src/Makefile +++ b/src/Makefile @@ -52,7 +52,7 @@ endif ifeq ($(MPI), 1) NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include -NVLDFLAGS += -L$(MPI_HOME)/lib -lmpi +NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi endif LIBRARIES += curand nccl nvToolsExt NVLDFLAGS += $(LIBRARIES:%=-l%) From 6e9e05972b3dfd709304beaebd7359f7a232cb44 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 11 Mar 2020 13:40:17 -0700 Subject: [PATCH 037/233] Add option for stress testing --- src/common.cu | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/common.cu b/src/common.cu index 3edba38831..6aa2a02450 100644 --- a/src/common.cu +++ b/src/common.cu @@ -51,6 +51,7 @@ static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; static int memorytype = 0; +static int stress_cycles = 1; static ncclResult_t ncclabort = ncclSuccess; double parsesize(char *value) { @@ -573,13 +574,16 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* } TESTCHECK(completeColl(args)); - // Benchmark - for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { - setupArgs(size, type, args); - print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); - TESTCHECK(BenchTime(args, type, op, root, 0)); - TESTCHECK(BenchTime(args, type, op, root, 1)); - PRINT("\n"); + for (size_t iter = 0; iter < stress_cycles; iter++) { + if (iter > 0) PRINT("# Testing %d cycle.\n", iter+1); + // Benchmark + for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { + setupArgs(size, type, args); + print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + TESTCHECK(BenchTime(args, type, op, root, 0)); + TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); + } } return testSuccess; } @@ -683,12 +687,13 @@ int main(int argc, char* argv[]) { {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, {"memory_type", required_argument, 0, 'y'}, + {"stress_cycles", required_argument, 0, 's'}, {"help", no_argument, 0, 'h'} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:h", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:s:h", longopts, &longindex); if (c == -1) break; @@ -746,6 +751,9 @@ int main(int argc, char* argv[]) { case 'y': memorytype = ncclstringtomtype(optarg); break; + case 's': + stress_cycles = strtol(optarg, NULL, 0); + break; case 'h': printf("USAGE: %s \n\t" "[-t,--nthreads ] \n\t" From 119a0ecf600f30d6b82897126f6301e15b6582b8 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 17 Mar 2020 12:00:19 -0700 Subject: [PATCH 038/233] Add alltoall perf test --- src/Makefile | 2 +- src/alltoall.cu | 117 ++++++++++++++++++++++++++++++++++++++++++++++++ src/common.cu | 4 +- src/common.h | 14 +++--- 4 files changed, 129 insertions(+), 8 deletions(-) create mode 100644 src/alltoall.cu diff --git a/src/Makefile b/src/Makefile index 034cc672fa..33ca479422 100644 --- a/src/Makefile +++ b/src/Makefile @@ -60,7 +60,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/alltoall.cu b/src/alltoall.cu new file mode 100644 index 0000000000..aea9370f65 --- /dev/null +++ b/src/alltoall.cu @@ -0,0 +1,117 @@ +/************************************************************************* + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" + +void print_header() { + PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s %6s", size, count, typeName, opName); +} + +void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = (count/nranks)*nranks; + *recvcount = (count/nranks)*nranks; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = count/nranks; +} + +testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + char* str = getenv("NCCL_TESTS_DEVICE"); + int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + // We don't support in-place alltoall + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + size_t rankOffset = count * wordSize(type); + if (count == 0) return testSuccess; + + NCCLCHECK(ncclGroupStart()); + for (int r=0; rcollTest = &alltoAllTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; inProcs*args->nThreads*args->nGpus; - if (maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; + if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; *delta = maxDelta; return testSuccess; } @@ -832,6 +832,8 @@ testResult_t run() { threads[t].args.bw=bw+t; threads[t].args.bw_count=bw_count+t; + threads[t].args.reportErrors = 1; + threads[t].func = parallel_init ? threadInit : threadRunTests; if (t) TESTCHECK(threadLaunch(threads+t)); diff --git a/src/common.h b/src/common.h index 8fb5b8cadf..a2d7ae2958 100644 --- a/src/common.h +++ b/src/common.h @@ -17,25 +17,25 @@ #include "nccl1_compat.h" #define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ printf("%s: Test CUDA failure %s:%d '%s'\n", \ hostname, \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ + __FILE__,__LINE__,cudaGetErrorString(err)); \ return testCudaError; \ } \ } while(0) #define NCCLCHECK(cmd) do { \ - ncclResult_t r = cmd; \ - if (r!= ncclSuccess) { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ printf("%s: Test NCCL failure %s:%d '%s'\n", \ hostname, \ - __FILE__,__LINE__,ncclGetErrorString(r)); \ + __FILE__,__LINE__,ncclGetErrorString(res)); \ return testNcclError; \ } \ } while(0) @@ -124,6 +124,8 @@ struct threadArgs { double* bw; int* bw_count; + int reportErrors; + struct testColl* collTest; }; From 6932a583e74c7685fcc4e7206e367d917f3e0485 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 19 Mar 2020 10:18:39 -0700 Subject: [PATCH 039/233] Add gather and scatter test --- src/Makefile | 2 +- src/common.cu | 21 ++++---- src/gather.cu | 127 ++++++++++++++++++++++++++++++++++++++++++++++++ src/scatter.cu | 129 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 269 insertions(+), 10 deletions(-) create mode 100644 src/gather.cu create mode 100644 src/scatter.cu diff --git a/src/Makefile b/src/Makefile index b02d6e886d..e109b8c3ae 100644 --- a/src/Makefile +++ b/src/Makefile @@ -54,7 +54,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/common.cu b/src/common.cu index d14c286125..908a69b9c5 100644 --- a/src/common.cu +++ b/src/common.cu @@ -16,6 +16,8 @@ #include #include +//#define DEBUG_PRINT + #if NCCL_MAJOR >= 2 #if RCCL_BFLOAT16 == 1 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, ncclBfloat16}; @@ -326,6 +328,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t for (int i=0; inGpus; i++) { int device; int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + if (rank != root && strcmp(args->collTest->name, "Gather") == 0) + continue; NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); HIPCHECK(hipSetDevice(device)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; @@ -333,25 +337,24 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t maxDelta = std::max(*(args->deltaHost), maxDelta); #ifdef DEBUG_PRINT - if (rank == 0) { + //if (rank == 0) { int *expectedHost = (int *)malloc(args->expectedBytes); int *dataHost = (int *)malloc(args->expectedBytes); - hipMemcpy(expectedHost, args->expected[0], args->expectedBytes, hipMemcpyDeviceToHost); - printf("\n Expected: "); + hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost); + printf("\n Rank [%d] Expected: ", rank); for(int j=0; jexpectedBytes/sizeof(int); j++) { printf("%d:%d ", j, expectedHost[j]); } - printf("\n"); - hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost); - printf("\n Actual: "); + printf("\n Rank [%d] Actual: ", rank); for (int j=0; jexpectedBytes/sizeof(int); j++) { printf("%d:%d ", j, dataHost[j]); } printf("\n"); - free(temp); - } + free(dataHost); + free(expectedHost); + //} #endif } double nranks = args->nProcs*args->nThreads*args->nGpus; @@ -571,7 +574,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* TESTCHECK(completeColl(args)); for (size_t iter = 0; iter < stress_cycles; iter++) { - if (iter > 0) PRINT("# Testing %ld cycle.\n", iter+1); + if (iter > 0) PRINT("# Testing %lu cycle.\n", iter+1); // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); diff --git a/src/gather.cu b/src/gather.cu new file mode 100644 index 0000000000..5230fdc81d --- /dev/null +++ b/src/gather.cu @@ -0,0 +1,127 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "common.h" + +//#define DEBUG_PRINT + +void print_header() { + PRINT("# %10s %12s %6s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s", size, count, typeName); +} + +void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count/nranks; + *recvcount = (count/nranks)*nranks; + *sendInplaceOffset = count/nranks; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + HIPCHECK(hipSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); +#ifdef DEBUG_PRINT + int *dataHost = (int *)malloc(args->sendBytes); + hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); + printf("\n Rank [%d] Init: ", rank); + for (int j=0; jsendBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(dataHost); +#endif + for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + } + HIPCHECK(hipDeviceSynchronize()); + } + return testSuccess; +} + +void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + size_t rankOffset = count * wordSize(type); + if (count == 0) return testSuccess; + + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + NCCLCHECK(ncclGroupStart()); + if (rank == root) { + for (int r=0; rcollTest = &gatherTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i +#include "common.h" + +//#define DEBUG_PRINT + +void print_header() { + PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s %6s", size, count, typeName, opName); +} + +void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = (count/nranks)*nranks; + *recvcount = count/nranks; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *recvcount; +} + +testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + HIPCHECK(hipSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) { + for (int j=0; jexpectedBytes*j, recvcount, type, rep, j)); + } +#ifdef DEBUG_PRINT + int *dataHost = (int *)malloc(args->sendBytes); + hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); + printf("\n Rank [%d] Init: ", rank); + for (int j=0; jsendBytes/sizeof(int); j++) { + printf("%d:%d ", j, dataHost[j]); + } + printf("\n"); + free(dataHost); +#endif + } + TESTCHECK(InitData(args->expected[i], recvcount, type, rep, rank)); + HIPCHECK(hipDeviceSynchronize()); + } + return testSuccess; +} + +void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + size_t rankOffset = count * wordSize(type); + if (count == 0) return testSuccess; + + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + NCCLCHECK(ncclGroupStart()); + if (rank == root) { + for (int r=0; rcollTest = &scatterTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i Date: Fri, 10 Apr 2020 07:51:39 -0700 Subject: [PATCH 040/233] Improve makefile to avoid LD_LIBRARY_PATH --- src/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index e109b8c3ae..8a63340968 100644 --- a/src/Makefile +++ b/src/Makefile @@ -37,11 +37,11 @@ endif BUILDDIR ?= ../build ifneq ($(NCCL_HOME), "") HIPCUFLAGS += -I$(NCCL_HOME)/include/ -HIPLDFLAGS += -L$(NCCL_HOME)/lib +HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME)/lib -L$(NCCL_HOME)/lib endif ifeq ($(MPI), 1) -HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include +HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi else ifeq ($(MPICH), 1) HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich From 0d7c4db33ec7d2859b28a2cbe1b07baf227923f0 Mon Sep 17 00:00:00 2001 From: Sourav Chakraborty Date: Fri, 10 Apr 2020 07:53:38 -0700 Subject: [PATCH 041/233] Add sendrecv benchmark --- src/Makefile | 2 +- src/sendrecv.cu | 112 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 src/sendrecv.cu diff --git a/src/Makefile b/src/Makefile index 8a63340968..ac317eb26b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -54,7 +54,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter sendrecv BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/sendrecv.cu b/src/sendrecv.cu new file mode 100644 index 0000000000..180d8c26a9 --- /dev/null +++ b/src/sendrecv.cu @@ -0,0 +1,112 @@ +/************************************************************************* + * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "common.h" + +void print_header() { + PRINT("# %10s %12s %6s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s", size, count, typeName); +} + +void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + *sendcount = count; + *recvcount = count; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = *sendcount; +} + +testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + HIPCHECK(hipSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + TESTCHECK(InitData(args->sendbuffs[i], sendcount, type, rep, rank)); + TESTCHECK(InitData(args->recvbuffs[i], recvcount, type, rep, rank)); + int src = rank < nranks/2 ? rank : rank - nranks/2; + TESTCHECK(InitData(args->expected[i], recvcount, type, rep, src)); + HIPCHECK(hipDeviceSynchronize()); + } + return testSuccess; +} + +void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = nranks/2; + *busBw = baseBw * factor; +} + +testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { + int rank, peer, nranks, npairs; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + NCCLCHECK(ncclCommCount(comm, &nranks)); + npairs = nranks / 2; +#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7 + if (rank < npairs) { + peer = rank + npairs; + NCCLCHECK(ncclSend(sendbuff, count, type, peer, comm, stream)); + } else if (rank < 2*npairs) { + peer = rank - npairs; + NCCLCHECK(ncclRecv(recvbuff, count, type, peer, comm, stream)); + } +#endif + return testSuccess; +} + +struct testColl sendrecvTest = { + "SendRecv", + SendRecvGetCollByteCount, + SendRecvInitData, + SendRecvGetBw, + SendRecvRunColl +}; + +void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &sendrecvTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i Date: Mon, 13 Apr 2020 15:51:57 -0700 Subject: [PATCH 042/233] Add option to use alltoall, gather and scatter API These APIs launche RCCL kernel implementation by default. If environmental variable RCCL_ALLTOALL_KERNEL_DISABLE=1, then the APIs use wrapper around ncclSend and ncclRecv. --- src/alltoall.cu | 8 ++++++++ src/gather.cu | 12 ++++++++++++ src/scatter.cu | 9 +++++++++ 3 files changed, 29 insertions(+) diff --git a/src/alltoall.cu b/src/alltoall.cu index 4d86bdaae5..41bbc780a8 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -8,6 +8,8 @@ #include #include "common.h" +#define USE_RCCL_GATHER_SCATTER + void print_header() { PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", @@ -65,12 +67,18 @@ testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD size_t rankOffset = count * wordSize(type); if (count == 0) return testSuccess; +#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7 +#if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER) + NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream)); +#else NCCLCHECK(ncclGroupStart()); for (int r=0; rreportErrors = in_place ? 0 : 1; return testSuccess; } @@ -76,6 +79,13 @@ testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDat int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); +#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7 +#if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER) + if (rank == root) + NCCLCHECK(ncclGather(sendbuff, recvbuff, count, type, root, comm, stream)); + else + NCCLCHECK(ncclGather(sendbuff, 0, count, type, root, comm, stream)); +#else NCCLCHECK(ncclGroupStart()); if (rank == root) { for (int r=0; rexpected[i], recvcount, type, rep, rank)); HIPCHECK(hipDeviceSynchronize()); } + // We don't support in-place scatter + args->reportErrors = in_place ? 0 : 1; return testSuccess; } @@ -78,6 +81,10 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); +#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7 +#if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER) + NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream)); +#else NCCLCHECK(ncclGroupStart()); if (rank == root) { for (int r=0; r Date: Thu, 7 May 2020 12:29:07 -0600 Subject: [PATCH 043/233] Restarting CI (#6) --- .jenkins/common.groovy | 37 +++++++++++++ .jenkins/precheckin.groovy | 81 +++++++++++++++++++++++++++++ docker/dockerfile-build-centos | 41 +++++++++++++++ docker/dockerfile-build-ubuntu-rock | 43 +++++++++++++++ docker/dockerfile-install-centos | 8 +++ docker/dockerfile-install-ubuntu | 8 +++ 6 files changed, 218 insertions(+) create mode 100644 .jenkins/common.groovy create mode 100644 .jenkins/precheckin.groovy create mode 100644 docker/dockerfile-build-centos create mode 100644 docker/dockerfile-build-ubuntu-rock create mode 100644 docker/dockerfile-install-centos create mode 100644 docker/dockerfile-install-ubuntu diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy new file mode 100644 index 0000000000..5bf86a4765 --- /dev/null +++ b/.jenkins/common.groovy @@ -0,0 +1,37 @@ +// This file is for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +def runCompileCommand(platform, project, jobName) +{ + project.paths.construct_build_prefix() + + String hipclangArgs = jobName.contains('hipclang') ? '--hip-clang' : '' + def getRCCL = auxiliary.getLibrary('rccl',platform.jenkinsLabel,'develop') + + def command = """#!/usr/bin/env bash + set -x + ${getRCCL} + ${auxiliary.exitIfNotSuccess()} + cd ${project.paths.project_build_prefix} + ${project.paths.build_command} + ${auxiliary.exitIfNotSuccess()} + """ + + platform.runCommand(this,command) +} + +def runTestCommand (platform, project) +{ + String sudo = auxiliary.sudo(platform.jenkinsLabel) + + def command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + python3 -m pytest -k "not MPI" --verbose --junitxml=./testreport.xml + """ + + platform.runCommand(this, command) + junit "${project.paths.project_build_prefix}/build/release/test/*.xml" +} + +return this diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy new file mode 100644 index 0000000000..aae81c922e --- /dev/null +++ b/.jenkins/precheckin.groovy @@ -0,0 +1,81 @@ +#!/usr/bin/env groovy +// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ +@Library('rocJenkins@pong') _ + +// This is file for internal AMD use. +// If you are interested in running your own Jenkins, please raise a github issue for assistance. + +import com.amd.project.* +import com.amd.docker.* +import java.nio.file.Path + +def runCI = +{ + nodeDetails, jobName-> + + def prj = new rocProject('rccl-tests', 'PreCheckin') + prj.paths.build_command = './install.sh' + + // Define test architectures, optional rocm version argument is available + def nodes = new dockerNodes(nodeDetails, jobName, prj) + + boolean formatCheck = false + + def commonGroovy + + def compileCommand = + { + platform, project-> + + commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" + commonGroovy.runCompileCommand(platform, project, jobName) + } + + def testCommand = + { + platform, project-> + + commonGroovy.runTestCommand(platform, project) + } + + buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) +} + +ci: { + String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) + + def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], + "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], + "rocm-docker":[]] + propertyList = auxiliary.appendPropertyList(propertyList) + + def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['rccl906']]), + "rocm-docker":([ubuntu16:['rccl906']])] + jobNameList = auxiliary.appendJobNameList(jobNameList) + jobNameList['compute-rocm-dkms-no-npi-hipclang'] = [ubuntu16:['rccl906']] + + propertyList.each + { + jobName, property-> + if (urlJobName == jobName) + properties(auxiliary.addCommonProperties(property)) + } + + jobNameList.each + { + jobName, nodeDetails-> + if (urlJobName == jobName) + stage(jobName) { + runCI(nodeDetails, jobName) + } + } + + // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 + if(!jobNameList.keySet().contains(urlJobName)) + { + properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) + stage(urlJobName) { + runCI([ubuntu16:['rccl906']], urlJobName) + } + } +} \ No newline at end of file diff --git a/docker/dockerfile-build-centos b/docker/dockerfile-build-centos new file mode 100644 index 0000000000..6e48134bfa --- /dev/null +++ b/docker/dockerfile-build-centos @@ -0,0 +1,41 @@ +# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +# This Dockerfile provides a starting point for a ROCm installation of rccl + +# Parameters related to building rccl +ARG base_image + +FROM ${base_image} +LABEL maintainer="rccl-maintainer@amd.com" + +USER root +ARG user_uid + +# Install dependent packages +RUN yum install -y --nogpgcheck \ + sudo \ + chrpath \ + rock-dkms \ + rocm-cmake \ + centos-release-scl \ + devtoolset-7 \ + ca-certificates \ + git \ + cmake3 \ + make \ + libgomp \ + clang \ + clang-devel \ + gcc-c++ \ + pkgconfig \ + numactl-libs + +RUN echo '#!/bin/bash' | tee /etc/profile.d/devtoolset7.sh && echo \ + 'source scl_source enable devtoolset-7' >>/etc/profile.d/devtoolset7.sh + +# docker pipeline runs containers with particular uid +# create a jenkins user with this specific uid so it can use sudo priviledges +# Grant any member of sudo group password-less sudo privileges +RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \ + echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \ + chmod 400 /etc/sudoers.d/sudo-nopasswd + diff --git a/docker/dockerfile-build-ubuntu-rock b/docker/dockerfile-build-ubuntu-rock new file mode 100644 index 0000000000..f7e17d500a --- /dev/null +++ b/docker/dockerfile-build-ubuntu-rock @@ -0,0 +1,43 @@ +# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +# Parameters related to building rccl +ARG base_image + +FROM ${base_image} +LABEL maintainer="rccl-maintainer@amd.com" + +ARG user_uid + +# Install dependent packages +# Dependencies: +# * hcc-config.cmake: pkg-config +# * tensile: python2.7, python-yaml +# * rocblas-test: gfortran, googletest +# * rocblas-bench: libboost-program-options-dev +# * libhsakmt.so: libnuma1 +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + rock-dkms \ + sudo \ + ca-certificates \ + chrpath \ + git \ + make \ + cmake \ + pkg-config \ + python2.7 \ + python-yaml \ + python3-pytest \ + rocm-cmake \ + libboost-program-options-dev \ + libnuma1 \ + libomp-dev \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# docker pipeline runs containers with particular uid +# create a jenkins user with this specific uid so it can use sudo priviledges +# Grant any member of sudo group password-less sudo privileges +RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \ + mkdir -p /etc/sudoers.d/ && \ + echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd + diff --git a/docker/dockerfile-install-centos b/docker/dockerfile-install-centos new file mode 100644 index 0000000000..2ccd6337f6 --- /dev/null +++ b/docker/dockerfile-install-centos @@ -0,0 +1,8 @@ +# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +# Parameters related to building rccl +ARG base_image + +FROM ${base_image} +LABEL maintainer="rccl-maintainer@amd.com" + +#empty for now diff --git a/docker/dockerfile-install-ubuntu b/docker/dockerfile-install-ubuntu new file mode 100644 index 0000000000..d0b70e37c1 --- /dev/null +++ b/docker/dockerfile-install-ubuntu @@ -0,0 +1,8 @@ +# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +# Parameters related to building rccl +ARG base_image + +FROM ${base_image} +LABEL maintainer="rccl-maintainer@amd.com" + +#empty for now From a698b55cf50688041f470c1b15dbe3d35e66b14f Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 11 May 2020 15:17:53 -0700 Subject: [PATCH 044/233] Update rccl_bfloat16.h to match rocBLAS --- src/rccl_bfloat16.h | 77 ++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/src/rccl_bfloat16.h b/src/rccl_bfloat16.h index 06b053a626..cbc6e059a5 100644 --- a/src/rccl_bfloat16.h +++ b/src/rccl_bfloat16.h @@ -1,7 +1,7 @@ /** * MIT License * - * Copyright 2019 Advanced Micro Devices, Inc. All rights reserved. + * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,9 +29,9 @@ #ifndef _RCCL_BFLOAT16_H_ #define _RCCL_BFLOAT16_H_ -#if __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__)) +#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__)) -// If this is a C compiler, C++ compiler below C++14, or a host-only compiler, we only +// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only // include a minimal definition of rccl_bfloat16 #include @@ -41,7 +41,7 @@ typedef struct uint16_t data; } rccl_bfloat16; -#else // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__)) +#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__)) #include #include @@ -54,16 +54,26 @@ struct rccl_bfloat16 { uint16_t data; + enum truncate_t + { + truncate + }; + __host__ __device__ rccl_bfloat16() = default; // round upper 16 bits of IEEE float to convert to bfloat16 - explicit constexpr __host__ __device__ rccl_bfloat16(float f) + explicit __host__ __device__ rccl_bfloat16(float f) : data(float_to_bfloat16(f)) { } + explicit __host__ __device__ rccl_bfloat16(float f, truncate_t) + : data(truncate_float_to_bfloat16(f)) + { + } + // zero extend lower 16 bits of bfloat16 to convert to IEEE float - constexpr __host__ __device__ operator float() const + __host__ __device__ operator float() const { union { @@ -74,7 +84,7 @@ struct rccl_bfloat16 } private: - static constexpr __host__ __device__ uint16_t float_to_bfloat16(float f) + static __host__ __device__ uint16_t float_to_bfloat16(float f) { union { @@ -115,6 +125,17 @@ private: } return uint16_t(u.int32 >> 16); } + + // Truncate instead of rounding, preserving SNaN + static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f) + { + union + { + float fp32; + uint32_t int32; + } u = {f}; + return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff)); + } }; typedef struct @@ -138,86 +159,86 @@ inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16) { return os << float(bf16); } -constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a) +inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a) { return a; } -constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a) +inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a) { a.data ^= 0x8000; return a; } -constexpr __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b) { return rccl_bfloat16(float(a) + float(b)); } -constexpr __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b) { return rccl_bfloat16(float(a) - float(b)); } -constexpr __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b) { return rccl_bfloat16(float(a) * float(b)); } -constexpr __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b) { return rccl_bfloat16(float(a) / float(b)); } -constexpr __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b) { return float(a) < float(b); } -constexpr __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b) { return float(a) == float(b); } -constexpr __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b) { return b < a; } -constexpr __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b) { return !(a > b); } -constexpr __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b) { return !(a == b); } -constexpr __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b) +inline __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b) { return !(a < b); } -constexpr __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b) { return a = a + b; } -constexpr __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b) { return a = a - b; } -constexpr __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b) { return a = a * b; } -constexpr __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b) +inline __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b) { return a = a / b; } -constexpr __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a) +inline __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a) { return a += rccl_bfloat16(1.0f); } -constexpr __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a) +inline __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a) { return a -= rccl_bfloat16(1.0f); } -constexpr __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int) +inline __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int) { rccl_bfloat16 orig = a; ++a; return orig; } -constexpr __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int) +inline __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int) { rccl_bfloat16 orig = a; --a; @@ -248,6 +269,6 @@ namespace std } } -#endif // __cplusplus < 201402L || (!defined(__HCC__) && !defined(__HIPCC__)) +#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__)) #endif // _RCCL_BFLOAT16_H_ From 97a26afc2626650783ff94f36b1254d809b4d7d0 Mon Sep 17 00:00:00 2001 From: saadrahim <44449863+saadrahim@users.noreply.github.com> Date: Fri, 22 May 2020 09:58:42 -0600 Subject: [PATCH 045/233] Update common.groovy --- .jenkins/common.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index 5bf86a4765..c893f9fd8e 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -31,7 +31,7 @@ def runTestCommand (platform, project) """ platform.runCommand(this, command) - junit "${project.paths.project_build_prefix}/build/release/test/*.xml" + junit "${project.paths.project_build_prefix}/*.xml" } return this From 622771cc4e5db5a589c6ef0febafe1fe5367d848 Mon Sep 17 00:00:00 2001 From: saadrahim <44449863+saadrahim@users.noreply.github.com> Date: Tue, 2 Jun 2020 10:19:38 -0600 Subject: [PATCH 046/233] Removing old Jenkinsfile that is no longer needed --- Jenkinsfile | 82 ----------------------------------------------------- 1 file changed, 82 deletions(-) delete mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile deleted file mode 100644 index 7589636c68..0000000000 --- a/Jenkinsfile +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env groovy -// Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. -// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS -@Library('rocJenkins@noDocker') _ - -// This is file for internal AMD use. -// If you are interested in running your own Jenkins, please raise a github issue for assistance. - -import com.amd.project.* -import com.amd.docker.* - -//////////////////////////////////////////////////////////////////////// -// Mostly generated from snippet generator 'properties; set job properties' -// Time-based triggers added to execute nightly tests, eg '30 2 * * *' means 2:30 AM -properties([ - pipelineTriggers([cron('0 1 * * *'), [$class: 'PeriodicFolderTrigger', interval: '5m']]), - buildDiscarder(logRotator( - artifactDaysToKeepStr: '', - artifactNumToKeepStr: '', - daysToKeepStr: '', - numToKeepStr: '10')), - disableConcurrentBuilds(), - [$class: 'CopyArtifactPermissionProperty', projectNames: '*'] - ]) - - -//////////////////////////////////////////////////////////////////////// -import java.nio.file.Path; - -rcclTestsCI: -{ - def rcclTests = new rocProject('rcclTests') - // customize for project - rcclTests.paths.build_command = './install.sh' - - // Define test architectures, optional rocm version argument is available - def nodes = new dockerNodes(['RCCL'], rcclTests) - - boolean formatCheck = false - - def compileCommand = - { - platform, project-> - - project.paths.construct_build_prefix() - - def command = """#!/usr/bin/env bash - set -x - rm -rf rccl - git clone https://github.com/ROCmSoftwarePlatform/rccl - cd rccl - export RCCL_PATH=${WORKSPACE}/rccl/rccl-install - ./install.sh -i --prefix=\$RCCL_PATH - cd .. - cd ${project.paths.project_build_prefix} - ${project.paths.build_command} --rccl_home=\$RCCL_PATH - """ - sh command - } - def testCommand = - { - platform, project-> - - def command = """#!/usr/bin/env bash - set -x - LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${WORKSPACE}/rccl/rccl-install/lib/ python3 -m pytest -k "not MPI" --junitxml=./testreport.xml - """ - - sh command - //junit "${project.paths.project_build_prefix}/build/release/*.xml" - } - - def packageCommand = - { - platform, project-> - - def command = """ - """ - } - - buildProjectNoDocker(rcclTests, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) -} From ba924dac95c794540b582cdbc480398fb3f64930 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 3 Jun 2020 15:07:51 -0700 Subject: [PATCH 047/233] Fix #43 : Add .gitignore for build dir --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..a0a013e438 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# See LICENCE.txt for license information +/build From 83b846cf4fb8e16f6e07f5577168c40db7173e65 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 10 Jun 2020 23:24:08 +0000 Subject: [PATCH 048/233] Correct szie when init sendbuff --- src/common.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index 908a69b9c5..ed88f51905 100644 --- a/src/common.cu +++ b/src/common.cu @@ -885,9 +885,9 @@ testResult_t run() { HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking)); // initialize data buffer to avoid all zero data #if NCCL_MAJOR >= 2 - TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclUint8, 0, i)); + TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i)); #else - TESTCHECK(InitData(sendbuffs[i], maxBytes, ncclChar, 0, i)); + TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclChar, 0, i)); #endif HIPCHECK(hipDeviceSynchronize()); } From 7a833631b2ba685627aec257627a966f58e26bd4 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Mon, 15 Jun 2020 08:54:21 -0700 Subject: [PATCH 049/233] Remove sm_30 --- src/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Makefile b/src/Makefile index ed723d4210..56d2e6345d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -15,8 +15,7 @@ NVCC = $(CUDA_HOME)/bin/nvcc # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -NVCC_GENCODE ?= -gencode=arch=compute_30,code=sm_30 \ - -gencode=arch=compute_35,code=sm_35 \ +NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_50,code=sm_50 \ -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 \ From af4fa0f4cf7c2c3db0540da7ac8d6efc1d526635 Mon Sep 17 00:00:00 2001 From: Luke Yeager Date: Tue, 7 Jan 2020 13:30:19 -0800 Subject: [PATCH 050/233] Fix some memory leaks --- src/common.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 5a3ae529d6..2c5e38eca3 100644 --- a/src/common.cu +++ b/src/common.cu @@ -302,7 +302,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t printf("%d:%d ", j, dataHost[j]); } printf("\n"); - free(temp); + free(expectedHost); + free(dataHost); } #endif } @@ -351,6 +352,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* // We might want to let other threads (including NCCL threads) use the CPU. if (idle) pthread_yield(); } + free(done); return testSuccess; } From 07ac716c1ac5999964bd583806ec37e928251119 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 18 Jun 2020 15:00:05 -0700 Subject: [PATCH 051/233] Fix #47 : compilation error on NCCL<2.7 Return an error when trying to run alltoall test when compiled against NCCL<2.7. --- src/alltoall.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/alltoall.cu b/src/alltoall.cu index aea9370f65..31cfca090d 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -64,14 +64,18 @@ testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD size_t rankOffset = count * wordSize(type); if (count == 0) return testSuccess; +#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 + printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); + return testNcclError; +#else NCCLCHECK(ncclGroupStart()); for (int r=0; r Date: Fri, 19 Jun 2020 10:40:33 -0700 Subject: [PATCH 052/233] Change all_gather/reduce_scatter algbw to match the documentation. Fix #45 : All_gather and reduce_scatter algorithm bandwidth was computed as time/count*(nranks-1) which is not consistent with the way we compute it for other collectives. This change makes algbw higher; busbw is unchanged. --- src/all_gather.cu | 4 ++-- src/reduce_scatter.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index cfb2ec356b..f5bc44c57d 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -48,10 +48,10 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc } void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; *algBw = baseBw; - double factor = 1; + double factor = ((double)(nranks - 1))/((double)nranks); *busBw = baseBw * factor; } diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index 0b1d986952..86e789c15d 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -47,10 +47,10 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, } void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; *algBw = baseBw; - double factor = 1; + double factor = ((double)(nranks - 1))/((double)nranks); *busBw = baseBw * factor; } From b2603a2e85436b63b80a02b5fc45df84fe42be7b Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 23 Jun 2020 18:16:46 -0700 Subject: [PATCH 053/233] Add gencode for CUDA11 --- src/Makefile | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 2440db1672..0770f080ed 100644 --- a/src/Makefile +++ b/src/Makefile @@ -13,14 +13,25 @@ CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include NVCC = $(CUDA_HOME)/bin/nvcc +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) + # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. +ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) +NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_61,code=sm_61 \ + -genncode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_80,code=sm_80 \ + -gencode=arch=compute_80,code=compute_80 +else NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_50,code=sm_50 \ - -gencode=arch=compute_60,code=sm_60 \ + -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 \ - -gencode=arch=compute_70,code=compute_70 \ - -gencode=arch=compute_70,code=sm_70 + -gencode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_70,code=compute_70 +endif NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 From afdaf59b3b179af51553614c85925dd2ab0a39a4 Mon Sep 17 00:00:00 2001 From: Luke Yeager Date: Wed, 24 Jun 2020 14:39:22 -0700 Subject: [PATCH 054/233] Fix typo in src/Makefile --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 0770f080ed..52169bb3e1 100644 --- a/src/Makefile +++ b/src/Makefile @@ -21,7 +21,7 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 \ - -genncode=arch=compute_70,code=sm_70 \ + -gencode=arch=compute_70,code=sm_70 \ -gencode=arch=compute_80,code=sm_80 \ -gencode=arch=compute_80,code=compute_80 else From 346cb164427e3e3de1acbe068587af65d17e6aff Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Mon, 6 Jul 2020 17:12:50 -0700 Subject: [PATCH 055/233] Change scatter and gather bandwidth calculation to match alltoall (#7) --- src/gather.cu | 11 +++++++---- src/scatter.cu | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/gather.cu b/src/gather.cu index 65dc714893..4b98ede241 100644 --- a/src/gather.cu +++ b/src/gather.cu @@ -64,10 +64,10 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe } void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + double baseBw = (double)(count * typesize) / 1.0E9 / sec; *algBw = baseBw; - double factor = 1; + double factor = ((double)(nranks-1))/((double)(nranks)); *busBw = baseBw * factor; } @@ -79,7 +79,10 @@ testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDat int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); -#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7 +#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 + printf("NCCL 2.7 or later is needed for gather. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); + return testNcclError; +#else #if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER) if (rank == root) NCCLCHECK(ncclGather(sendbuff, recvbuff, count, type, root, comm, stream)); @@ -93,9 +96,9 @@ testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDat } NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream)); NCCLCHECK(ncclGroupEnd()); -#endif #endif return testSuccess; +#endif } struct testColl gatherTest = { diff --git a/src/scatter.cu b/src/scatter.cu index d741391300..4dbbda25a3 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -66,10 +66,10 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR } void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { - double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + double baseBw = (double)(count * typesize) / 1.0E9 / sec; *algBw = baseBw; - double factor = 1; + double factor = ((double)(nranks-1))/((double)(nranks)); *busBw = baseBw * factor; } @@ -81,7 +81,10 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); -#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 7 +#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 + printf("NCCL 2.7 or later is needed for scatter. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); + return testNcclError; +#else #if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER) NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream)); #else @@ -92,9 +95,9 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa } NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream)); NCCLCHECK(ncclGroupEnd()); -#endif #endif return testSuccess; +#endif } struct testColl scatterTest = { From 3d63a84d97b49edeb63e9b7e0be407ecbe7f008e Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 21 Aug 2020 21:34:55 +0000 Subject: [PATCH 056/233] Add cumask option --- src/common.cu | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index bc9ac3185c..e1fb769c95 100644 --- a/src/common.cu +++ b/src/common.cu @@ -55,6 +55,7 @@ static int blocking_coll = 0; static int memorytype = 0; static int stress_cycles = 1; static ncclResult_t ncclabort = ncclSuccess; +static uint32_t cumask[4]; double parsesize(char *value) { long long int units; @@ -687,12 +688,13 @@ int main(int argc, char* argv[]) { {"blocking", required_argument, 0, 'z'}, {"memory_type", required_argument, 0, 'y'}, {"stress_cycles", required_argument, 0, 's'}, + {"cumask", required_argument, 0, 'u'}, {"help", no_argument, 0, 'h'} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:s:h", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:s:u:h", longopts, &longindex); if (c == -1) break; @@ -753,6 +755,16 @@ int main(int argc, char* argv[]) { case 's': stress_cycles = strtol(optarg, NULL, 0); break; + case 'u': + { + int nmasks = 0; + char *mask = strtok(optarg, ","); + while (mask != NULL && nmasks < 4) { + cumask[nmasks++] = strtol(mask, NULL, 16); + mask = strtok(NULL, ","); + }; + } + break; case 'h': printf("USAGE: %s \n\t" "[-t,--nthreads ] \n\t" @@ -771,6 +783,8 @@ int main(int argc, char* argv[]) { "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" "[-y,--memory_type ] \n\t" + "[-s,--stress_cycles ] \n\t" + "[-u,--cumask ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -793,6 +807,8 @@ int main(int argc, char* argv[]) { "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" "[-y,--memory_type ] \n\t" + "[-s,--stress_cycles ] \n\t" + "[-u,--cumask ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -882,7 +898,13 @@ testResult_t run() { for (int i=0; i= 2 TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i)); From 58dcd35af23f64a9becb08e8cf4c872177133227 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 27 Aug 2020 23:45:47 +0000 Subject: [PATCH 057/233] Add alltoallv test --- src/Makefile | 2 +- src/alltoallv.cu | 190 +++++++++++++++++++++++++++++++++++++++++++++++ src/common.cu | 41 +++++----- 3 files changed, 215 insertions(+), 18 deletions(-) create mode 100644 src/alltoallv.cu diff --git a/src/Makefile b/src/Makefile index ac317eb26b..5b7ffa965c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -54,7 +54,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter sendrecv +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall gather scatter sendrecv alltoallv BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/alltoallv.cu b/src/alltoallv.cu new file mode 100644 index 0000000000..7993059441 --- /dev/null +++ b/src/alltoallv.cu @@ -0,0 +1,190 @@ +/************************************************************************* + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "common.h" + +#define USE_RCCL_GATHER_SCATTER + +void print_header() { + PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %6s %6s", size, count, typeName, opName); +} + +void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + if (count < nranks*nranks/2) { + *sendcount = 0; + *recvcount = 0; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = 0; + } else { + *sendcount = (count/nranks)*nranks; + *recvcount = (count/nranks)*nranks; + *sendInplaceOffset = 0; + *recvInplaceOffset = 0; + *paramcount = count/nranks; + } +} + +testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + char* str = getenv("NCCL_TESTS_DEVICE"); + int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + HIPCHECK(hipSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); +#if 0 + int *dataHost = (int *)malloc(args->sendBytes); + hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); + printf(" Rank [%d] Original: ", rank); + for(int j=0; jexpected[i])+rdisp*wordSize(type), rcount, type, rep+sdisp, j)); + rdisp += rcount; + } + HIPCHECK(hipDeviceSynchronize()); + } + // We don't support in-place alltoall + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { + int nranks; + NCCLCHECK(ncclCommCount(comm, &nranks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + #define MAX_ALLTOALLV_RANKS 256 + static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS]; + if (count == 0) return testSuccess; + if (nranks > MAX_ALLTOALLV_RANKS) { + printf("Number of ranks %d exceeds limit %d\n", nranks, MAX_ALLTOALLV_RANKS); + return testNcclError; + } + + size_t disp = 0; + size_t chunksize = count*2/nranks; + for (int i = 0; i < nranks; i++) { + size_t scount = ((i+rank)%nranks)*chunksize; + if (i+rank == nranks-1) + scount += (count*nranks-chunksize*(nranks-1)*nranks/2); + sendcounts[i] = recvcounts[i] = scount; + sdispls[i] = rdispls[i] = disp; + disp += scount; + } + +#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 + printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); + return testNcclError; +#else + NCCLCHECK(ncclGroupStart()); + for (int r=0; rcollTest = &alltoAllTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = ncclNumTypes; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; i locmax ) { locmax = delta; -#ifdef DEBUG_PRINT +#if 0 if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i])); #endif } @@ -339,23 +339,30 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #ifdef DEBUG_PRINT //if (rank == 0) { - int *expectedHost = (int *)malloc(args->expectedBytes); - int *dataHost = (int *)malloc(args->expectedBytes); + int *expectedHost = (int *)malloc(args->expectedBytes); + int *dataHost = (int *)malloc(args->expectedBytes); - hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost); - printf("\n Rank [%d] Expected: ", rank); - for(int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, expectedHost[j]); - } - hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost); - printf("\n Rank [%d] Actual: ", rank); - for (int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, dataHost[j]); - } - printf("\n"); - free(expectedHost); - free(dataHost); - } + hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost); + hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost); + int j, k, l; + for (j=0; jexpectedBytes/sizeof(int); j++) + if (expectedHost[j] != dataHost[j]) break; + k = j; + for (; jexpectedBytes/sizeof(int); j++) + if (expectedHost[j] == dataHost[j]) break; + l = j; + printf("\n Rank [%d] Expected: ", rank); + for (j=k; jexpectedBytes/sizeof(int) && jexpectedBytes/sizeof(int) && jnProcs*args->nThreads*args->nGpus; From 0d1940e18eafa337f558b9d6410399eb3d97f96d Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 17 Sep 2020 14:54:12 -0700 Subject: [PATCH 058/233] Prioritize NCCL_HOME --- src/Makefile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/Makefile b/src/Makefile index 5b7ffa965c..260e98a282 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,13 +14,19 @@ DEBUG ?= 0 HIPCC = $(ROCM_HOME)/hip/bin/hipcc CXX = $(HIPCC) - HIPCUFLAGS := -std=c++14 +LDFLAGS := +HIPLDFLAGS := + +ifneq ($(NCCL_HOME), "") +HIPCUFLAGS += -I$(NCCL_HOME) +HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) +endif HIPCUFLAGS += -I$(ROCM_HOME)/include HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip -LDFLAGS := -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt -HIPLDFLAGS := $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt +LDFLAGS += -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt +HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt ifeq ($(DEBUG), 0) HIPCUFLAGS += -O3 @@ -35,10 +41,6 @@ endif .PHONY: build clean BUILDDIR ?= ../build -ifneq ($(NCCL_HOME), "") -HIPCUFLAGS += -I$(NCCL_HOME)/include/ -HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME)/lib -L$(NCCL_HOME)/lib -endif ifeq ($(MPI), 1) HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi From 3f1dfacc9503434560d6e16551865b7d8d816a96 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 10 Sep 2020 22:31:19 +0000 Subject: [PATCH 059/233] Add test for alltoallv API --- src/alltoallv.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 7993059441..30577fea43 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -122,6 +122,9 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); return testNcclError; +#else +#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER) + NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls, type, comm, stream)); #else NCCLCHECK(ncclGroupStart()); for (int r=0; r Date: Fri, 25 Sep 2020 18:06:09 +0000 Subject: [PATCH 060/233] Fix build error --- src/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Makefile b/src/Makefile index 260e98a282..f4e1f805c0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,6 +10,7 @@ MPI_HOME ?= /usr/lib/openmpi PREFIX ?= /usr/local VERBOSE ?= 0 DEBUG ?= 0 +NCCL_HOME ?= "" HIPCC = $(ROCM_HOME)/hip/bin/hipcc CXX = $(HIPCC) From bf4a866109667bea8c7fa142d940ef00f46fc2d6 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 19 Oct 2020 14:06:23 -0400 Subject: [PATCH 061/233] Uses nullptr as send buffer for non-root ranks during scatter --- src/scatter.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/scatter.cu b/src/scatter.cu index 4dbbda25a3..b18ed382ce 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -86,7 +86,10 @@ testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDa return testNcclError; #else #if defined(RCCL_GATHER_SCATTER) && defined(USE_RCCL_GATHER_SCATTER) - NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream)); + if (rank == root) + NCCLCHECK(ncclScatter(sendbuff, recvbuff, count, type, root, comm, stream)); + else + NCCLCHECK(ncclScatter(0, recvbuff, count, type, root, comm, stream)); #else NCCLCHECK(ncclGroupStart()); if (rank == root) { From e3f9e281f1a03abf983b81e1a1804ef7214dee6b Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 19 Oct 2020 14:43:01 -0700 Subject: [PATCH 062/233] Fix mpich linking option --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index f4e1f805c0..8b33b66b13 100644 --- a/src/Makefile +++ b/src/Makefile @@ -48,7 +48,7 @@ HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi else ifeq ($(MPICH), 1) HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -HIPLDFLAGS += -L/usr/lib -lmpi +HIPLDFLAGS += -L/usr/lib -lmpich endif LIBRARIES += rccl From d310466d882948cf44fc2a75ee419ce2a6958ce3 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 14 Dec 2020 18:01:04 -0500 Subject: [PATCH 063/233] Fix alltoallv test --- src/alltoallv.cu | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 30577fea43..fb6d0acde8 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -101,7 +101,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); #define MAX_ALLTOALLV_RANKS 256 - static size_t sendcounts[MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS]; + static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS]; if (count == 0) return testSuccess; if (nranks > MAX_ALLTOALLV_RANKS) { printf("Number of ranks %d exceeds limit %d\n", nranks, MAX_ALLTOALLV_RANKS); @@ -114,9 +114,10 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl size_t scount = ((i+rank)%nranks)*chunksize; if (i+rank == nranks-1) scount += (count*nranks-chunksize*(nranks-1)*nranks/2); - sendcounts[i] = recvcounts[i] = scount; - sdispls[i] = rdispls[i] = disp; + sendcounts[i+rank*MAX_ALLTOALLV_RANKS] = recvcounts[i+rank*MAX_ALLTOALLV_RANKS] = scount; + sdispls[i+rank*MAX_ALLTOALLV_RANKS] = rdispls[i+rank*MAX_ALLTOALLV_RANKS] = disp; disp += scount; + //printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type), sdispls[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type)); } #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 @@ -124,23 +125,23 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl return testNcclError; #else #if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER) - NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls, type, comm, stream)); + NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*MAX_ALLTOALLV_RANKS, sdispls+rank*MAX_ALLTOALLV_RANKS, recvbuff, recvcounts+rank*MAX_ALLTOALLV_RANKS, rdispls+rank*MAX_ALLTOALLV_RANKS, type, comm, stream)); #else NCCLCHECK(ncclGroupStart()); for (int r=0; r Date: Tue, 15 Dec 2020 22:05:50 -0500 Subject: [PATCH 064/233] Add support for testing memory allocated with hipMallocManaged --- src/common.cu | 9 +++++++-- src/common.h | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/common.cu b/src/common.cu index 23d884cb9b..401ba46c2b 100644 --- a/src/common.cu +++ b/src/common.cu @@ -32,7 +32,7 @@ const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "dou #endif ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin}; const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"}; -const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host"}; +const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host", "managed"}; thread_local int is_main_thread = 0; @@ -655,6 +655,11 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s HIPCHECK(hipHostMalloc(recvbuff, nbytes)); HIPCHECK(hipHostMalloc(expected, recvBytes)); } + else if (memorytype == ncclManaged) { + HIPCHECK(hipMallocManaged(sendbuff, nbytes)); + HIPCHECK(hipMallocManaged(recvbuff, nbytes)); + HIPCHECK(hipMallocManaged(expected, recvBytes)); + } else { HIPCHECK(hipMalloc(sendbuff, nbytes)); HIPCHECK(hipMalloc(recvbuff, nbytes)); @@ -813,7 +818,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" - "[-y,--memory_type ] \n\t" + "[-y,--memory_type ] \n\t" "[-s,--stress_cycles ] \n\t" "[-u,--cumask ] \n\t" "[-h,--help]\n", diff --git a/src/common.h b/src/common.h index 8de2efaa4b..a498cce8e6 100644 --- a/src/common.h +++ b/src/common.h @@ -212,7 +212,8 @@ extern const char *test_opnames[ncclNumOps]; typedef enum { ncclCoarse = 0, ncclFine = 1, ncclHost = 2, - nccl_NUM_MTYPES = 3 } ncclMemoryType_t; + ncclManaged = 3, + nccl_NUM_MTYPES = 4 } ncclMemoryType_t; extern const char *test_memorytypes[nccl_NUM_MTYPES]; static int ncclstringtotype(char *str) { From da67a81c8e43496e442931ccacf5fc3fd1b4e91e Mon Sep 17 00:00:00 2001 From: Jithin Jose Date: Fri, 18 Dec 2020 10:12:54 -0800 Subject: [PATCH 065/233] Use DJB2a hash algorithm in getHostHash() --- src/common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common.h b/src/common.h index a2d7ae2958..0fb5aa4467 100644 --- a/src/common.h +++ b/src/common.h @@ -165,10 +165,10 @@ static void getHostName(char* hostname, int maxlen) { #include static uint64_t getHostHash(const char* string) { - // Based on DJB2, result = result * 33 + char + // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; string[c] != '\0'; c++){ - result = ((result << 5) + result) + string[c]; + result = ((result << 5) + result) ^ string[c]; } return result; } From ae1ce98e69dfec377261ad168214dfc8d47aa996 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 4 Jan 2021 11:37:32 -0800 Subject: [PATCH 066/233] Add boot_id to the hostname hash due to collisions on Azure Fixes #60 --- src/common.h | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/common.h b/src/common.h index 0fb5aa4467..865ee258cd 100644 --- a/src/common.h +++ b/src/common.h @@ -164,15 +164,46 @@ static void getHostName(char* hostname, int maxlen) { #include -static uint64_t getHostHash(const char* string) { +static uint64_t getHash(const char* string, size_t n) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; - for (int c = 0; string[c] != '\0'; c++){ + for (size_t c = 0; c < n; c++) { result = ((result << 5) + result) ^ string[c]; } return result; } +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + static size_t wordSize(ncclDataType_t type) { switch(type) { case ncclChar: From e5f1482efb91c1e0d505ec9b61070dc9f5b60e28 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Mon, 4 Jan 2021 16:51:16 -0500 Subject: [PATCH 067/233] Add tests code that can print info and reset input/output buffers --- src/common.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/common.cu b/src/common.cu index 401ba46c2b..6363899965 100644 --- a/src/common.cu +++ b/src/common.cu @@ -659,6 +659,11 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s HIPCHECK(hipMallocManaged(sendbuff, nbytes)); HIPCHECK(hipMallocManaged(recvbuff, nbytes)); HIPCHECK(hipMallocManaged(expected, recvBytes)); +#if 0 + HIPCHECK(hipMemset(*sendbuff, 0, nbytes)); + HIPCHECK(hipMemset(*recvbuff, 0, nbytes)); + HIPCHECK(hipMemset(*expected, 0, recvBytes)); +#endif } else { HIPCHECK(hipMalloc(sendbuff, nbytes)); @@ -910,6 +915,7 @@ testResult_t run() { for (int i=0; i Date: Wed, 20 Jan 2021 17:08:40 -0800 Subject: [PATCH 068/233] Do not allocate memory for expected buffer if checking disabled This allows the tests to be run with larger buffers --- src/common.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common.cu b/src/common.cu index 19129d66ec..ff4e1fd857 100644 --- a/src/common.cu +++ b/src/common.cu @@ -559,7 +559,7 @@ testResult_t threadLaunch(struct testThread* thread) { testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { CUDACHECK(cudaMalloc(sendbuff, nbytes)); CUDACHECK(cudaMalloc(recvbuff, nbytes)); - CUDACHECK(cudaMalloc(expected, recvBytes)); + if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); return testSuccess; } @@ -764,7 +764,7 @@ testResult_t run() { for (int i=0; i Date: Wed, 3 Feb 2021 21:16:18 -0500 Subject: [PATCH 069/233] Revert "Allow call ncclCommAbort on Ctrl+C" This reverts commit 23c374475f0472a06b461ad5ba5d09b5312a1f3c. --- src/common.cu | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/src/common.cu b/src/common.cu index 6363899965..443c140fcc 100644 --- a/src/common.cu +++ b/src/common.cu @@ -13,7 +13,6 @@ #include #include #include -#include #include //#define DEBUG_PRINT @@ -54,7 +53,6 @@ static int parallel_init = 0; static int blocking_coll = 0; static int memorytype = 0; static int stress_cycles = 1; -static ncclResult_t ncclabort = ncclSuccess; static uint32_t cumask[4]; double parsesize(char *value) { @@ -371,21 +369,6 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t return testSuccess; } -void INThandler(int sig) { - char c; - - signal(sig, SIG_IGN); - printf("\nDo you want to call ncclCommAbort before exit? [y/n] "); - c = getchar(); - if (c == 'y' || c == 'Y') { - ncclabort = ncclSystemError; - signal(SIGINT, INThandler); - } - else - exit (0); - getchar(); // Get new line character -} - testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) { hipError_t hipErr; int remaining = ngpus; @@ -411,17 +394,13 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* if (comms) { ncclResult_t ncclAsyncErr; NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); - if (ncclAsyncErr != ncclSuccess || ncclabort != ncclSuccess) { + if (ncclAsyncErr != ncclSuccess) { // An asynchronous error happened. Stop the operation and destroy // the communicator for (int i=0; i= 2 -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - // may call ncclCommAbort - signal(SIGINT, INThandler); -#endif -#endif // Make sure everyline is flushed so that we see the progress of the test setlinebuf(stdout); From e37545e4911c210558baba789941ea7bf59db00d Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 15 Mar 2021 14:44:06 -0700 Subject: [PATCH 070/233] Add support for new datatype: bfloat16 --- src/common.cu | 41 +++++++++++++++++++++++++++++++++++++---- src/common.h | 3 +++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/common.cu b/src/common.cu index ff4e1fd857..4589593b07 100644 --- a/src/common.cu +++ b/src/common.cu @@ -12,8 +12,16 @@ #include "cuda.h" #if NCCL_MAJOR >= 2 -ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble}; -const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"}; +ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, +#if defined(__CUDA_BF16_TYPES_EXIST__) + ncclBfloat16 +#endif +}; +const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", +#if defined(__CUDA_BF16_TYPES_EXIST__) + "bfloat16" +#endif +}; #else ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; @@ -61,6 +69,9 @@ double parsesize(char *value) { double DeltaMaxValue(ncclDataType_t type) { switch(type) { case ncclHalf: return 1e-2; +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: return 1e-2; +#endif case ncclFloat: return 1e-5; case ncclDouble: return 1e-12; case ncclInt: @@ -95,6 +106,12 @@ template<> __device__ float toFloat(half a) { return __half2float(a); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> __device__ +float toFloat(__nv_bfloat16 a) { + return __bfloat162float(a); +} +#endif template __global__ void deltaKern(void* A_, void* B_, size_t count, double* max) { @@ -128,6 +145,10 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { switch (type) { +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break; +#endif case ncclHalf: deltaKern<<<1, 512>>>(results, expected, count, devmax); break; case ncclFloat: @@ -174,6 +195,12 @@ template<> __device__ half testValue(const size_t offset, const int rep, const int rank) { return __float2half(testValue(offset, rep, rank)); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { + return __float2bfloat16(testValue(offset, rep, rank)); +} +#endif // Operations template @@ -210,7 +237,10 @@ __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offse #define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double) + OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), +#if defined(__CUDA_BF16_TYPES_EXIST__) + OPS(__nv_bfloat16) +#endif }; testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { @@ -236,7 +266,10 @@ static void* const initDataKerns[ncclNumTypes] = { (void*)InitDataKernel, (void*)InitDataKernel< half>, (void*)InitDataKernel< float>, - (void*)InitDataKernel< double> + (void*)InitDataKernel< double>, +#if defined(__CUDA_BF16_TYPES_EXIST__) + (void*)InitDataKernel<__nv_bfloat16>, +#endif }; template diff --git a/src/common.h b/src/common.h index 865ee258cd..c869254669 100644 --- a/src/common.h +++ b/src/common.h @@ -213,6 +213,9 @@ static size_t wordSize(ncclDataType_t type) { #endif return 1; case ncclHalf: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif //case ncclFloat16: return 2; case ncclInt: From 5373e3c6307d64711c8b03b86b9eafd7d9d45bbd Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Tue, 16 Mar 2021 20:38:13 +0000 Subject: [PATCH 071/233] Disabling host and fine memory types for CI testing --- .jenkins/common.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index c893f9fd8e..14c644b026 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -27,7 +27,7 @@ def runTestCommand (platform, project) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} - python3 -m pytest -k "not MPI" --verbose --junitxml=./testreport.xml + python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml """ platform.runCommand(this, command) From 0fccaec26f25e13f78c6d3cc1e4ba30c2c363451 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 16 Apr 2021 18:23:28 -0400 Subject: [PATCH 072/233] Update mpich include path --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index 8b33b66b13..fa506c567a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -47,7 +47,7 @@ ifeq ($(MPI), 1) HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi else ifeq ($(MPICH), 1) -HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich +HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich HIPLDFLAGS += -L/usr/lib -lmpich endif From e12c35d84b026acc5fb573f7ac5a732430eedd32 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 27 May 2021 09:12:52 -0700 Subject: [PATCH 073/233] Update PERFORMANCE.md --- doc/PERFORMANCE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/PERFORMANCE.md b/doc/PERFORMANCE.md index 7cc6ecee66..21fef609af 100644 --- a/doc/PERFORMANCE.md +++ b/doc/PERFORMANCE.md @@ -46,7 +46,7 @@ A tree would do it hierarchically : `(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))` -In all cases, we need n-1 additions and n assignations for each element. Since every step is on a different rank except potentially one (the last input and the first output), +In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output), we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation. Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best : @@ -82,7 +82,7 @@ Note that here, S is the size in bytes of the total array, which for NCCL is equ ### AllGather -The AllGather operation requires only to perform the assignation part of the allReduce operation : +The AllGather operation requires only to perform the assignment part of the allReduce operation : `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K` From c4de829d9131d83e4b0ca5c08cd9a8eca2dfc289 Mon Sep 17 00:00:00 2001 From: Greg Inozemtsev Date: Wed, 2 Jun 2021 17:52:11 -0700 Subject: [PATCH 074/233] Cleanup argument error handling and messages Add error checking for minbytes and maxbytes arguments Also accept lowercase literals when parsing size arguments and print errors and usage on stderr. --- src/common.cu | 69 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/src/common.cu b/src/common.cu index ff4e1fd857..25fc7dac7a 100644 --- a/src/common.cu +++ b/src/common.cu @@ -40,22 +40,40 @@ static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; -double parsesize(char *value) { +static double parsesize(const char *value) { long long int units; double size; + char size_lit; - if (strchr(value, 'G') != NULL) { - units=1024*1024*1024; - } else if (strchr(value, 'M') != NULL) { - units=1024*1024; - } else if (strchr(value, 'K') != NULL) { - units=1024; - } else { - units=1; + int count = sscanf(value, "%lf %1s", &size, &size_lit); + + switch (count) { + case 2: + switch (size_lit) { + case 'G': + case 'g': + units = 1024*1024*1024; + break; + case 'M': + case 'm': + units = 1024*1024; + break; + case 'K': + case 'k': + units = 1024; + break; + default: + return -1.0; + }; + break; + case 1: + units = 1; + break; + default: + return -1.0; } - size = atof(value)*units; - return size; + return size * units; } double DeltaMaxValue(ncclDataType_t type) { @@ -570,6 +588,7 @@ int main(int argc, char* argv[]) { setlinebuf(stdout); // Parse args + double parsed; int longindex; static struct option longopts[] = { {"nthreads", required_argument, 0, 't'}, @@ -605,10 +624,20 @@ int main(int argc, char* argv[]) { nGpus = strtol(optarg, NULL, 0); break; case 'b': - minBytes = (size_t)parsesize(optarg); + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'minbytes'\n"); + return -1; + } + minBytes = (size_t)parsed; break; case 'e': - maxBytes = (size_t)parsesize(optarg); + parsed = parsesize(optarg); + if (parsed < 0) { + fprintf(stderr, "invalid size specified for 'maxbytes'\n"); + return -1; + } + maxBytes = (size_t)parsed; break; case 'i': stepBytes = strtol(optarg, NULL, 0); @@ -623,7 +652,7 @@ int main(int argc, char* argv[]) { #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 agg_iters = (int)strtol(optarg, NULL, 0); #else - printf("Option -m not supported before NCCL 2.2. Ignoring\n"); + fprintf(stderr, "Option -m not supported before NCCL 2.2. Ignoring\n"); #endif break; case 'w': @@ -648,7 +677,7 @@ int main(int argc, char* argv[]) { blocking_coll = strtol(optarg, NULL, 0); break; case 'h': - printf("USAGE: %s \n\t" + fprintf(stderr, "USAGE: %s \n\t" "[-t,--nthreads ] \n\t" "[-g,--ngpus ] \n\t" "[-b,--minbytes ] \n\t" @@ -668,8 +697,8 @@ int main(int argc, char* argv[]) { basename(argv[0])); return 0; default: - printf("invalid option \n"); - printf("USAGE: %s \n\t" + fprintf(stderr, "invalid option \n"); + fprintf(stderr, "USAGE: %s \n\t" "[-t,--nthreads ] \n\t" "[-g,--ngpus ] \n\t" "[-b,--minbytes ] \n\t" @@ -690,6 +719,12 @@ int main(int argc, char* argv[]) { return 0; } } + if (minBytes > maxBytes) { + fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", + (unsigned long long)minBytes, + (unsigned long long)maxBytes); + return -1; + } #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); #endif From cde7e769c1879a77daddebe9da164513e030105b Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 17 Jun 2021 14:08:43 -0700 Subject: [PATCH 075/233] Add support for ncclAvg operation --- src/all_gather.cu | 2 +- src/all_reduce.cu | 4 +- src/alltoall.cu | 2 +- src/broadcast.cu | 2 +- src/common.cu | 92 ++++++++++++++++++++++++++++++++++++------- src/common.h | 5 ++- src/reduce.cu | 4 +- src/reduce_scatter.cu | 4 +- 8 files changed, 90 insertions(+), 25 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index f5bc44c57d..ee1d0ea0b9 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -84,7 +84,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index bd8daaf0a2..52dce8993c 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -83,7 +83,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } @@ -93,7 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t run_ops = &op; run_opnames = &opName; } else { - op_count = ncclNumOps; + op_count = test_opnum; run_ops = test_ops; run_opnames = test_opnames; } diff --git a/src/alltoall.cu b/src/alltoall.cu index 31cfca090d..4afd3eb947 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -102,7 +102,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } diff --git a/src/broadcast.cu b/src/broadcast.cu index c62a99ff62..f7c0094864 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -92,7 +92,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t run_types = &type; run_typenames = &typeName; } else { - type_count = ncclNumTypes; + type_count = test_typenum; run_types = test_types; run_typenames = test_typenames; } diff --git a/src/common.cu b/src/common.cu index 4589593b07..1313079e79 100644 --- a/src/common.cu +++ b/src/common.cu @@ -11,23 +11,41 @@ #include #include "cuda.h" +int test_ncclVersion = 0; // init'd with ncclGetVersion() + #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) ncclBfloat16 #endif }; const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) "bfloat16" #endif }; + +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +int test_typenum = 10; +#else +int test_typenum = 9; +#endif + #else ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; +int test_typenum = 7; #endif + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg}; +const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "avg"}; +int test_opnum = 5; +#else ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin}; const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"}; +int test_opnum = 4; +#endif thread_local int is_main_thread = 0; @@ -126,7 +144,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { if( delta > locmax ) { locmax = delta; #ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %d/%ld : %f != %f\n", i, count, toFloat(A[i]), toFloat(B[i])); + if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); #endif } } @@ -222,23 +240,48 @@ __device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float( template<> __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } -template +template +__device__ T ncclPostOpIdent(T x, int n) { return x; } + +template +__device__ T ncclPostOpDiv(T x, int n) { return x/n; } +template<> +__device__ half ncclPostOpDiv(half x, int n) { return __float2half(__half2float(x)/n); } +#if defined(__CUDA_BF16_TYPES_EXIST__) +template<> +__device__ __nv_bfloat16 ncclPostOpDiv<__nv_bfloat16>(__nv_bfloat16 x, int n) { return __float2bfloat16(__bfloat162float(x)/n); } +#endif + +template __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); for (int i=1; i(o+offset, rep, i)); } - data[o] = val; + data[o] = PostOp(val, nranks); } } -#define KERN(type, op) (void*)InitDataReduceKernel> -#define OPS(type) KERN(type, ncclOpSum), KERN(type, ncclOpProd), KERN(type, ncclOpMax), KERN(type, ncclOpMin) +#define KERN(type, op, postop) (void*)InitDataReduceKernel, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPostOpIdent), \ + KERN(type, ncclOpProd, ncclPostOpIdent), \ + KERN(type, ncclOpMax, ncclPostOpIdent), \ + KERN(type, ncclOpMin, ncclPostOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPostOpDiv) +#else + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPostOpIdent), \ + KERN(type, ncclOpProd, ncclPostOpIdent), \ + KERN(type, ncclOpMax, ncclPostOpIdent), \ + KERN(type, ncclOpMin, ncclPostOpIdent) +#endif static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = { OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) OPS(__nv_bfloat16) #endif }; @@ -267,7 +310,7 @@ static void* const initDataKerns[ncclNumTypes] = { (void*)InitDataKernel< half>, (void*)InitDataKernel< float>, (void*)InitDataKernel< double>, -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) (void*)InitDataKernel<__nv_bfloat16>, #endif }; @@ -367,7 +410,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr); #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) - if (comms) { + if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { ncclResult_t ncclAsyncErr; NCCLCHECK(ncclCommGetAsyncError(comms[i], &ncclAsyncErr)); if (ncclAsyncErr != ncclSuccess) { @@ -602,6 +645,17 @@ int main(int argc, char* argv[]) { // Make sure everyline is flushed so that we see the progress of the test setlinebuf(stdout); + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0) + ncclGetVersion(&test_ncclVersion); + #else + test_ncclVersion = NCCL_VERSION_CODE; + #endif + //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion < NCCL_VERSION(2,10,0)) { + test_opnum -= 1; // exclude ncclAvg + test_typenum -= 1; // exclude bfloat16 + } + // Parse args int longindex; static struct option longopts[] = { @@ -653,7 +707,7 @@ int main(int argc, char* argv[]) { iters = (int)strtol(optarg, NULL, 0); break; case 'm': -#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 +#if NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 2) agg_iters = (int)strtol(optarg, NULL, 0); #else printf("Option -m not supported before NCCL 2.2. Ignoring\n"); @@ -693,7 +747,11 @@ int main(int argc, char* argv[]) { "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else "[-o,--op ] \n\t" +#endif "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" @@ -701,8 +759,8 @@ int main(int argc, char* argv[]) { basename(argv[0])); return 0; default: - printf("invalid option \n"); - printf("USAGE: %s \n\t" + if (c != 'h') printf("invalid option '%c'\n", c); + printf("USAGE: %s \n\t" "[-t,--nthreads ] \n\t" "[-g,--ngpus ] \n\t" "[-b,--minbytes ] \n\t" @@ -714,7 +772,11 @@ int main(int argc, char* argv[]) { "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" "[-c,--check <0/1>] \n\t" +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + "[-o,--op ] \n\t" +#else "[-o,--op ] \n\t" +#endif "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" @@ -899,8 +961,8 @@ testResult_t run() { // Free off CUDA allocated memory for (int i=0; i Date: Mon, 28 Jun 2021 10:12:34 -0700 Subject: [PATCH 076/233] Fixed formatting for bfloat16 support --- src/all_gather.cu | 8 ++++---- src/all_reduce.cu | 8 ++++---- src/alltoall.cu | 8 ++++---- src/broadcast.cu | 8 ++++---- src/reduce.cu | 8 ++++---- src/reduce_scatter.cu | 8 ++++---- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index ee1d0ea0b9..0b9e0cc939 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s out-of-place in-place \n", "", "", ""); - PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s", size, count, typeName); + PRINT("%12li %12li %8s", size, count, typeName); } void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 52dce8993c..9b6b7f02b9 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s", size, count, typeName, opName); + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); } void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/alltoall.cu b/src/alltoall.cu index 4afd3eb947..865099743d 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s", size, count, typeName, opName); + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); } void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/broadcast.cu b/src/broadcast.cu index f7c0094864..e2b4421ac5 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6i", size, count, typeName, root); + PRINT("%12li %12li %8s %6i", size, count, typeName, root); } void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/reduce.cu b/src/reduce.cu index e40b501b7e..278768881d 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s %6i", size, count, typeName, opName, root); + PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root); } void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index c6de434ebe..b0c4fab52e 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -8,15 +8,15 @@ #include "common.h" void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", + PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", + PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); } void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s", size, count, typeName, opName); + PRINT("%12li %12li %8s %6s", size, count, typeName, opName); } void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { From e55ad3796d710adcf72778dca02559dc6c9706bb Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 28 Jun 2021 14:19:45 -0700 Subject: [PATCH 077/233] Added support for CUDA graph capture/replay (-G) --- src/common.cu | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 1313079e79..c180294644 100644 --- a/src/common.cu +++ b/src/common.cu @@ -65,6 +65,7 @@ static int nccltype = ncclFloat; static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; +static int cudaGraphLaunches = 0; double parsesize(char *value) { long long int units; @@ -481,6 +482,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); + cudaGraph_t graphs[args->nGpus]; + cudaGraphExec_t graphExec[args->nGpus]; + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); + } + } + // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); for (int iter = 0; iter < iters; iter++) { @@ -490,11 +500,40 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } + + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Resync CPU, restart timing, launch cuda graph + Barrier(args); + start = std::chrono::high_resolution_clock::now(); + for (int l=0; lnGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } + } + TESTCHECK(completeColl(args)); auto delta = std::chrono::high_resolution_clock::now() - start; double deltaSec = std::chrono::duration_cast>(delta).count(); deltaSec = deltaSec/(iters*agg_iters); + if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; + + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } double algBw, busBw; args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); @@ -508,10 +547,41 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); + if (cudaGraphLaunches >= 1) { + // Begin cuda graph capture for data check + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); + } + } + //test validation in single itertion, should ideally be included into the multi-iteration run TESTCHECK(startColl(args, type, op, root, in_place, 0)); + + if (cudaGraphLaunches >= 1) { + // End cuda graph capture + for (int i=0; inGpus; i++) { + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); + } + // Instantiate cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + } + // Launch cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); + } + } + TESTCHECK(completeColl(args)); + if (cudaGraphLaunches >= 1) { + //destroy cuda graph + for (int i=0; inGpus; i++) { + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); + } + } + TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); //aggregate delta from all threads and procs @@ -674,12 +744,13 @@ int main(int argc, char* argv[]) { {"datatype", required_argument, 0, 'd'}, {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, + {"cudagraph", required_argument, 0, 'G'}, {"help", no_argument, 0, 'h'} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:h", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex); if (c == -1) break; @@ -734,6 +805,13 @@ int main(int argc, char* argv[]) { case 'z': blocking_coll = strtol(optarg, NULL, 0); break; + case 'G': +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 + cudaGraphLaunches = strtol(optarg, NULL, 0); +#else + printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); +#endif + break; case 'h': printf("USAGE: %s \n\t" "[-t,--nthreads ] \n\t" @@ -755,6 +833,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; @@ -780,6 +859,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-G,--cudagraph ] \n\t" "[-h,--help]\n", basename(argv[0])); return 0; From 9dae3d3a37a7505a9eb0622be4268e2d2a3cb5f9 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 28 Jun 2021 16:49:10 -0700 Subject: [PATCH 078/233] Added new tests: scatter, sendrecv, hypercube --- src/Makefile | 4 +- src/hypercube.cu | 124 +++++++++++++++++++++++++++++++++++++++++++++ src/scatter.cu | 125 ++++++++++++++++++++++++++++++++++++++++++++++ src/sendrecv.cu | 127 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 378 insertions(+), 2 deletions(-) create mode 100644 src/hypercube.cu create mode 100644 src/scatter.cu create mode 100644 src/sendrecv.cu diff --git a/src/Makefile b/src/Makefile index 52169bb3e1..26e653e7d6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -70,7 +70,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter sendrecv hypercube BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/hypercube.cu b/src/hypercube.cu new file mode 100644 index 0000000000..142f1a6359 --- /dev/null +++ b/src/hypercube.cu @@ -0,0 +1,124 @@ +/************************************************************************* + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "common.h" + +#define ALIGN 4 + +void print_header() { + PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", + "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); + PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", + "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); +} + +void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { + PRINT("%12li %12li %8s", size, count, typeName); +} + +void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { + size_t base = (count/(ALIGN*nranks))*ALIGN; + *sendcount = base; + *recvcount = base*nranks; + *sendInplaceOffset = base; + *recvInplaceOffset = 0; + *paramcount = base; +} + +testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { + size_t sendcount = args->sendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + } + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + char* sbuff = (char*)sendbuff; + char* rbuff = (char*)recvbuff; + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + size_t rankSize = count * wordSize(type); + + if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream)); + + // Hypercube AllGather + for (int mask=1; maskcollTest = &hyperCubeTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + for (int i=0; isendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root)); + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + size_t rankOffset = count * wordSize(type); + if (count == 0) return testSuccess; + + NCCLCHECK(ncclGroupStart()); + if (rank == root) { + for (int r=0; rcollTest = &scatterTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; isendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + int peer = (rank-1+nranks)%nranks; + TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer)); + CUDACHECK(cudaDeviceSynchronize()); + } + // We don't support in-place sendrecv + args->reportErrors = in_place ? 0 : 1; + return testSuccess; +} + +void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = 1; + *busBw = baseBw * factor; +} + +testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + int recvPeer = (rank-1+nRanks) % nRanks; + int sendPeer = (rank+1) % nRanks; + + NCCLCHECK(ncclGroupStart()); + NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream)); + NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream)); + NCCLCHECK(ncclGroupEnd()); + return testSuccess; +} + +struct testColl sendRecvTest = { + "SendRecv", + SendRecvGetCollByteCount, + SendRecvInitData, + SendRecvGetBw, + SendRecvRunColl +}; + +void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { + size_t paramcount, sendInplaceOffset, recvInplaceOffset; + SendRecvGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks); +} + +testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) { + args->collTest = &sendRecvTest; + ncclDataType_t *run_types; + ncclRedOp_t *run_ops; + const char **run_typenames, **run_opnames; + int type_count, op_count; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if ((int)op != -1) { + op_count = 1; + run_ops = &op; + run_opnames = &opName; + } else { + op_count = test_opnum; + run_ops = test_ops; + run_opnames = test_opnames; + } + + for (int i=0; i Date: Mon, 28 Jun 2021 18:23:12 -0700 Subject: [PATCH 079/233] Resync with changes in gitilab-master code --- src/common.cu | 81 +++++++++++++++++++++++---------------------------- src/common.h | 5 ++-- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/src/common.cu b/src/common.cu index c180294644..7aad2c1868 100644 --- a/src/common.cu +++ b/src/common.cu @@ -67,6 +67,8 @@ static int parallel_init = 0; static int blocking_coll = 0; static int cudaGraphLaunches = 0; +#define NUM_BLOCKS 32 + double parsesize(char *value) { long long int units; double size; @@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { const T* A = (const T*)A_; const T* B = (const T*)B_; __shared__ double temp[BSIZE]; - int tid = threadIdx.x; + int tid = blockIdx.x*blockDim.x + threadIdx.x; double locmax = 0.0; - for(int i=tid; i locmax ) { @@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { } } + tid = threadIdx.x; temp[tid] = locmax; for(int stride = BSIZE/2; stride > 1; stride>>=1) { __syncthreads(); @@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) { } __syncthreads(); if( threadIdx.x == 0) - *max = temp[0] > temp[1] ? temp[0] : temp[1]; + max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; } - -testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) { +testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { switch (type) { #if defined(__CUDA_BF16_TYPES_EXIST__) case ncclBfloat16: - deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; #endif case ncclHalf: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclFloat: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclDouble: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclChar: #if NCCL_MAJOR >= 2 case ncclUint8: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclInt: #if NCCL_MAJOR >= 2 case ncclUint32: #endif - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; case ncclInt64: case ncclUint64: - deltaKern<<<1, 512>>>(results, expected, count, devmax); break; + deltaKern<<>>(results, expected, count, devmax); break; } CUDACHECK(cudaDeviceSynchronize()); + for (int i=1; isendBytes, args->expectedBytes); - size_t shift = (totalnbytes * iter) % args->maxbytes; - if (shift + totalnbytes > args->maxbytes) shift = 0; + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { @@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) { testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); + if (datacheck) { + // Initialize sendbuffs, recvbuffs and expected + TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place)); + } // Sync TESTCHECK(startColl(args, type, op, root, in_place, 0)); @@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } double timeUsec = deltaSec*1.0E6; - char timeStr[10]; + char timeStr[100]; if (timeUsec > 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); - } else if (timeUsec > 100.0) { + } else if (timeUsec >= 100.0) { sprintf(timeStr, "%7.1f", timeUsec); } else { sprintf(timeStr, "%7.2f", timeUsec); @@ -812,31 +819,6 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; - case 'h': - printf("USAGE: %s \n\t" - "[-t,--nthreads ] \n\t" - "[-g,--ngpus ] \n\t" - "[-b,--minbytes ] \n\t" - "[-e,--maxbytes ] \n\t" - "[-i,--stepbytes ] \n\t" - "[-f,--stepfactor ] \n\t" - "[-n,--iters ] \n\t" - "[-m,--agg_iters ] \n\t" - "[-w,--warmup_iters ] \n\t" - "[-p,--parallel_init <0/1>] \n\t" - "[-c,--check <0/1>] \n\t" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - "[-o,--op ] \n\t" -#else - "[-o,--op ] \n\t" -#endif - "[-d,--datatype ] \n\t" - "[-r,--root ] \n\t" - "[-z,--blocking <0/1>] \n\t" - "[-G,--cudagraph ] \n\t" - "[-h,--help]\n", - basename(argv[0])); - return 0; default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" @@ -868,7 +850,8 @@ int main(int argc, char* argv[]) { #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); #endif - return run(); + TESTCHECK(run()); + return 0; } testResult_t run() { @@ -900,6 +883,7 @@ testResult_t run() { #define MAX_LINE 2048 char line[MAX_LINE]; int len = 0; + size_t maxMem = ~0; for (int i=0; i memMaxBytes) { + maxBytes = memMaxBytes; + if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes); + } + ncclUniqueId ncclId; if (proc == 0) { NCCLCHECK(ncclGetUniqueId(&ncclId)); @@ -963,7 +956,7 @@ testResult_t run() { int errors[nThreads]; double bw[nThreads]; double* delta; - CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped)); + CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped)); int bw_count[nThreads]; for (int t=0; t Date: Wed, 30 Jun 2021 19:36:07 -0700 Subject: [PATCH 080/233] Added new option to report average iteration time --- src/common.cu | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 7aad2c1868..d9f036879e 100644 --- a/src/common.cu +++ b/src/common.cu @@ -66,6 +66,10 @@ static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; static int cudaGraphLaunches = 0; +#ifdef MPI_SUPPORT +// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) +static int average = 1; +#endif #define NUM_BLOCKS 32 @@ -533,6 +537,23 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t double deltaSec = std::chrono::duration_cast>(delta).count(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; +#ifdef MPI_SUPPORT + switch (average) { + case 1: + // Calculate the average time across all ranks + MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + deltaSec = deltaSec/(args->nProcs*args->nThreads*args->nGpus); + break; + case 2: + // Obtain the minimum time across all ranks + MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); + break; + case 3: + // Obtain the maximum time across all ranks + MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + break; + } +#endif if (cudaGraphLaunches >= 1) { //destroy cuda graph @@ -752,12 +773,13 @@ int main(int argc, char* argv[]) { {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, {"cudagraph", required_argument, 0, 'G'}, + {"average", required_argument, 0, 'a'}, {"help", no_argument, 0, 'h'} }; while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); if (c == -1) break; @@ -819,6 +841,11 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; +#ifdef MPI_SUPPORT + case 'a': + average = (int)strtol(optarg, NULL, 0); + break; +#endif default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" @@ -842,6 +869,9 @@ int main(int argc, char* argv[]) { "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" "[-G,--cudagraph ] \n\t" +#ifdef MPI_SUPPORT + "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" +#endif "[-h,--help]\n", basename(argv[0])); return 0; From 11cff17a04e268ea0a82cc8517fdcfde3414280e Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 6 Jul 2021 14:47:50 -0700 Subject: [PATCH 081/233] Updated with new command line arguments --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 791bed2599..bff6433b89 100644 --- a/README.md +++ b/README.md @@ -52,19 +52,21 @@ All tests support the same set of arguments : * `-i,--stepbytes ` fixed increment between sizes. Default : (max-min)/10. * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. * NCCL operations arguments - * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. + * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. * `-d,--datatype ` Specify which datatype to use. Default : Float. * `-r,--root ` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0. * Performance * `-n,--iters ` number of iterations. Default : 20. * `-w,--warmup_iters ` number of warmup iterations (not timed). Default : 5. * `-m,--agg_iters ` number of operations to aggregate together in each iteration. Default : 1. + * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1. * Test operation * `-p,--parallel_init <0/1>` use threads to initialize NCCL in parallel. Default : 0. * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1. * `-z,--blocking <0/1>` Make NCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0. + * `-G,--cudagraph ` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0. ## Copyright -NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. +NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. From 547e119d350a8ad9034c4e75c8664e62c60bf599 Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 8 Jul 2021 16:42:40 -0700 Subject: [PATCH 082/233] Fix issues with MPI_Allreduce and multi-threaded tests --- src/common.cu | 71 ++++++++++++++++++++++++++------------------------- src/common.h | 4 +-- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/src/common.cu b/src/common.cu index d4ee519107..4768b9be3e 100644 --- a/src/common.cu +++ b/src/common.cu @@ -356,12 +356,9 @@ testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const return testSuccess; } -void Barrier(struct threadArgs* args) -{ +void Barrier(struct threadArgs* args) { while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { #ifdef MPI_SUPPORT MPI_Barrier(MPI_COMM_WORLD); @@ -370,7 +367,35 @@ void Barrier(struct threadArgs* args) } else { while (args->barrier[args->barrier_idx]) pthread_yield(); } + args->barrier_idx=!args->barrier_idx; +} +// Inter-thread/process barrier+allreduce +void Allreduce(struct threadArgs* args, double* value, int average) { + while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); + double val = *value; + if (args->thread > 0) { + double val2 = args->reduce[args->barrier_idx]; + if (average == 1) val += val2; + if (average == 2) val = std::min(val, val2); + if (average == 3) val = std::max(val, val2); + } + if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; + args->barrier[args->barrier_idx] = args->thread + 1; + if (args->thread+1 == args->nThreads) { +#ifdef MPI_SUPPORT + if (average != 0) { + MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; + MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + } +#endif + if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; + args->reduce[1-args->barrier_idx] = 0; + args->barrier[args->barrier_idx] = 0; + } else { + while (args->barrier[args->barrier_idx]) pthread_yield(); + } + *value = args->reduce[args->barrier_idx]; args->barrier_idx=!args->barrier_idx; } @@ -383,7 +408,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); CUDACHECK(cudaSetDevice(device)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; - TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->delta)); + TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); maxDelta = std::max(*(args->deltaHost), maxDelta); #ifdef DEBUG_PRINT @@ -555,23 +580,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t double deltaSec = std::chrono::duration_cast>(delta).count(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; -#ifdef MPI_SUPPORT - switch (average) { - case 1: - // Calculate the average time across all ranks - MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - deltaSec = deltaSec/(args->nProcs*args->nThreads*args->nGpus); - break; - case 2: - // Obtain the minimum time across all ranks - MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); - break; - case 3: - // Obtain the maximum time across all ranks - MPI_Allreduce(MPI_IN_PLACE, &deltaSec, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); - break; - } -#endif + Allreduce(args, &deltaSec, average); if (cudaGraphLaunches >= 1) { //destroy cuda graph @@ -631,21 +640,12 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); //aggregate delta from all threads and procs - Barrier(args); - if (args->thread == 0) { - for (int i=1; inThreads; i++) { - maxDelta += args->deltaThreads[i]; - } -#ifdef MPI_SUPPORT - MPI_Allreduce(MPI_IN_PLACE, &maxDelta, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); -#endif - } - Barrier(args); + Allreduce(args, &maxDelta, 3); } double timeUsec = deltaSec*1.0E6; char timeStr[100]; - if (timeUsec > 10000.0) { + if (timeUsec >= 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); } else if (timeUsec >= 100.0) { sprintf(timeStr, "%7.1f", timeUsec); @@ -875,6 +875,7 @@ int main(int argc, char* argv[]) { average = (int)strtol(optarg, NULL, 0); break; #endif + case 'h': default: if (c != 'h') printf("invalid option '%c'\n", c); printf("USAGE: %s \n\t" @@ -1033,6 +1034,7 @@ testResult_t run() { int* sync = (int*)calloc(2, sizeof(int)); int* barrier = (int*)calloc(2, sizeof(int)); + double* reduce = (double*)calloc(2, sizeof(double)); struct testThread threads[nThreads]; memset(threads, 0, sizeof(struct testThread)*nThreads); @@ -1058,11 +1060,10 @@ testResult_t run() { threads[t].args.barrier = (volatile int*)barrier; threads[t].args.barrier_idx = 0; + threads[t].args.reduce = (volatile double*)reduce; threads[t].args.sync = (volatile int*)sync; threads[t].args.sync_idx = 0; - threads[t].args.deltaThreads = delta; threads[t].args.deltaHost = (delta + t*NUM_BLOCKS); - threads[t].args.delta = delta; threads[t].args.errors=errors+t; threads[t].args.bw=bw+t; threads[t].args.bw_count=bw_count+t; diff --git a/src/common.h b/src/common.h index 44b298dfd2..f789c787cd 100644 --- a/src/common.h +++ b/src/common.h @@ -8,6 +8,7 @@ #include "nccl.h" #include +#include #include #include #ifdef MPI_SUPPORT @@ -116,11 +117,10 @@ struct threadArgs { int sync_idx; volatile int* barrier; int barrier_idx; + volatile double* reduce; int syncRank; int syncNranks; - double* deltaThreads; double* deltaHost; - double* delta; int* errors; double* bw; int* bw_count; From b9f90d12a906a0dc5e49f1ede6a52c7779289c01 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 12 Jul 2021 11:43:57 -0700 Subject: [PATCH 083/233] Removed MPI_SUPPORT conditional compilation of average flag --- src/common.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/common.cu b/src/common.cu index 4768b9be3e..c343342ffa 100644 --- a/src/common.cu +++ b/src/common.cu @@ -66,10 +66,8 @@ static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; static int cudaGraphLaunches = 0; -#ifdef MPI_SUPPORT // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) static int average = 1; -#endif #define NUM_BLOCKS 32 @@ -870,11 +868,9 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; -#ifdef MPI_SUPPORT case 'a': average = (int)strtol(optarg, NULL, 0); break; -#endif case 'h': default: if (c != 'h') printf("invalid option '%c'\n", c); @@ -899,9 +895,7 @@ int main(int argc, char* argv[]) { "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" "[-G,--cudagraph ] \n\t" -#ifdef MPI_SUPPORT "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" -#endif "[-h,--help]\n", basename(argv[0])); return 0; From 1f8f5416863a3082975b10eaa05fecee6fe870c8 Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 13 Jul 2021 10:17:05 -0700 Subject: [PATCH 084/233] Add CUDA graph support only for CUDA 11.3 and later builds Fixes #90 --- src/common.cu | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/common.cu b/src/common.cu index c343342ffa..6a26c6c4e8 100644 --- a/src/common.cu +++ b/src/common.cu @@ -534,6 +534,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); +#if CUDART_VERSION >= 11030 cudaGraph_t graphs[args->nGpus]; cudaGraphExec_t graphExec[args->nGpus]; if (cudaGraphLaunches >= 1) { @@ -542,6 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); } } +#endif // Performance Benchmark auto start = std::chrono::high_resolution_clock::now(); @@ -553,6 +555,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } +#if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { // End cuda graph capture for (int i=0; inGpus; i++) { @@ -571,6 +574,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } } } +#endif TESTCHECK(completeColl(args)); @@ -580,6 +584,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; Allreduce(args, &deltaSec, average); +#if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { //destroy cuda graph for (int i=0; inGpus; i++) { @@ -587,6 +592,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t CUDACHECK(cudaGraphDestroy(graphs[i])); } } +#endif double algBw, busBw; args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); @@ -600,16 +606,19 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); +#if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for data check for (int i=0; inGpus; i++) { CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); } } +#endif //test validation in single itertion, should ideally be included into the multi-iteration run TESTCHECK(startColl(args, type, op, root, in_place, 0)); +#if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { // End cuda graph capture for (int i=0; inGpus; i++) { @@ -624,9 +633,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); } } +#endif TESTCHECK(completeColl(args)); +#if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { //destroy cuda graph for (int i=0; inGpus; i++) { @@ -634,6 +645,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t CUDACHECK(cudaGraphDestroy(graphs[i])); } } +#endif TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); From cc34c545098145bc148e5035e4c8e767b4d71ece Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 21 Jul 2021 14:19:48 -0700 Subject: [PATCH 085/233] Use ROCM_PATH instead of ROCM_HOME --- src/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Makefile b/src/Makefile index 31e0fda431..cb2bdb09d5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -5,14 +5,14 @@ # See LICENSE.txt for license information # -ROCM_HOME ?= /opt/rocm +ROCM_PATH ?= /opt/rocm MPI_HOME ?= /usr/lib/openmpi PREFIX ?= /usr/local VERBOSE ?= 0 DEBUG ?= 0 NCCL_HOME ?= "" -HIPCC = $(ROCM_HOME)/hip/bin/hipcc +HIPCC = $(ROCM_PATH)/hip/bin/hipcc CXX = $(HIPCC) HIPCUFLAGS := -std=c++14 @@ -23,11 +23,11 @@ ifneq ($(NCCL_HOME), "") HIPCUFLAGS += -I$(NCCL_HOME) HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) endif -HIPCUFLAGS += -I$(ROCM_HOME)/include -HIPCUFLAGS += -I$(ROCM_HOME)/include/rccl -HIPCUFLAGS += -I$(ROCM_HOME)/hip/include/hip -LDFLAGS += -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt -HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_HOME)/lib -lhsa-runtime64 -lrt +HIPCUFLAGS += -I$(ROCM_PATH)/include +HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl +HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip +LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt +HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt ifeq ($(DEBUG), 0) HIPCUFLAGS += -O3 From f773748b464ea76930d3aa4cd24f270f6c955cb8 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 13 Sep 2021 14:43:22 -0700 Subject: [PATCH 086/233] Resync with NCCL 2.11 New operator: mulsum New test: gather --- src/Makefile | 2 +- src/common.cu | 201 +++++++++++++++++++++++++++++++++++--------------- src/common.h | 9 ++- src/gather.cu | 131 ++++++++++++++++++++++++++++++++ 4 files changed, 279 insertions(+), 64 deletions(-) create mode 100644 src/gather.cu diff --git a/src/Makefile b/src/Makefile index 26e653e7d6..c8491ea537 100644 --- a/src/Makefile +++ b/src/Makefile @@ -70,7 +70,7 @@ NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) SRC_FILES := $(wildcard *.cu) OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o) -BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter sendrecv hypercube +BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv hypercube BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf) build: ${BIN_FILES} diff --git a/src/common.cu b/src/common.cu index 6a26c6c4e8..05f814d923 100644 --- a/src/common.cu +++ b/src/common.cu @@ -14,37 +14,37 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 -ncclDataType_t test_types[ncclNumTypes] = {ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble, -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - ncclBfloat16 -#endif -}; -const char *test_typenames[ncclNumTypes] = {"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double", -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - "bfloat16" -#endif -}; + ncclDataType_t test_types[ncclNumTypes] = { + ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclBfloat16 + #endif + }; + const char *test_typenames[ncclNumTypes] = { + "int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double" + #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , "bfloat16" + #endif + }; + int test_typenum = -1; -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) -int test_typenum = 10; + const char *test_opnames[] = {"sum", "prod", "max", "min", "avg", "mulsum"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + , ncclAvg + #endif + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + , ncclNumOps // stand in for ncclRedOpCreatePreMulSum() created on-demand + #endif + }; + int test_opnum = -1; #else -int test_typenum = 9; -#endif - -#else -ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; -const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; -int test_typenum = 7; -#endif - -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) -ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg}; -const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min", "avg"}; -int test_opnum = 5; -#else -ncclRedOp_t test_ops[ncclNumOps] = {ncclSum, ncclProd, ncclMax, ncclMin}; -const char *test_opnames[ncclNumOps] = {"sum", "prod", "max", "min"}; -int test_opnum = 4; + ncclDataType_t test_types[ncclNumTypes] = {ncclChar, ncclInt, ncclHalf, ncclFloat, ncclDouble, ncclInt64, ncclUint64}; + const char *test_typenames[ncclNumTypes] = {"char", "int", "half", "float", "double", "int64", "uint64"}; + int test_typenum = 7; + const char *test_opnames[] = {"sum", "prod", "max", "min"}; + ncclRedOp_t test_ops[] = {ncclSum, ncclProd, ncclMax, ncclMin}; + int test_opnum = 4; #endif thread_local int is_main_thread = 0; @@ -265,45 +265,73 @@ template<> __device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } template -__device__ T ncclPostOpIdent(T x, int n) { return x; } - +__device__ T ncclPPOpIdent(T x, int arg) { return x; } template -__device__ T ncclPostOpDiv(T x, int n) { return x/n; } +__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } +template +__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } template<> -__device__ half ncclPostOpDiv(half x, int n) { return __float2half(__half2float(x)/n); } +__device__ half ncclPPOpMul(half x, int arg) { + return __float2half(__half2float(x)*float(arg)); +} +template<> +__device__ half ncclPPOpDiv(half x, int n) { + return __float2half(__half2float(x)/n); +} #if defined(__CUDA_BF16_TYPES_EXIST__) template<> -__device__ __nv_bfloat16 ncclPostOpDiv<__nv_bfloat16>(__nv_bfloat16 x, int n) { return __float2bfloat16(__bfloat162float(x)/n); } +__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { + return __float2bfloat16(__bfloat162float(x)*float(arg)); +} +template<> +__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { + return __float2bfloat16(__bfloat162float(x)/n); +} #endif -template +__host__ __device__ int preMulScalar(int rank) { + return 1 + rank%2; +} + +template __global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); + val = PreOp(val, preMulScalar(0)); for (int i=1; i(o+offset, rep, i)); + T val1 = testValue(o+offset, rep, i); + val1 = PreOp(val1, preMulScalar(i)); + val = Op(val, val1); } data[o] = PostOp(val, nranks); } } -#define KERN(type, op, postop) (void*)InitDataReduceKernel, postop > -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) #define OPS(type) \ - KERN(type, ncclOpSum, ncclPostOpIdent), \ - KERN(type, ncclOpProd, ncclPostOpIdent), \ - KERN(type, ncclOpMax, ncclPostOpIdent), \ - KERN(type, ncclOpMin, ncclPostOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPostOpDiv) + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ + KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define OPS(type) \ + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) #else #define OPS(type) \ - KERN(type, ncclOpSum, ncclPostOpIdent), \ - KERN(type, ncclOpProd, ncclPostOpIdent), \ - KERN(type, ncclOpMax, ncclPostOpIdent), \ - KERN(type, ncclOpMin, ncclPostOpIdent) + KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ + KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) #endif -static void* const redInitDataKerns[ncclNumOps*ncclNumTypes] = { +static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) OPS(__nv_bfloat16) @@ -314,7 +342,7 @@ testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, dim3 grid = { 32, 1, 1 }; dim3 block = { 256, 1, 1 }; void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; - CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*ncclNumOps+op], grid, block, args, 0, cudaStreamDefault)); + CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); return testSuccess; } @@ -335,7 +363,7 @@ static void* const initDataKerns[ncclNumTypes] = { (void*)InitDataKernel< float>, (void*)InitDataKernel< double>, #if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - (void*)InitDataKernel<__nv_bfloat16>, + (void*)InitDataKernel<__nv_bfloat16> #endif }; @@ -481,7 +509,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* return testSuccess; } -testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int iter) { +testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int root, int in_place, int iter) { size_t count = args->nbytes / wordSize(type); // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange @@ -499,10 +527,49 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; char* sendBuff = ((char*)args->sendbuffs[i]) + shift; + ncclRedOp_t op; + + if(opIndex < ncclNumOps) { + op = opIndex; + } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + else { + union { + int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; + half f16; float f32; double f64; + #if defined(__CUDA_BF16_TYPES_EXIST__) + __nv_bfloat16 bf16; + #endif + }; + int scalar = preMulScalar(rank); + switch(type) { + case ncclInt8: i8 = int8_t(scalar); break; + case ncclUint8: u8 = uint8_t(scalar); break; + case ncclInt32: i32 = int32_t(scalar); break; + case ncclUint32: u32 = uint32_t(scalar); break; + case ncclInt64: i64 = int32_t(scalar); break; + case ncclUint64: u64 = uint32_t(scalar); break; + case ncclFloat16: f16 = __float2half(float(scalar)); break; + case ncclFloat32: f32 = float(scalar); break; + case ncclFloat64: f64 = double(scalar); break; + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + #endif + } + NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); + } + #endif + TESTCHECK(args->collTest->runColl( (void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff), (void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff), count, type, op, root, args->comms[i], args->streams[i])); + + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + if(opIndex >= ncclNumOps) { + NCCLCHECK(ncclRedOpDestroy(op, args->comms[i])); + } + #endif } if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); @@ -540,7 +607,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); + // Thread local mode is needed for: + // - Multi-thread mode + // - P2P pre-connect + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); } } #endif @@ -610,7 +680,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for data check for (int i=0; inGpus; i++) { - CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); + CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); } } #endif @@ -777,10 +847,19 @@ int main(int argc, char* argv[]) { test_ncclVersion = NCCL_VERSION_CODE; #endif //printf("# NCCL_VERSION_CODE=%d ncclGetVersion=%d\n", NCCL_VERSION_CODE, test_ncclVersion); - if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion < NCCL_VERSION(2,10,0)) { - test_opnum -= 1; // exclude ncclAvg - test_typenum -= 1; // exclude bfloat16 - } + #if NCCL_VERSION_CODE >= NCCL_VERSION(2,0,0) + test_opnum = 4; + test_typenum = 9; + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { + test_opnum++; // ncclAvg + #if defined(__CUDA_BF16_TYPES_EXIST__) + test_typenum++; // bfloat16 + #endif + } + if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) { + test_opnum++; // PreMulSum + } + #endif // Parse args double parsed; @@ -803,7 +882,8 @@ int main(int argc, char* argv[]) { {"blocking", required_argument, 0, 'z'}, {"cudagraph", required_argument, 0, 'G'}, {"average", required_argument, 0, 'a'}, - {"help", no_argument, 0, 'h'} + {"help", no_argument, 0, 'h'}, + {} }; while(1) { @@ -898,7 +978,9 @@ int main(int argc, char* argv[]) { "[-w,--warmup_iters ] \n\t" "[-p,--parallel_init <0/1>] \n\t" "[-c,--check <0/1>] \n\t" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + "[-o,--op ] \n\t" +#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) "[-o,--op ] \n\t" #else "[-o,--op ] \n\t" @@ -993,6 +1075,7 @@ testResult_t run() { } #ifdef MPI_SUPPORT MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); #endif cudaStream_t streams[nGpus*nThreads]; void* sendbuffs[nGpus*nThreads]; diff --git a/src/common.h b/src/common.h index f789c787cd..e13816f6f8 100644 --- a/src/common.h +++ b/src/common.h @@ -237,12 +237,13 @@ static size_t wordSize(ncclDataType_t type) { } extern int test_ncclVersion; // init'd with ncclGetVersion() -extern ncclDataType_t test_types[ncclNumTypes]; -extern const char *test_typenames[ncclNumTypes]; -extern ncclRedOp_t test_ops[ncclNumOps]; -extern const char *test_opnames[ncclNumOps]; +constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0); extern int test_opnum; extern int test_typenum; +extern ncclDataType_t test_types[ncclNumTypes]; +extern const char *test_typenames[ncclNumTypes]; +extern ncclRedOp_t test_ops[]; +extern const char *test_opnames[]; static int ncclstringtotype(char *str) { for (int t=0; tsendBytes / wordSize(type); + size_t recvcount = args->expectedBytes / wordSize(type); + int nranks = args->nProcs*args->nThreads*args->nGpus; + + for (int i=0; inGpus; i++) { + int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + CUDACHECK(cudaSetDevice(gpuid)); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); + if (rank == root) { + for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + } + } + CUDACHECK(cudaDeviceSynchronize()); + } + return testSuccess; +} + +void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { + double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec; + + *algBw = baseBw; + double factor = ((double)(nranks-1))/((double)(nranks)); + *busBw = baseBw * factor; +} + +testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + int nRanks; + NCCLCHECK(ncclCommCount(comm, &nRanks)); + int rank; + NCCLCHECK(ncclCommUserRank(comm, &rank)); + size_t rankOffset = count * wordSize(type); + if (count == 0) return testSuccess; + + NCCLCHECK(ncclGroupStart()); + NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream)); + if (rank == root) { + for (int r=0; rcollTest = &gatherTest; + ncclDataType_t *run_types; + const char **run_typenames; + int type_count; + int begin_root, end_root; + + if ((int)type != -1) { + type_count = 1; + run_types = &type; + run_typenames = &typeName; + } else { + type_count = test_typenum; + run_types = test_types; + run_typenames = test_typenames; + } + + if (root != -1) { + begin_root = end_root = root; + } else { + begin_root = 0; + end_root = args->nProcs*args->nThreads*args->nGpus-1; + } + + for (int i=0; i Date: Wed, 22 Sep 2021 08:43:01 -0700 Subject: [PATCH 087/233] Fix divide by zero error --- src/common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index 6db2b4328b..4b62741fa3 100644 --- a/src/common.cu +++ b/src/common.cu @@ -527,7 +527,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t size_t count = args->nbytes / wordSize(type); // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange - size_t totalnbytes = max(args->sendBytes, args->expectedBytes); + size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes); size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; size_t shift = totalnbytes * (iter % steps); From 8b35847d36b14442caacfb08ff62ab52d0fc9f31 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 23 Sep 2021 16:39:11 -0700 Subject: [PATCH 088/233] Use rccl_bfloat16 class --- src/common.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common.cu b/src/common.cu index 4b62741fa3..64f5e92263 100644 --- a/src/common.cu +++ b/src/common.cu @@ -157,10 +157,10 @@ template<> __device__ float toFloat(half a) { return __half2float(a); } -#if defined(__CUDA_BF16_TYPES_EXIST__) +#if defined(RCCL_BFLOAT16) template<> __device__ -float toFloat(__nv_bfloat16 a) { - return __bfloat162float(a); +float toFloat(rccl_bfloat16 a) { + return (float)(a); } #endif @@ -551,8 +551,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t union { int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64; half f16; float f32; double f64; - #if defined(__CUDA_BF16_TYPES_EXIST__) - __nv_bfloat16 bf16; + #if defined(RCCL_BFLOAT16) + rccl_bfloat16 bf16; #endif }; int scalar = preMulScalar(rank); @@ -566,8 +566,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t case ncclFloat16: f16 = __float2half(float(scalar)); break; case ncclFloat32: f32 = float(scalar); break; case ncclFloat64: f64 = double(scalar); break; - #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + #if defined(RCCL_BFLOAT16) + case ncclBfloat16: bf16 = (rccl_bfloat16)(float(scalar)); break; #endif } NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); @@ -892,7 +892,7 @@ int main(int argc, char* argv[]) { test_typenum = 9; if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) { test_opnum++; // ncclAvg - #if defined(__CUDA_BF16_TYPES_EXIST__) + #if defined(RCCL_BFLOAT16) test_typenum++; // bfloat16 #endif } From 7130fa6096466f80b0c310b9a3070b6556c0e158 Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 25 Oct 2021 16:30:57 -0700 Subject: [PATCH 089/233] Add MPI_IBM build option --- src/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Makefile b/src/Makefile index c8491ea537..9a1f62eeb0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -64,6 +64,10 @@ ifeq ($(MPI), 1) NVCUFLAGS += -DMPI_SUPPORT -I$(MPI_HOME)/include NVLDFLAGS += -L$(MPI_HOME)/lib -L$(MPI_HOME)/lib64 -lmpi endif +ifeq ($(MPI_IBM),1) +NVCUFLAGS += -DMPI_SUPPORT +NVLDFLAGS += -lmpi_ibm +endif LIBRARIES += curand nccl nvToolsExt NVLDFLAGS += $(LIBRARIES:%=-l%) From de3ddbe261d553d4356ffcd548f4f8d893f193e0 Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 10 Nov 2021 09:14:22 -0800 Subject: [PATCH 090/233] Add option to statically link cudart Build with CUDARTLIB=cudart_static to remove dynamic linkage Also removed unused curand and nvToolsExt dependencies BUG 95 --- src/Makefile | 5 +++-- src/common.h | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Makefile b/src/Makefile index 9a1f62eeb0..2a399db7fa 100644 --- a/src/Makefile +++ b/src/Makefile @@ -12,6 +12,7 @@ DEBUG ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include NVCC = $(CUDA_HOME)/bin/nvcc +CUDARTLIB ?= cudart CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) @@ -36,7 +37,7 @@ endif NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt -NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt ifeq ($(DEBUG), 0) NVCUFLAGS += -O3 -g @@ -68,7 +69,7 @@ ifeq ($(MPI_IBM),1) NVCUFLAGS += -DMPI_SUPPORT NVLDFLAGS += -lmpi_ibm endif -LIBRARIES += curand nccl nvToolsExt +LIBRARIES += nccl NVLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) diff --git a/src/common.h b/src/common.h index e13816f6f8..bd84d01853 100644 --- a/src/common.h +++ b/src/common.h @@ -10,7 +10,6 @@ #include #include #include -#include #ifdef MPI_SUPPORT #include "mpi.h" #endif @@ -46,7 +45,6 @@ typedef enum { testInternalError = 1, testCudaError = 2, testNcclError = 3, - testCuRandError = 4 } testResult_t; // Relay errors up and trace From 602b745ff48f4aa7d73e8fd946442b666fae344f Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Tue, 16 Nov 2021 07:50:18 -0800 Subject: [PATCH 091/233] Add missing hipStreamDestroy at test exit --- src/common.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common.cu b/src/common.cu index 64f5e92263..98fa8dcfb7 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1268,6 +1268,10 @@ testResult_t run() { free(comms); } + for (int i=0; i Date: Sat, 19 Feb 2022 00:31:40 +0800 Subject: [PATCH 092/233] move to a2a api (#9) --- src/alltoall.cu | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/alltoall.cu b/src/alltoall.cu index ba3c6f1088..4b8e66d5a2 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -60,23 +60,8 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double } testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { - int nRanks; - NCCLCHECK(ncclCommCount(comm, &nRanks)); - size_t rankOffset = count * wordSize(type); - if (count == 0) return testSuccess; - -#if NCCL_MAJOR < 2 || NCCL_MINOR < 7 - printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); - return testNcclError; -#else - NCCLCHECK(ncclGroupStart()); - for (int r=0; r Date: Thu, 31 Mar 2022 13:18:02 -0400 Subject: [PATCH 093/233] Update include path for custom RCCL build --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index ef77730c70..ec0301b758 100644 --- a/src/Makefile +++ b/src/Makefile @@ -20,7 +20,7 @@ LDFLAGS := HIPLDFLAGS := ifneq ($(NCCL_HOME), "") -HIPCUFLAGS += -I$(NCCL_HOME) +HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) endif HIPCUFLAGS += -I$(ROCM_PATH)/include From 6156759a40d6f2f39bc78f53edae363943938c47 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Wed, 6 Apr 2022 16:46:17 +0000 Subject: [PATCH 094/233] Print GPU's full PCI bus ID --- src/common.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index 98fa8dcfb7..b0fc63094f 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1114,8 +1114,10 @@ testResult_t run() { int rank = proc*nThreads*nGpus+i; hipDeviceProp_t prop; HIPCHECK(hipGetDeviceProperties(&prop, hipDev)); - len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", - rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name); + char busIdStr[] = "00000000:00:00.0"; + HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev)); + len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n", + rank, getpid(), hostname, hipDev, busIdStr, prop.name); maxMem = std::min(maxMem, prop.totalGlobalMem); } From 3d6f70659a0356d68d396ae5923e5ef95150b4eb Mon Sep 17 00:00:00 2001 From: amdkila <47991923+amdkila@users.noreply.github.com> Date: Thu, 28 May 2020 10:34:30 -0600 Subject: [PATCH 095/233] Check for error code in install script (#2) --- install.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/install.sh b/install.sh index 7c8a865ef5..c56a6bfdde 100755 --- a/install.sh +++ b/install.sh @@ -67,6 +67,14 @@ while true; do esac done +# throw error code after running a command in the install script +check_exit_code( ) +{ + if (( $1 != 0 )); then + exit $1 + fi +} + # Install the pre-commit hook #bash ./githooks/install @@ -87,6 +95,7 @@ if ($mpi_enabled); then else make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc) fi +check_exit_code "$?" # Optionally, run tests if they're enabled. if ($run_tests); then From 5cd2374edb3c5094d95874bd5361afde0da59de3 Mon Sep 17 00:00:00 2001 From: Edgar Date: Fri, 18 Mar 2022 11:37:05 -0400 Subject: [PATCH 096/233] create branch up-to-date with rccl-test --- src/common.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common.cu b/src/common.cu index b0fc63094f..45225ff10a 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1118,6 +1118,8 @@ testResult_t run() { HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), hipDev)); len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n", rank, getpid(), hostname, hipDev, busIdStr, prop.name); + len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", + rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name); maxMem = std::min(maxMem, prop.totalGlobalMem); } From 0500f2f132914e39320e4886af33c0552fcad14b Mon Sep 17 00:00:00 2001 From: Edgar Date: Fri, 18 Mar 2022 11:42:15 -0400 Subject: [PATCH 097/233] implementation of multi-rank support in rccl-tests. --- src/all_gather.cu | 23 ++-- src/all_reduce.cu | 21 ++-- src/alltoall.cu | 21 ++-- src/alltoallv.cu | 69 ++++++----- src/broadcast.cu | 19 +++- src/common.cu | 258 +++++++++++++++++++++++++++++------------- src/common.h | 3 + src/gather.cu | 25 ++-- src/hypercube.cu | 22 ++-- src/reduce.cu | 23 ++-- src/reduce_scatter.cu | 23 ++-- src/scatter.cu | 18 ++- src/sendrecv.cu | 21 ++-- 13 files changed, 365 insertions(+), 181 deletions(-) diff --git a/src/all_gather.cu b/src/all_gather.cu index 0ca428dbed..bc1c59969c 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -31,17 +31,24 @@ void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[k])+args->sendBytes*j, sendcount, type, rep, j)); + } + k++; } HIPCHECK(hipDeviceSynchronize()); } @@ -99,4 +106,4 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t struct testEngine ncclTestEngine = { AllGatherGetBuffSize, AllGatherRunTest -}; \ No newline at end of file +}; diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 1c1d73a9d2..e76ee38dff 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -31,16 +31,23 @@ void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k = 0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks)); + k++; + } HIPCHECK(hipDeviceSynchronize()); } return testSuccess; @@ -110,4 +117,4 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t struct testEngine ncclTestEngine = { AllReduceGetBuffSize, AllReduceRunTest -}; \ No newline at end of file +}; diff --git a/src/alltoall.cu b/src/alltoall.cu index 4b8e66d5a2..48020e4fa3 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -31,18 +31,25 @@ void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *para testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { char* str = getenv("NCCL_TESTS_DEVICE"); int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - for (int j=0; jexpected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j)); + } + k++; } HIPCHECK(hipDeviceSynchronize()); } diff --git a/src/alltoallv.cu b/src/alltoallv.cu index fb6d0acde8..7a39bcce7b 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -41,44 +41,51 @@ void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { char* str = getenv("NCCL_TESTS_DEVICE"); int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); #if 0 - int *dataHost = (int *)malloc(args->sendBytes); - hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); - printf(" Rank [%d] Original: ", rank); - for(int j=0; jsendBytes); + hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); + printf(" Rank [%d] Original: ", rank); + for(int j=0; jexpected[i])+rdisp*wordSize(type), rcount, type, rep+sdisp, j)); - rdisp += rcount; + printf("\n"); + free(dataHost); +#endif + size_t rdisp = 0; + size_t data_count = sendcount*2/nranks; + size_t chunksize = data_count/nranks; + for (int j=0; jexpected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j)); + rdisp += rcount; + } + k++; } HIPCHECK(hipDeviceSynchronize()); } diff --git a/src/broadcast.cu b/src/broadcast.cu index 61f0a9952a..dffb6b6256 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -32,14 +32,21 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); - TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root)); + k++; + } HIPCHECK(hipDeviceSynchronize()); } return testSuccess; @@ -116,4 +123,4 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t struct testEngine ncclTestEngine = { BroadcastGetBuffSize, BroadcastRunTest -}; \ No newline at end of file +}; diff --git a/src/common.cu b/src/common.cu index 45225ff10a..c31cff308e 100644 --- a/src/common.cu +++ b/src/common.cu @@ -78,6 +78,9 @@ static uint32_t cumask[4]; static int cudaGraphLaunches = 0; // Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX) static int average = 1; +static int numDevices = 1; +static int ranksPerGpu = 1; +static int enable_multiranks = 0; #define NUM_BLOCKS 32 @@ -117,6 +120,38 @@ static double parsesize(const char *value) { return size * units; } +static bool minReqVersion(int rmajor, int rminor, int rpatch) +{ + int version; + int major, minor, patch, rem; + ncclGetVersion(&version); + + if (version < 10000) { + major = version/1000; + rem = version%1000; + minor = rem/100; + patch = rem%100; + } + else { + major = version/10000; + rem = version%10000; + minor = rem/100; + patch = rem%100; + } + + if (major < rmajor) return false; + else if (major > rmajor) return true; + + // major == rmajor + if (minor < rminor) return false; + else if (minor > rminor) return true; + + // major == rmajor && minor == rminor + if (patch < rpatch) return false; + + return true; +} + double DeltaMaxValue(ncclDataType_t type) { switch(type) { case ncclHalf: return 1e-2; @@ -437,9 +472,9 @@ void Allreduce(struct threadArgs* args, double* value, int average) { testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { size_t count = args->expectedBytes/wordSize(type); double maxDelta = 0.0; - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { int device; - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i); NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); HIPCHECK(hipSetDevice(device)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; @@ -474,20 +509,20 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t //} #endif } - double nranks = args->nProcs*args->nThreads*args->nGpus; + double nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; *delta = maxDelta; return testSuccess; } -testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) { +testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_t* comms) { hipError_t hipErr; - int remaining = ngpus; - int* done = (int*)malloc(sizeof(int)*ngpus); - memset(done, 0, sizeof(int)*ngpus); + int remaining = nStreams; + int* done = (int*)malloc(sizeof(int)*nStreams); + memset(done, 0, sizeof(int)*nStreams); while (remaining) { int idle = 1; - for (int i=0; imaxbytes / totalnbytes : 1; size_t shift = totalnbytes * (iter % steps); - if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); - for (int i = 0; i < args->nGpus; i++) { + if (args->nGpus> 1 || args->nRanks > 1) NCCLCHECK(ncclGroupStart()); + for (int i = 0; i < args->nGpus*args->nRanks; i++) { #ifndef NCCL_MAJOR int hipDev; NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev)); HIPCHECK(hipSetDevice(hipDev)); #endif - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; char* sendBuff = ((char*)args->sendbuffs[i]) + shift; ncclRedOp_t op; @@ -585,11 +620,11 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #endif } - if (args->nGpus > 1) NCCLCHECK(ncclGroupEnd()); + if (args->nGpus > 1 || args->nRanks > 1) NCCLCHECK(ncclGroupEnd()); if (blocking_coll) { // Complete op before returning - TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + TESTCHECK(testStreamSynchronize(args->nGpus*args->nRanks, args->streams, args->comms)); } if (blocking_coll) Barrier(args); return testSuccess; @@ -598,10 +633,11 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t testResult_t completeColl(struct threadArgs* args) { if (blocking_coll) return testSuccess; - TESTCHECK(testStreamSynchronize(args->nGpus, args->streams, args->comms)); + TESTCHECK(testStreamSynchronize(args->nGpus*args->nRanks, args->streams, args->comms)); return testSuccess; } +//EDGAR: Revisit because of cudaGraphLaunches testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); if (datacheck) { @@ -616,11 +652,11 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); #if CUDART_VERSION >= 11030 - hipGraph_t graphs[args->nGpus]; - hipGraphExec_t graphExec[args->nGpus]; + hipGraph_t graphs[args->nGpus*args->nRanks]; + hipGraphExec_t graphExec[args->nGpus*args->nRanks]; if (cudaGraphLaunches >= 1) { // Begin cuda graph capture - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { // Thread local mode is needed for: // - Multi-thread mode // - P2P pre-connect @@ -642,18 +678,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { // End cuda graph capture - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i)); } // Instantiate cuda graph - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); } // Resync CPU, restart timing, launch cuda graph Barrier(args); start = std::chrono::high_resolution_clock::now(); for (int l=0; lnGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i])); } } @@ -671,7 +707,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { //destroy cuda graph - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipGraphExecDestroy(graphExec[i])); HIPCHECK(hipGraphDestroy(graphs[i])); } @@ -679,7 +715,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif double algBw, busBw; - args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus); + args->collTest->getBw(count, wordSize(type), deltaSec, &algBw, &busBw, args->nProcs*args->nThreads*args->nGpus*args->nRanks); Barrier(args); @@ -694,7 +730,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for data check - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(chiptreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal)); } } @@ -706,15 +742,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { // End cuda graph capture - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i)); } // Instantiate cuda graph - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); } // Launch cuda graph - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i])); } } @@ -725,7 +761,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #if CUDART_VERSION >= 11030 if (cudaGraphLaunches >= 1) { //destroy cuda graph - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { HIPCHECK(hipGraphExecDestroy(graphExec[i])); HIPCHECK(hipGraphDestroy(graphs[i])); } @@ -759,7 +795,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { - int nranks = args->nProcs*args->nGpus*args->nThreads; + int nranks = args->nProcs*args->nGpus*args->nThreads*args->nRanks; size_t count, sendCount, recvCount, paramCount, sendInplaceOffset, recvInplaceOffset; count = size / wordSize(type); @@ -806,6 +842,8 @@ testResult_t threadRunTests(struct threadArgs* args) { // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; + if (enable_multiranks) + gpuid = gpuid % numDevices; HIPCHECK(hipSetDevice(gpuid)); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; @@ -814,23 +852,33 @@ testResult_t threadRunTests(struct threadArgs* args) { testResult_t threadInit(struct threadArgs* args) { char hostname[1024]; getHostName(hostname, 1024); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; //set main thread again is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { - int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (enable_multiranks) + gpuid = gpuid % numDevices; HIPCHECK(hipSetDevice(gpuid)); - NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); + + for (int j=0; jnRanks; j++) { + int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j; + if (args->enable_multiranks) + NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); +#ifdef RCCL_MULTIRANKPERGPU + else + NCCLCHECK(ncclCommInitRankMulti(args->comms+i*args->nRanks+j, nranks, args->ncclId, rank, rank)); +#endif + } } NCCLCHECK(ncclGroupEnd()); TESTCHECK(threadRunTests(args)); - for (int i=0; inGpus; i++) { + for (int i=0; inGpus*args->nRanks; i++) { NCCLCHECK(ncclCommDestroy(args->comms[i])); } return testSuccess; @@ -925,13 +973,21 @@ int main(int argc, char* argv[]) { {"cumask", required_argument, 0, 'u'}, {"cudagraph", required_argument, 0, 'G'}, {"average", required_argument, 0, 'a'}, +#ifdef RCCL_MULTIRANKPERGPU + {"enable_multiranks", required_argument, 0, 'x'}, + {"ranks_per_gpu", required_argument, 0, 'R'}, +#endif {"help", no_argument, 0, 'h'}, {} }; while(1) { int c; +#ifdef RCCL_MULTIRANKPERGPU + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:", longopts, &longindex); +#else c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:", longopts, &longindex); +#endif if (c == -1) break; @@ -1022,6 +1078,14 @@ int main(int argc, char* argv[]) { case 'a': average = (int)strtol(optarg, NULL, 0); break; +#ifdef RCCL_MULTIRANKPERGPU + case 'x': + enable_multiranks = (int)strtol(optarg, NULL, 0); + break; + case 'R': + ranksPerGpu = (int)strtol(optarg, NULL, 0); + break; +#endif case 'h': default: if (c != 'h') printf("invalid option '%c'\n", c); @@ -1052,26 +1116,43 @@ int main(int argc, char* argv[]) { "[-u,--cumask ] \n\t" "[-G,--cudagraph ] \n\t" "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t" +#ifdef RCCL_MULTIRANKPERGPU + "[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t" + "[-R,--ranks_per_gpu] \n\t" +#endif "[-h,--help]\n", basename(argv[0])); return 0; } } - int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); if (nGpus > numDevices) { fprintf(stderr, "[ERROR] The number of requested GPUs (%d) is greater than the number of GPUs available (%d)\n", nGpus, numDevices); return testNcclError; } - if (minBytes > maxBytes) { fprintf(stderr, "invalid sizes for 'minbytes' and 'maxbytes': %llu > %llu\n", (unsigned long long)minBytes, (unsigned long long)maxBytes); return -1; } + if (!minReqVersion(2, 12, 12) && enable_multiranks) { + fprintf(stderr, "Multiple Ranks per GPU requested, but rccl library found does not support this feature.\n"); + fprintf(stderr, "Please check LD_LIBRARY_PATH. Resetting enable_multiranks and ranksPerGpu to default values.\n"); + enable_multiranks = 0; + ranksPerGpu = 1; + } + + if (enable_multiranks && parallel_init) { + fprintf(stderr, "Cannot use parallel_init when using multiple ranks per GPU.\n"); + return -1; + } + if (ranksPerGpu > 1 && !enable_multiranks) { + fprintf(stderr, "Need to enable multiranks option to use multiple ranks per GPU\n"); + return -1; + } #ifdef MPI_SUPPORT MPI_Init(&argc, &argv); #endif @@ -1098,7 +1179,7 @@ testResult_t run() { #endif is_main_thread = (proc == 0) ? 1 : 0; - PRINT("# nThread: %d nGpus: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, + PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, ranksPerGpu, minBytes, maxBytes, (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); @@ -1111,18 +1192,20 @@ testResult_t run() { size_t maxMem = ~0; for (int i=0; ilen ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n", - rank, getpid(), hostname, hipDev, busIdStr, prop.name); - len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n", - rank, getpid(), hostname, hipDev, prop.pciBusID, prop.name); - maxMem = std::min(maxMem, prop.totalGlobalMem); - } + for (int j=0; jlen ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n", + rank, getpid(), hostname, hipDev, busIdStr, prop.name); + maxMem = std::min(maxMem, prop.totalGlobalMem); + } + } #if MPI_SUPPORT char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL; // Gather all output in rank order to root (0) @@ -1152,42 +1235,61 @@ testResult_t run() { MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); #endif - hipStream_t streams[nGpus*nThreads]; - void* sendbuffs[nGpus*nThreads]; - void* recvbuffs[nGpus*nThreads]; - void* expected[nGpus*nThreads]; + hipStream_t streams[nGpus*nThreads*ranksPerGpu]; + void* sendbuffs[nGpus*nThreads*ranksPerGpu]; + void* recvbuffs[nGpus*nThreads*ranksPerGpu]; + void* expected[nGpus*nThreads*ranksPerGpu]; size_t sendBytes, recvBytes; - ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads); + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu); - for (int i=0; isendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); - if (rank == root) { - for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault)); + if (rank == root) { + for (int j=0; jexpected[k])+args->sendBytes*j, sendcount, type, rep, j)); + } } + k++; } HIPCHECK(hipDeviceSynchronize()); } diff --git a/src/hypercube.cu b/src/hypercube.cu index 946c9c670b..d654617ccd 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -33,17 +33,24 @@ void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *par testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + for (int j=0; jexpected[k])+args->sendBytes*j, sendcount, type, rep, j)); + } + k++; } HIPCHECK(hipDeviceSynchronize()); } @@ -66,7 +73,6 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); size_t rankSize = count * wordSize(type); - if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream)); // Hypercube AllGather diff --git a/src/reduce.cu b/src/reduce.cu index d0792a49f9..7ea7b0f726 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -31,17 +31,24 @@ void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramc testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); - if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault)); + if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks)); + k++; + } HIPCHECK(hipDeviceSynchronize()); } return testSuccess; @@ -119,4 +126,4 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ struct testEngine ncclTestEngine = { ReduceGetBuffSize, ReduceRunTest -}; \ No newline at end of file +}; diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index bf5cbede8d..23b99de35b 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -31,17 +31,24 @@ void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); - TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault)); + TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks)); + k++; + } HIPCHECK(hipDeviceSynchronize()); } return testSuccess; @@ -111,4 +118,4 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp struct testEngine ncclTestEngine = { ReduceScatterGetBuffSize, ReduceScatterRunTest -}; \ No newline at end of file +}; diff --git a/src/scatter.cu b/src/scatter.cu index 884ec96a46..ec8c06b092 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -32,14 +32,22 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); - TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(args->expected[k], recvcount, type, rep+rank*recvcount, root)); + k++; + + } HIPCHECK(hipDeviceSynchronize()); } return testSuccess; diff --git a/src/sendrecv.cu b/src/sendrecv.cu index 6ded375678..84d7398e42 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -31,17 +31,24 @@ void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *para testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { size_t sendcount = args->sendBytes / wordSize(type); size_t recvcount = args->expectedBytes / wordSize(type); - int nranks = args->nProcs*args->nThreads*args->nGpus; + int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks; + int k=0; for (int i=0; inGpus; i++) { int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; + if (args->enable_multiranks) + gpuid = gpuid % args->localNumDevices; HIPCHECK(hipSetDevice(gpuid)); - int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); - void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); - int peer = (rank-1+nranks)%nranks; - TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer)); + + for (int l=0; lnRanks; l++) { + int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); + HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); + void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; + TESTCHECK(InitData(data, sendcount, type, rep, rank)); + int peer = (rank-1+nranks)%nranks; + TESTCHECK(InitData(args->expected[k], recvcount, type, rep, peer)); + k++; + } HIPCHECK(hipDeviceSynchronize()); } // We don't support in-place sendrecv From 67544e2c3450cf04a6784f57ebd09b05853e35cb Mon Sep 17 00:00:00 2001 From: Edgar Date: Mon, 13 Jun 2022 09:34:59 -0400 Subject: [PATCH 098/233] update pytest before running CI There seems to be in an incompatibility between the python installation used in the CI and pytest. Update pytest before running CI. --- .jenkins/common.groovy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index 14c644b026..70dbbd7a3c 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -27,6 +27,8 @@ def runTestCommand (platform, project) def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} + python3 -m pip install --upgrade pytest + python3 -m pytest --version python3 -m pytest -k "not MPI and not host and not fine" --verbose --junitxml=./testreport.xml """ From 9925195afc967bc50f82aecaa32c3c09866fbe15 Mon Sep 17 00:00:00 2001 From: akolliasAMD <99202231+akolliasAMD@users.noreply.github.com> Date: Thu, 21 Jul 2022 10:28:53 -0600 Subject: [PATCH 099/233] updated alltoallV test to not have any zero values (#12) updated alltoallV test to not have any zero values between ranks --- src/alltoallv.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 7a39bcce7b..cb8fcaff0d 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -71,12 +71,12 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t chunksize = data_count/nranks; for (int j=0; j Date: Thu, 28 Jul 2022 08:19:16 -0700 Subject: [PATCH 100/233] Allow gpu config override in CI (#14) --- .jenkins/precheckin.groovy | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.jenkins/precheckin.groovy b/.jenkins/precheckin.groovy index aae81c922e..d316d47929 100644 --- a/.jenkins/precheckin.groovy +++ b/.jenkins/precheckin.groovy @@ -51,8 +51,10 @@ ci: { def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['rccl906']]), "rocm-docker":([ubuntu16:['rccl906']])] - jobNameList = auxiliary.appendJobNameList(jobNameList) + jobNameList['compute-rocm-dkms-no-npi-hipclang'] = [ubuntu16:['rccl906']] + jobNameList = auxiliary.appendJobNameList(jobNameList) + propertyList.each { From d704668bf7376efd299d15cebe7b0e4ab183d5af Mon Sep 17 00:00:00 2001 From: Liam Wrubleski Date: Tue, 9 Aug 2022 11:17:07 -0600 Subject: [PATCH 101/233] Add CMake files to build & package (#15) * Add CMake files to build & package * Change build technique on CI * Correct CI build command --- .jenkins/common.groovy | 5 ++- CMakeLists.txt | 61 ++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 71 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 CMakeLists.txt create mode 100644 src/CMakeLists.txt diff --git a/.jenkins/common.groovy b/.jenkins/common.groovy index 70dbbd7a3c..7426d35d75 100644 --- a/.jenkins/common.groovy +++ b/.jenkins/common.groovy @@ -13,7 +13,10 @@ def runCompileCommand(platform, project, jobName) ${getRCCL} ${auxiliary.exitIfNotSuccess()} cd ${project.paths.project_build_prefix} - ${project.paths.build_command} + cmake \ + -DCMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \ + -S . -B build + make -C build -j\$(nproc) ${auxiliary.exitIfNotSuccess()} """ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..539a1eae2b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,61 @@ +# ######################################################################## +# Copyright 2022 Advanced Micro Devices, Inc. +# ######################################################################## + +cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR) + +project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX) + +# Get ROCm path from environment if available +if (DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation") +else() + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation") +endif() + +# Set CMake/CPack variables +list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm) +set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path") +set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.") +set(CMAKE_CXX_STANDARD 14) + +# Get additional packages required +find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}") +find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}") + +include(ROCMSetupVersion) +include(ROCMCreatePackage) +include(ROCMInstallTargets) +include(ROCMCheckTargetIds) +include(ROCMClients) + +# Build variables +option(USE_MPI "Build RCCL-tests with MPI support. Requires the MPI path to be set.") +set(MPI_PATH "" CACHE PATH "Path to MPI installation") +## Get default GPU targets using rocm_check_target_ids +rocm_check_target_ids( + DEFAULT_AMDGPU_TARGETS + TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030" +) +set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.") + +# Find the MPI package if we're using MPI +if (USE_MPI) + if(NOT MPI_PATH STREQUAL "") + set(MPI_HOME "${MPI_PATH}") + endif() + find_package(MPI REQUIRED MODULE) + add_definitions(-DOMPI_SKIP_MPICXX -DMPI_SUPPORT) +endif() + +set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component + +# Add all of the tests +add_subdirectory(src) + +# Create ROCm standard packages +rocm_create_package( + NAME rccl-separate-tests + DESCRIPTION "Tests for the ROCm Communication Collectives Library" + MAINTAINER "RCCL Maintainer " +) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..b5a40aefc1 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,71 @@ +# ######################################################################## +# Copyright 2022 Advanced Micro Devices, Inc. +# ######################################################################## + +# Compile common object library +set_property(SOURCE common.cu PROPERTY LANGUAGE CXX) +add_library(rccl_common OBJECT common.cu) +if(USE_MPI) + target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX) +else() + target_link_libraries(rccl_common roc::rccl) +endif() + +function(add_relative_test test_name test_target) + get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY) + if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND") + set(EXE_PATH ".") + endif() + get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}") + get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME) + if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND") + get_target_property(EXE_NAME ${test_target} OUTPUT_NAME) + if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND") + set(EXE_NAME "${test_target}") + endif() + endif() + file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}") + add_test(NAME "${test_name}" COMMAND "./${rel_path}") +endfunction() + +function(add_rccl_test TEST) + set(TEST_SOURCE "${TEST}.cu") + set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX) + set(TEST_TARGET "${TEST}_perf") + add_executable(${TEST_TARGET} ${TEST_SOURCE}) + target_link_libraries( + ${TEST_TARGET} + PRIVATE + rccl_common + ) + if (NOT WIN32) + foreach(amdgpu_target ${AMDGPU_TARGETS}) + target_link_libraries(${TEST_TARGET} PRIVATE --amdgpu-target=${amdgpu_target}) + endforeach() + endif() + set_target_properties( + ${TEST_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + # LINKER_LANGUAGE CXX + ) + add_relative_test(${TEST} ${TEST_TARGET}) + rocm_install(TARGETS ${TEST_TARGET}) + # TODO: copy/install DLLs on Windows + set_target_properties( + ${TEST_TARGET} PROPERTIES + INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib" + ) +endfunction() + +add_rccl_test(all_gather) +add_rccl_test(all_reduce) +add_rccl_test(alltoall) +add_rccl_test(alltoallv) +add_rccl_test(broadcast) +add_rccl_test(gather) +add_rccl_test(hypercube) +add_rccl_test(reduce_scatter) +add_rccl_test(reduce) +add_rccl_test(scatter) +add_rccl_test(sendrecv) From 9025051bbb62fecc15429876d5d7543fad370ec0 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 9 Aug 2022 11:04:38 -0700 Subject: [PATCH 102/233] Fix missing error checking for AllocateBuffs due to merge (#17) --- src/common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index c31cff308e..a4577550ba 100644 --- a/src/common.cu +++ b/src/common.cu @@ -1250,7 +1250,7 @@ testResult_t run() { HIPCHECK(hipSetDevice(gpuid)); for (int j=0; j Date: Tue, 9 Aug 2022 16:45:27 -0600 Subject: [PATCH 103/233] Enabling hipGraph codepath for future support (#18) --- src/common.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common.cu b/src/common.cu index a4577550ba..bcb8df8b4b 100644 --- a/src/common.cu +++ b/src/common.cu @@ -651,7 +651,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); -#if CUDART_VERSION >= 11030 +#if HIP_VERSION >= 50221310 hipGraph_t graphs[args->nGpus*args->nRanks]; hipGraphExec_t graphExec[args->nGpus*args->nRanks]; if (cudaGraphLaunches >= 1) { @@ -675,7 +675,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (agg_iters>1) NCCLCHECK(ncclGroupEnd()); } -#if CUDART_VERSION >= 11030 +#if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { // End cuda graph capture for (int i=0; inGpus*args->nRanks; i++) { @@ -704,7 +704,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; Allreduce(args, &deltaSec, average); -#if CUDART_VERSION >= 11030 +#if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { //destroy cuda graph for (int i=0; inGpus*args->nRanks; i++) { @@ -727,7 +727,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); -#if CUDART_VERSION >= 11030 +#if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for data check for (int i=0; inGpus*args->nRanks; i++) { @@ -739,7 +739,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t //test validation in single itertion, should ideally be included into the multi-iteration run TESTCHECK(startColl(args, type, op, root, in_place, 0)); -#if CUDART_VERSION >= 11030 +#if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { // End cuda graph capture for (int i=0; inGpus*args->nRanks; i++) { @@ -758,7 +758,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t TESTCHECK(completeColl(args)); -#if CUDART_VERSION >= 11030 +#if HIP_VERSION >= 50221310 if (cudaGraphLaunches >= 1) { //destroy cuda graph for (int i=0; inGpus*args->nRanks; i++) { @@ -1069,10 +1069,10 @@ int main(int argc, char* argv[]) { } break; case 'G': -#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 +#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && HIP_VERSION >= 50221310 cudaGraphLaunches = strtol(optarg, NULL, 0); #else - printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); + printf("Option -G (HIP graph) not supported before NCCL 2.9 + ROCm 5.2 Ignoring\n"); #endif break; case 'a': From 45ec598ac4ca7fea8d3fd7b9e5e6206421bb3380 Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 12 Aug 2022 14:42:17 +0000 Subject: [PATCH 104/233] Fix typo from previous merge --- src/common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.cu b/src/common.cu index bcb8df8b4b..332cc3f272 100644 --- a/src/common.cu +++ b/src/common.cu @@ -731,7 +731,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for data check for (int i=0; inGpus*args->nRanks; i++) { - HIPCHECK(chiptreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal)); + HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal)); } } #endif From 51af5572bf8ebf197bac7de8cd6bc7d847339575 Mon Sep 17 00:00:00 2001 From: John Bachan Date: Fri, 19 Aug 2022 15:15:10 -0500 Subject: [PATCH 105/233] Resync with NCCL 2.13 * Added "verifiable", a suite of kernels for generating and verifying reduction input and output arrays in a bit-precise way. * Data corruption errors now reported in number of wrong elements instead of max deviation. * Use ncclGetLastError. * Don't run hypercube on non-powers of 2 ranks. * Fix to hypercube data verification. * Use "thread local" as the defaut CUDA capture mode. * Replaced pthread_yield -> sched_yield() * Bugfix to the cpu-side barrier/allreduce implementations. --- src/Makefile | 8 +- src/all_gather.cu | 16 +- src/all_reduce.cu | 14 +- src/alltoall.cu | 17 +- src/broadcast.cu | 16 +- src/common.cu | 517 +++++--------- src/common.h | 30 +- src/gather.cu | 18 +- src/hypercube.cu | 27 +- src/reduce.cu | 14 +- src/reduce_scatter.cu | 14 +- src/scatter.cu | 16 +- src/sendrecv.cu | 16 +- verifiable/Makefile | 24 + verifiable/inexact_regress.cu | 177 +++++ verifiable/verifiable.cu | 1227 +++++++++++++++++++++++++++++++++ verifiable/verifiable.h | 59 ++ verifiable/verifiable.mk | 11 + 18 files changed, 1706 insertions(+), 515 deletions(-) create mode 100644 verifiable/Makefile create mode 100644 verifiable/inexact_regress.cu create mode 100644 verifiable/verifiable.cu create mode 100644 verifiable/verifiable.h create mode 100644 verifiable/verifiable.mk diff --git a/src/Makefile b/src/Makefile index 2a399db7fa..137b9d7925 100644 --- a/src/Makefile +++ b/src/Makefile @@ -83,12 +83,16 @@ build: ${BIN_FILES} clean: rm -rf ${DST_DIR} -${DST_DIR}/%.o: %.cu common.h +TEST_VERIFIABLE_SRCDIR := ../verifiable +TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable +include ../verifiable/verifiable.mk + +${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS) @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) -c $< -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} diff --git a/src/all_gather.cu b/src/all_gather.cu index 0b9e0cc939..1eaafddfab 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s", size, count, typeName); -} - void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = count/nranks; *recvcount = (count/nranks)*nranks; @@ -38,9 +26,9 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index 9b6b7f02b9..9c65f25aba 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6s", size, count, typeName, opName); -} - void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = count; *recvcount = count; @@ -38,7 +26,7 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); CUDACHECK(cudaDeviceSynchronize()); } diff --git a/src/alltoall.cu b/src/alltoall.cu index 865099743d..0eae1b07c9 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6s", size, count, typeName, opName); -} - void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = (count/nranks)*nranks; *recvcount = (count/nranks)*nranks; @@ -39,9 +27,10 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[i])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j)); + size_t partcount = sendcount/nranks; + TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); } diff --git a/src/broadcast.cu b/src/broadcast.cu index e2b4421ac5..40dcb5d885 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6i", size, count, typeName, root); -} - void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = count; *recvcount = count; @@ -37,8 +25,8 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); - TESTCHECK(InitData(args->expected[i], recvcount, type, rep, root)); + if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); + TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0)); CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; diff --git a/src/common.cu b/src/common.cu index 05f814d923..eaa3318f34 100644 --- a/src/common.cu +++ b/src/common.cu @@ -7,10 +7,13 @@ #include "common.h" #include #include +#include #include #include #include "cuda.h" +#include "../verifiable/verifiable.h" + int test_ncclVersion = 0; // init'd with ncclGetVersion() #if NCCL_MAJOR >= 2 @@ -107,362 +110,154 @@ static double parsesize(const char *value) { return size * units; } -double DeltaMaxValue(ncclDataType_t type) { - switch(type) { - case ncclHalf: return 1e-2; -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: return 1e-2; -#endif - case ncclFloat: return 1e-5; - case ncclDouble: return 1e-12; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint8: - //case ncclInt32: - case ncclUint32: -#endif - case ncclInt64: - case ncclUint64: return 1e-200; - } - return 1e-200; -} - -template __device__ -double absDiff(T a, T b) { - return fabs((double)(b - a)); -} - -template<> __device__ -double absDiff(half a, half b) { - float x = __half2float(a); - float y = __half2float(b); - return fabs((double)(y-x)); -} - -template __device__ -float toFloat(T a) { - return (float)a; -} -template<> __device__ -float toFloat(half a) { - return __half2float(a); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> __device__ -float toFloat(__nv_bfloat16 a) { - return __bfloat162float(a); -} -#endif - -template __global__ -void deltaKern(void* A_, void* B_, size_t count, double* max) { - const T* A = (const T*)A_; - const T* B = (const T*)B_; - __shared__ double temp[BSIZE]; - int tid = blockIdx.x*blockDim.x + threadIdx.x; - double locmax = 0.0; - for(size_t i=tid; i locmax ) { - locmax = delta; -#ifdef DEBUG_PRINT - if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i])); -#endif - } - } - - tid = threadIdx.x; - temp[tid] = locmax; - for(int stride = BSIZE/2; stride > 1; stride>>=1) { - __syncthreads(); - if( tid < stride ) - temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride]; - } - __syncthreads(); - if( threadIdx.x == 0) - max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1]; -} - -testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) { - switch (type) { -#if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - deltaKern<__nv_bfloat16, 512><<>>(results, expected, count, devmax); break; -#endif - case ncclHalf: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclFloat: - deltaKern<<>>(results, expected, count, devmax); break; - case ncclDouble: - deltaKern<<>>(results, expected, count, devmax); break; - - case ncclChar: -#if NCCL_MAJOR >= 2 - case ncclUint8: -#endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt: -#if NCCL_MAJOR >= 2 - case ncclUint32: -#endif - deltaKern<<>>(results, expected, count, devmax); break; - case ncclInt64: - case ncclUint64: - deltaKern<<>>(results, expected, count, devmax); break; - } +testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) { + ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault); CUDACHECK(cudaDeviceSynchronize()); - for (int i=1; i -__device__ T testValue(const size_t offset, const int rep, const int rank) { - uint8_t v = (rep+rank+offset) % 256; - return (T)v; +testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) { + ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault); + return testSuccess; } -// For floating point datatype, we use values between 0 and 1 otherwise the -// Product operation will produce NaNs. -template<> -__device__ double testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(double)testValue(offset, rep, rank)); -} -template<> -__device__ float testValue(const size_t offset, const int rep, const int rank) { - return 1.0/(1.0+(float)testValue(offset, rep, rank)); -} -template<> -__device__ half testValue(const size_t offset, const int rep, const int rank) { - return __float2half(testValue(offset, rep, rank)); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 testValue<__nv_bfloat16>(const size_t offset, const int rep, const int rank) { - return __float2bfloat16(testValue(offset, rep, rank)); -} -#endif - -// Operations -template -__device__ T ncclOpSum(T a, T b) { return a+b; } -template -__device__ T ncclOpProd(T a, T b) { return a*b; } -template -__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; } -template -__device__ T ncclOpMin(T a, T b) { return a -__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); } -template<> -__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); } -template<> -__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; } -template<> -__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; } - -template -__device__ T ncclPPOpIdent(T x, int arg) { return x; } -template -__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); } -template -__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); } -template<> -__device__ half ncclPPOpMul(half x, int arg) { - return __float2half(__half2float(x)*float(arg)); -} -template<> -__device__ half ncclPPOpDiv(half x, int n) { - return __float2half(__half2float(x)/n); -} -#if defined(__CUDA_BF16_TYPES_EXIST__) -template<> -__device__ __nv_bfloat16 ncclPPOpMul(__nv_bfloat16 x, int arg) { - return __float2bfloat16(__bfloat162float(x)*float(arg)); -} -template<> -__device__ __nv_bfloat16 ncclPPOpDiv(__nv_bfloat16 x, int n) { - return __float2bfloat16(__bfloat162float(x)/n); -} -#endif - -__host__ __device__ int preMulScalar(int rank) { - return 1 + rank%2; +testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { + ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault); + return testSuccess; } -template -__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o+offset, rep, 0); - val = PreOp(val, preMulScalar(0)); - for (int i=1; i(o+offset, rep, i); - val1 = PreOp(val1, preMulScalar(i)); - val = Op(val, val1); - } - data[o] = PostOp(val, nranks); +void Barrier(struct threadArgs *args) { + thread_local int epoch = 0; + static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}; + static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER}; + static int counter[2] = {0, 0}; + + pthread_mutex_lock(&lock[epoch]); + if(++counter[epoch] == args->nThreads) + pthread_cond_broadcast(&cond[epoch]); + + if(args->thread+1 == args->nThreads) { + while(counter[epoch] != args->nThreads) + pthread_cond_wait(&cond[epoch], &lock[epoch]); + #ifdef MPI_SUPPORT + MPI_Barrier(MPI_COMM_WORLD); + #endif + counter[epoch] = 0; + pthread_cond_broadcast(&cond[epoch]); } + else { + while(counter[epoch] != 0) + pthread_cond_wait(&cond[epoch], &lock[epoch]); + } + pthread_mutex_unlock(&lock[epoch]); + epoch ^= 1; } -#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel, preop, postop > -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \ - KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent) -#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv) -#else - #define OPS(type) \ - KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \ - KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent) -#endif - -static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = { - OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double), -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - OPS(__nv_bfloat16) -#endif -}; - -testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks }; - CUDACHECK(cudaLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, cudaStreamDefault)); - return testSuccess; -} - +// Inter-thread/process barrier+allreduce. The quality of the return value +// for average=0 (which means broadcast from rank=0) is dubious. The returned +// value will actually be the result of process-local broadcast from the local thread=0. template -__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) { - for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o(o, rep, rank); -} +void Allreduce(struct threadArgs* args, T* value, int average) { + thread_local int epoch = 0; + static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}; + static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER}; + static T accumulator[2]; + static int counter[2] = {0, 0}; -static void* const initDataKerns[ncclNumTypes] = { - (void*)InitDataKernel< int8_t>, - (void*)InitDataKernel< uint8_t>, - (void*)InitDataKernel< int32_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< int64_t>, - (void*)InitDataKernel, - (void*)InitDataKernel< half>, - (void*)InitDataKernel< float>, - (void*)InitDataKernel< double>, -#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) - (void*)InitDataKernel<__nv_bfloat16> -#endif -}; - -template -testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) { - T* ptr = (T*)dest; - InitDataKernel<<<16, 512>>>(ptr, N, rep, rank); - return testSuccess; -} - -testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) { - dim3 grid = { 32, 1, 1 }; - dim3 block = { 256, 1, 1 }; - void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank }; - CUDACHECK(cudaLaunchKernel(initDataKerns[type], grid, block, args, 0, cudaStreamDefault)); - return testSuccess; -} - -void Barrier(struct threadArgs* args) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - MPI_Barrier(MPI_COMM_WORLD); -#endif - args->barrier[args->barrier_idx] = 0; + pthread_mutex_lock(&lock[epoch]); + if(counter[epoch] == 0) { + if(average != 0 || args->thread == 0) accumulator[epoch] = *value; } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); - } - args->barrier_idx=!args->barrier_idx; -} - -// Inter-thread/process barrier+allreduce -void Allreduce(struct threadArgs* args, double* value, int average) { - while (args->barrier[args->barrier_idx] != args->thread) pthread_yield(); - double val = *value; - if (args->thread > 0) { - double val2 = args->reduce[args->barrier_idx]; - if (average == 1) val += val2; - if (average == 2) val = std::min(val, val2); - if (average == 3) val = std::max(val, val2); - } - if (average || args->thread == 0) args->reduce[args->barrier_idx] = val; - args->barrier[args->barrier_idx] = args->thread + 1; - if (args->thread+1 == args->nThreads) { -#ifdef MPI_SUPPORT - if (average != 0) { - MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX; - MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD); + switch(average) { + case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break; + case /*avg*/1: accumulator[epoch] += *value; break; + case /*min*/2: accumulator[epoch] = std::min(accumulator[epoch], *value); break; + case /*max*/3: accumulator[epoch] = std::max(accumulator[epoch], *value); break; + case /*sum*/4: accumulator[epoch] += *value; break; } -#endif - if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads; - args->reduce[1-args->barrier_idx] = 0; - args->barrier[args->barrier_idx] = 0; - } else { - while (args->barrier[args->barrier_idx]) pthread_yield(); } - *value = args->reduce[args->barrier_idx]; - args->barrier_idx=!args->barrier_idx; + + if(++counter[epoch] == args->nThreads) + pthread_cond_broadcast(&cond[epoch]); + + if(args->thread+1 == args->nThreads) { + while(counter[epoch] != args->nThreads) + pthread_cond_wait(&cond[epoch], &lock[epoch]); + + #ifdef MPI_SUPPORT + if(average != 0) { + static_assert(std::is_same::value || std::is_same::value, "Allreduce only for T in {long long, double}"); + MPI_Datatype ty = std::is_same::value ? MPI_LONG_LONG : + std::is_same::value ? MPI_DOUBLE : + MPI_Datatype(); + MPI_Op op = average == 1 ? MPI_SUM : + average == 2 ? MPI_MIN : + average == 3 ? MPI_MAX : + average == 4 ? MPI_SUM : MPI_Op(); + MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD); + } + #endif + + if(average == 1) accumulator[epoch] /= args->nProcs*args->nThreads; + counter[epoch] = 0; + pthread_cond_broadcast(&cond[epoch]); + } + else { + while(counter[epoch] != 0) + pthread_cond_wait(&cond[epoch], &lock[epoch]); + } + pthread_mutex_unlock(&lock[epoch]); + + *value = accumulator[epoch]; + epoch ^= 1; } -testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) { +testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) { + int nranks = args->nProcs*args->nGpus*args->nThreads; size_t count = args->expectedBytes/wordSize(type); - double maxDelta = 0.0; + + int64_t *wrongPerGpu = nullptr; + CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped)); + for (int i=0; inGpus; i++) { int device; int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); CUDACHECK(cudaSetDevice(device)); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; - TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost)); - maxDelta = std::max(*(args->deltaHost), maxDelta); -#ifdef DEBUG_PRINT - if (rank == 0) { - int *expectedHost = (int *)malloc(args->expectedBytes); - int *dataHost = (int *)malloc(args->expectedBytes); + TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); - cudaMemcpy(expectedHost, args->expected[0], args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Expected: "); - for(int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, expectedHost[j]); - } - printf("\n"); +#if 1 && DEBUG_PRINT + if (args->reportErrors && wrongPerGpu[i] != 0) { + printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]); + char *expectedHost = (char*)malloc(args->expectedBytes); + char *dataHost = (char*)malloc(args->expectedBytes); + int eltsz = wordSize(type); + cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost); + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); - cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); - printf("\n Actual: "); - for (int j=0; jexpectedBytes/sizeof(int); j++) { - printf("%d:%d ", j, dataHost[j]); - } - printf("\n"); - free(expectedHost); - free(dataHost); + for(int j=0; jexpectedBytes/eltsz; j++) { + unsigned long long want, got; + want = 0; + memcpy(&want, expectedHost + j*eltsz, eltsz); + got = 0; + memcpy(&got, dataHost + j*eltsz, eltsz); + if(want != got) { + printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got); + } + } + free(expectedHost); + free(dataHost); } #endif } - double nranks = args->nProcs*args->nThreads*args->nGpus; - if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++; - *delta = maxDelta; + + *wrongElts = 0; + for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i]; + cudaFree(wrongPerGpu); + + if (args->reportErrors && *wrongElts) args->errors[0]++; return testSuccess; } @@ -503,7 +298,7 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* } // We might want to let other threads (including NCCL threads) use the CPU. - if (idle) pthread_yield(); + if (idle) sched_yield(); } free(done); return testSuccess; @@ -541,19 +336,18 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t __nv_bfloat16 bf16; #endif }; - int scalar = preMulScalar(rank); switch(type) { - case ncclInt8: i8 = int8_t(scalar); break; - case ncclUint8: u8 = uint8_t(scalar); break; - case ncclInt32: i32 = int32_t(scalar); break; - case ncclUint32: u32 = uint32_t(scalar); break; - case ncclInt64: i64 = int32_t(scalar); break; - case ncclUint64: u64 = uint32_t(scalar); break; - case ncclFloat16: f16 = __float2half(float(scalar)); break; - case ncclFloat32: f32 = float(scalar); break; - case ncclFloat64: f64 = double(scalar); break; + case ncclInt8: i8 = ncclVerifiablePremulScalar(rank); break; + case ncclUint8: u8 = ncclVerifiablePremulScalar(rank); break; + case ncclInt32: i32 = ncclVerifiablePremulScalar(rank); break; + case ncclUint32: u32 = ncclVerifiablePremulScalar(rank); break; + case ncclInt64: i64 = ncclVerifiablePremulScalar(rank); break; + case ncclUint64: u64 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat16: f16 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat32: f32 = ncclVerifiablePremulScalar(rank); break; + case ncclFloat64: f64 = ncclVerifiablePremulScalar(rank); break; #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: bf16 = __float2bfloat16(float(scalar)); break; + case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break; #endif } NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); @@ -607,9 +401,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for (int i=0; inGpus; i++) { - // Thread local mode is needed for: - // - Multi-thread mode - // - P2P pre-connect + // Thread local mdoe is needed for: + // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads + // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture. + // Since pre-connect calls cudaMalloc, we cannot use global capture mode CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); } } @@ -669,7 +464,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); - double maxDelta = 0; + int64_t wrongElts = 0; static __thread int rep = 0; rep++; if (datacheck) { @@ -717,10 +512,12 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #endif - TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta)); + TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts)); //aggregate delta from all threads and procs - Allreduce(args, &maxDelta, 3); + long long wrongElts1 = wrongElts; + Allreduce(args, &wrongElts1, /*sum*/4); + wrongElts = wrongElts1; } double timeUsec = deltaSec*1.0E6; @@ -733,9 +530,9 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t sprintf(timeStr, "%7.2f", timeUsec); } if (datacheck) { - PRINT(" %7s %6.2f %6.2f %5.0le", timeStr, algBw, busBw, maxDelta); + PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts); } else { - PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); + PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); } args->bw[0] += busBw; @@ -775,7 +572,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); - print_line_header(max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + char rootName[100]; + sprintf(rootName, "%6i", root); + PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); TESTCHECK(BenchTime(args, type, op, root, 0)); TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); @@ -828,7 +627,7 @@ testResult_t threadLaunch(struct testThread* thread) { return testSuccess; } -testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { CUDACHECK(cudaMalloc(sendbuff, nbytes)); CUDACHECK(cudaMalloc(recvbuff, nbytes)); if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); @@ -1027,8 +826,10 @@ testResult_t run() { #endif is_main_thread = (proc == 0) ? 1 : 0; - PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d validation: %d \n", nThreads, nGpus, minBytes, maxBytes, - (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck); + PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n", + nThreads, nGpus, minBytes, maxBytes, + (stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", + warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches); if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n"); if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n"); PRINT("#\n"); @@ -1087,7 +888,7 @@ testResult_t run() { for (int i=0; i= NCCL_VERSION(2,12,10) +#define NCCLCHECK(cmd) do { \ + ncclResult_t res = cmd; \ + if (res != ncclSuccess) { \ + char hostname[1024]; \ + getHostName(hostname, 1024); \ + printf("%s: Test NCCL failure %s:%d " \ + "'%s / %s'\n", \ + hostname,__FILE__,__LINE__, \ + ncclGetErrorString(res), \ + ncclGetLastError(NULL)); \ + return testNcclError; \ + } \ +} while(0) +#else #define NCCLCHECK(cmd) do { \ ncclResult_t res = cmd; \ if (res != ncclSuccess) { \ @@ -39,6 +54,7 @@ return testNcclError; \ } \ } while(0) +#endif typedef enum { testSuccess = 0, @@ -111,14 +127,6 @@ struct threadArgs { void** expected; size_t expectedBytes; - volatile int* sync; - int sync_idx; - volatile int* barrier; - int barrier_idx; - volatile double* reduce; - int syncRank; - int syncNranks; - double* deltaHost; int* errors; double* bw; int* bw_count; @@ -141,8 +149,8 @@ struct testThread { // Provided by common.cu extern void Barrier(struct threadArgs* args); extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root); -extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks); -extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank); +extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks); +extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank); extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks); // Provided by each coll @@ -228,7 +236,7 @@ static size_t wordSize(ncclDataType_t type) { case ncclInt64: case ncclUint64: case ncclDouble: - //case ncclFloat64: + //case ncclFloat64: return 8; default: return 0; } diff --git a/src/gather.cu b/src/gather.cu index d0cfa5dabb..99088528d3 100644 --- a/src/gather.cu +++ b/src/gather.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6i", size, count, typeName, root); -} - void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = count/nranks; *recvcount = (count/nranks)*nranks; @@ -38,12 +26,10 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); if (rank == root) { - for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); - } + TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); } diff --git a/src/hypercube.cu b/src/hypercube.cu index 142f1a6359..ae9fbd0ad5 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -9,18 +9,6 @@ #define ALIGN 4 -void print_header() { - PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s", size, count, typeName); -} - void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { size_t base = (count/(ALIGN*nranks))*ALIGN; *sendcount = base; @@ -41,9 +29,9 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[i])+args->sendBytes*j, sendcount, type, rep, j)); + TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } CUDACHECK(cudaDeviceSynchronize()); } @@ -110,9 +98,16 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t run_typenames = test_typenames; } - for (int i=0; inProcs*args->nThreads*args->nGpus; + if (nRanks && !(nRanks & (nRanks - 1))) { + for (int i=0; iproc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); CUDACHECK(cudaDeviceSynchronize()); diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index b0c4fab52e..e4a59dc20e 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6s", size, count, typeName, opName); -} - void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = (count/nranks)*nranks; *recvcount = count/nranks; @@ -38,7 +26,7 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); CUDACHECK(cudaDeviceSynchronize()); diff --git a/src/scatter.cu b/src/scatter.cu index 93ab2e694a..d244b2b8bc 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s %6i", size, count, typeName, root); -} - void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = (count/nranks)*nranks; *recvcount = count/nranks; @@ -37,8 +25,8 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank)); - TESTCHECK(InitData(args->expected[i], recvcount, type, rep+rank*recvcount, root)); + if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); + TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0)); CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; diff --git a/src/sendrecv.cu b/src/sendrecv.cu index 8bebc48e3d..e73a92b2d5 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -7,18 +7,6 @@ #include "cuda_runtime.h" #include "common.h" -void print_header() { - PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", ""); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %8s", size, count, typeName); -} - void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { *sendcount = count; *recvcount = count; @@ -38,9 +26,9 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); int peer = (rank-1+nranks)%nranks; - TESTCHECK(InitData(args->expected[i], recvcount, type, rep, peer)); + TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0)); CUDACHECK(cudaDeviceSynchronize()); } // We don't support in-place sendrecv diff --git a/verifiable/Makefile b/verifiable/Makefile new file mode 100644 index 0000000000..b141a2a7c5 --- /dev/null +++ b/verifiable/Makefile @@ -0,0 +1,24 @@ +include ../../makefiles/common.mk + +.PHONY: all clean + +BUILDDIR := $(abspath ../../build) +NCCLDIR := $(BUILDDIR) +NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include +DST_DIR := $(BUILDDIR)/test/verifiable + +all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o + +clean: + rm -rf $(DST_DIR) + +TEST_VERIFIABLE_SRCDIR := . +TEST_VERIFIABLE_BUILDDIR := $(DST_DIR) +include verifiable.mk + +self_test: $(DST_DIR)/self_test + +$(DST_DIR)/self_test: verifiable.cu verifiable.h + @printf "Linking %s\n" $@ + @mkdir -p $(DST_DIR) + $(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS) diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu new file mode 100644 index 0000000000..d7bd545f62 --- /dev/null +++ b/verifiable/inexact_regress.cu @@ -0,0 +1,177 @@ +/* Generate parameters for our error bound model of floating point average + * (sum of scaled values) by sampling sums of random sequences for each + * floating point type. + * + * The model has parameters "coef" and "power", where for two floats a & b, + * they are close enough if and only if: + * abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power); + * + * Where intBits(x) is the reinterpretation of the float bitpattern as an integer. + * + * Compile with: + * nvcc -gencode=arch=compute_80,code=sm_80 + */ + +#include +#include +#include +#include +#include +#include + +using std::uint64_t; +using std::uint32_t; +using bfloat16 = __nv_bfloat16; + +template +struct float_traits; + +template<> +struct float_traits { + static constexpr int mantissa_bits = 23; + static constexpr int exponent_bits = 8; + using uint_t = uint32_t; + __device__ static float make(double x) { return (float)x; } + __device__ static float make(uint64_t x) { return (float)x; } + __device__ static double todouble(float x) { return x; } + __device__ static float add(float a, float b) { return a+b; } + __device__ static float mul(float a, float b) { return a*b; } +}; +template<> +struct float_traits { + static constexpr int mantissa_bits = 52; + static constexpr int exponent_bits = 11; + using uint_t = uint64_t; + __device__ static double make(double x) { return x; } + __device__ static double make(uint64_t x) { return (double)x; } + __device__ static double todouble(double x) { return x; } + __device__ static double add(double a, double b) { return a+b; } + __device__ static double mul(double a, double b) { return a*b; } +}; +template<> +struct float_traits { + static constexpr int mantissa_bits = 10; + static constexpr int exponent_bits = 5; + using uint_t = uint16_t; + __device__ static half make(double x) { return __double2half(x); } + __device__ static half make(uint64_t x) { return __int2half_rn(x); } + __device__ static double todouble(half x) { return __half2float(x); } + __device__ static half add(half a, half b) { return __hadd(a, b); } + __device__ static half mul(half a, half b) { return __hmul(a, b); } +}; +template<> +struct float_traits { + static constexpr int mantissa_bits = 7; + static constexpr int exponent_bits = 8; + using uint_t = uint16_t; + __device__ static bfloat16 make(double x) { return __double2bfloat16(x); } + __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); } + __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); } + __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); } + __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); } +}; + +template +__device__ int compare(F a, F b) { + union { typename float_traits::uint_t ua; F fa; }; + union { typename float_traits::uint_t ub; F fb; }; + ua=0; ub=0; + fa=a; fb=b; + //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub); + return ua < ub ? ub-ua : ua-ub; +} + +struct xoshiro256ss { + uint64_t s[4]; + __device__ xoshiro256ss(int seed) { + constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77}; + for(int i=0; i < 4; i++) + s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3; + } + __device__ uint64_t operator()() { + auto rol64 = [](uint64_t x, int k) { + return (x << k) | (x >> (64 - k)); + }; + uint64_t const result = rol64(s[1] * 5, 7) * 9; + uint64_t const t = s[1] << 17; + s[2] ^= s[0]; + s[3] ^= s[1]; + s[1] ^= s[2]; + s[0] ^= s[3]; + s[2] ^= t; + s[3] = rol64(s[3], 45); + return result; + } +}; + +template +__global__ void kernel() { + using traits = float_traits; + constexpr int samps = 4<<10; + __shared__ F accf[samps]; + __shared__ double accd[samps]; + + xoshiro256ss rng(threadIdx.x); + float expo_avg = 1; + for(int pass=0; pass < 2; pass++) { + F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x)); + int err_max = 0; + float coef = 0; + double expo_sum = 0; + int expo_n = 0; + int max_ranks = std::is_same::value ? 16<<10 : 1<::value ? double(rng() & m) : 1.0; + F f = traits::make(d); + accf[i] = traits::add(accf[i], traits::mul(scalar, f)); + accd[i] += traits::todouble(f); + //if(threadIdx.x==0 && std::is_same::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i])); + int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i]))); + err = err > e ? err : e; + } + err = __reduce_max_sync(-1u, err); + err_max = err_max > err ? err_max : err; + if (r >= 2) { + // err = 1 + coef*pow(r,expo) + float c = float(err-1)/powf(float(r), expo_avg); + coef = coef > c ? coef : c; + } + if (r >= 2) { + double expo = log2f(1+err_max)/log2f(r); + expo_sum += expo; + expo_n++; + //if(threadIdx.x==0 && std::is_same::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n); + } + } + } + if(pass==0) + expo_avg = expo_sum/expo_n; + else if(threadIdx.x == 0) + std::printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg); + } +} + +int main() { + std::printf("type=float:\n"); + kernel<<<1,32>>>(); + cudaDeviceSynchronize(); + + std::printf("\ntype=half:\n"); + kernel<<<1,32>>>(); + cudaDeviceSynchronize(); + + std::printf("\ntype=bfloat16:\n"); + kernel<<<1,32>>>(); + cudaDeviceSynchronize(); + return 0; +} diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu new file mode 100644 index 0000000000..5f617ee188 --- /dev/null +++ b/verifiable/verifiable.cu @@ -0,0 +1,1227 @@ +#pragma nv_diag_suppress declared_but_not_referenced + +#include "verifiable.h" +#include + +#include +#include +#if CUDART_VERSION >= 11000 +#include +#endif + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__) + #define HAVE_ncclBfloat16 1 +#else + #define HAVE_ncclBfloat16 0 +#endif + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) + #define HAVE_ncclAvg 1 +#else + #define HAVE_ncclAvg 0 +#endif + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) + #define HAVE_ncclPreMulSum 1 +#else + #define HAVE_ncclPreMulSum 0 +#endif + +#include +#include +#include +#include +#include +#include + +using std::size_t; +using std::int8_t; +using std::int16_t; +using std::int32_t; +using std::int64_t; +using std::uint8_t; +using std::uint16_t; +using std::uint32_t; +using std::uint64_t; + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +template +__device__ unsigned long long bitsOf(T x) { + union { unsigned long long ull; T val; } u; + u.ull = 0; + u.val = x; + return u.ull; +} + +__host__ __device__ uint64_t mixBits(uint64_t x) { + union { uint32_t u32[2]; uint64_t u64; }; + u64 = x; + u32[1] += 1; + u32[0] ^= u32[1]; + u64 *= 0x9e3779b97f4a7c13u; + u32[0] ^= u32[1]<<16 ^ u32[1]>>16; + return u64; +} + +__host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) { + a += uint64_t(1)<<32; + a += b; + a ^= a>>32; + a *= 0x9e3779b97f4a7c13u; + a += b>>16 ^ b<<48; + a ^= a>>32; + a *= 0xc4ceb9fe1a85ec53u; + return a; +} +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +template +struct IsIntegral: std::is_integral {}; +template<> +struct IsIntegral: std::false_type {}; +#ifdef __CUDA_BF16_TYPES_EXIST__ +template<> +struct IsIntegral<__nv_bfloat16>: std::false_type {}; +#endif +} + +//////////////////////////////////////////////////////////////////////////////// + +// Hide a value from arithmetic optimizations. Hopefully compiler cannot detect +// that this is equivalent to the identity function. +template +__host__ __device__ T inhibit(T x) { + union { uint64_t u64; T val; }; + u64 = 0; + val = x; + u64 *= 0x0000000100000001u; + u64 *= 0xffffffff00000001u; + return val; +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + template + __host__ __device__ Y castTo(X x) { + return Y(x); + } + template + __host__ __device__ Y castTo(float x) { + return Y(x); + } + template<> + __host__ __device__ half castTo(float x) { + return __float2half(x); + } + #ifdef __CUDA_BF16_TYPES_EXIST__ + template<> + __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) { + return __float2bfloat16(x); + } + #endif +} + +//////////////////////////////////////////////////////////////////////////////// +// The reduction functions + +namespace { +struct ReduceNil { + template + __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } + template + __host__ __device__ T operator()(T a, T /*b*/) const { return a; } + template + __host__ __device__ T postOp(T x) const { return x; } +}; +struct ReduceSum { + template + __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } + template + __host__ __device__ T operator()(T a, T b) const { return a + b; } + __host__ __device__ half operator()(half a, half b) const { + #if __CUDA_ARCH__ >= 530 + return __hadd(a, b); + #else + return __float2half(__half2float(a) + __half2float(b)); + #endif + } + #ifdef __CUDA_BF16_TYPES_EXIST__ + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { + #if __CUDA_ARCH__ >= 800 + return __hadd(a, b); + #else + return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b)); + #endif + } + #endif + template + __host__ __device__ T postOp(T x) const { return x; } +}; +struct ReduceProd { + template + __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } + template + __host__ __device__ T operator()(T a, T b) const { return a * b; } + __host__ __device__ half operator()(half a, half b) const { + #if __CUDA_ARCH__ >= 530 + return __hmul(a, b); + #else + return __float2half(__half2float(a) * __half2float(b)); + #endif + } + #ifdef __CUDA_BF16_TYPES_EXIST__ + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { + #if __CUDA_ARCH__ >= 800 + return __hmul(a, b); + #else + return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b)); + #endif + } + #endif + template + __host__ __device__ T postOp(T x) const { return x; } +}; +struct ReduceMin { + template + __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } + template + __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; } + __host__ __device__ half operator()(half a, half b) const { + #if __CUDA_ARCH__ >= 800 + return __hmin(a, b); + #elif __CUDA_ARCH__ >= 530 + return __hlt(a, b) ? a : b; + #else + return __half2float(a) < __half2float(b) ? a : b; + #endif + } + #ifdef __CUDA_BF16_TYPES_EXIST__ + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { + #if __CUDA_ARCH__ >= 800 + return __hmin(a, b); + //#elif __CUDA_ARCH__ >= 530 + // return __hlt(a, b) ? a : b; + #else + return __bfloat162float(a) < __bfloat162float(b) ? a : b; + #endif + } + #endif + template + __host__ __device__ T postOp(T x) const { return x; } +}; +struct ReduceMax { + template + __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } + templateT())> + __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; } + __host__ __device__ half operator()(half a, half b) const { + #if __CUDA_ARCH__ >= 800 + return __hmax(a, b); + #elif __CUDA_ARCH__ >= 530 + return __hgt(a, b) ? a : b; + #else + return __half2float(a) > __half2float(b) ? a : b; + #endif + } + #ifdef __CUDA_BF16_TYPES_EXIST__ + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { + #if __CUDA_ARCH__ >= 800 + return __hmax(a, b); + //#elif __CUDA_ARCH__ >= 530 + // return __hgt(a, b) ? a : b; + #else + return __bfloat162float(a) > __bfloat162float(b) ? a : b; + #endif + } + #endif + template + __host__ __device__ T postOp(T x) const { return x; } +}; +struct ReducePreMulSum { + template + __host__ __device__ T preOp(T x, int rank_me) const { + return ReduceProd()(x, ncclVerifiablePremulScalar(rank_me)); + } + template + __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); } + template + __host__ __device__ T postOp(T x) const { return x; } +}; + +template::value> +struct ReduceAvg_Base; + +template +struct ReduceAvg_Base { + int rank_n; + __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } + __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); } + __host__ __device__ T postOp(T x) const { return x/rank_n; } +}; + +template +struct ReduceAvg_Base { + int rank_n; + __host__ __device__ T preOp(T x, int /*rank_me*/) const { + using T1 = typename std::conditional<(sizeof(T)::type; + return ReduceProd()(inhibit(castTo(T1(1)/T1(rank_n))), inhibit(x)); + } + __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); } + __host__ __device__ T postOp(T x) const { return x; } +}; + +struct ReduceAvg { + int rank_n; + template + __host__ __device__ T preOp(T x, int rank_me) const { + return ReduceAvg_Base{rank_n}.preOp(x, rank_me); + } + template + __host__ __device__ T operator()(T a, T b) const { + return ReduceAvg_Base{rank_n}(a, b); + } + template + __host__ __device__ T postOp(T x) const { + return ReduceAvg_Base{rank_n}.postOp(x); + } +}; +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +template +struct FloatLayout; +template<> +struct FloatLayout { + static constexpr int exponent_bits = 8, mantissa_bits = 23; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; +template<> +struct FloatLayout { + static constexpr int exponent_bits = 11, mantissa_bits = 52; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; +template<> +struct FloatLayout { + static constexpr int exponent_bits = 5, mantissa_bits = 10; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; +#ifdef __CUDA_BF16_TYPES_EXIST__ +template<> +struct FloatLayout<__nv_bfloat16> { + static constexpr int exponent_bits = 8, mantissa_bits = 7; + static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; +}; +#endif + +template +__host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) { + union { T ans; uint64_t bits; }; + bits = sign; + bits <<= FloatLayout::exponent_bits; + bits |= exp; + bits <<= FloatLayout::mantissa_bits; + bits |= mant; + return ans; +} +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +// High bits of multiplcation are useful for generating bounded random values +// from unbounded random values. For instance, given X a totally random 32-bit +// integer, `umul32hi(X,n)` will be totally random within [0,n). +__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) { +#ifdef __CUDA_ARCH__ + return __umulhi(a, b); +#else + return uint64_t(a)*b >> 32; +#endif +} +__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) { +#ifdef __CUDA_ARCH__ + return __umul64hi(a, b); +#else + return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64); +#endif +} + +__host__ __device__ int clz32(int x) { +#ifdef __CUDA_ARCH__ + return __clz(x); +#else + return x==0 ? 32 : __builtin_clz(x); +#endif +} +__host__ __device__ int clz64(long long x) { +#ifdef __CUDA_ARCH__ + return __clzll(x); +#else + return x==0 ? 64 : __builtin_clzll(x); +#endif +} +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +// Returns a wildly permuted rank index. Useful when we know we want exactly N +// random ranks to exhibit some behavior, we can just test if: +// `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true +// for well defined results. This mixes the bits of rng. +__host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) { + uint32_t a = uint32_t(rng); + uint32_t b = uint32_t(rng>>32); + rng = mixBits(rng); + + uint32_t r = rank_me; + // round down rank_n to largest pow2, then subtract 1 + uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n); + + // These are 1:1 functions modulo 2^n: + // f(x) = x*a + b : for odd a, any b + // f(x) = (x*x + x)/2 + // So we apply both to the bottom n2+1 ranks, then rotate the top + // (rank_n-n2-1) to the bottom and apply both again. + + if(r <= n2) { + // shuffle bottom n2+1 ranks + r = (r*(a|1) + b) & n2; + r = (r*r + r)/2 & n2; + // rotate top to bottom + r += rank_n - (n2+1); + } + else + r -= n2+1; // rotate top to bottom + + if(r <= n2) { + // shuffle bottom n2+1 again + r = (r*(b|1) + a) & n2; + r = (r*r + r)/2 & n2; + } + return r; +} +} + +namespace { +// Generate wild integers x and y such that if every rank submits its x into a +// summation the result will be y with y <= y_max. Ranks should be shuffled +// before calling. +template +__host__ __device__ void genSumXY( + int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y, + bool avoid_y=false // if true then returned y will not equal given y + ) { + static_assert(std::is_unsigned::value, "Type must be unsigned integral."); + + { // Pick y as a random value in [y_max/2, y_max] + Uint d, y_min = (y_max+1)/2; + if(8*sizeof(Uint) > 32) + d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1)); + else + d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1)); + Uint y1 = (avoid_y ? y+1 : y_min) + d; + y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0); + } + rng = mixBits(rng); + + unsigned r = unsigned(rank_me); + unsigned rn = unsigned(rank_n); + // Partition our rn ranks into pn distinct subsets each of size rn/pn. If each + // rank submits 1+p (where p is 0-based partition index) then the sum be: + // (rn/pn) * pn*(pn+1)/2 + // So set this equal to our desired sum y and solve for pn. + // (rn/pn) * pn*(pn+1)/2 = y + // rn*(pn+1)/2 = y + // pn = 2*(y/rn)-1 + Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1; + // In the case where rn is huge (compared to y) use only one partition meaning + // that all rn ranks will submit 1 (since p=0). + pn = pn == 0 ? 1 : pn; + // Can't have more partitions than ranks. + pn = rn < pn ? rn : pn; + // Compute sum of contribution from pn partitions where each submits p+1. + Uint p_sum; + if(y_max <= ~uint32_t(0)>>1) // compile time known + p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2); + else + p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2); + // Let s be the number of ranks per partition. This is either rn/pn as we + // intended, or y/p_sum if that's smaller to prevent overshooting our target y. + uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn; + x = r/s < pn ? 1 + r/s : 0; // First s*pn ranks contribute partition index +1. + x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy. +} +} + +namespace { +template +__host__ __device__ T genInOutFloatSum( + bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index, + bool same_sign + ) { + constexpr int exp_lo = 1 + FloatLayout::mantissa_bits; + constexpr int exp_hi = (1<::exponent_bits)-1; + using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type; + constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout::mantissa_bits)-1; + constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1 + uint64_t rng = hashOf(seed, index); + + int y_sign = rng & 1; + int x_sign = y_sign; + int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo); + rng = mixBits(rng); + rank_me = shuffleRank(rank_n, rank_me, rng); + + // If we're using mixed signs then partition into evens and odds. + int subrank_n = same_sign ? rank_n : (rank_n+1)/2; + int subrank_me = same_sign ? rank_me : rank_me/2; + uintmant_t x0_mant, y0_mant; + genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant); + + if (!same_sign && (rank_n+0)/2 != 0) { + uintmant_t x1_mant, y1_mant = y0_mant; + // Avoid generating y1_mant == y0_mant so we don't have to worry about + // signed zero as the result. + genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true); + y_sign ^= y0_mant < y1_mant ? 1 : 0; + y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant); + x_sign ^= rank_me%2; + x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant; + } + + uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant; + if(ans_mant == 0) + return T(0.0f); + else { + int shift = clz64(ans_mant) - (64-FloatLayout::mantissa_bits-1); + int ans_sign = input_not_output ? x_sign : y_sign; + int ans_exp = xy_exp - shift; + ans_mant <<= shift; + return makeFloat(ans_sign, ans_exp, ans_mant & mant_mask); + } +} +} + +namespace { +template +__host__ __device__ T genInOutFloatPreMulSum( + bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index + ) { + constexpr int exp_lo = 1 + FloatLayout::mantissa_bits; + constexpr int exp_hi = (1<::exponent_bits)-1; + using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type; + constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout::mantissa_bits)-1; + constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1 + uint64_t rng = hashOf(seed, index); + + int y_sign = rng & 1; + int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo); + rng = mixBits(rng); + int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng); + int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng); + + // when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0) + uintmant_t x0_mant, y0_mant; + genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant); + + // when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1) + uintmant_t x1_mant=0, y1_mant=0; + if((rank_n+0)/2 != 0) + genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant); + + uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant; + uintmant_t y_mant = y0_mant + 2*y1_mant; + uintmant_t ans_mant = input_not_output ? x_mant : y_mant; + + if(ans_mant == 0) + return T(0.0f); + else { + int shift = clz64(ans_mant) - (64-FloatLayout::mantissa_bits-1); + int ans_sign = y_sign; + int ans_exp = y_exp - shift; + ans_mant <<= shift; + return makeFloat(ans_sign, ans_exp, ans_mant & mant_mask); + } +} +} + +namespace { +template +__host__ __device__ T genInOutFloatProd( + bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index + ) { + // Three kinds of contributions (values for x): + // 1) x = random value: only one rank does this + // 2) x = 2^n: random positive n + // 3) x = 1 + // Since only one rank submits a random value, the result of the product + // will have the same mantissa as that value but with an exponent incorporating + // the sum of the exponents from case (2) + + uint64_t rng = hashOf(seed, index); + rank_me = shuffleRank(rank_n, rank_me, rng); + int y_sign = (rank_n/2)%2; + int x_sign = rank_me%2; + + constexpr unsigned max_exp = -1 + (1<<(FloatLayout::exponent_bits-1)); + unsigned x_exp=0, y_exp=0; + genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp); + x_exp += FloatLayout::exponent_bias; + y_exp += FloatLayout::exponent_bias; + + constexpr uint64_t mant_mask = (uint64_t(1)<::mantissa_bits)-1; + uint64_t y_mant = rng & mant_mask; + if (y_mant == 0) y_mant = 1; + + return makeFloat( + input_not_output ? x_sign : y_sign, + input_not_output ? x_exp : y_exp, + !input_not_output || rank_me==0 ? y_mant : 0 + ); +} +} + +//////////////////////////////////////////////////////////////////////////////// +// What follows is lots of overloads for genInput/genOutput to generate data + +namespace { +// General case for integral data for all ops but ReduceNil/premulsum +template::value + >::type> +__host__ __device__ void genInput( + T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::true_type /*integral*/ + ) { + (void)rank_n; // silence unused warnings + union { uint64_t bits; T tmp; }; + bits = uint64_t(-1)>>(64 - 8*sizeof(T)); + bits &= hashOf(index ^ index<<16 ^ rank_me, seed); + // make sure we never return 0 in products + ans = std::is_same::value && bits == 0 ? T(1) : tmp; +} +} + +//////////////////////////////////////////////////////////////////////////////// +// Dumb/generic case for genOutput just reduces results of genInput + +namespace { +template +__host__ __device__ void genOutput( + T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index, + std::integral_constant + ) { + T acc = genInput(op, rank_n, 0, seed, index); + acc = op.preOp(acc, 0); + for(int r=1; r < rank_n; r++) + acc = op(acc, op.preOp(genInput(op, rank_n, r, seed, index), r)); + ans = op.postOp(acc); +} +} + +//////////////////////////////////////////////////////////////////////////////// +// Nil reduction (byte copy functions). Optimized to assume rank_n=1 + +namespace { +template +__host__ __device__ void genInput( + T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::integral_constant + ) { + (void)rank_n, (void)rank_me; // silence unused warnings + union { uint64_t bits; T tmp; }; + bits = mixBits(seed ^ index); + bits >>= 64 - 8*sizeof(T); + bits &= uint64_t(-1)>>(64 - 8*sizeof(T)); + ans = tmp; +} + +template +__host__ __device__ void genOutput( + T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index, + std::integral_constant + ) { + ans = genInput(op, rank_n, 0, seed, index); +} +} + +//////////////////////////////////////////////////////////////////////////////// +// Sum of float + +namespace { +template +__host__ __device__ void genInput( + T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatSum(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false); +} + +template +__host__ __device__ void genOutput( + T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatSum(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false); +} +} + +//////////////////////////////////////////////////////////////////////////////// +// Product of float + +namespace { +template +__host__ __device__ void genInput( + T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatProd(/*input_not_output=*/true, rank_n, rank_me, seed, index); +} + +template +__host__ __device__ void genOutput( + T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatProd(/*input_not_output=*/false, rank_n, 0, seed, index); +} +} + +//////////////////////////////////////////////////////////////////////////////// +// PreMulSum of int/float + +namespace { +template +__host__ __device__ void genInput( + T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::true_type integral + ) { + genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral); +} + +// No genOutput overload specific to premulsum(int), just use generic case. + +template +__host__ __device__ void genInput( + T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatPreMulSum(/*input_not_output=*/true, rank_n, rank_me, seed, index); +} + +template +__host__ __device__ void genOutput( + T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatPreMulSum(/*input_not_output=*/false, rank_n, 0, seed, index); +} +} + +///////////////////////////////////////////////////////////////////////////////// +// Average of float + +namespace { +template +__host__ __device__ void genInput( + T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatSum(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true); +} + +template +__host__ __device__ void genOutput( + T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + ans = genInOutFloatSum(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true); + using T1 = typename std::conditional<(sizeof(T)::type; + ans = ReduceProd()(ans, T1(1)/T1(rank_n)); +} +} + +///////////////////////////////////////////////////////////////////////////////// +// min/max of float + +namespace { +template +__host__ __device__ void genInput( + T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type integral + ) { + genInput(ans, ReduceMax(), rank_n, rank_me, seed, index, integral); +} +template +__host__ __device__ void genInput( + T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index, + std::false_type /*integral*/ + ) { + (void)rank_n; // silence unused warnings + constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout::mantissa_bits)-1; + uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed); + int sign = rng & 1; + rng ^= rng>>1; + int exp = rng & ((1<<(FloatLayout::exponent_bits-1))-1); + exp += 1<<(FloatLayout::exponent_bits-2); + rng ^= rng >> FloatLayout::exponent_bits; + uint64_t mant = rng & mant_mask; + ans = makeFloat(sign, exp, mant); +} + +// No genOutput overload specific to floating point min/max, just use generic case. +} + +/////////////////////////////////////////////////////////////////////////////// +// Entry API for genInput/genOutput + +namespace { +template +__host__ __device__ T genInput( + ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index + ) { + T ans; + genInput(ans, op, rank_n, rank_me, seed, index, + std::integral_constant::value>()); + return ans; +} + +template +__host__ __device__ T genOutput( + ReduceFn op, int rank_n, uint64_t seed, intptr_t index + ) { + T ans; + genOutput(ans, op, rank_n, seed, index, + std::integral_constant::value>()); + return ans; +} +} + +//////////////////////////////////////////////////////////////////////////////// + +#if !SELF_TEST +namespace { +template +__global__ void prepareInput2( + T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me, + uint64_t seed, intptr_t elt_ix0 + ) { + intptr_t i0 = blockIdx.x*(elt_n/gridDim.x); + i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x; + intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x); + i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x; + intptr_t i = i0 + threadIdx.x; + while(i < i1) { + elts[i] = genInput(op, rank_n, rank_me, seed, elt_ix0+i); + #if 0 + T output = genOutput(op, rank_n, seed, elt_ix0+i); + printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n", + std::is_same::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts); + #endif + i += blockDim.x; + } +} + +template +void prepareInput1( + void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me, + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + ) { + int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); + #define CASE_TY(T) prepareInput2<<>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break; + switch(elt_ty) { + case ncclInt8: CASE_TY(int8_t) + case ncclUint8: CASE_TY(uint8_t) + case ncclInt32: CASE_TY(int32_t) + case ncclUint32: CASE_TY(uint32_t) + case ncclInt64: CASE_TY(int64_t) + case ncclUint64: CASE_TY(uint64_t) + case ncclFloat16: CASE_TY(half) + #if HAVE_ncclBfloat16 + case ncclBfloat16: CASE_TY(__nv_bfloat16) + #endif + case ncclFloat32: CASE_TY(float) + case ncclFloat64: CASE_TY(double) + default: assert(0); + } + #undef CASE_TY +} +} + +void ncclVerifiablePrepareInput( + void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + ) { + #define CASE_OP(op) \ + if(rank_n == 1) \ + prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \ + else \ + prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \ + break; + switch(red_op) { + case ncclSum: CASE_OP(ReduceSum()) + case ncclMin: CASE_OP(ReduceMin()) + case ncclMax: CASE_OP(ReduceMax()) + case ncclProd: CASE_OP(ReduceProd()) + #if HAVE_ncclAvg + case ncclAvg: CASE_OP(ReduceAvg{rank_n}) + #endif + #if HAVE_ncclPreMulSum + default: CASE_OP(ReducePreMulSum()) + #endif + } + #undef CASE_OP +} +#endif + +//////////////////////////////////////////////////////////////////////////////// + +#if !SELF_TEST +namespace { +template +__global__ void prepareExpected2( + T *elts, intptr_t elt_n, ReduceFn op, int rank_n, + uint64_t seed, intptr_t elt_ix0 + ) { + intptr_t i0 = blockIdx.x*(elt_n/gridDim.x); + i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x; + intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x); + i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x; + intptr_t i = i0 + threadIdx.x; + while(i < i1) { + elts[i] = genOutput(op, rank_n, seed, elt_ix0+i); + #if 0 + printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n", + (long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts); + #endif + i += blockDim.x; + } +} + +template +void prepareExpected1( + void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + ) { + int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); + #define CASE_TY(T) prepareExpected2<<>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break; + switch(elt_ty) { + case ncclInt8: CASE_TY(int8_t) + case ncclUint8: CASE_TY(uint8_t) + case ncclInt32: CASE_TY(int32_t) + case ncclUint32: CASE_TY(uint32_t) + case ncclInt64: CASE_TY(int64_t) + case ncclUint64: CASE_TY(uint64_t) + case ncclFloat16: CASE_TY(half) + #if HAVE_ncclBfloat16 + case ncclBfloat16: CASE_TY(__nv_bfloat16) + #endif + case ncclFloat32: CASE_TY(float) + case ncclFloat64: CASE_TY(double) + default: assert(0); + } + #undef CASE_TY +} +} + +void ncclVerifiablePrepareExpected( + void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + ) { + #define CASE_OP(op) \ + if(rank_n == 1) \ + prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \ + else \ + prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \ + break; + switch(red_op) { + case ncclSum: CASE_OP(ReduceSum()) + case ncclMin: CASE_OP(ReduceMin()) + case ncclMax: CASE_OP(ReduceMax()) + case ncclProd: CASE_OP(ReduceProd()) + #if HAVE_ncclAvg + case ncclAvg: CASE_OP(ReduceAvg{rank_n}) + #endif + #if HAVE_ncclPreMulSum + default: CASE_OP(ReducePreMulSum()) + #endif + } + #undef CASE_OP +} +#endif + +//////////////////////////////////////////////////////////////////////////////// + +namespace { +/* How we compare floating point values when exactness is impossible is interesting. + * First, we take note that simply reinterpreting integer bits as floating point + * gives us a monotonic mapping which exponentially spaces out floats. Thus + * consecutive integers encode consecutive floats. In general, using integer + * subraction on the bitpatterns of two floats gives us an integer which is the + * logarithm of their relative difference. But, if the floats always have similar + * exponents, than the integer difference is actually proportional to the + * relative error (this is because we are counting hops in the mantissa bits only, + * not the exponent bits). So a cheap way to compare if two floats are relatively + * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula + * calculates such a tolerance for a summation of n floats. This formula + * was derived by inspecting the maximum observed integer difference over many + * random runs of summation. The parameter values were computed by the + * companion program "inexact_regress.cu". + */ +__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) { + float power, coef; + switch(elt_ty) { + case ncclFloat32: + case ncclFloat64: + power = .51f; + coef = 1.25f; + break; + case ncclFloat16: + power = .91f; + coef = .75f; + break; + #if HAVE_ncclBfloat16 + case ncclBfloat16: + power = .91f; + coef = .66f; + break; + #endif + } + #if __CUDA_ARCH__ + return 1 + unsigned(coef*powf(float(rank_n), power)); + #else + return 1 + unsigned(coef*std::pow(float(rank_n), power)); + #endif +} + +template +__host__ __device__ uint64_t calcDelta(T a, T b) { + union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y; + x.t = a; + y.t = b; + switch(sizeof(T)) { + case 1: return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1; + case 2: return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2; + case 4: return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4; + default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8; + } +} +} + +//////////////////////////////////////////////////////////////////////////////// + +#if !SELF_TEST +namespace { +template +__global__ void verifyPrepared( + T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n + ) { + intptr_t i0 = blockIdx.x*(elt_n/gridDim.x); + i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x; + intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x); + i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x; + intptr_t i = i0 + threadIdx.x; + int64_t bad = 0; + + while(i < i1) { + T a = results[i], b = expected[i]; + T delta = a < b ? b - a : a - b; + bad += tolerance < delta ? 1 : 0; + #if 0 + if(tolerance < delta) { + printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]); + } + #endif + i += blockDim.x; + } + asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); +} + +template +__global__ void verifyInline2( + T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed, + intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n + ) { + intptr_t i0 = blockIdx.x*(elt_n/gridDim.x); + i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x; + intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x); + i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x; + intptr_t i = i0 + threadIdx.x; + int64_t bad = 0; + + while(i < i1) { + union { T t; Uint u; } a, b; + a.t = results[i]; + b.t = genOutput(op, rank_n, seed, elt_ix0+i); + Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u; + bad += tolerance < delta ? 1 : 0; + #if 0 + T input = genInput(op, rank_n, 0, seed, elt_ix0+i); + if(tolerance < delta) { + printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n", + std::is_same::value, (long long)i, (float)a.t, (float)b.t, (float)input); + } else { + printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n", + std::is_same::value, (long long)i, (float)a.t, (float)b.t, (float)input); + } + #endif + i += blockDim.x; + } + asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); +} + +template +void verifyInline1( + T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, + unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n + ) { + #define CASE_OP(op) \ + if(rank_n == 1) \ + verifyInline2<<>> \ + ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \ + else \ + verifyInline2<<>> \ + ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \ + break; + switch(red_op) { + case ncclSum: CASE_OP(ReduceSum()) + case ncclMin: CASE_OP(ReduceMin()) + case ncclMax: CASE_OP(ReduceMax()) + case ncclProd: CASE_OP(ReduceProd()) + #if HAVE_ncclAvg + case ncclAvg: CASE_OP(ReduceAvg{rank_n}) + #endif + #if HAVE_ncclPreMulSum + default: CASE_OP(ReducePreMulSum()) + #endif + } + #undef CASE_OP +} +} + +void ncclVerifiableVerify( + void const *results, void const *expected, intptr_t elt_n, int elt_ty, + int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, + int64_t *bad_elt_n, cudaStream_t stream + ) { + bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64; + #if HAVE_ncclBfloat16 + floating |= elt_ty == ncclBfloat16; + #endif + + unsigned tolerance = 0; + #if HAVE_ncclAvg + if (floating && red_op == ncclAvg) + tolerance = calcSumFloatTolerance(rank_n, elt_ty); + #endif + + int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); + + *bad_elt_n = 0; + #define CASE_TY(T, Uint) { \ + if(expected != nullptr) { \ + verifyPrepared<<>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \ + } else { \ + verifyInline1((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \ + } \ + } break; + switch(elt_ty) { + case ncclInt8: CASE_TY(int8_t, uint8_t) + case ncclUint8: CASE_TY(uint8_t, uint8_t) + case ncclInt32: CASE_TY(int32_t, uint32_t) + case ncclUint32: CASE_TY(uint32_t, uint32_t) + case ncclInt64: CASE_TY(int64_t, uint64_t) + case ncclUint64: CASE_TY(uint64_t, uint64_t) + case ncclFloat16: CASE_TY(half, uint16_t) + #if HAVE_ncclBfloat16 + case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t) + #endif + case ncclFloat32: CASE_TY(float, uint32_t) + case ncclFloat64: CASE_TY(double, uint64_t) + default: assert(0); + } + #undef CASE_TY +} +#endif + +//////////////////////////////////////////////////////////////////////////////// + +#if SELF_TEST +#include + +template +__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) { + //if(!std::is_same::value) return; + //if(!std::is_same::value) return; + //if(rank_n!=3) return; + + unsigned tolerance = !IsIntegral::value && std::is_same::value ? calcSumFloatTolerance(rank_n, ty) : 0; + uint64_t seed = 0xc8e2bed69766d533; + + for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) { + //if(ix!=387) continue; + T y = genOutput(op, rank_n, seed, ix); + T sum; + for(int r=0; r < rank_n; r++) { + T x = genInput(op, rank_n, r, seed, ix); + x = op.preOp(x, r); + sum = r==0 ? x : op(sum, inhibit(x)); + //std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum)); + } + sum = op.postOp(sum); + if(tolerance < calcDelta(sum, y)) { + std::printf( + //"%10g != %10g : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n", + "%llx != %llx : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n", + *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix + ); + } + } +} + +template +__device__ void sweep1(int ty, char const *tyname) { + for(int i=0; i < 10; i++) { + int rank_n = (1<(ty, tyname, ReduceSum(), "sum", rank_n); + sweep2(ty, tyname, ReduceProd(), "prod", rank_n); + sweep2(ty, tyname, ReduceMin(), "min", rank_n); + sweep2(ty, tyname, ReduceMax(), "max", rank_n); + sweep2(ty, tyname, ReducePreMulSum(), "premulsum", rank_n); + sweep2(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n); + } +} + +__global__ void sweep() { + sweep1(ncclInt8, "int8"); + sweep1(ncclUint8, "uint8"); + sweep1(ncclInt32, "int32"); + sweep1(ncclUint32, "uint32"); + sweep1(ncclInt64, "int64"); + sweep1(ncclUint64, "uint64"); + sweep1(ncclFloat16, "half"); + #if HAVE_ncclBfloat16 + sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16"); + #endif + sweep1(ncclFloat32, "float"); + sweep1(ncclFloat64, "double"); +} + +int main(int arg_n, char **args) { + std::cerr<<"You are hoping to see no output beyond this line."<>>(); + cudaDeviceSynchronize(); + return 0; +} +#endif diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h new file mode 100644 index 0000000000..aca0565a6b --- /dev/null +++ b/verifiable/verifiable.h @@ -0,0 +1,59 @@ +#ifndef _d41d8cd98f00b204e9800998ecf8427e +#define _d41d8cd98f00b204e9800998ecf8427e + +#include + +#include + +/* Routines for launching kernels that verify reduction results. A significant + * feature of these routines is they carefully craft floating point input + * to produce exactly predictable output. + * + * int elt_ty: actually just a ncclDataType_t + * + * int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically + * created, these are encoded as the value ncclNumOps and their scalar is + * assumed to be `ncclVerifiablePremulScalar(rank_me)` + * + * uint64_t seed: arbitrary 64-bits to use in seeding the random values + * + * intptr_t elt_ix0: index of first element pointed to by elts when generating + * random values. This makes it possible to generate subsequences independently + * as well as in aggregate. + * + * int rank_n: Number of contributions into the reduction. Non-reduction + * collectives like broadcast, gather, etc will always set this to one. + * + * int rank_me: Index of this contribution + */ + +// Use this as the local scalar for PreMulSum ops +template +__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) { + return T(rank_me%2 == 0 ? 1.0f : 2.0f); +} + +// Enqueue kernel to generate data which is to be reduced. +void ncclVerifiablePrepareInput( + void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream +); + +// Enqueue kernel to generate expected results of reduction. +void ncclVerifiablePrepareExpected( + void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream +); + +// Enqueue kernel to verify reduced data matches expectation. The number of +// failed elements is written to bad_elt_n which must be in cudaHost memory. +// If `expected == nullptr` then the expected results are generated on-the-fly +// which can be costly. Thus if you plan to run the same reduction multiple +// times it is advantageous to precompute the expected values with +// ncclVerifiablePrepareExpected and pass them as `expected` here. +void ncclVerifiableVerify( + void const *results, void const *expected, intptr_t elt_n, int elt_ty, + int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, + int64_t *bad_elt_n, cudaStream_t stream +); +#endif diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk new file mode 100644 index 0000000000..225c32a3c3 --- /dev/null +++ b/verifiable/verifiable.mk @@ -0,0 +1,11 @@ +# We requires both of the following paths to be set upon including this makefile +# TEST_VERIFIABLE_SRCDIR = +# TEST_VERIFIABLE_BUILDDIR = + +TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h +TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o + +$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS) + @printf "Compiling %s\n" $@ + @mkdir -p $(TEST_VERIFIABLE_BUILDDIR) + $(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu From bc5f7cfb0aad52af4388f5b4cc6214baf1e1a8ed Mon Sep 17 00:00:00 2001 From: John Bachan Date: Thu, 7 Jul 2022 11:42:21 +0200 Subject: [PATCH 106/233] Changed top-level Makefile behavior so that BUILDDIR is interpreted as relative to top-level directory. This done is by abspath'ing it before passing it to subdirectory Makefile's. The old behavior had two cases: with and without BUILDDIR being set by the user. With BUILDDIR not set, the build dir would be named "build" in the top-level directory. If BUILDDIR was set, then the build dir would be placed at "src/${BUILDDIR}". The new behavior is simpler, if BUILDDIR is not set then it defaults to "build", and the directory holding the final build is always at just "${BUILDDIR}" in the top level. --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 29409a8422..43729f897a 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,9 @@ # See LICENCE.txt for license information # +BUILDDIR ?= build +override BUILDDIR := $(abspath $(BUILDDIR)) + .PHONY : all clean default : src.build @@ -14,7 +17,7 @@ all: ${TARGETS:%=%.build} clean: ${TARGETS:%=%.clean} %.build: - ${MAKE} -C $* build + ${MAKE} -C $* build BUILDDIR=${BUILDDIR} %.clean: - ${MAKE} -C $* clean + ${MAKE} -C $* clean BUILDDIR=${BUILDDIR} From a0a14911ee5405353a85a7e345c188514410e10e Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 6 Sep 2022 13:17:15 -0700 Subject: [PATCH 107/233] Display N/A for error count in AlltoAll in-place test AlltoAll does not support in-place buffers --- src/common.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index eaa3318f34..0bc047c4f1 100644 --- a/src/common.cu +++ b/src/common.cu @@ -467,7 +467,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int64_t wrongElts = 0; static __thread int rep = 0; rep++; - if (datacheck) { + if (args->reportErrors) { // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); @@ -529,7 +529,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } else { sprintf(timeStr, "%7.2f", timeUsec); } - if (datacheck) { + if (args->reportErrors) { PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts); } else { PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A"); From afa4c56b6aeae3d198dfc30d9d8f26cc5ee75dba Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 7 Sep 2022 11:23:49 -0700 Subject: [PATCH 108/233] Fix an issue with the last commit when data checking is disabled --- src/common.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common.cu b/src/common.cu index 0bc047c4f1..8fe9258164 100644 --- a/src/common.cu +++ b/src/common.cu @@ -467,7 +467,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t int64_t wrongElts = 0; static __thread int rep = 0; rep++; - if (args->reportErrors) { + if (datacheck) { // Initialize sendbuffs, recvbuffs and expected TESTCHECK(args->collTest->initData(args, type, op, root, rep, in_place)); @@ -952,7 +952,7 @@ testResult_t run() { threads[t].args.bw=bw+t; threads[t].args.bw_count=bw_count+t; - threads[t].args.reportErrors = 1; + threads[t].args.reportErrors = datacheck; threads[t].func = parallel_init ? threadInit : threadRunTests; if (t) From 749573f2d65027859c8ace9d41fabf4b81eda491 Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 7 Sep 2022 16:10:41 -0700 Subject: [PATCH 109/233] Fix preprocessor version check for ncclGetLastError() ncclGetLastError() was added in NCCL 2.13.0 --- src/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.h b/src/common.h index 51cf9da276..84967ed6a1 100644 --- a/src/common.h +++ b/src/common.h @@ -28,7 +28,7 @@ } \ } while(0) -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,12,10) +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0) #define NCCLCHECK(cmd) do { \ ncclResult_t res = cmd; \ if (res != ncclSuccess) { \ From d313d20a2695b7a9be9b22bd9417fe2e201fef3f Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 20 Sep 2022 02:21:36 -0700 Subject: [PATCH 110/233] Update NCCL tests --- src/Makefile | 9 ++- src/all_gather.cu | 18 ++--- src/all_reduce.cu | 5 +- src/alltoall.cu | 9 +-- src/broadcast.cu | 7 +- src/common.cu | 151 ++++++++++++++++++++++++++++-------------- src/common.h | 56 ++++------------ src/gather.cu | 7 +- src/hypercube.cu | 5 +- src/reduce.cu | 5 +- src/reduce_scatter.cu | 16 +++-- src/scatter.cu | 7 +- src/sendrecv.cu | 5 +- src/timer.cc | 28 ++++++++ src/timer.h | 15 +++++ 15 files changed, 206 insertions(+), 137 deletions(-) create mode 100644 src/timer.cc create mode 100644 src/timer.h diff --git a/src/Makefile b/src/Makefile index 137b9d7925..6d8b1ef40f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -92,7 +92,12 @@ ${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS) @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) -c $< -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o $(TEST_VERIFIABLE_OBJS) +${DST_DIR}/timer.o: timer.cc timer.h + @printf "Compiling %-35s > %s\n" $< $@ + @mkdir -p ${DST_DIR} + $(CXX) $(CXXFLAGS) -o $@ -c timer.cc + +${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} $(NVCC) -o $@ $(NVCUFLAGS) $^ ${NVLDFLAGS} diff --git a/src/all_gather.cu b/src/all_gather.cu index 1eaafddfab..0831207433 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,12 +7,15 @@ #include "cuda_runtime.h" #include "common.h" +#define ALIGN 4 + void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { - *sendcount = count/nranks; - *recvcount = (count/nranks)*nranks; - *sendInplaceOffset = count/nranks; + size_t base = (count/(ALIGN*nranks))*ALIGN; + *sendcount = base; + *recvcount = base*nranks; + *sendInplaceOffset = base; *recvInplaceOffset = 0; - *paramcount = *sendcount; + *paramcount = base; } testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) { @@ -21,8 +24,7 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; @@ -78,7 +80,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t } for (int i=0; inProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; diff --git a/src/alltoall.cu b/src/alltoall.cu index 0eae1b07c9..41c7c4ae33 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -21,9 +21,7 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - char* str = getenv("NCCL_TESTS_DEVICE"); - int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; @@ -51,7 +49,6 @@ testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclD int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); size_t rankOffset = count * wordSize(type); - if (count == 0) return testSuccess; #if NCCL_MAJOR < 2 || NCCL_MINOR < 7 printf("NCCL 2.7 or later is needed for alltoall. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR); @@ -97,7 +94,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t } for (int i=0; iexpectedBytes / wordSize(type); for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; @@ -94,7 +93,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t for (int i=0; inProcs*args->nThreads; + if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads; counter[epoch] = 0; pthread_cond_broadcast(&cond[epoch]); } @@ -220,10 +229,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t CUDACHECK(cudaHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped)); for (int i=0; inGpus; i++) { - int device; int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - NCCLCHECK(ncclCommCuDevice(args->comms[i], &device)); - CUDACHECK(cudaSetDevice(device)); + CUDACHECK(cudaSetDevice(args->gpus[i])); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); @@ -266,6 +273,8 @@ testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* int remaining = ngpus; int* done = (int*)malloc(sizeof(int)*ngpus); memset(done, 0, sizeof(int)*ngpus); + timer tim; + while (remaining) { int idle = 1; for (int i=0; i timeout && timeout > 0) { + for (int i=0; inGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { #ifndef NCCL_MAJOR - int cudaDev; - NCCLCHECK(ncclCommCuDevice(args->comms[i], &cudaDev)); - CUDACHECK(cudaSetDevice(cudaDev)); + CUDACHECK(cudaSetDevice(args->gpus[i])); #endif int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; @@ -411,7 +431,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t #endif // Performance Benchmark - auto start = std::chrono::high_resolution_clock::now(); + timer tim; for (int iter = 0; iter < iters; iter++) { if (agg_iters>1) NCCLCHECK(ncclGroupStart()); for (int aiter = 0; aiter < agg_iters; aiter++) { @@ -432,7 +452,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } // Resync CPU, restart timing, launch cuda graph Barrier(args); - start = std::chrono::high_resolution_clock::now(); + tim.reset(); for (int l=0; lnGpus; i++) { CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); @@ -441,10 +461,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t } #endif + double cputimeSec = tim.elapsed()/(iters*agg_iters); TESTCHECK(completeColl(args)); - auto delta = std::chrono::high_resolution_clock::now() - start; - double deltaSec = std::chrono::duration_cast>(delta).count(); + double deltaSec = tim.elapsed(); deltaSec = deltaSec/(iters*agg_iters); if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches; Allreduce(args, &deltaSec, average); @@ -520,7 +540,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t wrongElts = wrongElts1; } - double timeUsec = deltaSec*1.0E6; + double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6; char timeStr[100]; if (timeUsec >= 10000.0) { sprintf(timeStr, "%7.0f", timeUsec); @@ -555,6 +575,9 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) { } testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) { + // Sync to avoid first-call timeout + Barrier(args); + // Warm-up for large size setupArgs(args->maxbytes, type, args); for (int iter = 0; iter < warmup_iters; iter++) { @@ -586,8 +609,7 @@ testResult_t threadRunTests(struct threadArgs* args) { // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[0])); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } @@ -598,13 +620,12 @@ testResult_t threadInit(struct threadArgs* args) { int nranks = args->nProcs*args->nThreads*args->nGpus; //set main thread again - is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0; + is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0; NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - CUDACHECK(cudaSetDevice(gpuid)); + CUDACHECK(cudaSetDevice(args->gpus[i])); NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); @@ -679,7 +700,10 @@ int main(int argc, char* argv[]) { {"datatype", required_argument, 0, 'd'}, {"root", required_argument, 0, 'r'}, {"blocking", required_argument, 0, 'z'}, + {"stream_null", required_argument, 0, 'y'}, + {"timeout", required_argument, 0, 'T'}, {"cudagraph", required_argument, 0, 'G'}, + {"report_cputime", required_argument, 0, 'C'}, {"average", required_argument, 0, 'a'}, {"help", no_argument, 0, 'h'}, {} @@ -687,7 +711,7 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:hG:a:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:y:T:hG:C:a:", longopts, &longindex); if (c == -1) break; @@ -752,6 +776,12 @@ int main(int argc, char* argv[]) { case 'z': blocking_coll = strtol(optarg, NULL, 0); break; + case 'y': + streamnull = strtol(optarg, NULL, 0); + break; + case 'T': + timeout = strtol(optarg, NULL, 0); + break; case 'G': #if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && CUDART_VERSION >= 11030 cudaGraphLaunches = strtol(optarg, NULL, 0); @@ -759,6 +789,9 @@ int main(int argc, char* argv[]) { printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n"); #endif break; + case 'C': + report_cputime = strtol(optarg, NULL, 0); + break; case 'a': average = (int)strtol(optarg, NULL, 0); break; @@ -787,11 +820,14 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" + "[-y,--stream_null <0/1>] \n\t" + "[-T,--timeout