From fab39367ae3e804f3142baf91969933b8eff684c Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Mon, 17 Oct 2022 14:13:48 +0000 Subject: [PATCH] make rccl-test compile again. all files compile now. mpi tests also pass [ROCm/rccl-tests commit: 641e93e99ccb38920154741e84cf9a10ac3da25b] --- projects/rccl-tests/CMakeLists.txt | 1 + projects/rccl-tests/src/Makefile | 5 +- projects/rccl-tests/src/alltoallv.cu | 48 ++--- projects/rccl-tests/src/common.cu | 103 +++++------ projects/rccl-tests/src/common.h | 12 +- projects/rccl-tests/verifiable/Makefile | 59 +++++- .../rccl-tests/verifiable/inexact_regress.cu | 56 ++++-- projects/rccl-tests/verifiable/verifiable.cu | 171 ++++++++---------- projects/rccl-tests/verifiable/verifiable.h | 15 +- projects/rccl-tests/verifiable/verifiable.mk | 9 +- 10 files changed, 251 insertions(+), 228 deletions(-) diff --git a/projects/rccl-tests/CMakeLists.txt b/projects/rccl-tests/CMakeLists.txt index 539a1eae2b..1e22365515 100644 --- a/projects/rccl-tests/CMakeLists.txt +++ b/projects/rccl-tests/CMakeLists.txt @@ -51,6 +51,7 @@ endif() set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component # Add all of the tests +add_subdirectory(verifiable) add_subdirectory(src) # Create ROCm standard packages diff --git a/projects/rccl-tests/src/Makefile b/projects/rccl-tests/src/Makefile index 0c3c424616..dd01c484f9 100644 --- a/projects/rccl-tests/src/Makefile +++ b/projects/rccl-tests/src/Makefile @@ -20,12 +20,11 @@ LDFLAGS := HIPLDFLAGS := ifneq ($(NCCL_HOME), "") -HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include +HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) endif HIPCUFLAGS += -I$(ROCM_PATH)/include -HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl -HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip +HIPCUFLAGS += -I$(ROCM_PATH)/include/hip LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt diff --git a/projects/rccl-tests/src/alltoallv.cu b/projects/rccl-tests/src/alltoallv.cu index cb8fcaff0d..c5818d9ded 100644 --- a/projects/rccl-tests/src/alltoallv.cu +++ b/projects/rccl-tests/src/alltoallv.cu @@ -10,18 +10,6 @@ #define USE_RCCL_GATHER_SCATTER -void print_header() { - PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", ""); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", - "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error"); - PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); -} - -void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) { - PRINT("%12li %12li %6s %6s", size, count, typeName, opName); -} - void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { if (count < nranks*nranks/2) { *sendcount = 0; @@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc int k=0; for (int i=0; inGpus; i++) { - char* str = getenv("NCCL_TESTS_DEVICE"); - int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (args->enable_multiranks) - gpuid = gpuid % args->localNumDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int l=0; lnRanks; l++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l); HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k]; - TESTCHECK(InitData(data, sendcount, type, rep, rank)); + TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0)); + #if 0 int *dataHost = (int *)malloc(args->sendBytes); hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); @@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc printf("\n"); free(dataHost); #endif + size_t rdisp = 0; size_t data_count = sendcount*2/nranks; size_t chunksize = data_count/nranks; for (int j=0; jexpected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j)); - rdisp += rcount; + size_t sdisp = 0; + for (int kk=0; kkexpected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0)); + rdisp += rcount; } k++; } diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 5f8d7f58fe..4f80115869 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -165,18 +165,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch) } testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) { - ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault); - CUDACHECK(cudaDeviceSynchronize()); + ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault); + HIPCHECK(hipDeviceSynchronize()); return testSuccess; } testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) { - ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault); + ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault); return testSuccess; } testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { - ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault); + ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault); return testSuccess; } @@ -271,7 +271,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t size_t count = args->expectedBytes/wordSize(type); int64_t *wrongPerGpu = nullptr; - CUDACHECK(hipHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostAllocMapped)); + HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped)); for (int i=0; inGpus*args->nRanks; i++) { int device; @@ -352,7 +352,7 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_ } double delta = tim.elapsed(); if (delta > timeout && timeout > 0) { - for (int i=0; icomms[i], &hipDev)); HIPCHECK(hipSetDevice(hipDev)); - //CUDACHECK(cudaSetDevice(args->gpus[i])); EDGAR CHECK LATER #endif int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; @@ -417,7 +416,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t case ncclFloat32: f32 = ncclVerifiablePremulScalar(rank); break; case ncclFloat64: f64 = ncclVerifiablePremulScalar(rank); break; #if defined(RCCL_BFLOAT16) - case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break; + case ncclBfloat16: bf16 = ncclVerifiablePremulScalar(rank); break; #endif } NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i])); @@ -452,7 +451,7 @@ testResult_t completeColl(struct threadArgs* args) { return testSuccess; } -//EDGAR: Revisit because of cudaGraphLaunches +//RCCL: Revisit because of cudaGraphLaunches testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) { size_t count = args->nbytes / wordSize(type); if (datacheck) { @@ -648,7 +647,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* // Benchmark for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { setupArgs(size, type, args); - print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root); + char rootName[100]; + sprintf(rootName, "%6i", root); + PRINT("%12li %12li %8s %6s %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); TESTCHECK(BenchTime(args, type, op, root, 0)); TESTCHECK(BenchTime(args, type, op, root, 1)); PRINT("\n"); @@ -661,10 +662,7 @@ testResult_t threadRunTests(struct threadArgs* args) { // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus; - if (enable_multiranks) - gpuid = gpuid % numDevices; - HIPCHECK(hipSetDevice(gpuid)); + HIPCHECK(hipSetDevice(args->gpus[0])); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } @@ -679,11 +677,7 @@ testResult_t threadInit(struct threadArgs* args) { NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { - int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - if (enable_multiranks) - gpuid = gpuid % numDevices; - HIPCHECK(hipSetDevice(gpuid)); - //CUDACHECK(cudaSetDevice(args->gpus[i])); + HIPCHECK(hipSetDevice(args->gpus[i])); for (int j=0; jnRanks; j++) { int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j; @@ -715,7 +709,7 @@ testResult_t threadLaunch(struct testThread* thread) { return testSuccess; } -testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) { +testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { if (memorytype == ncclFine) { HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained)); HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained)); @@ -807,12 +801,10 @@ int main(int argc, char* argv[]) { while(1) { int c; - // EDGAR NOTE: y is used by 'memory_type' (a RCCL argument) and 'stream_null' (a NCCL argument) - // also not sure about G vs. hG (we had G, they have hG) #ifdef RCCL_MULTIRANKPERGPU - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:R:x:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:", longopts, &longindex); #else - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:", longopts, &longindex); #endif if (c == -1) @@ -878,7 +870,7 @@ int main(int argc, char* argv[]) { case 'z': blocking_coll = strtol(optarg, NULL, 0); break; - case 'y': + case 'Y': memorytype = ncclstringtomtype(optarg); break; case 's': @@ -946,7 +938,7 @@ int main(int argc, char* argv[]) { "[-d,--datatype ] \n\t" "[-r,--root ] \n\t" "[-z,--blocking <0/1>] \n\t" - "[-y,--memory_type ] \n\t" + "[-Y,--memory_type ] \n\t" "[-s,--stress_cycles ] \n\t" "[-u,--cumask ] \n\t" "[-y,--stream_null <0/1>] \n\t" @@ -1084,15 +1076,15 @@ testResult_t run() { #ifdef MPI_SUPPORT MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm); #endif -<<<<<<< HEAD - int gpus[nGpus*nThreads*ranksPerGpu]; + + int gpus[nGpus*nThreads]; hipStream_t streams[nGpus*nThreads*ranksPerGpu]; void* sendbuffs[nGpus*nThreads*ranksPerGpu]; void* recvbuffs[nGpus*nThreads*ranksPerGpu]; void* expected[nGpus*nThreads*ranksPerGpu]; size_t sendBytes, recvBytes; - ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu); + ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads*ranksPerGpu); envstr = getenv("NCCL_TESTS_DEVICE"); gpu0 = envstr ? atoi(envstr) : -1; @@ -1101,53 +1093,44 @@ testResult_t run() { if (enable_multiranks) gpuid = gpuid % numDevices; + gpus[ii] = gpu0 != -1 ? gpu0+ii : gpuid; + HIPCHECK(hipSetDevice(gpus[ii])); + for (int j=0; j #include #include @@ -21,14 +21,14 @@ // For nccl.h < 2.13 since we define a weak fallback extern "C" char const* ncclGetLastError(ncclComm_t comm); -#define HIPCHECK(cmd) do { \ - hipError_t e = cmd; \ - if( e != hipSuccess ) { \ +#define HIPCHECK(cmd) do { \ + hipError_t e = cmd; \ + if( e != hipSuccess ) { \ char hostname[1024]; \ getHostName(hostname, 1024); \ - printf("%s: Test HIP failure %s:%d '%s'\n", \ + printf("%s: Test HIP failure %s:%d '%s'\n", \ hostname, \ - __FILE__,__LINE__,hipGetErrorString(e)); \ + __FILE__,__LINE__,hipGetErrorString(e)); \ return testCudaError; \ } \ } while(0) diff --git a/projects/rccl-tests/verifiable/Makefile b/projects/rccl-tests/verifiable/Makefile index b141a2a7c5..182d44e727 100644 --- a/projects/rccl-tests/verifiable/Makefile +++ b/projects/rccl-tests/verifiable/Makefile @@ -1,13 +1,62 @@ -include ../../makefiles/common.mk +# +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. +# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE.txt for license information +# + +#include ../../makefiles/common.mk .PHONY: all clean BUILDDIR := $(abspath ../../build) -NCCLDIR := $(BUILDDIR) -NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include DST_DIR := $(BUILDDIR)/test/verifiable -all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o +ROCM_PATH ?= /opt/rocm +MPI_HOME ?= /usr/lib/openmpi +PREFIX ?= /usr/local +VERBOSE ?= 0 +DEBUG ?= 0 +NCCL_HOME ?= "" + +HIPCC = $(ROCM_PATH)/bin/hipcc +CXX = $(HIPCC) + +HIPCUFLAGS := -std=c++14 +LDFLAGS := +HIPLDFLAGS := + +ifneq ($(NCCL_HOME), "") +HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include +HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) +endif +HIPCUFLAGS += -I$(ROCM_PATH)/include +HIPCUFLAGS += -I$(ROCM_PATH)/include/hip +LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt +HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt + +ifeq ($(DEBUG), 0) +HIPCUFLAGS += -O3 +else +HIPCUFLAGS += -O0 -g -ggdb3 +endif + +ifeq ($(VERBOSE), 0) +.SILENT: +endif + +ifeq ($(MPI), 1) +HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi +HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi +else ifeq ($(MPICH), 1) +HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich +HIPLDFLAGS += -L/usr/lib -lmpich +endif + +LIBRARIES += rccl +HIPLDFLAGS += $(LIBRARIES:%=-l%) + +all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test clean: rm -rf $(DST_DIR) @@ -21,4 +70,4 @@ self_test: $(DST_DIR)/self_test $(DST_DIR)/self_test: verifiable.cu verifiable.h @printf "Linking %s\n" $@ @mkdir -p $(DST_DIR) - $(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS) + $(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS) diff --git a/projects/rccl-tests/verifiable/inexact_regress.cu b/projects/rccl-tests/verifiable/inexact_regress.cu index d7bd545f62..973b965412 100644 --- a/projects/rccl-tests/verifiable/inexact_regress.cu +++ b/projects/rccl-tests/verifiable/inexact_regress.cu @@ -1,3 +1,10 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + /* Generate parameters for our error bound model of floating point average * (sum of scaled values) by sampling sums of random sequences for each * floating point type. @@ -16,12 +23,12 @@ #include #include #include -#include -#include +#include +#include using std::uint64_t; using std::uint32_t; -using bfloat16 = __nv_bfloat16; +using bfloat16 = hip_bfloat16; template struct float_traits; @@ -49,26 +56,26 @@ struct float_traits { __device__ static double mul(double a, double b) { return a*b; } }; template<> -struct float_traits { +struct float_traits<__half> { static constexpr int mantissa_bits = 10; static constexpr int exponent_bits = 5; using uint_t = uint16_t; - __device__ static half make(double x) { return __double2half(x); } - __device__ static half make(uint64_t x) { return __int2half_rn(x); } - __device__ static double todouble(half x) { return __half2float(x); } - __device__ static half add(half a, half b) { return __hadd(a, b); } - __device__ static half mul(half a, half b) { return __hmul(a, b); } + __device__ static __half make(double x) { return __float2half((float)x); } + __device__ static __half make(uint64_t x) { return __int2half_rn(x); } + __device__ static double todouble(__half x) { return __half2float(x); } + __device__ static __half add(__half a, __half b) { return __hadd(a, b); } + __device__ static __half mul(__half a, __half b) { return __hmul(a, b); } }; template<> struct float_traits { static constexpr int mantissa_bits = 7; static constexpr int exponent_bits = 8; using uint_t = uint16_t; - __device__ static bfloat16 make(double x) { return __double2bfloat16(x); } - __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); } - __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); } - __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); } - __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); } + __device__ static bfloat16 make(double x) { return bfloat16(x); } + __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); } + __device__ static double todouble(bfloat16 x) { return double(x); } + __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); } + __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); } }; template @@ -104,6 +111,17 @@ struct xoshiro256ss { } }; +static __device__ int __reduce_max_sync(unsigned int mask, int value) +{ + //We ignore mask, since all bits are set when calling them in the + //test code below. + int width = warpSize; + for (unsigned int i = warpSize; i; i >>= 1) { + value = max(__shfl_down(value, i, width), value); + } + return value; +} + template __global__ void kernel() { using traits = float_traits; @@ -123,7 +141,7 @@ __global__ void kernel() { for(int round=0; round < 1 + (16<<10)/max_ranks; round++) { //for(int round=0; round < 2; round++) { for(int i=threadIdx.x; i < samps; i += blockDim.x) { - accf[i] = 0; + accf[i] = (F)0; accd[i] = 0; } __syncthreads(); @@ -157,21 +175,21 @@ __global__ void kernel() { if(pass==0) expo_avg = expo_sum/expo_n; else if(threadIdx.x == 0) - std::printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg); + printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg); } } int main() { std::printf("type=float:\n"); kernel<<<1,32>>>(); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); std::printf("\ntype=half:\n"); kernel<<<1,32>>>(); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); std::printf("\ntype=bfloat16:\n"); kernel<<<1,32>>>(); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); return 0; } diff --git a/projects/rccl-tests/verifiable/verifiable.cu b/projects/rccl-tests/verifiable/verifiable.cu index 5f617ee188..9d8e56aba9 100644 --- a/projects/rccl-tests/verifiable/verifiable.cu +++ b/projects/rccl-tests/verifiable/verifiable.cu @@ -1,15 +1,23 @@ -#pragma nv_diag_suppress declared_but_not_referenced +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +//#pragma nv_diag_suppress declared_but_not_referenced #include "verifiable.h" -#include +#include +#include +#include -#include -#include -#if CUDART_VERSION >= 11000 -#include -#endif +#include "rccl/rccl.h" -#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__) + +#define RCCL_BFLOAT 1 + +#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1 #define HAVE_ncclBfloat16 1 #else #define HAVE_ncclBfloat16 0 @@ -83,10 +91,10 @@ namespace { template struct IsIntegral: std::is_integral {}; template<> -struct IsIntegral: std::false_type {}; -#ifdef __CUDA_BF16_TYPES_EXIST__ +struct IsIntegral<__half>: std::false_type {}; +#if RCCL_BFLOAT16 == 1 template<> -struct IsIntegral<__nv_bfloat16>: std::false_type {}; +struct IsIntegral: std::false_type {}; #endif } @@ -116,13 +124,13 @@ namespace { return Y(x); } template<> - __host__ __device__ half castTo(float x) { + __host__ __device__ half castTo<__half>(float x) { return __float2half(x); } - #ifdef __CUDA_BF16_TYPES_EXIST__ + #if RCCL_BFLOAT16 == 1 template<> - __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) { - return __float2bfloat16(x); + __host__ __device__ hip_bfloat16 castTo(float x) { + return hip_bfloat16(x); } #endif } @@ -144,20 +152,12 @@ struct ReduceSum { __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } template __host__ __device__ T operator()(T a, T b) const { return a + b; } - __host__ __device__ half operator()(half a, half b) const { - #if __CUDA_ARCH__ >= 530 - return __hadd(a, b); - #else + __host__ __device__ __half operator()(__half a, __half b) const { return __float2half(__half2float(a) + __half2float(b)); - #endif } - #ifdef __CUDA_BF16_TYPES_EXIST__ - __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { - #if __CUDA_ARCH__ >= 800 - return __hadd(a, b); - #else - return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b)); - #endif + #if RCCL_BFLOAT16 == 1 + __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + return hip_bfloat16(static_cast(a) + static_cast(b)); } #endif template @@ -168,20 +168,12 @@ struct ReduceProd { __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } template __host__ __device__ T operator()(T a, T b) const { return a * b; } - __host__ __device__ half operator()(half a, half b) const { - #if __CUDA_ARCH__ >= 530 - return __hmul(a, b); - #else + __host__ __device__ __half operator()(__half a, __half b) const { return __float2half(__half2float(a) * __half2float(b)); - #endif } - #ifdef __CUDA_BF16_TYPES_EXIST__ - __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { - #if __CUDA_ARCH__ >= 800 - return __hmul(a, b); - #else - return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b)); - #endif + #if RCCL_BFLOAT16 == 1 + __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + return hip_bfloat16(static_cast(a) * static_cast(b)); } #endif template @@ -192,24 +184,12 @@ struct ReduceMin { __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } template __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; } - __host__ __device__ half operator()(half a, half b) const { - #if __CUDA_ARCH__ >= 800 - return __hmin(a, b); - #elif __CUDA_ARCH__ >= 530 - return __hlt(a, b) ? a : b; - #else - return __half2float(a) < __half2float(b) ? a : b; - #endif + __host__ __device__ __half operator()(__half a, __half b) const { + return __half2float(a) < __half2float(b) ? a : b; } - #ifdef __CUDA_BF16_TYPES_EXIST__ - __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { - #if __CUDA_ARCH__ >= 800 - return __hmin(a, b); - //#elif __CUDA_ARCH__ >= 530 - // return __hlt(a, b) ? a : b; - #else - return __bfloat162float(a) < __bfloat162float(b) ? a : b; - #endif + #if RCCL_BFLOAT16 == 1 + __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + return static_cast(a) < static_cast(b) ? a : b; } #endif template @@ -220,24 +200,12 @@ struct ReduceMax { __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; } templateT())> __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; } - __host__ __device__ half operator()(half a, half b) const { - #if __CUDA_ARCH__ >= 800 - return __hmax(a, b); - #elif __CUDA_ARCH__ >= 530 - return __hgt(a, b) ? a : b; - #else + __host__ __device__ __half operator()(__half a, __half b) const { return __half2float(a) > __half2float(b) ? a : b; - #endif } - #ifdef __CUDA_BF16_TYPES_EXIST__ - __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { - #if __CUDA_ARCH__ >= 800 - return __hmax(a, b); - //#elif __CUDA_ARCH__ >= 530 - // return __hgt(a, b) ? a : b; - #else - return __bfloat162float(a) > __bfloat162float(b) ? a : b; - #endif + #if RCCL_BFLOAT16 == 1 + __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + return static_cast(a) > static_cast(b) ? a : b; } #endif template @@ -309,13 +277,13 @@ struct FloatLayout { static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; template<> -struct FloatLayout { +struct FloatLayout<__half> { static constexpr int exponent_bits = 5, mantissa_bits = 10; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; -#ifdef __CUDA_BF16_TYPES_EXIST__ +#if RCCL_BFLOAT16 == 1 template<> -struct FloatLayout<__nv_bfloat16> { +struct FloatLayout { static constexpr int exponent_bits = 8, mantissa_bits = 7; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; @@ -340,14 +308,14 @@ namespace { // from unbounded random values. For instance, given X a totally random 32-bit // integer, `umul32hi(X,n)` will be totally random within [0,n). __host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) { -#ifdef __CUDA_ARCH__ +#if HIP_VERSION > 50200000 return __umulhi(a, b); #else return uint64_t(a)*b >> 32; #endif } __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) { -#ifdef __CUDA_ARCH__ +#if HIP_VERSION > 50200000 return __umul64hi(a, b); #else return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64); @@ -355,14 +323,14 @@ __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) { } __host__ __device__ int clz32(int x) { -#ifdef __CUDA_ARCH__ +#if HIP_VERSION > 50200000 return __clz(x); #else return x==0 ? 32 : __builtin_clz(x); #endif } __host__ __device__ int clz64(long long x) { -#ifdef __CUDA_ARCH__ +#if HIP_VERSION > 50200000 return __clzll(x); #else return x==0 ? 64 : __builtin_clzll(x); @@ -747,8 +715,9 @@ __host__ __device__ void genOutput( ) { ans = genInOutFloatSum(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true); using T1 = typename std::conditional<(sizeof(T)::type; - ans = ReduceProd()(ans, T1(1)/T1(rank_n)); -} + //ans = ReduceProd()(ans, T1(1)/T1(rank_n)); + ans = ReduceProd()(ans, inhibit(castTo(T1(1)/T1(rank_n)))); + } } ///////////////////////////////////////////////////////////////////////////////// @@ -835,7 +804,7 @@ __global__ void prepareInput2( template void prepareInput1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me, - uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + uint64_t seed, intptr_t elt_ix0, hipStream_t stream ) { int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); #define CASE_TY(T) prepareInput2<<>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break; @@ -846,9 +815,9 @@ void prepareInput1( case ncclUint32: CASE_TY(uint32_t) case ncclInt64: CASE_TY(int64_t) case ncclUint64: CASE_TY(uint64_t) - case ncclFloat16: CASE_TY(half) + case ncclFloat16: CASE_TY(__half) #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(__nv_bfloat16) + case ncclBfloat16: CASE_TY(hip_bfloat16) #endif case ncclFloat32: CASE_TY(float) case ncclFloat64: CASE_TY(double) @@ -860,7 +829,7 @@ void prepareInput1( void ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, - uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + uint64_t seed, intptr_t elt_ix0, hipStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ @@ -911,7 +880,7 @@ __global__ void prepareExpected2( template void prepareExpected1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, - uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + uint64_t seed, intptr_t elt_ix0, hipStream_t stream ) { int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); #define CASE_TY(T) prepareExpected2<<>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break; @@ -922,9 +891,9 @@ void prepareExpected1( case ncclUint32: CASE_TY(uint32_t) case ncclInt64: CASE_TY(int64_t) case ncclUint64: CASE_TY(uint64_t) - case ncclFloat16: CASE_TY(half) + case ncclFloat16: CASE_TY(__half) #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(__nv_bfloat16) + case ncclBfloat16: CASE_TY(hip_bfloat16) #endif case ncclFloat32: CASE_TY(float) case ncclFloat64: CASE_TY(double) @@ -936,7 +905,7 @@ void prepareExpected1( void ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, - uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + uint64_t seed, intptr_t elt_ix0, hipStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ @@ -1044,7 +1013,8 @@ __global__ void verifyPrepared( #endif i += blockDim.x; } - asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); + //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); + atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad); } template @@ -1077,13 +1047,14 @@ __global__ void verifyInline2( #endif i += blockDim.x; } - asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); + //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad)); + atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad); } template void verifyInline1( T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, - unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n + unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n ) { #define CASE_OP(op) \ if(rank_n == 1) \ @@ -1112,7 +1083,7 @@ void verifyInline1( void ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, - int64_t *bad_elt_n, cudaStream_t stream + int64_t *bad_elt_n, hipStream_t stream ) { bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64; #if HAVE_ncclBfloat16 @@ -1142,9 +1113,9 @@ void ncclVerifiableVerify( case ncclUint32: CASE_TY(uint32_t, uint32_t) case ncclInt64: CASE_TY(int64_t, uint64_t) case ncclUint64: CASE_TY(uint64_t, uint64_t) - case ncclFloat16: CASE_TY(half, uint16_t) + case ncclFloat16: CASE_TY(__half, uint16_t) #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t) + case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t) #endif case ncclFloat32: CASE_TY(float, uint32_t) case ncclFloat64: CASE_TY(double, uint64_t) @@ -1180,7 +1151,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in } sum = op.postOp(sum); if(tolerance < calcDelta(sum, y)) { - std::printf( + printf( //"%10g != %10g : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n", "%llx != %llx : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n", *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix @@ -1209,9 +1180,9 @@ __global__ void sweep() { sweep1(ncclUint32, "uint32"); sweep1(ncclInt64, "int64"); sweep1(ncclUint64, "uint64"); - sweep1(ncclFloat16, "half"); + sweep1<__half>(ncclFloat16, "half"); #if HAVE_ncclBfloat16 - sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16"); + sweep1(ncclBfloat16, "bfloat16"); #endif sweep1(ncclFloat32, "float"); sweep1(ncclFloat64, "double"); @@ -1219,9 +1190,9 @@ __global__ void sweep() { int main(int arg_n, char **args) { std::cerr<<"You are hoping to see no output beyond this line."<>>(); - cudaDeviceSynchronize(); + hipDeviceSynchronize(); return 0; } #endif diff --git a/projects/rccl-tests/verifiable/verifiable.h b/projects/rccl-tests/verifiable/verifiable.h index aca0565a6b..b41ef1ad12 100644 --- a/projects/rccl-tests/verifiable/verifiable.h +++ b/projects/rccl-tests/verifiable/verifiable.h @@ -1,7 +1,14 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + #ifndef _d41d8cd98f00b204e9800998ecf8427e #define _d41d8cd98f00b204e9800998ecf8427e -#include +#include #include @@ -36,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) { // Enqueue kernel to generate data which is to be reduced. void ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, - uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + uint64_t seed, intptr_t elt_ix0, hipStream_t stream ); // Enqueue kernel to generate expected results of reduction. void ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, - uint64_t seed, intptr_t elt_ix0, cudaStream_t stream + uint64_t seed, intptr_t elt_ix0, hipStream_t stream ); // Enqueue kernel to verify reduced data matches expectation. The number of @@ -54,6 +61,6 @@ void ncclVerifiablePrepareExpected( void ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, - int64_t *bad_elt_n, cudaStream_t stream + int64_t *bad_elt_n, hipStream_t stream ); #endif diff --git a/projects/rccl-tests/verifiable/verifiable.mk b/projects/rccl-tests/verifiable/verifiable.mk index 225c32a3c3..fba1fbf35c 100644 --- a/projects/rccl-tests/verifiable/verifiable.mk +++ b/projects/rccl-tests/verifiable/verifiable.mk @@ -1,3 +1,9 @@ +# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. +# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE.txt for license information + + # We requires both of the following paths to be set upon including this makefile # TEST_VERIFIABLE_SRCDIR = # TEST_VERIFIABLE_BUILDDIR = @@ -8,4 +14,5 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS) @printf "Compiling %s\n" $@ @mkdir -p $(TEST_VERIFIABLE_BUILDDIR) - $(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu + echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu" + $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu