make rccl-test compile again.
all files compile now. mpi tests also pass
Этот коммит содержится в:
@@ -51,6 +51,7 @@ endif()
|
||||
set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component
|
||||
|
||||
# Add all of the tests
|
||||
add_subdirectory(verifiable)
|
||||
add_subdirectory(src)
|
||||
|
||||
# Create ROCm standard packages
|
||||
|
||||
+2
-3
@@ -20,12 +20,11 @@ LDFLAGS :=
|
||||
HIPLDFLAGS :=
|
||||
|
||||
ifneq ($(NCCL_HOME), "")
|
||||
HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
|
||||
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
|
||||
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
|
||||
endif
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
|
||||
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
|
||||
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
|
||||
|
||||
|
||||
+17
-31
@@ -10,18 +10,6 @@
|
||||
|
||||
#define USE_RCCL_GATHER_SCATTER
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
if (count < nranks*nranks/2) {
|
||||
*sendcount = 0;
|
||||
@@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
char* str = getenv("NCCL_TESTS_DEVICE");
|
||||
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
|
||||
|
||||
#if 0
|
||||
int *dataHost = (int *)malloc(args->sendBytes);
|
||||
hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
|
||||
@@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
printf("\n");
|
||||
free(dataHost);
|
||||
#endif
|
||||
|
||||
size_t rdisp = 0;
|
||||
size_t data_count = sendcount*2/nranks;
|
||||
size_t chunksize = data_count/nranks;
|
||||
for (int j=0; j<nranks; j++) {
|
||||
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
|
||||
if ((j+rank)%nranks == 0)
|
||||
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
|
||||
if ((j+rank)%nranks == 0)
|
||||
rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
|
||||
size_t sdisp = 0;
|
||||
for (int k=0; k<nranks; k++) {
|
||||
scount = ((k+j)%nranks)*chunksize;
|
||||
if ((k+j)%nranks == 0)
|
||||
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
|
||||
if (k == rank)
|
||||
break;
|
||||
sdisp += scount;
|
||||
}
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
|
||||
rdisp += rcount;
|
||||
size_t sdisp = 0;
|
||||
for (int kk=0; kk<nranks; kk++) {
|
||||
scount = ((kk+j)%nranks)*chunksize;
|
||||
if ((kk+j)%nranks == 0)
|
||||
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
|
||||
if (kk == rank)
|
||||
break;
|
||||
sdisp += scount;
|
||||
}
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
|
||||
rdisp += rcount;
|
||||
}
|
||||
k++;
|
||||
}
|
||||
|
||||
+44
-59
@@ -165,18 +165,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
|
||||
}
|
||||
|
||||
testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
|
||||
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
|
||||
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
|
||||
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
|
||||
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
|
||||
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -271,7 +271,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
size_t count = args->expectedBytes/wordSize(type);
|
||||
|
||||
int64_t *wrongPerGpu = nullptr;
|
||||
CUDACHECK(hipHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostAllocMapped));
|
||||
HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
|
||||
|
||||
for (int i=0; i<args->nGpus*args->nRanks; i++) {
|
||||
int device;
|
||||
@@ -352,7 +352,7 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_
|
||||
}
|
||||
double delta = tim.elapsed();
|
||||
if (delta > timeout && timeout > 0) {
|
||||
for (int i=0; i<ngpus; i++)
|
||||
for (int i=0; i<nStreams; i++)
|
||||
NCCLCHECK(ncclCommAbort(comms[i]));
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024);
|
||||
@@ -387,7 +387,6 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
int hipDev;
|
||||
NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
|
||||
HIPCHECK(hipSetDevice(hipDev));
|
||||
//CUDACHECK(cudaSetDevice(args->gpus[i])); EDGAR CHECK LATER
|
||||
#endif
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
|
||||
char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
|
||||
@@ -417,7 +416,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
|
||||
case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
|
||||
case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<rccl_bfloat16>(rank); break;
|
||||
#endif
|
||||
}
|
||||
NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
|
||||
@@ -452,7 +451,7 @@ testResult_t completeColl(struct threadArgs* args) {
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
//EDGAR: Revisit because of cudaGraphLaunches
|
||||
//RCCL: Revisit because of cudaGraphLaunches
|
||||
testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
|
||||
size_t count = args->nbytes / wordSize(type);
|
||||
if (datacheck) {
|
||||
@@ -648,7 +647,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
|
||||
// Benchmark
|
||||
for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
|
||||
setupArgs(size, type, args);
|
||||
print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
|
||||
char rootName[100];
|
||||
sprintf(rootName, "%6i", root);
|
||||
PRINT("%12li %12li %8s %6s %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
|
||||
TESTCHECK(BenchTime(args, type, op, root, 0));
|
||||
TESTCHECK(BenchTime(args, type, op, root, 1));
|
||||
PRINT("\n");
|
||||
@@ -661,10 +662,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
|
||||
// Set device to the first of our GPUs. If we don't do that, some operations
|
||||
// will be done on the current GPU (by default : 0) and if the GPUs are in
|
||||
// exclusive mode those operations will fail.
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
|
||||
if (enable_multiranks)
|
||||
gpuid = gpuid % numDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[0]));
|
||||
TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -679,11 +677,7 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (enable_multiranks)
|
||||
gpuid = gpuid % numDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
//CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int j=0; j<args->nRanks; j++) {
|
||||
int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
|
||||
@@ -715,7 +709,7 @@ testResult_t threadLaunch(struct testThread* thread) {
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
|
||||
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
|
||||
if (memorytype == ncclFine) {
|
||||
HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
@@ -807,12 +801,10 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
while(1) {
|
||||
int c;
|
||||
// EDGAR NOTE: y is used by 'memory_type' (a RCCL argument) and 'stream_null' (a NCCL argument)
|
||||
// also not sure about G vs. hG (we had G, they have hG)
|
||||
#ifdef RCCL_MULTIRANKPERGPU
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:R:x:", longopts, &longindex);
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:", longopts, &longindex);
|
||||
#else
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:", longopts, &longindex);
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:", longopts, &longindex);
|
||||
#endif
|
||||
|
||||
if (c == -1)
|
||||
@@ -878,7 +870,7 @@ int main(int argc, char* argv[]) {
|
||||
case 'z':
|
||||
blocking_coll = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'y':
|
||||
case 'Y':
|
||||
memorytype = ncclstringtomtype(optarg);
|
||||
break;
|
||||
case 's':
|
||||
@@ -946,7 +938,7 @@ int main(int argc, char* argv[]) {
|
||||
"[-d,--datatype <nccltype/all>] \n\t"
|
||||
"[-r,--root <root>] \n\t"
|
||||
"[-z,--blocking <0/1>] \n\t"
|
||||
"[-y,--memory_type <coarse/fine/host/managed>] \n\t"
|
||||
"[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
|
||||
"[-s,--stress_cycles <number of cycles>] \n\t"
|
||||
"[-u,--cumask <d0,d1,d2,d3>] \n\t"
|
||||
"[-y,--stream_null <0/1>] \n\t"
|
||||
@@ -1084,15 +1076,15 @@ testResult_t run() {
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
|
||||
#endif
|
||||
<<<<<<< HEAD
|
||||
int gpus[nGpus*nThreads*ranksPerGpu];
|
||||
|
||||
int gpus[nGpus*nThreads];
|
||||
hipStream_t streams[nGpus*nThreads*ranksPerGpu];
|
||||
void* sendbuffs[nGpus*nThreads*ranksPerGpu];
|
||||
void* recvbuffs[nGpus*nThreads*ranksPerGpu];
|
||||
void* expected[nGpus*nThreads*ranksPerGpu];
|
||||
size_t sendBytes, recvBytes;
|
||||
|
||||
ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
|
||||
ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads*ranksPerGpu);
|
||||
|
||||
envstr = getenv("NCCL_TESTS_DEVICE");
|
||||
gpu0 = envstr ? atoi(envstr) : -1;
|
||||
@@ -1101,53 +1093,44 @@ testResult_t run() {
|
||||
if (enable_multiranks)
|
||||
gpuid = gpuid % numDevices;
|
||||
|
||||
gpus[ii] = gpu0 != -1 ? gpu0+ii : gpuid;
|
||||
HIPCHECK(hipSetDevice(gpus[ii]));
|
||||
|
||||
for (int j=0; j<ranksPerGpu; j++) {
|
||||
int i = ii*ranksPerGpu+j;
|
||||
gpus[i] = gpu0 != -1 ? gpu0+ii : gpuid;
|
||||
HIPCHECK(hipSetDevice(gpus[i]));
|
||||
|
||||
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
|
||||
//PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
|
||||
if (streamnull)
|
||||
streams[i] = NULL;
|
||||
else {
|
||||
if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
|
||||
PRINT("cumask: ");
|
||||
for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
|
||||
PRINT("\n");
|
||||
HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
|
||||
} else
|
||||
HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
|
||||
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
|
||||
if (streamnull)
|
||||
streams[i] = NULL;
|
||||
else {
|
||||
if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
|
||||
PRINT("cumask: ");
|
||||
for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
|
||||
PRINT("\n");
|
||||
HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
|
||||
} else
|
||||
HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
|
||||
}
|
||||
}
|
||||
#if 0 //EDGAR
|
||||
// initialize data buffer to avoid all zero data
|
||||
TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
#endif //EDGAR
|
||||
}
|
||||
|
||||
//if parallel init is not selected, use main thread to initialize NCCL
|
||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
|
||||
if (!parallel_init) {
|
||||
if (nProcs == 1 && !enable_multiranks) {
|
||||
if (ncclProcs == 1 && !enable_multiranks) {
|
||||
NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
|
||||
} else {
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int ii=0; ii<nGpus*nThreads; ii++) {
|
||||
int gpuid = localRank*nThreads*nGpus+ii;
|
||||
if (enable_multiranks) {
|
||||
gpuid = gpuid % numDevices;
|
||||
}
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(gpus[ii]));
|
||||
if (!enable_multiranks) {
|
||||
NCCLCHECK(ncclCommInitRank(comms+ii, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
|
||||
NCCLCHECK(ncclCommInitRank(comms+ii, ncclProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
|
||||
}
|
||||
#ifdef RCCL_MULTIRANKPERGPU
|
||||
else
|
||||
for (int j=0; j<ranksPerGpu; j++) {
|
||||
int i = ii*ranksPerGpu+j;
|
||||
NCCLCHECK(ncclCommInitRankMulti(comms+i, nProcs*nThreads*nGpus*ranksPerGpu, ncclId, proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
|
||||
NCCLCHECK(ncclCommInitRankMulti(comms+i, ncclProcs*nThreads*nGpus*ranksPerGpu, ncclId,
|
||||
proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -1182,6 +1165,8 @@ testResult_t run() {
|
||||
threads[t].args.stepbytes=stepBytes;
|
||||
threads[t].args.stepfactor=stepFactor;
|
||||
threads[t].args.localRank = localRank;
|
||||
|
||||
threads[t].args.totalProcs = totalProcs;
|
||||
threads[t].args.localNumDevices = numDevices;
|
||||
threads[t].args.enable_multiranks = enable_multiranks;
|
||||
threads[t].args.nRanks = ranksPerGpu;
|
||||
@@ -1190,7 +1175,7 @@ testResult_t run() {
|
||||
threads[t].args.nThreads=nThreads;
|
||||
threads[t].args.thread=t;
|
||||
threads[t].args.nGpus=nGpus;
|
||||
threads[t].args.gpus=gpus+t*nGpus*ranksPerGpu;
|
||||
threads[t].args.gpus=gpus+t*nGpus;
|
||||
threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
|
||||
threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
|
||||
threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
|
||||
|
||||
+6
-6
@@ -7,7 +7,7 @@
|
||||
#ifndef __COMMON_H__
|
||||
#define __COMMON_H__
|
||||
|
||||
#include "rccl.h"
|
||||
#include "rccl/rccl.h"
|
||||
#include <stdio.h>
|
||||
#include <cstdint>
|
||||
#include <algorithm>
|
||||
@@ -21,14 +21,14 @@
|
||||
// For nccl.h < 2.13 since we define a weak fallback
|
||||
extern "C" char const* ncclGetLastError(ncclComm_t comm);
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
char hostname[1024]; \
|
||||
getHostName(hostname, 1024); \
|
||||
printf("%s: Test HIP failure %s:%d '%s'\n", \
|
||||
printf("%s: Test HIP failure %s:%d '%s'\n", \
|
||||
hostname, \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
return testCudaError; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
@@ -1,13 +1,62 @@
|
||||
include ../../makefiles/common.mk
|
||||
#
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
#include ../../makefiles/common.mk
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
BUILDDIR := $(abspath ../../build)
|
||||
NCCLDIR := $(BUILDDIR)
|
||||
NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
|
||||
DST_DIR := $(BUILDDIR)/test/verifiable
|
||||
|
||||
all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
MPI_HOME ?= /usr/lib/openmpi
|
||||
PREFIX ?= /usr/local
|
||||
VERBOSE ?= 0
|
||||
DEBUG ?= 0
|
||||
NCCL_HOME ?= ""
|
||||
|
||||
HIPCC = $(ROCM_PATH)/bin/hipcc
|
||||
CXX = $(HIPCC)
|
||||
|
||||
HIPCUFLAGS := -std=c++14
|
||||
LDFLAGS :=
|
||||
HIPLDFLAGS :=
|
||||
|
||||
ifneq ($(NCCL_HOME), "")
|
||||
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
|
||||
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
|
||||
endif
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
|
||||
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
|
||||
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
|
||||
|
||||
ifeq ($(DEBUG), 0)
|
||||
HIPCUFLAGS += -O3
|
||||
else
|
||||
HIPCUFLAGS += -O0 -g -ggdb3
|
||||
endif
|
||||
|
||||
ifeq ($(VERBOSE), 0)
|
||||
.SILENT:
|
||||
endif
|
||||
|
||||
ifeq ($(MPI), 1)
|
||||
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
|
||||
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
|
||||
else ifeq ($(MPICH), 1)
|
||||
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
|
||||
HIPLDFLAGS += -L/usr/lib -lmpich
|
||||
endif
|
||||
|
||||
LIBRARIES += rccl
|
||||
HIPLDFLAGS += $(LIBRARIES:%=-l%)
|
||||
|
||||
all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test
|
||||
|
||||
clean:
|
||||
rm -rf $(DST_DIR)
|
||||
@@ -21,4 +70,4 @@ self_test: $(DST_DIR)/self_test
|
||||
$(DST_DIR)/self_test: verifiable.cu verifiable.h
|
||||
@printf "Linking %s\n" $@
|
||||
@mkdir -p $(DST_DIR)
|
||||
$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
/* Generate parameters for our error bound model of floating point average
|
||||
* (sum of scaled values) by sampling sums of random sequences for each
|
||||
* floating point type.
|
||||
@@ -16,12 +23,12 @@
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
using std::uint64_t;
|
||||
using std::uint32_t;
|
||||
using bfloat16 = __nv_bfloat16;
|
||||
using bfloat16 = hip_bfloat16;
|
||||
|
||||
template<typename T>
|
||||
struct float_traits;
|
||||
@@ -49,26 +56,26 @@ struct float_traits<double> {
|
||||
__device__ static double mul(double a, double b) { return a*b; }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<half> {
|
||||
struct float_traits<__half> {
|
||||
static constexpr int mantissa_bits = 10;
|
||||
static constexpr int exponent_bits = 5;
|
||||
using uint_t = uint16_t;
|
||||
__device__ static half make(double x) { return __double2half(x); }
|
||||
__device__ static half make(uint64_t x) { return __int2half_rn(x); }
|
||||
__device__ static double todouble(half x) { return __half2float(x); }
|
||||
__device__ static half add(half a, half b) { return __hadd(a, b); }
|
||||
__device__ static half mul(half a, half b) { return __hmul(a, b); }
|
||||
__device__ static __half make(double x) { return __float2half((float)x); }
|
||||
__device__ static __half make(uint64_t x) { return __int2half_rn(x); }
|
||||
__device__ static double todouble(__half x) { return __half2float(x); }
|
||||
__device__ static __half add(__half a, __half b) { return __hadd(a, b); }
|
||||
__device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<bfloat16> {
|
||||
static constexpr int mantissa_bits = 7;
|
||||
static constexpr int exponent_bits = 8;
|
||||
using uint_t = uint16_t;
|
||||
__device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
|
||||
__device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
|
||||
__device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
|
||||
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
|
||||
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
|
||||
__device__ static bfloat16 make(double x) { return bfloat16(x); }
|
||||
__device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
|
||||
__device__ static double todouble(bfloat16 x) { return double(x); }
|
||||
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
|
||||
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
|
||||
};
|
||||
|
||||
template<typename F>
|
||||
@@ -104,6 +111,17 @@ struct xoshiro256ss {
|
||||
}
|
||||
};
|
||||
|
||||
static __device__ int __reduce_max_sync(unsigned int mask, int value)
|
||||
{
|
||||
//We ignore mask, since all bits are set when calling them in the
|
||||
//test code below.
|
||||
int width = warpSize;
|
||||
for (unsigned int i = warpSize; i; i >>= 1) {
|
||||
value = max(__shfl_down(value, i, width), value);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
__global__ void kernel() {
|
||||
using traits = float_traits<F>;
|
||||
@@ -123,7 +141,7 @@ __global__ void kernel() {
|
||||
for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
|
||||
//for(int round=0; round < 2; round++) {
|
||||
for(int i=threadIdx.x; i < samps; i += blockDim.x) {
|
||||
accf[i] = 0;
|
||||
accf[i] = (F)0;
|
||||
accd[i] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -157,21 +175,21 @@ __global__ void kernel() {
|
||||
if(pass==0)
|
||||
expo_avg = expo_sum/expo_n;
|
||||
else if(threadIdx.x == 0)
|
||||
std::printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
|
||||
printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::printf("type=float:\n");
|
||||
kernel<float><<<1,32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=half:\n");
|
||||
kernel<half><<<1,32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=bfloat16:\n");
|
||||
kernel<bfloat16><<<1,32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
+71
-100
@@ -1,15 +1,23 @@
|
||||
#pragma nv_diag_suppress declared_but_not_referenced
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
//#pragma nv_diag_suppress declared_but_not_referenced
|
||||
|
||||
#include "verifiable.h"
|
||||
#include <nccl.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
#if CUDART_VERSION >= 11000
|
||||
#include <cuda_bf16.h>
|
||||
#endif
|
||||
#include "rccl/rccl.h"
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
|
||||
#define RCCL_BFLOAT 1
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
|
||||
#define HAVE_ncclBfloat16 1
|
||||
#else
|
||||
#define HAVE_ncclBfloat16 0
|
||||
@@ -83,10 +91,10 @@ namespace {
|
||||
template<typename T>
|
||||
struct IsIntegral: std::is_integral<T> {};
|
||||
template<>
|
||||
struct IsIntegral<half>: std::false_type {};
|
||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||
struct IsIntegral<__half>: std::false_type {};
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
struct IsIntegral<__nv_bfloat16>: std::false_type {};
|
||||
struct IsIntegral<hip_bfloat16>: std::false_type {};
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -116,13 +124,13 @@ namespace {
|
||||
return Y(x);
|
||||
}
|
||||
template<>
|
||||
__host__ __device__ half castTo<half>(float x) {
|
||||
__host__ __device__ half castTo<__half>(float x) {
|
||||
return __float2half(x);
|
||||
}
|
||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
__host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
|
||||
return __float2bfloat16(x);
|
||||
__host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
|
||||
return hip_bfloat16(x);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -144,20 +152,12 @@ struct ReduceSum {
|
||||
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
|
||||
template<typename T, typename=decltype(T()+T())>
|
||||
__host__ __device__ T operator()(T a, T b) const { return a + b; }
|
||||
__host__ __device__ half operator()(half a, half b) const {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hadd(a, b);
|
||||
#else
|
||||
__host__ __device__ __half operator()(__half a, __half b) const {
|
||||
return __float2half(__half2float(a) + __half2float(b));
|
||||
#endif
|
||||
}
|
||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
return __hadd(a, b);
|
||||
#else
|
||||
return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
|
||||
#endif
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
|
||||
}
|
||||
#endif
|
||||
template<typename T>
|
||||
@@ -168,20 +168,12 @@ struct ReduceProd {
|
||||
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
|
||||
template<typename T, typename=decltype(T()*T())>
|
||||
__host__ __device__ T operator()(T a, T b) const { return a * b; }
|
||||
__host__ __device__ half operator()(half a, half b) const {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
return __hmul(a, b);
|
||||
#else
|
||||
__host__ __device__ __half operator()(__half a, __half b) const {
|
||||
return __float2half(__half2float(a) * __half2float(b));
|
||||
#endif
|
||||
}
|
||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
return __hmul(a, b);
|
||||
#else
|
||||
return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
|
||||
#endif
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
|
||||
}
|
||||
#endif
|
||||
template<typename T>
|
||||
@@ -192,24 +184,12 @@ struct ReduceMin {
|
||||
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
|
||||
template<typename T, typename=decltype(T()<T())>
|
||||
__host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
|
||||
__host__ __device__ half operator()(half a, half b) const {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
return __hmin(a, b);
|
||||
#elif __CUDA_ARCH__ >= 530
|
||||
return __hlt(a, b) ? a : b;
|
||||
#else
|
||||
return __half2float(a) < __half2float(b) ? a : b;
|
||||
#endif
|
||||
__host__ __device__ __half operator()(__half a, __half b) const {
|
||||
return __half2float(a) < __half2float(b) ? a : b;
|
||||
}
|
||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
return __hmin(a, b);
|
||||
//#elif __CUDA_ARCH__ >= 530
|
||||
// return __hlt(a, b) ? a : b;
|
||||
#else
|
||||
return __bfloat162float(a) < __bfloat162float(b) ? a : b;
|
||||
#endif
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
return static_cast<float>(a) < static_cast<float>(b) ? a : b;
|
||||
}
|
||||
#endif
|
||||
template<typename T>
|
||||
@@ -220,24 +200,12 @@ struct ReduceMax {
|
||||
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
|
||||
template<typename T, typename=decltype(T()>T())>
|
||||
__host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
|
||||
__host__ __device__ half operator()(half a, half b) const {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
return __hmax(a, b);
|
||||
#elif __CUDA_ARCH__ >= 530
|
||||
return __hgt(a, b) ? a : b;
|
||||
#else
|
||||
__host__ __device__ __half operator()(__half a, __half b) const {
|
||||
return __half2float(a) > __half2float(b) ? a : b;
|
||||
#endif
|
||||
}
|
||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
#if __CUDA_ARCH__ >= 800
|
||||
return __hmax(a, b);
|
||||
//#elif __CUDA_ARCH__ >= 530
|
||||
// return __hgt(a, b) ? a : b;
|
||||
#else
|
||||
return __bfloat162float(a) > __bfloat162float(b) ? a : b;
|
||||
#endif
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
return static_cast<float>(a) > static_cast<float>(b) ? a : b;
|
||||
}
|
||||
#endif
|
||||
template<typename T>
|
||||
@@ -309,13 +277,13 @@ struct FloatLayout<double> {
|
||||
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
|
||||
};
|
||||
template<>
|
||||
struct FloatLayout<half> {
|
||||
struct FloatLayout<__half> {
|
||||
static constexpr int exponent_bits = 5, mantissa_bits = 10;
|
||||
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
|
||||
};
|
||||
#ifdef __CUDA_BF16_TYPES_EXIST__
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
struct FloatLayout<__nv_bfloat16> {
|
||||
struct FloatLayout<hip_bfloat16> {
|
||||
static constexpr int exponent_bits = 8, mantissa_bits = 7;
|
||||
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
|
||||
};
|
||||
@@ -340,14 +308,14 @@ namespace {
|
||||
// from unbounded random values. For instance, given X a totally random 32-bit
|
||||
// integer, `umul32hi(X,n)` will be totally random within [0,n).
|
||||
__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if HIP_VERSION > 50200000
|
||||
return __umulhi(a, b);
|
||||
#else
|
||||
return uint64_t(a)*b >> 32;
|
||||
#endif
|
||||
}
|
||||
__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if HIP_VERSION > 50200000
|
||||
return __umul64hi(a, b);
|
||||
#else
|
||||
return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
|
||||
@@ -355,14 +323,14 @@ __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
|
||||
}
|
||||
|
||||
__host__ __device__ int clz32(int x) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if HIP_VERSION > 50200000
|
||||
return __clz(x);
|
||||
#else
|
||||
return x==0 ? 32 : __builtin_clz(x);
|
||||
#endif
|
||||
}
|
||||
__host__ __device__ int clz64(long long x) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if HIP_VERSION > 50200000
|
||||
return __clzll(x);
|
||||
#else
|
||||
return x==0 ? 64 : __builtin_clzll(x);
|
||||
@@ -747,8 +715,9 @@ __host__ __device__ void genOutput(
|
||||
) {
|
||||
ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
|
||||
using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
|
||||
ans = ReduceProd()(ans, T1(1)/T1(rank_n));
|
||||
}
|
||||
//ans = ReduceProd()(ans, T1(1)/T1(rank_n));
|
||||
ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -835,7 +804,7 @@ __global__ void prepareInput2(
|
||||
template<typename ReduceOp>
|
||||
void prepareInput1(
|
||||
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
) {
|
||||
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
|
||||
#define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
|
||||
@@ -846,9 +815,9 @@ void prepareInput1(
|
||||
case ncclUint32: CASE_TY(uint32_t)
|
||||
case ncclInt64: CASE_TY(int64_t)
|
||||
case ncclUint64: CASE_TY(uint64_t)
|
||||
case ncclFloat16: CASE_TY(half)
|
||||
case ncclFloat16: CASE_TY(__half)
|
||||
#if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(__nv_bfloat16)
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16)
|
||||
#endif
|
||||
case ncclFloat32: CASE_TY(float)
|
||||
case ncclFloat64: CASE_TY(double)
|
||||
@@ -860,7 +829,7 @@ void prepareInput1(
|
||||
|
||||
void ncclVerifiablePrepareInput(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
) {
|
||||
#define CASE_OP(op) \
|
||||
if(rank_n == 1) \
|
||||
@@ -911,7 +880,7 @@ __global__ void prepareExpected2(
|
||||
template<typename ReduceOp>
|
||||
void prepareExpected1(
|
||||
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
) {
|
||||
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
|
||||
#define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
|
||||
@@ -922,9 +891,9 @@ void prepareExpected1(
|
||||
case ncclUint32: CASE_TY(uint32_t)
|
||||
case ncclInt64: CASE_TY(int64_t)
|
||||
case ncclUint64: CASE_TY(uint64_t)
|
||||
case ncclFloat16: CASE_TY(half)
|
||||
case ncclFloat16: CASE_TY(__half)
|
||||
#if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(__nv_bfloat16)
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16)
|
||||
#endif
|
||||
case ncclFloat32: CASE_TY(float)
|
||||
case ncclFloat64: CASE_TY(double)
|
||||
@@ -936,7 +905,7 @@ void prepareExpected1(
|
||||
|
||||
void ncclVerifiablePrepareExpected(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
) {
|
||||
#define CASE_OP(op) \
|
||||
if(rank_n == 1) \
|
||||
@@ -1044,7 +1013,8 @@ __global__ void verifyPrepared(
|
||||
#endif
|
||||
i += blockDim.x;
|
||||
}
|
||||
asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
|
||||
//asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
|
||||
atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
|
||||
}
|
||||
|
||||
template<typename T, typename Uint, typename ReduceFn>
|
||||
@@ -1077,13 +1047,14 @@ __global__ void verifyInline2(
|
||||
#endif
|
||||
i += blockDim.x;
|
||||
}
|
||||
asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
|
||||
//asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
|
||||
atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
|
||||
}
|
||||
|
||||
template<typename T, typename Uint>
|
||||
void verifyInline1(
|
||||
T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
|
||||
unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
|
||||
) {
|
||||
#define CASE_OP(op) \
|
||||
if(rank_n == 1) \
|
||||
@@ -1112,7 +1083,7 @@ void verifyInline1(
|
||||
void ncclVerifiableVerify(
|
||||
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
|
||||
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t *bad_elt_n, cudaStream_t stream
|
||||
int64_t *bad_elt_n, hipStream_t stream
|
||||
) {
|
||||
bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
|
||||
#if HAVE_ncclBfloat16
|
||||
@@ -1142,9 +1113,9 @@ void ncclVerifiableVerify(
|
||||
case ncclUint32: CASE_TY(uint32_t, uint32_t)
|
||||
case ncclInt64: CASE_TY(int64_t, uint64_t)
|
||||
case ncclUint64: CASE_TY(uint64_t, uint64_t)
|
||||
case ncclFloat16: CASE_TY(half, uint16_t)
|
||||
case ncclFloat16: CASE_TY(__half, uint16_t)
|
||||
#if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
|
||||
#endif
|
||||
case ncclFloat32: CASE_TY(float, uint32_t)
|
||||
case ncclFloat64: CASE_TY(double, uint64_t)
|
||||
@@ -1180,7 +1151,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in
|
||||
}
|
||||
sum = op.postOp(sum);
|
||||
if(tolerance < calcDelta(sum, y)) {
|
||||
std::printf(
|
||||
printf(
|
||||
//"%10g != %10g : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
|
||||
"%llx != %llx : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
|
||||
*(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
|
||||
@@ -1209,9 +1180,9 @@ __global__ void sweep() {
|
||||
sweep1<uint32_t>(ncclUint32, "uint32");
|
||||
sweep1<int64_t>(ncclInt64, "int64");
|
||||
sweep1<uint64_t>(ncclUint64, "uint64");
|
||||
sweep1<half>(ncclFloat16, "half");
|
||||
sweep1<__half>(ncclFloat16, "half");
|
||||
#if HAVE_ncclBfloat16
|
||||
sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
|
||||
sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
|
||||
#endif
|
||||
sweep1<float>(ncclFloat32, "float");
|
||||
sweep1<double>(ncclFloat64, "double");
|
||||
@@ -1219,9 +1190,9 @@ __global__ void sweep() {
|
||||
|
||||
int main(int arg_n, char **args) {
|
||||
std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
|
||||
cudaSetDevice(0);
|
||||
hipSetDevice(0);
|
||||
sweep<<<1,512>>>();
|
||||
cudaDeviceSynchronize();
|
||||
hipDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,14 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef _d41d8cd98f00b204e9800998ecf8427e
|
||||
#define _d41d8cd98f00b204e9800998ecf8427e
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -36,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
|
||||
// Enqueue kernel to generate data which is to be reduced.
|
||||
void ncclVerifiablePrepareInput(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to generate expected results of reduction.
|
||||
void ncclVerifiablePrepareExpected(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to verify reduced data matches expectation. The number of
|
||||
@@ -54,6 +61,6 @@ void ncclVerifiablePrepareExpected(
|
||||
void ncclVerifiableVerify(
|
||||
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
|
||||
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t *bad_elt_n, cudaStream_t stream
|
||||
int64_t *bad_elt_n, hipStream_t stream
|
||||
);
|
||||
#endif
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
|
||||
|
||||
# We requires both of the following paths to be set upon including this makefile
|
||||
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
|
||||
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
|
||||
@@ -8,4 +14,5 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
|
||||
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
|
||||
@printf "Compiling %s\n" $@
|
||||
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
|
||||
$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
|
||||
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
|
||||
|
||||
Ссылка в новой задаче
Block a user