Resync with changes in gitilab-master code

[ROCm/rccl-tests commit: 1ae8cdc315]
This commit is contained in:
David Addison
2021-06-28 18:23:12 -07:00
parent d30e35f150
commit 1044cd1f32
2 changed files with 40 additions and 46 deletions
+37 -44
View File
@@ -67,6 +67,8 @@ static int parallel_init = 0;
static int blocking_coll = 0;
static int cudaGraphLaunches = 0;
#define NUM_BLOCKS 32
double parsesize(char *value) {
long long int units;
double size;
@@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
const T* A = (const T*)A_;
const T* B = (const T*)B_;
__shared__ double temp[BSIZE];
int tid = threadIdx.x;
int tid = blockIdx.x*blockDim.x + threadIdx.x;
double locmax = 0.0;
for(int i=tid; i<count; i+=blockDim.x) {
for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
double delta = absDiff(A[i], B[i]);
if( delta > locmax ) {
@@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
}
}
tid = threadIdx.x;
temp[tid] = locmax;
for(int stride = BSIZE/2; stride > 1; stride>>=1) {
__syncthreads();
@@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
}
__syncthreads();
if( threadIdx.x == 0)
*max = temp[0] > temp[1] ? temp[0] : temp[1];
max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
}
testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
switch (type) {
#if defined(__CUDA_BF16_TYPES_EXIST__)
case ncclBfloat16:
deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break;
deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
#endif
case ncclHalf:
deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
case ncclFloat:
deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
case ncclDouble:
deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
case ncclChar:
#if NCCL_MAJOR >= 2
case ncclUint8:
#endif
deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
case ncclInt:
#if NCCL_MAJOR >= 2
case ncclUint32:
#endif
deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
case ncclInt64:
case ncclUint64:
deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
}
CUDACHECK(cudaDeviceSynchronize());
for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
return testSuccess;
}
@@ -438,8 +441,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
// Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
size_t shift = (totalnbytes * iter) % args->maxbytes;
if (shift + totalnbytes > args->maxbytes) shift = 0;
size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
size_t shift = totalnbytes * (iter % steps);
if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
for (int i = 0; i < args->nGpus; i++) {
@@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) {
testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
size_t count = args->nbytes / wordSize(type);
if (datacheck) {
// Initialize sendbuffs, recvbuffs and expected
TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
}
// Sync
TESTCHECK(startColl(args, type, op, root, in_place, 0));
@@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
}
double timeUsec = deltaSec*1.0E6;
char timeStr[10];
char timeStr[100];
if (timeUsec > 10000.0) {
sprintf(timeStr, "%7.0f", timeUsec);
} else if (timeUsec > 100.0) {
} else if (timeUsec >= 100.0) {
sprintf(timeStr, "%7.1f", timeUsec);
} else {
sprintf(timeStr, "%7.2f", timeUsec);
@@ -812,31 +819,6 @@ int main(int argc, char* argv[]) {
printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
#endif
break;
case 'h':
printf("USAGE: %s \n\t"
"[-t,--nthreads <num threads>] \n\t"
"[-g,--ngpus <gpus per thread>] \n\t"
"[-b,--minbytes <min size in bytes>] \n\t"
"[-e,--maxbytes <max size in bytes>] \n\t"
"[-i,--stepbytes <increment size>] \n\t"
"[-f,--stepfactor <increment factor>] \n\t"
"[-n,--iters <iteration count>] \n\t"
"[-m,--agg_iters <aggregated iteration count>] \n\t"
"[-w,--warmup_iters <warmup iteration count>] \n\t"
"[-p,--parallel_init <0/1>] \n\t"
"[-c,--check <0/1>] \n\t"
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
"[-o,--op <sum/prod/min/max/avg/all>] \n\t"
#else
"[-o,--op <sum/prod/min/max/all>] \n\t"
#endif
"[-d,--datatype <nccltype/all>] \n\t"
"[-r,--root <root>] \n\t"
"[-z,--blocking <0/1>] \n\t"
"[-G,--cudagraph <num graph launches>] \n\t"
"[-h,--help]\n",
basename(argv[0]));
return 0;
default:
if (c != 'h') printf("invalid option '%c'\n", c);
printf("USAGE: %s \n\t"
@@ -868,7 +850,8 @@ int main(int argc, char* argv[]) {
#ifdef MPI_SUPPORT
MPI_Init(&argc, &argv);
#endif
return run();
TESTCHECK(run());
return 0;
}
testResult_t run() {
@@ -900,6 +883,7 @@ testResult_t run() {
#define MAX_LINE 2048
char line[MAX_LINE];
int len = 0;
size_t maxMem = ~0;
for (int i=0; i<nThreads*nGpus; i++) {
int cudaDev = localRank*nThreads*nGpus+i;
int rank = proc*nThreads*nGpus+i;
@@ -907,6 +891,7 @@ testResult_t run() {
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
len += snprintf(line+len, MAX_LINE-len, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
maxMem = std::min(maxMem, prop.totalGlobalMem);
}
#if MPI_SUPPORT
@@ -918,10 +903,18 @@ testResult_t run() {
PRINT("%s", lines+MAX_LINE*p);
free(lines);
}
MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
#else
PRINT("%s", line);
#endif
// We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
if (maxBytes > memMaxBytes) {
maxBytes = memMaxBytes;
if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
}
ncclUniqueId ncclId;
if (proc == 0) {
NCCLCHECK(ncclGetUniqueId(&ncclId));
@@ -963,7 +956,7 @@ testResult_t run() {
int errors[nThreads];
double bw[nThreads];
double* delta;
CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
int bw_count[nThreads];
for (int t=0; t<nThreads; t++) {
bw[t] = 0.0;
@@ -1003,7 +996,7 @@ testResult_t run() {
threads[t].args.sync = (volatile int*)sync;
threads[t].args.sync_idx = 0;
threads[t].args.deltaThreads = delta;
threads[t].args.deltaHost = (delta + t);
threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
threads[t].args.delta = delta;
threads[t].args.errors=errors+t;
threads[t].args.bw=bw+t;
+3 -2
View File
@@ -54,8 +54,8 @@ typedef enum {
if (r!= testSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf(" .. %s: Test failure %s:%d\n", \
hostname, \
printf(" .. %s pid %d: Test failure %s:%d\n", \
hostname, getpid(), \
__FILE__,__LINE__); \
return r; \
} \
@@ -78,6 +78,7 @@ extern struct testColl allGatherTest;
extern struct testColl reduceScatterTest;
extern struct testColl broadcastTest;
extern struct testColl reduceTest;
extern struct testColl alltoAllTest;
struct testEngine {
void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);