Resync with changes in gitilab-master code
[ROCm/rccl-tests commit: 1ae8cdc315]
This commit is contained in:
@@ -67,6 +67,8 @@ static int parallel_init = 0;
|
||||
static int blocking_coll = 0;
|
||||
static int cudaGraphLaunches = 0;
|
||||
|
||||
#define NUM_BLOCKS 32
|
||||
|
||||
double parsesize(char *value) {
|
||||
long long int units;
|
||||
double size;
|
||||
@@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
|
||||
const T* A = (const T*)A_;
|
||||
const T* B = (const T*)B_;
|
||||
__shared__ double temp[BSIZE];
|
||||
int tid = threadIdx.x;
|
||||
int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
double locmax = 0.0;
|
||||
for(int i=tid; i<count; i+=blockDim.x) {
|
||||
for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
|
||||
|
||||
double delta = absDiff(A[i], B[i]);
|
||||
if( delta > locmax ) {
|
||||
@@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
|
||||
}
|
||||
}
|
||||
|
||||
tid = threadIdx.x;
|
||||
temp[tid] = locmax;
|
||||
for(int stride = BSIZE/2; stride > 1; stride>>=1) {
|
||||
__syncthreads();
|
||||
@@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
|
||||
}
|
||||
__syncthreads();
|
||||
if( threadIdx.x == 0)
|
||||
*max = temp[0] > temp[1] ? temp[0] : temp[1];
|
||||
max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
|
||||
}
|
||||
|
||||
|
||||
testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
|
||||
testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
|
||||
switch (type) {
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__)
|
||||
case ncclBfloat16:
|
||||
deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
#endif
|
||||
case ncclHalf:
|
||||
deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclFloat:
|
||||
deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclDouble:
|
||||
deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
|
||||
case ncclChar:
|
||||
#if NCCL_MAJOR >= 2
|
||||
case ncclUint8:
|
||||
#endif
|
||||
deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclInt:
|
||||
#if NCCL_MAJOR >= 2
|
||||
case ncclUint32:
|
||||
#endif
|
||||
deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
|
||||
deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
|
||||
}
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -438,8 +441,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
|
||||
// Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
|
||||
size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
|
||||
size_t shift = (totalnbytes * iter) % args->maxbytes;
|
||||
if (shift + totalnbytes > args->maxbytes) shift = 0;
|
||||
size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
|
||||
size_t shift = totalnbytes * (iter % steps);
|
||||
|
||||
if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
|
||||
for (int i = 0; i < args->nGpus; i++) {
|
||||
@@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) {
|
||||
|
||||
testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
|
||||
size_t count = args->nbytes / wordSize(type);
|
||||
if (datacheck) {
|
||||
// Initialize sendbuffs, recvbuffs and expected
|
||||
TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
|
||||
}
|
||||
|
||||
// Sync
|
||||
TESTCHECK(startColl(args, type, op, root, in_place, 0));
|
||||
@@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
}
|
||||
|
||||
double timeUsec = deltaSec*1.0E6;
|
||||
char timeStr[10];
|
||||
char timeStr[100];
|
||||
if (timeUsec > 10000.0) {
|
||||
sprintf(timeStr, "%7.0f", timeUsec);
|
||||
} else if (timeUsec > 100.0) {
|
||||
} else if (timeUsec >= 100.0) {
|
||||
sprintf(timeStr, "%7.1f", timeUsec);
|
||||
} else {
|
||||
sprintf(timeStr, "%7.2f", timeUsec);
|
||||
@@ -812,31 +819,6 @@ int main(int argc, char* argv[]) {
|
||||
printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
|
||||
#endif
|
||||
break;
|
||||
case 'h':
|
||||
printf("USAGE: %s \n\t"
|
||||
"[-t,--nthreads <num threads>] \n\t"
|
||||
"[-g,--ngpus <gpus per thread>] \n\t"
|
||||
"[-b,--minbytes <min size in bytes>] \n\t"
|
||||
"[-e,--maxbytes <max size in bytes>] \n\t"
|
||||
"[-i,--stepbytes <increment size>] \n\t"
|
||||
"[-f,--stepfactor <increment factor>] \n\t"
|
||||
"[-n,--iters <iteration count>] \n\t"
|
||||
"[-m,--agg_iters <aggregated iteration count>] \n\t"
|
||||
"[-w,--warmup_iters <warmup iteration count>] \n\t"
|
||||
"[-p,--parallel_init <0/1>] \n\t"
|
||||
"[-c,--check <0/1>] \n\t"
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
"[-o,--op <sum/prod/min/max/avg/all>] \n\t"
|
||||
#else
|
||||
"[-o,--op <sum/prod/min/max/all>] \n\t"
|
||||
#endif
|
||||
"[-d,--datatype <nccltype/all>] \n\t"
|
||||
"[-r,--root <root>] \n\t"
|
||||
"[-z,--blocking <0/1>] \n\t"
|
||||
"[-G,--cudagraph <num graph launches>] \n\t"
|
||||
"[-h,--help]\n",
|
||||
basename(argv[0]));
|
||||
return 0;
|
||||
default:
|
||||
if (c != 'h') printf("invalid option '%c'\n", c);
|
||||
printf("USAGE: %s \n\t"
|
||||
@@ -868,7 +850,8 @@ int main(int argc, char* argv[]) {
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Init(&argc, &argv);
|
||||
#endif
|
||||
return run();
|
||||
TESTCHECK(run());
|
||||
return 0;
|
||||
}
|
||||
|
||||
testResult_t run() {
|
||||
@@ -900,6 +883,7 @@ testResult_t run() {
|
||||
#define MAX_LINE 2048
|
||||
char line[MAX_LINE];
|
||||
int len = 0;
|
||||
size_t maxMem = ~0;
|
||||
for (int i=0; i<nThreads*nGpus; i++) {
|
||||
int cudaDev = localRank*nThreads*nGpus+i;
|
||||
int rank = proc*nThreads*nGpus+i;
|
||||
@@ -907,6 +891,7 @@ testResult_t run() {
|
||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||
len += snprintf(line+len, MAX_LINE-len, "# Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
|
||||
rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
|
||||
maxMem = std::min(maxMem, prop.totalGlobalMem);
|
||||
}
|
||||
|
||||
#if MPI_SUPPORT
|
||||
@@ -918,10 +903,18 @@ testResult_t run() {
|
||||
PRINT("%s", lines+MAX_LINE*p);
|
||||
free(lines);
|
||||
}
|
||||
MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
|
||||
#else
|
||||
PRINT("%s", line);
|
||||
#endif
|
||||
|
||||
// We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
|
||||
size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
|
||||
if (maxBytes > memMaxBytes) {
|
||||
maxBytes = memMaxBytes;
|
||||
if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
|
||||
}
|
||||
|
||||
ncclUniqueId ncclId;
|
||||
if (proc == 0) {
|
||||
NCCLCHECK(ncclGetUniqueId(&ncclId));
|
||||
@@ -963,7 +956,7 @@ testResult_t run() {
|
||||
int errors[nThreads];
|
||||
double bw[nThreads];
|
||||
double* delta;
|
||||
CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
|
||||
CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
|
||||
int bw_count[nThreads];
|
||||
for (int t=0; t<nThreads; t++) {
|
||||
bw[t] = 0.0;
|
||||
@@ -1003,7 +996,7 @@ testResult_t run() {
|
||||
threads[t].args.sync = (volatile int*)sync;
|
||||
threads[t].args.sync_idx = 0;
|
||||
threads[t].args.deltaThreads = delta;
|
||||
threads[t].args.deltaHost = (delta + t);
|
||||
threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
|
||||
threads[t].args.delta = delta;
|
||||
threads[t].args.errors=errors+t;
|
||||
threads[t].args.bw=bw+t;
|
||||
|
||||
@@ -54,8 +54,8 @@ typedef enum {
|
||||
if (r!= testSuccess) { \
|
||||
char hostname[1024]; \
|
||||
getHostName(hostname, 1024); \
|
||||
printf(" .. %s: Test failure %s:%d\n", \
|
||||
hostname, \
|
||||
printf(" .. %s pid %d: Test failure %s:%d\n", \
|
||||
hostname, getpid(), \
|
||||
__FILE__,__LINE__); \
|
||||
return r; \
|
||||
} \
|
||||
@@ -78,6 +78,7 @@ extern struct testColl allGatherTest;
|
||||
extern struct testColl reduceScatterTest;
|
||||
extern struct testColl broadcastTest;
|
||||
extern struct testColl reduceTest;
|
||||
extern struct testColl alltoAllTest;
|
||||
|
||||
struct testEngine {
|
||||
void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
|
||||
|
||||
Reference in New Issue
Block a user