Resync with changes in gitilab-master code

[ROCm/rccl-tests commit: 1ae8cdc315]
2021-06-28 18:23:12 -07:00
parent d30e35f150
commit 1044cd1f32
2 changed files with 40 additions and 46 deletions
@@ -67,6 +67,8 @@ static int parallel_init = 0;
 static int blocking_coll = 0;
 static int cudaGraphLaunches = 0;

+#define NUM_BLOCKS 32
+
 double parsesize(char *value) {
    long long int units;
    double size;
@@ -137,9 +139,9 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
  const T* A = (const T*)A_;
  const T* B = (const T*)B_;
  __shared__ double temp[BSIZE];
-  int tid = threadIdx.x;
+  int tid = blockIdx.x*blockDim.x + threadIdx.x;
  double locmax = 0.0;
-  for(int i=tid; i<count; i+=blockDim.x) {
+  for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {

    double delta = absDiff(A[i], B[i]);
    if( delta > locmax ) {
@@ -150,6 +152,7 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
    }
  }

+  tid = threadIdx.x;
  temp[tid] = locmax;
  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
    __syncthreads();
@@ -158,38 +161,38 @@ void deltaKern(void* A_, void* B_, size_t count, double* max) {
  }
  __syncthreads();
  if( threadIdx.x == 0)
-    *max = temp[0] > temp[1] ? temp[0] : temp[1];
+    max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
 }

-
-testResult_t CheckDelta(void* expected, void* results, size_t count, ncclDataType_t type, double* devmax) {
+testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
  switch (type) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
    case ncclBfloat16:
-      deltaKern<__nv_bfloat16, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<__nv_bfloat16, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
 #endif
    case ncclHalf:
-      deltaKern<half, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<half, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
    case ncclFloat:
-      deltaKern<float, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<float, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
    case ncclDouble:
-      deltaKern<double, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<double, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;

    case ncclChar:
 #if NCCL_MAJOR >= 2
    case ncclUint8:
 #endif
-      deltaKern<uint8_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint8_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
    case ncclInt:
 #if NCCL_MAJOR >= 2
    case ncclUint32:
 #endif
-      deltaKern<uint32_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint32_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
    case ncclInt64:
    case ncclUint64:
-      deltaKern<uint64_t, 512><<<1, 512>>>(results, expected, count, devmax); break;
+      deltaKern<uint64_t, 512><<<NUM_BLOCKS, 512>>>(results, expected, count, devmax); break;
  }
  CUDACHECK(cudaDeviceSynchronize());
+  for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
  return testSuccess;
 }

@@ -438,8 +441,8 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t

  // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
-  size_t shift = (totalnbytes * iter) % args->maxbytes;
-  if (shift + totalnbytes > args->maxbytes) shift = 0;
+  size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+  size_t shift = totalnbytes * (iter % steps);

  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
  for (int i = 0; i < args->nGpus; i++) {
@@ -475,6 +478,10 @@ testResult_t completeColl(struct threadArgs* args) {

 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
  size_t count = args->nbytes / wordSize(type);
+  if (datacheck) {
+    // Initialize sendbuffs, recvbuffs and expected
+    TESTCHECK(args->collTest->initData(args, type, op, root, 99, in_place));
+  }

  // Sync
  TESTCHECK(startColl(args, type, op, root, in_place, 0));
@@ -598,10 +605,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  }

  double timeUsec = deltaSec*1.0E6;
-  char timeStr[10];
+  char timeStr[100];
  if (timeUsec > 10000.0) {
    sprintf(timeStr, "%7.0f", timeUsec);
-  } else if (timeUsec > 100.0) {
+  } else if (timeUsec >= 100.0) {
    sprintf(timeStr, "%7.1f", timeUsec);
  } else {
    sprintf(timeStr, "%7.2f", timeUsec);
@@ -812,31 +819,6 @@ int main(int argc, char* argv[]) {
        printf("Option -G (CUDA graph) not supported before NCCL 2.9 + CUDA 11.3. Ignoring\n");
 #endif
        break;
-      case 'h':
-	printf("USAGE: %s \n\t"
-            "[-t,--nthreads <num threads>] \n\t"
-            "[-g,--ngpus <gpus per thread>] \n\t"
-            "[-b,--minbytes <min size in bytes>] \n\t"
-            "[-e,--maxbytes <max size in bytes>] \n\t"
-            "[-i,--stepbytes <increment size>] \n\t"
-            "[-f,--stepfactor <increment factor>] \n\t"
-            "[-n,--iters <iteration count>] \n\t"
-            "[-m,--agg_iters <aggregated iteration count>] \n\t"
-            "[-w,--warmup_iters <warmup iteration count>] \n\t"
-            "[-p,--parallel_init <0/1>] \n\t"
-            "[-c,--check <0/1>] \n\t"
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
-            "[-o,--op <sum/prod/min/max/avg/all>] \n\t"
-#else
-            "[-o,--op <sum/prod/min/max/all>] \n\t"
-#endif
-            "[-d,--datatype <nccltype/all>] \n\t"
-            "[-r,--root <root>] \n\t"
-            "[-z,--blocking <0/1>] \n\t"
-            "[-G,--cudagraph <num graph launches>] \n\t"
-            "[-h,--help]\n",
-	    basename(argv[0]));
-	return 0;
      default:
        if (c != 'h') printf("invalid option '%c'\n", c);
        printf("USAGE: %s \n\t"
@@ -868,7 +850,8 @@ int main(int argc, char* argv[]) {
 #ifdef MPI_SUPPORT
  MPI_Init(&argc, &argv);
 #endif
-  return run();
+  TESTCHECK(run());
+  return 0;
 }

 testResult_t run() {
@@ -900,6 +883,7 @@ testResult_t run() {
 #define MAX_LINE 2048
  char line[MAX_LINE];
  int len = 0;
+  size_t maxMem = ~0;
  for (int i=0; i<nThreads*nGpus; i++) {
    int cudaDev = localRank*nThreads*nGpus+i;
    int rank = proc*nThreads*nGpus+i;
@@ -907,6 +891,7 @@ testResult_t run() {
    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
    len += snprintf(line+len, MAX_LINE-len, "#   Rank %2d Pid %6d on %10s device %2d [0x%02x] %s\n",
                    rank, getpid(), hostname, cudaDev, prop.pciBusID, prop.name);
+    maxMem = std::min(maxMem, prop.totalGlobalMem);
  }

 #if MPI_SUPPORT
@@ -918,10 +903,18 @@ testResult_t run() {
      PRINT("%s", lines+MAX_LINE*p);
    free(lines);
  }
+  MPI_Allreduce(MPI_IN_PLACE, &maxMem, 1, MPI_LONG, MPI_MIN, MPI_COMM_WORLD);
 #else
  PRINT("%s", line);
 #endif

+  // We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
+  size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
+  if (maxBytes > memMaxBytes) {
+    maxBytes = memMaxBytes;
+    if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
+  }
+
  ncclUniqueId ncclId;
  if (proc == 0) {
    NCCLCHECK(ncclGetUniqueId(&ncclId));
@@ -963,7 +956,7 @@ testResult_t run() {
  int errors[nThreads];
  double bw[nThreads];
  double* delta;
-  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads, cudaHostAllocPortable | cudaHostAllocMapped));
+  CUDACHECK(cudaHostAlloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
  int bw_count[nThreads];
  for (int t=0; t<nThreads; t++) {
    bw[t] = 0.0;
@@ -1003,7 +996,7 @@ testResult_t run() {
    threads[t].args.sync = (volatile int*)sync;
    threads[t].args.sync_idx = 0;
    threads[t].args.deltaThreads = delta;
-    threads[t].args.deltaHost = (delta + t);
+    threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
    threads[t].args.delta = delta;
    threads[t].args.errors=errors+t;
    threads[t].args.bw=bw+t;
@@ -54,8 +54,8 @@ typedef enum {
  if (r!= testSuccess) {                            \
    char hostname[1024];                            \
    getHostName(hostname, 1024);                    \
-    printf(" .. %s: Test failure %s:%d\n",          \
-         hostname,                                  \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
        __FILE__,__LINE__);                         \
    return r;                                       \
  }                                                 \
@@ -78,6 +78,7 @@ extern struct testColl allGatherTest;
 extern struct testColl reduceScatterTest;
 extern struct testColl broadcastTest;
 extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;

 struct testEngine {
  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);