From fab39367ae3e804f3142baf91969933b8eff684c Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 17 Oct 2022 14:13:48 +0000
Subject: [PATCH] make rccl-test compile again.

all files compile now.
mpi tests also pass


[ROCm/rccl-tests commit: 641e93e99ccb38920154741e84cf9a10ac3da25b]
---
 projects/rccl-tests/CMakeLists.txt            |   1 +
 projects/rccl-tests/src/Makefile              |   5 +-
 projects/rccl-tests/src/alltoallv.cu          |  48 ++---
 projects/rccl-tests/src/common.cu             | 103 +++++------
 projects/rccl-tests/src/common.h              |  12 +-
 projects/rccl-tests/verifiable/Makefile       |  59 +++++-
 .../rccl-tests/verifiable/inexact_regress.cu  |  56 ++++--
 projects/rccl-tests/verifiable/verifiable.cu  | 171 ++++++++----------
 projects/rccl-tests/verifiable/verifiable.h   |  15 +-
 projects/rccl-tests/verifiable/verifiable.mk  |   9 +-
 10 files changed, 251 insertions(+), 228 deletions(-)
diff --git a/projects/rccl-tests/CMakeLists.txt b/projects/rccl-tests/CMakeLists.txt
index 539a1eae2b..1e22365515 100644
--- a/projects/rccl-tests/CMakeLists.txt
+++ b/projects/rccl-tests/CMakeLists.txt
@@ -51,6 +51,7 @@ endif()
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
 
 # Add all of the tests
+add_subdirectory(verifiable)
 add_subdirectory(src)
 
 # Create ROCm standard packages
diff --git a/projects/rccl-tests/src/Makefile b/projects/rccl-tests/src/Makefile
index 0c3c424616..dd01c484f9 100644
--- a/projects/rccl-tests/src/Makefile
+++ b/projects/rccl-tests/src/Makefile
@@ -20,12 +20,11 @@ LDFLAGS    :=
 HIPLDFLAGS :=
 
 ifneq ($(NCCL_HOME), "")
-HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
 HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
 endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
-HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
-HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
 HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
 
diff --git a/projects/rccl-tests/src/alltoallv.cu b/projects/rccl-tests/src/alltoallv.cu
index cb8fcaff0d..c5818d9ded 100644
--- a/projects/rccl-tests/src/alltoallv.cu
+++ b/projects/rccl-tests/src/alltoallv.cu
@@ -10,18 +10,6 @@
 
 #define USE_RCCL_GATHER_SCATTER
 
-void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
-}
-
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
   if (count < nranks*nranks/2) {
     *sendcount = 0;
@@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
 
   int k=0;
   for (int i=0; i<args->nGpus; i++) {
-    char* str = getenv("NCCL_TESTS_DEVICE");
-    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int l=0; l<args->nRanks; l++) {
       int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
       HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
       void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
+
 #if 0
       int *dataHost = (int *)malloc(args->sendBytes);
       hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
@@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
       printf("\n");
       free(dataHost);
 #endif
+
       size_t rdisp = 0;
       size_t data_count = sendcount*2/nranks;
       size_t chunksize = data_count/nranks;
       for (int j=0; j<nranks; j++) {
-	size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
-	if ((j+rank)%nranks == 0)
+        size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+        if ((j+rank)%nranks == 0)
           rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	size_t sdisp = 0;
-	for (int k=0; k<nranks; k++) {
-	  scount = ((k+j)%nranks)*chunksize;
-	  if ((k+j)%nranks == 0)
-	    scount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	  if (k == rank)
-	    break;
-	  sdisp += scount;
-	}
-	TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
-	rdisp += rcount;
+        size_t sdisp = 0;
+        for (int kk=0; kk<nranks; kk++) {
+          scount = ((kk+j)%nranks)*chunksize;
+          if ((kk+j)%nranks == 0)
+            scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+          if (kk == rank)
+            break;
+          sdisp += scount;
+        }
+        TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
+        rdisp += rcount;
       }
       k++;
     }
diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu
index 5f8d7f58fe..4f80115869 100644
--- a/projects/rccl-tests/src/common.cu
+++ b/projects/rccl-tests/src/common.cu
@@ -165,18 +165,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }
 
 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
-  CUDACHECK(cudaDeviceSynchronize());
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
+  HIPCHECK(hipDeviceSynchronize());
   return testSuccess;
 }
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
   return testSuccess;
 }
 
@@ -271,7 +271,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t count = args->expectedBytes/wordSize(type);
 
   int64_t *wrongPerGpu = nullptr;
-  CUDACHECK(hipHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostAllocMapped));
+  HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
   
   for (int i=0; i<args->nGpus*args->nRanks; i++) {
     int device;
@@ -352,7 +352,7 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_
      }
      double delta = tim.elapsed();
      if (delta > timeout && timeout > 0) {
-       for (int i=0; i<ngpus; i++)
+       for (int i=0; i<nStreams; i++)
          NCCLCHECK(ncclCommAbort(comms[i]));
        char hostname[1024];
        getHostName(hostname, 1024);
@@ -387,7 +387,6 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
     int hipDev;
     NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
     HIPCHECK(hipSetDevice(hipDev));
-    //CUDACHECK(cudaSetDevice(args->gpus[i])); EDGAR CHECK LATER
 #endif
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -417,7 +416,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
       case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
       #if defined(RCCL_BFLOAT16)
-      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
+      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<rccl_bfloat16>(rank); break;
       #endif
       }
       NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
@@ -452,7 +451,7 @@ testResult_t completeColl(struct threadArgs* args) {
   return testSuccess;
 }
 
-//EDGAR: Revisit because of cudaGraphLaunches
+//RCCL: Revisit because of cudaGraphLaunches
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
   size_t count = args->nbytes / wordSize(type);
   if (datacheck) {
@@ -648,7 +647,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
     // Benchmark
     for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
         setupArgs(size, type, args);
-        print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+	char rootName[100];
+	sprintf(rootName, "%6i", root);	
+	PRINT("%12li  %12li  %8s  %6s  %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
         TESTCHECK(BenchTime(args, type, op, root, 0));
         TESTCHECK(BenchTime(args, type, op, root, 1));
         PRINT("\n");
@@ -661,10 +662,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  if (enable_multiranks)
-    gpuid = gpuid % numDevices;
-  HIPCHECK(hipSetDevice(gpuid));
+  HIPCHECK(hipSetDevice(args->gpus[0]));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
@@ -679,11 +677,7 @@ testResult_t threadInit(struct threadArgs* args) {
 
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (enable_multiranks)
-      gpuid = gpuid % numDevices;
-    HIPCHECK(hipSetDevice(gpuid));
-    //CUDACHECK(cudaSetDevice(args->gpus[i]));
+    HIPCHECK(hipSetDevice(args->gpus[i]));
 
     for (int j=0; j<args->nRanks; j++) {
       int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
@@ -715,7 +709,7 @@ testResult_t threadLaunch(struct testThread* thread) {
   return testSuccess;
 }
 
-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
   if (memorytype == ncclFine) {
     HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
     HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
@@ -807,12 +801,10 @@ int main(int argc, char* argv[]) {
 
   while(1) {
     int c;
-    // EDGAR NOTE: y is used by 'memory_type' (a RCCL argument) and 'stream_null' (a NCCL argument)
-    // also not sure about G vs. hG (we had G, they have hG)
 #ifdef RCCL_MULTIRANKPERGPU    
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:R:x:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:", longopts, &longindex);
 #else
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:", longopts, &longindex);
 #endif
 
     if (c == -1)
@@ -878,7 +870,7 @@ int main(int argc, char* argv[]) {
       case 'z':
         blocking_coll = strtol(optarg, NULL, 0);
         break;
-      case 'y':
+      case 'Y':
         memorytype = ncclstringtomtype(optarg);
         break;
       case 's':
@@ -946,7 +938,7 @@ int main(int argc, char* argv[]) {
             "[-d,--datatype <nccltype/all>] \n\t"
             "[-r,--root <root>] \n\t"
             "[-z,--blocking <0/1>] \n\t"
-            "[-y,--memory_type <coarse/fine/host/managed>] \n\t"
+            "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
             "[-s,--stress_cycles <number of cycles>] \n\t"
             "[-u,--cumask <d0,d1,d2,d3>] \n\t"
             "[-y,--stream_null <0/1>] \n\t"
@@ -1084,15 +1076,15 @@ testResult_t run() {
 #ifdef MPI_SUPPORT
   MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
 #endif
-<<<<<<< HEAD
-  int gpus[nGpus*nThreads*ranksPerGpu];
+
+  int gpus[nGpus*nThreads];
   hipStream_t streams[nGpus*nThreads*ranksPerGpu];
   void* sendbuffs[nGpus*nThreads*ranksPerGpu];
   void* recvbuffs[nGpus*nThreads*ranksPerGpu];
   void* expected[nGpus*nThreads*ranksPerGpu];
   size_t sendBytes, recvBytes;
 
-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads*ranksPerGpu);
 
   envstr = getenv("NCCL_TESTS_DEVICE");
   gpu0 = envstr ? atoi(envstr) : -1;
@@ -1101,53 +1093,44 @@ testResult_t run() {
     if (enable_multiranks)
       gpuid = gpuid % numDevices;
 
+    gpus[ii] = gpu0 != -1 ? gpu0+ii : gpuid;
+    HIPCHECK(hipSetDevice(gpus[ii]));
+
     for (int j=0; j<ranksPerGpu; j++) {
       int i = ii*ranksPerGpu+j;
-      gpus[i] = gpu0 != -1 ? gpu0+ii : gpuid;
-      HIPCHECK(hipSetDevice(gpus[i]));
-
-      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
-      //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
-    if (streamnull)
-      streams[i] = NULL;
-    else {
-      if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
-	PRINT("cumask: ");
-	for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
-	PRINT("\n");
-	HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
-      } else
-	HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
+      if (streamnull)
+	streams[i] = NULL;
+      else {
+	if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
+	  PRINT("cumask: ");
+	  for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
+	  PRINT("\n");
+	  HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
+	} else
+	  HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      }
     }
-#if 0 //EDGAR
-    // initialize data buffer to avoid all zero data
-      TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
-    }
-    HIPCHECK(hipDeviceSynchronize());
-#endif //EDGAR
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
   ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
   if (!parallel_init) {
-     if (nProcs == 1 && !enable_multiranks) {
+     if (ncclProcs == 1 && !enable_multiranks) {
        NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int ii=0; ii<nGpus*nThreads; ii++) {
-	 int gpuid = localRank*nThreads*nGpus+ii;
-         if (enable_multiranks) {
-	   gpuid = gpuid % numDevices;
-	 }
-         HIPCHECK(hipSetDevice(gpuid));
+         HIPCHECK(hipSetDevice(gpus[ii]));
 	 if (!enable_multiranks) {
-	   NCCLCHECK(ncclCommInitRank(comms+ii, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
+	   NCCLCHECK(ncclCommInitRank(comms+ii, ncclProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
 	 }
 #ifdef RCCL_MULTIRANKPERGPU
 	 else
 	   for (int j=0; j<ranksPerGpu; j++) {
 	     int i = ii*ranksPerGpu+j;
-	     NCCLCHECK(ncclCommInitRankMulti(comms+i, nProcs*nThreads*nGpus*ranksPerGpu, ncclId, proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
+	     NCCLCHECK(ncclCommInitRankMulti(comms+i, ncclProcs*nThreads*nGpus*ranksPerGpu, ncclId,
+					     proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
 	   }
 #endif
        }
@@ -1182,6 +1165,8 @@ testResult_t run() {
     threads[t].args.stepbytes=stepBytes;
     threads[t].args.stepfactor=stepFactor;
     threads[t].args.localRank = localRank;
+
+    threads[t].args.totalProcs = totalProcs;
     threads[t].args.localNumDevices = numDevices;
     threads[t].args.enable_multiranks = enable_multiranks;
     threads[t].args.nRanks = ranksPerGpu;
@@ -1190,7 +1175,7 @@ testResult_t run() {
     threads[t].args.nThreads=nThreads;
     threads[t].args.thread=t;
     threads[t].args.nGpus=nGpus;
-    threads[t].args.gpus=gpus+t*nGpus*ranksPerGpu;
+    threads[t].args.gpus=gpus+t*nGpus;
     threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
     threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
     threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
diff --git a/projects/rccl-tests/src/common.h b/projects/rccl-tests/src/common.h
index 1d687528b3..cb3bd3f3c2 100644
--- a/projects/rccl-tests/src/common.h
+++ b/projects/rccl-tests/src/common.h
@@ -7,7 +7,7 @@
 #ifndef __COMMON_H__
 #define __COMMON_H__
 
-#include "rccl.h"
+#include "rccl/rccl.h"
 #include <stdio.h>
 #include <cstdint>
 #include <algorithm>
@@ -21,14 +21,14 @@
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);
 
-#define HIPCHECK(cmd) do {                         \
-  hipError_t e = cmd;                              \
-  if( e != hipSuccess ) {                          \
+#define HIPCHECK(cmd) do {                          \
+  hipError_t e = cmd;                               \
+  if( e != hipSuccess ) {                           \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf("%s: Test HIP failure %s:%d '%s'\n",    \
+    printf("%s: Test HIP failure %s:%d '%s'\n",     \
          hostname,                                  \
-        __FILE__,__LINE__,hipGetErrorString(e));   \
+        __FILE__,__LINE__,hipGetErrorString(e));    \
     return testCudaError;                           \
   }                                                 \
 } while(0)
diff --git a/projects/rccl-tests/verifiable/Makefile b/projects/rccl-tests/verifiable/Makefile
index b141a2a7c5..182d44e727 100644
--- a/projects/rccl-tests/verifiable/Makefile
+++ b/projects/rccl-tests/verifiable/Makefile
@@ -1,13 +1,62 @@
-include ../../makefiles/common.mk
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+#include ../../makefiles/common.mk
 
 .PHONY: all clean
 
 BUILDDIR := $(abspath ../../build)
-NCCLDIR := $(BUILDDIR)
-NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
 DST_DIR := $(BUILDDIR)/test/verifiable
 
-all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+
+HIPCC = $(ROCM_PATH)/bin/hipcc
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+endif
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L/usr/lib -lmpich
+endif
+
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test 
 
 clean:
 	rm -rf $(DST_DIR)
@@ -21,4 +70,4 @@ self_test: $(DST_DIR)/self_test
 $(DST_DIR)/self_test: verifiable.cu verifiable.h
 	@printf "Linking  %s\n" $@
 	@mkdir -p $(DST_DIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
diff --git a/projects/rccl-tests/verifiable/inexact_regress.cu b/projects/rccl-tests/verifiable/inexact_regress.cu
index d7bd545f62..973b965412 100644
--- a/projects/rccl-tests/verifiable/inexact_regress.cu
+++ b/projects/rccl-tests/verifiable/inexact_regress.cu
@@ -1,3 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 /* Generate parameters for our error bound model of floating point average
  * (sum of scaled values) by sampling sums of random sequences for each
  * floating point type.
@@ -16,12 +23,12 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdint>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
 
 using std::uint64_t;
 using std::uint32_t;
-using bfloat16 = __nv_bfloat16;
+using bfloat16 = hip_bfloat16;
 
 template<typename T>
 struct float_traits;
@@ -49,26 +56,26 @@ struct float_traits<double> {
   __device__ static double mul(double a, double b) { return a*b; }
 };
 template<>
-struct float_traits<half> {
+struct float_traits<__half> {
   static constexpr int mantissa_bits = 10;
   static constexpr int exponent_bits = 5;
   using uint_t = uint16_t;
-  __device__ static half make(double x) { return __double2half(x); }
-  __device__ static half make(uint64_t x) { return __int2half_rn(x); }
-  __device__ static double todouble(half x) { return __half2float(x); }
-  __device__ static half add(half a, half b) { return __hadd(a, b); }
-  __device__ static half mul(half a, half b) { return __hmul(a, b); }
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
 };
 template<>
 struct float_traits<bfloat16> {
   static constexpr int mantissa_bits = 7;
   static constexpr int exponent_bits = 8;
   using uint_t = uint16_t;
-  __device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
-  __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
-  __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
-  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
-  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
 };
 
 template<typename F>
@@ -104,6 +111,17 @@ struct xoshiro256ss {
   }
 };
 
+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
 template<typename F>
 __global__ void kernel() {
   using traits = float_traits<F>;
@@ -123,7 +141,7 @@ __global__ void kernel() {
     for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
     //for(int round=0; round < 2; round++) {
       for(int i=threadIdx.x; i < samps; i += blockDim.x) {
-        accf[i] = 0;
+        accf[i] = (F)0;
         accd[i] = 0;
       }
       __syncthreads();
@@ -157,21 +175,21 @@ __global__ void kernel() {
     if(pass==0)
       expo_avg = expo_sum/expo_n;
     else if(threadIdx.x == 0)
-      std::printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
   }
 }
 
 int main() {
   std::printf("type=float:\n");
   kernel<float><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
 
   std::printf("\ntype=half:\n");
   kernel<half><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
 
   std::printf("\ntype=bfloat16:\n");
   kernel<bfloat16><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
   return 0;
 }
diff --git a/projects/rccl-tests/verifiable/verifiable.cu b/projects/rccl-tests/verifiable/verifiable.cu
index 5f617ee188..9d8e56aba9 100644
--- a/projects/rccl-tests/verifiable/verifiable.cu
+++ b/projects/rccl-tests/verifiable/verifiable.cu
@@ -1,15 +1,23 @@
-#pragma nv_diag_suppress declared_but_not_referenced
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+//#pragma nv_diag_suppress declared_but_not_referenced
 
 #include "verifiable.h"
-#include <nccl.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
 
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#if CUDART_VERSION >= 11000
-#include <cuda_bf16.h>
-#endif
+#include "rccl/rccl.h"
 
-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
+
+#define RCCL_BFLOAT 1
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
   #define HAVE_ncclBfloat16 1
 #else
   #define HAVE_ncclBfloat16 0
@@ -83,10 +91,10 @@ namespace {
 template<typename T>
 struct IsIntegral: std::is_integral<T> {};
 template<>
-struct IsIntegral<half>: std::false_type {};
-#ifdef __CUDA_BF16_TYPES_EXIST__
+struct IsIntegral<__half>: std::false_type {};
+#if RCCL_BFLOAT16 == 1
 template<>
-struct IsIntegral<__nv_bfloat16>: std::false_type {};
+struct IsIntegral<hip_bfloat16>: std::false_type {};
 #endif
 }
 
@@ -116,13 +124,13 @@ namespace {
     return Y(x);
   }
   template<>
-  __host__ __device__ half castTo<half>(float x) {
+  __host__ __device__ half castTo<__half>(float x) {
     return __float2half(x);
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if RCCL_BFLOAT16 == 1
   template<>
-  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
-    return __float2bfloat16(x);
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+    return hip_bfloat16(x);
   }
   #endif
 }
@@ -144,20 +152,12 @@ struct ReduceSum {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()+T())>
   __host__ __device__ T operator()(T a, T b) const { return a + b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 530
-      return __hadd(a, b);
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
       return __float2half(__half2float(a) + __half2float(b));
-    #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hadd(a, b);
-    #else
-      return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
   }
   #endif
   template<typename T>
@@ -168,20 +168,12 @@ struct ReduceProd {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()*T())>
   __host__ __device__ T operator()(T a, T b) const { return a * b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 530
-      return __hmul(a, b);
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
       return __float2half(__half2float(a) * __half2float(b));
-    #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmul(a, b);
-    #else
-      return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
   }
   #endif
   template<typename T>
@@ -192,24 +184,12 @@ struct ReduceMin {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()<T())>
   __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmin(a, b);
-    #elif __CUDA_ARCH__ >= 530
-      return __hlt(a, b) ? a : b;
-    #else
-      return __half2float(a) < __half2float(b) ? a : b;
-    #endif
+  __host__ __device__ __half operator()(__half a, __half b) const {
+    return __half2float(a) < __half2float(b) ? a : b;
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmin(a, b);
-    //#elif __CUDA_ARCH__ >= 530
-    //  return __hlt(a, b) ? a : b;
-    #else
-      return __bfloat162float(a) < __bfloat162float(b) ? a : b;
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
   }
   #endif
   template<typename T>
@@ -220,24 +200,12 @@ struct ReduceMax {
   __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
   template<typename T, typename=decltype(T()>T())>
   __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmax(a, b);
-    #elif __CUDA_ARCH__ >= 530
-      return __hgt(a, b) ? a : b;
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
       return __half2float(a) > __half2float(b) ? a : b;
-    #endif
   }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmax(a, b);
-    //#elif __CUDA_ARCH__ >= 530
-    //  return __hgt(a, b) ? a : b;
-    #else
-      return __bfloat162float(a) > __bfloat162float(b) ? a : b;
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
   }
   #endif
   template<typename T>
@@ -309,13 +277,13 @@ struct FloatLayout<double> {
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
-struct FloatLayout<half> {
+struct FloatLayout<__half> {
   static constexpr int exponent_bits = 5, mantissa_bits = 10;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
-#ifdef __CUDA_BF16_TYPES_EXIST__
+#if RCCL_BFLOAT16 == 1
 template<>
-struct FloatLayout<__nv_bfloat16> {
+struct FloatLayout<hip_bfloat16> {
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
@@ -340,14 +308,14 @@ namespace {
 // from unbounded random values. For instance, given X a totally random 32-bit
 // integer, `umul32hi(X,n)` will be totally random within [0,n).
 __host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __umulhi(a, b);
 #else
   return uint64_t(a)*b >> 32;
 #endif
 }
 __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __umul64hi(a, b);
 #else
   return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
@@ -355,14 +323,14 @@ __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
 }
 
 __host__ __device__ int clz32(int x) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __clz(x);
 #else
   return x==0 ? 32 : __builtin_clz(x);
 #endif
 }
 __host__ __device__ int clz64(long long x) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
   return __clzll(x);
 #else
   return x==0 ? 64 : __builtin_clzll(x);
@@ -747,8 +715,9 @@ __host__ __device__ void genOutput(
   ) {
   ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
   using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
-  ans = ReduceProd()(ans, T1(1)/T1(rank_n));
-}
+  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
+ }
 }
 
 /////////////////////////////////////////////////////////////////////////////////
@@ -835,7 +804,7 @@ __global__ void prepareInput2(
 template<typename ReduceOp>
 void prepareInput1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
@@ -846,9 +815,9 @@ void prepareInput1(
   case ncclUint32: CASE_TY(uint32_t)
   case ncclInt64: CASE_TY(int64_t)
   case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -860,7 +829,7 @@ void prepareInput1(
 
 void ncclVerifiablePrepareInput(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -911,7 +880,7 @@ __global__ void prepareExpected2(
 template<typename ReduceOp>
 void prepareExpected1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
@@ -922,9 +891,9 @@ void prepareExpected1(
   case ncclUint32: CASE_TY(uint32_t)
   case ncclInt64: CASE_TY(int64_t)
   case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -936,7 +905,7 @@ void prepareExpected1(
 
 void ncclVerifiablePrepareExpected(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1044,7 +1013,8 @@ __global__ void verifyPrepared(
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
 }
 
 template<typename T, typename Uint, typename ReduceFn>
@@ -1077,13 +1047,14 @@ __global__ void verifyInline2(
     #endif
     i += blockDim.x;
   }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
 }
 
 template<typename T, typename Uint>
 void verifyInline1(
     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1112,7 +1083,7 @@ void verifyInline1(
 void ncclVerifiableVerify(
     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    int64_t *bad_elt_n, cudaStream_t stream
+    int64_t *bad_elt_n, hipStream_t stream
   ) {
   bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
   #if HAVE_ncclBfloat16
@@ -1142,9 +1113,9 @@ void ncclVerifiableVerify(
   case ncclUint32: CASE_TY(uint32_t, uint32_t)
   case ncclInt64: CASE_TY(int64_t, uint64_t)
   case ncclUint64: CASE_TY(uint64_t, uint64_t)
-  case ncclFloat16: CASE_TY(half, uint16_t)
+  case ncclFloat16: CASE_TY(__half, uint16_t)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
+  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1180,7 +1151,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in
     }
     sum = op.postOp(sum);
     if(tolerance < calcDelta(sum, y)) {
-      std::printf(
+      printf(
         //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
         "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
         *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
@@ -1209,9 +1180,9 @@ __global__ void sweep() {
   sweep1<uint32_t>(ncclUint32, "uint32");
   sweep1<int64_t>(ncclInt64, "int64");
   sweep1<uint64_t>(ncclUint64, "uint64");
-  sweep1<half>(ncclFloat16, "half");
+  sweep1<__half>(ncclFloat16, "half");
   #if HAVE_ncclBfloat16
-    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
+    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
@@ -1219,9 +1190,9 @@ __global__ void sweep() {
 
 int main(int arg_n, char **args) {
   std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  cudaSetDevice(0);
+  hipSetDevice(0);
   sweep<<<1,512>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
   return 0;
 }
 #endif
diff --git a/projects/rccl-tests/verifiable/verifiable.h b/projects/rccl-tests/verifiable/verifiable.h
index aca0565a6b..b41ef1ad12 100644
--- a/projects/rccl-tests/verifiable/verifiable.h
+++ b/projects/rccl-tests/verifiable/verifiable.h
@@ -1,7 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #ifndef _d41d8cd98f00b204e9800998ecf8427e
 #define _d41d8cd98f00b204e9800998ecf8427e
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 
 #include <stdint.h>
 
@@ -36,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 // Enqueue kernel to generate data which is to be reduced.
 void ncclVerifiablePrepareInput(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
 );
 
 // Enqueue kernel to generate expected results of reduction.
 void ncclVerifiablePrepareExpected(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
 );
 
 // Enqueue kernel to verify reduced data matches expectation. The number of
@@ -54,6 +61,6 @@ void ncclVerifiablePrepareExpected(
 void ncclVerifiableVerify(
   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-  int64_t *bad_elt_n, cudaStream_t stream
+  int64_t *bad_elt_n, hipStream_t stream
 );
 #endif
diff --git a/projects/rccl-tests/verifiable/verifiable.mk b/projects/rccl-tests/verifiable/verifiable.mk
index 225c32a3c3..fba1fbf35c 100644
--- a/projects/rccl-tests/verifiable/verifiable.mk
+++ b/projects/rccl-tests/verifiable/verifiable.mk
@@ -1,3 +1,9 @@
+# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+
+
 # We requires both of the following paths to be set upon including this makefile
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
@@ -8,4 +14,5 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu