make rccl-test compile again.

all files compile now. mpi tests also pass
2022-10-17 14:13:48 +00:00
@@ -51,6 +51,7 @@ endif()
 set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component

 # Add all of the tests
+add_subdirectory(verifiable)
 add_subdirectory(src)

 # Create ROCm standard packages
@@ -20,12 +20,11 @@ LDFLAGS    :=
 HIPLDFLAGS :=

 ifneq ($(NCCL_HOME), "")
-HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
 HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
 endif
 HIPCUFLAGS += -I$(ROCM_PATH)/include
-HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
-HIPCUFLAGS += -I$(ROCM_PATH)/hip/include/hip
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
 LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
 HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt

@@ -10,18 +10,6 @@

 #define USE_RCCL_GATHER_SCATTER

-void print_header() {
-  PRINT("# %10s  %12s  %6s  %6s            out-of-place                       in-place          \n", "", "", "", "");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "size", "count", "type", "redop",
-        "time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
-  PRINT("# %10s  %12s  %6s  %6s  %7s  %6s  %6s  %5s  %7s  %6s  %6s  %5s\n", "(B)", "(elements)", "", "",
-        "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
-}
-
-void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
-  PRINT("%12li  %12li  %6s  %6s", size, count, typeName, opName);
-}
-
 void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
  if (count < nranks*nranks/2) {
    *sendcount = 0;
@@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc

  int k=0;
  for (int i=0; i<args->nGpus; i++) {
-    char* str = getenv("NCCL_TESTS_DEVICE");
-    int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (args->enable_multiranks)
-      gpuid = gpuid % args->localNumDevices;
-    HIPCHECK(hipSetDevice(gpuid));
+    HIPCHECK(hipSetDevice(args->gpus[i]));

    for (int l=0; l<args->nRanks; l++) {
      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
-      TESTCHECK(InitData(data, sendcount, type, rep, rank));
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
+
 #if 0
      int *dataHost = (int *)malloc(args->sendBytes);
      hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
@@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
      printf("\n");
      free(dataHost);
 #endif
+
      size_t rdisp = 0;
      size_t data_count = sendcount*2/nranks;
      size_t chunksize = data_count/nranks;
      for (int j=0; j<nranks; j++) {
-	size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
-	if ((j+rank)%nranks == 0)
+        size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+        if ((j+rank)%nranks == 0)
          rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	size_t sdisp = 0;
-	for (int k=0; k<nranks; k++) {
-	  scount = ((k+j)%nranks)*chunksize;
-	  if ((k+j)%nranks == 0)
-	    scount += (sendcount-chunksize*(nranks-1)*nranks/2);
-	  if (k == rank)
-	    break;
-	  sdisp += scount;
-	}
-	TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
-	rdisp += rcount;
+        size_t sdisp = 0;
+        for (int kk=0; kk<nranks; kk++) {
+          scount = ((kk+j)%nranks)*chunksize;
+          if ((kk+j)%nranks == 0)
+            scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+          if (kk == rank)
+            break;
+          sdisp += scount;
+        }
+        TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
+        rdisp += rcount;
      }
      k++;
    }
@@ -165,18 +165,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }

 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
-  CUDACHECK(cudaDeviceSynchronize());
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
+  HIPCHECK(hipDeviceSynchronize());
  return testSuccess;
 }

 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
  return testSuccess;
 }

 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
  return testSuccess;
 }

@@ -271,7 +271,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  size_t count = args->expectedBytes/wordSize(type);

  int64_t *wrongPerGpu = nullptr;
-  CUDACHECK(hipHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostAllocMapped));
+  HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
  
  for (int i=0; i<args->nGpus*args->nRanks; i++) {
    int device;
@@ -352,7 +352,7 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_
     }
     double delta = tim.elapsed();
     if (delta > timeout && timeout > 0) {
-       for (int i=0; i<ngpus; i++)
+       for (int i=0; i<nStreams; i++)
         NCCLCHECK(ncclCommAbort(comms[i]));
       char hostname[1024];
       getHostName(hostname, 1024);
@@ -387,7 +387,6 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
    int hipDev;
    NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
    HIPCHECK(hipSetDevice(hipDev));
-    //CUDACHECK(cudaSetDevice(args->gpus[i])); EDGAR CHECK LATER
 #endif
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -417,7 +416,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
      case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
      #if defined(RCCL_BFLOAT16)
-      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
+      case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<rccl_bfloat16>(rank); break;
      #endif
      }
      NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
@@ -452,7 +451,7 @@ testResult_t completeColl(struct threadArgs* args) {
  return testSuccess;
 }

-//EDGAR: Revisit because of cudaGraphLaunches
+//RCCL: Revisit because of cudaGraphLaunches
 testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place) {
  size_t count = args->nbytes / wordSize(type);
  if (datacheck) {
@@ -648,7 +647,9 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
    // Benchmark
    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
        setupArgs(size, type, args);
-        print_line_header(std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, root);
+	char rootName[100];
+	sprintf(rootName, "%6i", root);	
+	PRINT("%12li  %12li  %8s  %6s  %6s", (size_t)max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
        TESTCHECK(BenchTime(args, type, op, root, 0));
        TESTCHECK(BenchTime(args, type, op, root, 1));
        PRINT("\n");
@@ -661,10 +662,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
  // Set device to the first of our GPUs. If we don't do that, some operations
  // will be done on the current GPU (by default : 0) and if the GPUs are in
  // exclusive mode those operations will fail.
-  int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus;
-  if (enable_multiranks)
-    gpuid = gpuid % numDevices;
-  HIPCHECK(hipSetDevice(gpuid));
+  HIPCHECK(hipSetDevice(args->gpus[0]));
  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
  return testSuccess;
 }
@@ -679,11 +677,7 @@ testResult_t threadInit(struct threadArgs* args) {

  NCCLCHECK(ncclGroupStart());
  for (int i=0; i<args->nGpus; i++) {
-    int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    if (enable_multiranks)
-      gpuid = gpuid % numDevices;
-    HIPCHECK(hipSetDevice(gpuid));
-    //CUDACHECK(cudaSetDevice(args->gpus[i]));
+    HIPCHECK(hipSetDevice(args->gpus[i]));

    for (int j=0; j<args->nRanks; j++) {
      int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
@@ -715,7 +709,7 @@ testResult_t threadLaunch(struct testThread* thread) {
  return testSuccess;
 }

-testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, int nranks) {
+testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
  if (memorytype == ncclFine) {
    HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
    HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
@@ -807,12 +801,10 @@ int main(int argc, char* argv[]) {

  while(1) {
    int c;
-    // EDGAR NOTE: y is used by 'memory_type' (a RCCL argument) and 'stream_null' (a NCCL argument)
-    // also not sure about G vs. hG (we had G, they have hG)
 #ifdef RCCL_MULTIRANKPERGPU    
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:R:x:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:R:x:", longopts, &longindex);
 #else
-    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:", longopts, &longindex);
+    c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:Y:T:G:C:a:y:s:u:h:", longopts, &longindex);
 #endif

    if (c == -1)
@@ -878,7 +870,7 @@ int main(int argc, char* argv[]) {
      case 'z':
        blocking_coll = strtol(optarg, NULL, 0);
        break;
-      case 'y':
+      case 'Y':
        memorytype = ncclstringtomtype(optarg);
        break;
      case 's':
@@ -946,7 +938,7 @@ int main(int argc, char* argv[]) {
            "[-d,--datatype <nccltype/all>] \n\t"
            "[-r,--root <root>] \n\t"
            "[-z,--blocking <0/1>] \n\t"
-            "[-y,--memory_type <coarse/fine/host/managed>] \n\t"
+            "[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
            "[-s,--stress_cycles <number of cycles>] \n\t"
            "[-u,--cumask <d0,d1,d2,d3>] \n\t"
            "[-y,--stream_null <0/1>] \n\t"
@@ -1084,15 +1076,15 @@ testResult_t run() {
 #ifdef MPI_SUPPORT
  MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
 #endif
-<<<<<<< HEAD
-  int gpus[nGpus*nThreads*ranksPerGpu];
+
+  int gpus[nGpus*nThreads];
  hipStream_t streams[nGpus*nThreads*ranksPerGpu];
  void* sendbuffs[nGpus*nThreads*ranksPerGpu];
  void* recvbuffs[nGpus*nThreads*ranksPerGpu];
  void* expected[nGpus*nThreads*ranksPerGpu];
  size_t sendBytes, recvBytes;

-  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
+  ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)ncclProcs*nGpus*nThreads*ranksPerGpu);

  envstr = getenv("NCCL_TESTS_DEVICE");
  gpu0 = envstr ? atoi(envstr) : -1;
@@ -1101,53 +1093,44 @@ testResult_t run() {
    if (enable_multiranks)
      gpuid = gpuid % numDevices;

+    gpus[ii] = gpu0 != -1 ? gpu0+ii : gpuid;
+    HIPCHECK(hipSetDevice(gpus[ii]));
+
    for (int j=0; j<ranksPerGpu; j++) {
      int i = ii*ranksPerGpu+j;
-      gpus[i] = gpu0 != -1 ? gpu0+ii : gpuid;
-      HIPCHECK(hipSetDevice(gpus[i]));
-
-      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
-      //PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
-    if (streamnull)
-      streams[i] = NULL;
-    else {
-      if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
-	PRINT("cumask: ");
-	for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
-	PRINT("\n");
-	HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
-      } else
-	HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
+      if (streamnull)
+	streams[i] = NULL;
+      else {
+	if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
+	  PRINT("cumask: ");
+	  for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
+	  PRINT("\n");
+	  HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
+	} else
+	  HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      }
    }
-#if 0 //EDGAR
-    // initialize data buffer to avoid all zero data
-      TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
-    }
-    HIPCHECK(hipDeviceSynchronize());
-#endif //EDGAR
  }

  //if parallel init is not selected, use main thread to initialize NCCL
  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
  if (!parallel_init) {
-     if (nProcs == 1 && !enable_multiranks) {
+     if (ncclProcs == 1 && !enable_multiranks) {
       NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
     } else {
       NCCLCHECK(ncclGroupStart());
       for (int ii=0; ii<nGpus*nThreads; ii++) {
-	 int gpuid = localRank*nThreads*nGpus+ii;
-         if (enable_multiranks) {
-	   gpuid = gpuid % numDevices;
-	 }
-         HIPCHECK(hipSetDevice(gpuid));
+         HIPCHECK(hipSetDevice(gpus[ii]));
 	 if (!enable_multiranks) {
-	   NCCLCHECK(ncclCommInitRank(comms+ii, nProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
+	   NCCLCHECK(ncclCommInitRank(comms+ii, ncclProcs*nThreads*nGpus, ncclId, proc*nThreads*nGpus+ii));
 	 }
 #ifdef RCCL_MULTIRANKPERGPU
 	 else
 	   for (int j=0; j<ranksPerGpu; j++) {
 	     int i = ii*ranksPerGpu+j;
-	     NCCLCHECK(ncclCommInitRankMulti(comms+i, nProcs*nThreads*nGpus*ranksPerGpu, ncclId, proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
+	     NCCLCHECK(ncclCommInitRankMulti(comms+i, ncclProcs*nThreads*nGpus*ranksPerGpu, ncclId,
+					     proc*nThreads*nGpus*ranksPerGpu+i, proc*nThreads*nGpus*ranksPerGpu+i));
 	   }
 #endif
       }
@@ -1182,6 +1165,8 @@ testResult_t run() {
    threads[t].args.stepbytes=stepBytes;
    threads[t].args.stepfactor=stepFactor;
    threads[t].args.localRank = localRank;
+
+    threads[t].args.totalProcs = totalProcs;
    threads[t].args.localNumDevices = numDevices;
    threads[t].args.enable_multiranks = enable_multiranks;
    threads[t].args.nRanks = ranksPerGpu;
@@ -1190,7 +1175,7 @@ testResult_t run() {
    threads[t].args.nThreads=nThreads;
    threads[t].args.thread=t;
    threads[t].args.nGpus=nGpus;
-    threads[t].args.gpus=gpus+t*nGpus*ranksPerGpu;
+    threads[t].args.gpus=gpus+t*nGpus;
    threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
    threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
    threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
@@ -7,7 +7,7 @@
 #ifndef __COMMON_H__
 #define __COMMON_H__

-#include "rccl.h"
+#include "rccl/rccl.h"
 #include <stdio.h>
 #include <cstdint>
 #include <algorithm>
@@ -21,14 +21,14 @@
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);

-#define HIPCHECK(cmd) do {                         \
-  hipError_t e = cmd;                              \
-  if( e != hipSuccess ) {                          \
+#define HIPCHECK(cmd) do {                          \
+  hipError_t e = cmd;                               \
+  if( e != hipSuccess ) {                           \
    char hostname[1024];                            \
    getHostName(hostname, 1024);                    \
-    printf("%s: Test HIP failure %s:%d '%s'\n",    \
+    printf("%s: Test HIP failure %s:%d '%s'\n",     \
         hostname,                                  \
-        __FILE__,__LINE__,hipGetErrorString(e));   \
+        __FILE__,__LINE__,hipGetErrorString(e));    \
    return testCudaError;                           \
  }                                                 \
 } while(0)
@@ -1,13 +1,62 @@
-include ../../makefiles/common.mk
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+#include ../../makefiles/common.mk

 .PHONY: all clean

 BUILDDIR := $(abspath ../../build)
-NCCLDIR := $(BUILDDIR)
-NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
 DST_DIR := $(BUILDDIR)/test/verifiable

-all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+
+HIPCC = $(ROCM_PATH)/bin/hipcc
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+endif
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L/usr/lib -lmpich
+endif
+
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test 

 clean:
 	rm -rf $(DST_DIR)
@@ -21,4 +70,4 @@ self_test: $(DST_DIR)/self_test
 $(DST_DIR)/self_test: verifiable.cu verifiable.h
 	@printf "Linking  %s\n" $@
 	@mkdir -p $(DST_DIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
@@ -1,3 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 /* Generate parameters for our error bound model of floating point average
 * (sum of scaled values) by sampling sums of random sequences for each
 * floating point type.
@@ -16,12 +23,12 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdint>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>

 using std::uint64_t;
 using std::uint32_t;
-using bfloat16 = __nv_bfloat16;
+using bfloat16 = hip_bfloat16;

 template<typename T>
 struct float_traits;
@@ -49,26 +56,26 @@ struct float_traits<double> {
  __device__ static double mul(double a, double b) { return a*b; }
 };
 template<>
-struct float_traits<half> {
+struct float_traits<__half> {
  static constexpr int mantissa_bits = 10;
  static constexpr int exponent_bits = 5;
  using uint_t = uint16_t;
-  __device__ static half make(double x) { return __double2half(x); }
-  __device__ static half make(uint64_t x) { return __int2half_rn(x); }
-  __device__ static double todouble(half x) { return __half2float(x); }
-  __device__ static half add(half a, half b) { return __hadd(a, b); }
-  __device__ static half mul(half a, half b) { return __hmul(a, b); }
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
 };
 template<>
 struct float_traits<bfloat16> {
  static constexpr int mantissa_bits = 7;
  static constexpr int exponent_bits = 8;
  using uint_t = uint16_t;
-  __device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
-  __device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
-  __device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
-  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
-  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
 };

 template<typename F>
@@ -104,6 +111,17 @@ struct xoshiro256ss {
  }
 };

+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
 template<typename F>
 __global__ void kernel() {
  using traits = float_traits<F>;
@@ -123,7 +141,7 @@ __global__ void kernel() {
    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
    //for(int round=0; round < 2; round++) {
      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
-        accf[i] = 0;
+        accf[i] = (F)0;
        accd[i] = 0;
      }
      __syncthreads();
@@ -157,21 +175,21 @@ __global__ void kernel() {
    if(pass==0)
      expo_avg = expo_sum/expo_n;
    else if(threadIdx.x == 0)
-      std::printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
  }
 }

 int main() {
  std::printf("type=float:\n");
  kernel<float><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();

  std::printf("\ntype=half:\n");
  kernel<half><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();

  std::printf("\ntype=bfloat16:\n");
  kernel<bfloat16><<<1,32>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
  return 0;
 }
@@ -1,15 +1,23 @@
-#pragma nv_diag_suppress declared_but_not_referenced
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+//#pragma nv_diag_suppress declared_but_not_referenced

 #include "verifiable.h"
-#include <nccl.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>

-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#if CUDART_VERSION >= 11000
-#include <cuda_bf16.h>
-#endif
+#include "rccl/rccl.h"

-#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && defined(__CUDA_BF16_TYPES_EXIST__)
+
+#define RCCL_BFLOAT 1
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
  #define HAVE_ncclBfloat16 1
 #else
  #define HAVE_ncclBfloat16 0
@@ -83,10 +91,10 @@ namespace {
 template<typename T>
 struct IsIntegral: std::is_integral<T> {};
 template<>
-struct IsIntegral<half>: std::false_type {};
-#ifdef __CUDA_BF16_TYPES_EXIST__
+struct IsIntegral<__half>: std::false_type {};
+#if RCCL_BFLOAT16 == 1
 template<>
-struct IsIntegral<__nv_bfloat16>: std::false_type {};
+struct IsIntegral<hip_bfloat16>: std::false_type {};
 #endif
 }

@@ -116,13 +124,13 @@ namespace {
    return Y(x);
  }
  template<>
-  __host__ __device__ half castTo<half>(float x) {
+  __host__ __device__ half castTo<__half>(float x) {
    return __float2half(x);
  }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
+  #if RCCL_BFLOAT16 == 1
  template<>
-  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
-    return __float2bfloat16(x);
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+    return hip_bfloat16(x);
  }
  #endif
 }
@@ -144,20 +152,12 @@ struct ReduceSum {
  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
  template<typename T, typename=decltype(T()+T())>
  __host__ __device__ T operator()(T a, T b) const { return a + b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 530
-      return __hadd(a, b);
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
      return __float2half(__half2float(a) + __half2float(b));
-    #endif
  }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hadd(a, b);
-    #else
-      return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b));
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
  }
  #endif
  template<typename T>
@@ -168,20 +168,12 @@ struct ReduceProd {
  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
  template<typename T, typename=decltype(T()*T())>
  __host__ __device__ T operator()(T a, T b) const { return a * b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 530
-      return __hmul(a, b);
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
      return __float2half(__half2float(a) * __half2float(b));
-    #endif
  }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmul(a, b);
-    #else
-      return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b));
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
  }
  #endif
  template<typename T>
@@ -192,24 +184,12 @@ struct ReduceMin {
  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
  template<typename T, typename=decltype(T()<T())>
  __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmin(a, b);
-    #elif __CUDA_ARCH__ >= 530
-      return __hlt(a, b) ? a : b;
-    #else
-      return __half2float(a) < __half2float(b) ? a : b;
-    #endif
+  __host__ __device__ __half operator()(__half a, __half b) const {
+    return __half2float(a) < __half2float(b) ? a : b;
  }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmin(a, b);
-    //#elif __CUDA_ARCH__ >= 530
-    //  return __hlt(a, b) ? a : b;
-    #else
-      return __bfloat162float(a) < __bfloat162float(b) ? a : b;
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
  }
  #endif
  template<typename T>
@@ -220,24 +200,12 @@ struct ReduceMax {
  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
  template<typename T, typename=decltype(T()>T())>
  __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
-  __host__ __device__ half operator()(half a, half b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmax(a, b);
-    #elif __CUDA_ARCH__ >= 530
-      return __hgt(a, b) ? a : b;
-    #else
+  __host__ __device__ __half operator()(__half a, __half b) const {
      return __half2float(a) > __half2float(b) ? a : b;
-    #endif
  }
-  #ifdef __CUDA_BF16_TYPES_EXIST__
-  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
-    #if __CUDA_ARCH__ >= 800
-      return __hmax(a, b);
-    //#elif __CUDA_ARCH__ >= 530
-    //  return __hgt(a, b) ? a : b;
-    #else
-      return __bfloat162float(a) > __bfloat162float(b) ? a : b;
-    #endif
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
  }
  #endif
  template<typename T>
@@ -309,13 +277,13 @@ struct FloatLayout<double> {
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
 template<>
-struct FloatLayout<half> {
+struct FloatLayout<__half> {
  static constexpr int exponent_bits = 5, mantissa_bits = 10;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
-#ifdef __CUDA_BF16_TYPES_EXIST__
+#if RCCL_BFLOAT16 == 1
 template<>
-struct FloatLayout<__nv_bfloat16> {
+struct FloatLayout<hip_bfloat16> {
  static constexpr int exponent_bits = 8, mantissa_bits = 7;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
@@ -340,14 +308,14 @@ namespace {
 // from unbounded random values. For instance, given X a totally random 32-bit
 // integer, `umul32hi(X,n)` will be totally random within [0,n).
 __host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
  return __umulhi(a, b);
 #else
  return uint64_t(a)*b >> 32;
 #endif
 }
 __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
  return __umul64hi(a, b);
 #else
  return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
@@ -355,14 +323,14 @@ __host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
 }

 __host__ __device__ int clz32(int x) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
  return __clz(x);
 #else
  return x==0 ? 32 : __builtin_clz(x);
 #endif
 }
 __host__ __device__ int clz64(long long x) {
-#ifdef __CUDA_ARCH__
+#if HIP_VERSION > 50200000
  return __clzll(x);
 #else
  return x==0 ? 64 : __builtin_clzll(x);
@@ -747,8 +715,9 @@ __host__ __device__ void genOutput(
  ) {
  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
-  ans = ReduceProd()(ans, T1(1)/T1(rank_n));
-}
+  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
+ }
 }

 /////////////////////////////////////////////////////////////////////////////////
@@ -835,7 +804,7 @@ __global__ void prepareInput2(
 template<typename ReduceOp>
 void prepareInput1(
    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
  ) {
  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
@@ -846,9 +815,9 @@ void prepareInput1(
  case ncclUint32: CASE_TY(uint32_t)
  case ncclInt64: CASE_TY(int64_t)
  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclFloat16: CASE_TY(__half)
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
  #endif
  case ncclFloat32: CASE_TY(float)
  case ncclFloat64: CASE_TY(double)
@@ -860,7 +829,7 @@ void prepareInput1(

 void ncclVerifiablePrepareInput(
    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
@@ -911,7 +880,7 @@ __global__ void prepareExpected2(
 template<typename ReduceOp>
 void prepareExpected1(
    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
  ) {
  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
@@ -922,9 +891,9 @@ void prepareExpected1(
  case ncclUint32: CASE_TY(uint32_t)
  case ncclInt64: CASE_TY(int64_t)
  case ncclUint64: CASE_TY(uint64_t)
-  case ncclFloat16: CASE_TY(half)
+  case ncclFloat16: CASE_TY(__half)
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16)
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
  #endif
  case ncclFloat32: CASE_TY(float)
  case ncclFloat64: CASE_TY(double)
@@ -936,7 +905,7 @@ void prepareExpected1(

 void ncclVerifiablePrepareExpected(
    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
@@ -1044,7 +1013,8 @@ __global__ void verifyPrepared(
    #endif
    i += blockDim.x;
  }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
 }

 template<typename T, typename Uint, typename ReduceFn>
@@ -1077,13 +1047,14 @@ __global__ void verifyInline2(
    #endif
    i += blockDim.x;
  }
-  asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
 }

 template<typename T, typename Uint>
 void verifyInline1(
    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
+    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
@@ -1112,7 +1083,7 @@ void verifyInline1(
 void ncclVerifiableVerify(
    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    int64_t *bad_elt_n, cudaStream_t stream
+    int64_t *bad_elt_n, hipStream_t stream
  ) {
  bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
  #if HAVE_ncclBfloat16
@@ -1142,9 +1113,9 @@ void ncclVerifiableVerify(
  case ncclUint32: CASE_TY(uint32_t, uint32_t)
  case ncclInt64: CASE_TY(int64_t, uint64_t)
  case ncclUint64: CASE_TY(uint64_t, uint64_t)
-  case ncclFloat16: CASE_TY(half, uint16_t)
+  case ncclFloat16: CASE_TY(__half, uint16_t)
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
+  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
  #endif
  case ncclFloat32: CASE_TY(float, uint32_t)
  case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1180,7 +1151,7 @@ __device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, in
    }
    sum = op.postOp(sum);
    if(tolerance < calcDelta(sum, y)) {
-      std::printf(
+      printf(
        //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
        "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
        *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
@@ -1209,9 +1180,9 @@ __global__ void sweep() {
  sweep1<uint32_t>(ncclUint32, "uint32");
  sweep1<int64_t>(ncclInt64, "int64");
  sweep1<uint64_t>(ncclUint64, "uint64");
-  sweep1<half>(ncclFloat16, "half");
+  sweep1<__half>(ncclFloat16, "half");
  #if HAVE_ncclBfloat16
-    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
+    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
  #endif
  sweep1<float>(ncclFloat32, "float");
  sweep1<double>(ncclFloat64, "double");
@@ -1219,9 +1190,9 @@ __global__ void sweep() {

 int main(int arg_n, char **args) {
  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  cudaSetDevice(0);
+  hipSetDevice(0);
  sweep<<<1,512>>>();
-  cudaDeviceSynchronize();
+  hipDeviceSynchronize();
  return 0;
 }
 #endif
@@ -1,7 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #ifndef _d41d8cd98f00b204e9800998ecf8427e
 #define _d41d8cd98f00b204e9800998ecf8427e

-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>

 #include <stdint.h>

@@ -36,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 // Enqueue kernel to generate data which is to be reduced.
 void ncclVerifiablePrepareInput(
  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
 );

 // Enqueue kernel to generate expected results of reduction.
 void ncclVerifiablePrepareExpected(
  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
 );

 // Enqueue kernel to verify reduced data matches expectation. The number of
@@ -54,6 +61,6 @@ void ncclVerifiablePrepareExpected(
 void ncclVerifiableVerify(
  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-  int64_t *bad_elt_n, cudaStream_t stream
+  int64_t *bad_elt_n, hipStream_t stream
 );
 #endif
@@ -1,3 +1,9 @@
+# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+
+
 # We requires both of the following paths to be set upon including this makefile
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
@@ -8,4 +14,5 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu