diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 41d312855f..fb4dc7d5b7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,37 +1,33 @@
 # ########################################################################
-# Copyright 2022 Advanced Micro Devices, Inc.
+# Copyright 2022-2024 Advanced Micro Devices, Inc.
 # ########################################################################
 
-# Compile common object library
-set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
-add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
-target_link_libraries(rccl_common roc::rccl hip::device)
-if(USE_MPI)
-    target_link_libraries(rccl_common MPI::MPI_CXX)
-endif()
-
-function(add_relative_test test_name test_target)
-    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
-    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
-        set(EXE_PATH ".")
-    endif()
-    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
-    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
-        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
-        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
-            set(EXE_NAME "${test_target}")
-        endif()
-    endif()
-    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
-    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
-endfunction()
-
 function(add_rccl_test TEST)
     set(TEST_SOURCE "${TEST}.cu")
     set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
+
+    # Check that file exists
+    if (NOT EXISTS ${SOURCE_DIR}/${TEST_SOURCE})
+        message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${TEST_SOURCE}")
+    endif()
+
+    # Establish hipified copy of the source file
+    set(HIP_FILE "${HIPIFY_DIR}/${TEST_SOURCE}")
+    get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
+
+    # Convert .cu files to .cpp so that they get processed properly
+    string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+
+    # Create a custom command to create hipified source code
+    add_custom_command(
+        OUTPUT ${HIP_FILE}
+        COMMAND mkdir -p ${HIP_FILE_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${TEST_SOURCE} -o ${HIP_FILE}
+        MAIN_DEPENDENCY ${TEST_SOURCE}
+        COMMENT "Hipifying ${TEST_SOURCE} -> ${HIP_FILE}"
+    )
+
     set(TEST_TARGET "${TEST}_perf")
-    add_executable(${TEST_TARGET} ${TEST_SOURCE})
+    add_executable(${TEST_TARGET} ${HIP_FILE})
     target_link_libraries(
         ${TEST_TARGET}
         PRIVATE
@@ -52,6 +48,78 @@ function(add_rccl_test TEST)
     )
 endfunction()
 
+function(add_relative_test test_name test_target)
+    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
+    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
+        set(EXE_PATH ".")
+    endif()
+    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
+    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
+        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+            set(EXE_NAME "${test_target}")
+        endif()
+    endif()
+    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
+    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
+endfunction()
+
+# Collect list of common source files
+#==================================================================================================
+set(COMMON_FILES
+  common.h
+  common.cu
+  nccl1_compat.h
+  rccl_bfloat16.h
+  timer.h
+  timer.cc
+  ../verifiable/verifiable.h
+  ../verifiable/verifiable.cu
+)
+
+# Hipify common files (copy of source generated into hipify directory)
+#==================================================================================================
+find_program(hipify-perl_executable hipify-perl)
+set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
+set(SOURCE_DIR "${CMAKE_SOURCE_DIR}/src")
+
+## Loop over each common file to hipify
+foreach(COMMON_FILE ${COMMON_FILES})
+  # Check that file exists
+  if (NOT EXISTS ${SOURCE_DIR}/${COMMON_FILE})
+    message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${COMMON_FILE}")
+  endif()
+
+  # Establish hipified copy of the common file
+  get_filename_component(HIP_FILE_NAME ${HIPIFY_DIR}/${COMMON_FILE} NAME)
+  set(HIP_FILE "${HIPIFY_DIR}/${HIP_FILE_NAME}")
+
+  # Convert .cu files to .cpp so that they get processed properly
+  string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+  list(APPEND HIP_COMMON_SOURCES ${HIP_FILE})
+
+  # Create a custom command to create hipified source code
+  add_custom_command(
+    OUTPUT ${HIP_FILE}
+    COMMAND mkdir -p ${HIPIFY_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${COMMON_FILE} -o ${HIP_FILE}
+    MAIN_DEPENDENCY ${COMMON_FILE}
+    COMMENT "Hipifying ${COMMON_FILE} -> ${HIP_FILE}"
+  )
+endforeach()
+
+# Compile common object library
+#==================================================================================================
+add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
+add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
+add_dependencies(rccl_common hipify)
+target_link_libraries(rccl_common roc::rccl hip::device)
+if(USE_MPI)
+    target_link_libraries(rccl_common MPI::MPI_CXX)
+endif()
+
+# Compile tests
+#==================================================================================================
 add_rccl_test(all_gather)
 add_rccl_test(all_reduce)
 add_rccl_test(alltoall)
diff --git a/src/Makefile b/src/Makefile
index 42daba2706..00a17b56a8 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 #
 # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications are Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -41,6 +41,9 @@ endif
 .PHONY: build clean
 
 BUILDDIR ?= ../build
+HIPIFY_DIR ?= $(BUILDDIR)/hipify
+
+.PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h
 
 ifeq ($(MPI), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
@@ -68,11 +71,21 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
 TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
 include ../verifiable/verifiable.mk
 
-${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+${HIPIFY_DIR}/%.cu.cpp: %.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${HIPIFY_DIR}/%.h: %.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
+	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<
 
 ${DST_DIR}/timer.o: timer.cc timer.h
 	@printf "Compiling  %-35s > %s\n" $< $@
diff --git a/src/all_gather.cu b/src/all_gather.cu
index f18ce0cb65..7efc8f2c5e 100644
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
       TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -46,7 +46,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
index de03a206ff..21e3ce3a9e 100644
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,13 +22,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -41,7 +41,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/alltoall.cu b/src/alltoall.cu
index acfeb7d8ee..2773223dc9 100644
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
       size_t partcount = sendcount/nranks;
       TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   // We don't support in-place alltoall
   args->reportErrors = in_place ? 0 : 1;
@@ -46,7 +46,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
   return testSuccess;
 }
diff --git a/src/alltoallv.cu b/src/alltoallv.cu
index 73b53d20c3..5bab3071b1 100644
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define USE_RCCL_GATHER_SCATTER
@@ -32,15 +32,15 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
 
 #if 0
     int *dataHost = (int *)malloc(args->sendBytes);
-    hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+    cudaMemcpy(dataHost, data, args->sendBytes, cudaMemcpyDeviceToHost);
     printf(" Rank [%d] Original: ", rank);
     for(int j=0; j<sendcount; j++) {
 	    printf("%d:%d ", j, dataHost[j]);
@@ -68,7 +68,7 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
       TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
       rdisp += rcount;
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   // We don't support in-place alltoall
   args->reportErrors = in_place ? 0 : 1;
@@ -83,7 +83,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nranks;
   NCCLCHECK(ncclCommCount(comm, &nranks));
   int rank;
diff --git a/src/broadcast.cu b/src/broadcast.cu
index 5cd6147f10..9157c4c0c0 100644
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
   size_t recvcount = args->expectedBytes / wordSize(type);
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
     TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -40,7 +40,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
diff --git a/src/common.cu b/src/common.cu
index 0096ecb729..0979e6992d 100644
--- a/src/common.cu
+++ b/src/common.cu
@@ -6,7 +6,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "rccl_bfloat16.h"
 #include "common.h"
 #include <pthread.h>
@@ -14,10 +14,11 @@
 #include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
+#include "cuda.h"
 
 //#define DEBUG_PRINT
 
-#include "../verifiable/verifiable.h"
+#include "verifiable.h"
 
 int test_ncclVersion = 0; // init'd with ncclGetVersion()
 
@@ -165,18 +166,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }
 
 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
-  HIPCHECK(hipDeviceSynchronize());
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(cudaDeviceSynchronize());
   return testSuccess;
 }
 
 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
   return testSuccess;
 }
 
 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
   return testSuccess;
 }
 
@@ -271,11 +272,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   size_t count = args->expectedBytes/wordSize(type);
 
   int64_t *wrongPerGpu = nullptr;
-  HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
+  CUDACHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
   
   for (int i=0; i<args->nGpus; i++) {
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
 
     TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
@@ -286,8 +287,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       char *expectedHost = (char*)malloc(args->expectedBytes);
       char *dataHost = (char*)malloc(args->expectedBytes);
       int eltsz = wordSize(type);
-      hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
-      hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
+      cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
+      cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
 
       for(int j=0; j<args->expectedBytes/eltsz; j++) {
         unsigned long long want, got;
@@ -307,14 +308,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
 
   *wrongElts = 0;
   for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
-  hipHostFree(wrongPerGpu);
+  cudaFreeHost(wrongPerGpu);
 
   if (args->reportErrors && *wrongElts) args->errors[0]++;
   return testSuccess;
 }
     
-testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
-  hipError_t hipErr;
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
   int remaining = ngpus;
   int* done = (int*)malloc(sizeof(int)*ngpus);
   memset(done, 0, sizeof(int)*ngpus);
@@ -325,15 +326,15 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
    for (int i=0; i<ngpus; i++) {
      if (done[i]) continue;
 
-     hipErr = hipStreamQuery(streams[i]);
-     if (hipErr == hipSuccess) {
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
        done[i] = 1;
        remaining--;
        idle = 0;
        continue;
      }
 
-     if (hipErr != hipErrorNotReady) HIPCHECK(hipErr);
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
      if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
@@ -382,7 +383,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
   for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
 #endif
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
     char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -463,16 +464,16 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   Barrier(args);
 
 #if HIP_VERSION >= 50221310
-  hipGraph_t graphs[args->nGpus];
-  hipGraphExec_t graphExec[args->nGpus];
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
   if (cudaGraphLaunches >= 1) {
     // Begin cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
       // Thread local mdoe is needed for:
       // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
       // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
-      //   Since pre-connect calls hipMalloc, we cannot use global capture mode
-      HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
     }
   }
 #endif
@@ -491,18 +492,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) {
     // End cuda graph capture
     for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
     }
     // Instantiate cuda graph
     for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
     }
     // Resync CPU, restart timing, launch cuda graph
     Barrier(args);
     tim.reset();
     for (int l=0; l<cudaGraphLaunches; l++) {
       for (int i=0; i<args->nGpus; i++) {
-        HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
       }
     }
   }
@@ -520,8 +521,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
   if (cudaGraphLaunches >= 1) {
     //destroy cuda graph
     for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipGraphExecDestroy(graphExec[i]));
-      HIPCHECK(hipGraphDestroy(graphs[i]));
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
     }
   }
 #endif
@@ -542,7 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         // Begin cuda graph capture for data check
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
         }
       }
 #endif
@@ -554,15 +555,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         // End cuda graph capture
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
         }
         // Instantiate cuda graph
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
         }
         // Launch cuda graph
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
         }
       }
 #endif
@@ -573,8 +574,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
       if (cudaGraphLaunches >= 1) {
         //destroy cuda graph
         for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphExecDestroy(graphExec[i]));
-          HIPCHECK(hipGraphDestroy(graphs[i]));
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
         }
       }
 #endif
@@ -664,7 +665,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
   // Set device to the first of our GPUs. If we don't do that, some operations
   // will be done on the current GPU (by default : 0) and if the GPUs are in
   // exclusive mode those operations will fail.
-  HIPCHECK(hipSetDevice(args->gpus[0]));
+  CUDACHECK(cudaSetDevice(args->gpus[0]));
   TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
   return testSuccess;
 }
@@ -680,7 +681,7 @@ testResult_t threadInit(struct threadArgs* args) {
   NCCLCHECK(ncclGroupStart());
   for (int i=0; i<args->nGpus; i++) {
     int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
   }
   NCCLCHECK(ncclGroupEnd());
@@ -705,29 +706,29 @@ testResult_t threadLaunch(struct testThread* thread) {
 
 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
   if (memorytype == ncclFine) {
-    HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
-    HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
-    if (datacheck) HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
+    CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
+    CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+    if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
   }
   else if (memorytype == ncclHost) {
-    HIPCHECK(hipHostMalloc(sendbuff, nbytes));
-    HIPCHECK(hipHostMalloc(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipHostMalloc(expected, recvBytes));
+    CUDACHECK(hipHostMalloc(sendbuff, nbytes));
+    CUDACHECK(hipHostMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
   }
   else if (memorytype == ncclManaged) {
-    HIPCHECK(hipMallocManaged(sendbuff, nbytes));
-    HIPCHECK(hipMallocManaged(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipMallocManaged(expected, recvBytes));
+    CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
+    CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
 #if 0
-    HIPCHECK(hipMemset(*sendbuff, 0, nbytes));
-    HIPCHECK(hipMemset(*recvbuff, 0, nbytes));
-    if (datacheck) HIPCHECK(hipMemset(*expected, 0, recvBytes));
+    CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
+    CUDACHECK(cudaMemset(*recvbuff, 0, nbytes));
+    if (datacheck) CUDACHECK(cudaMemset(*expected, 0, recvBytes));
 #endif
   }
   else {
-    HIPCHECK(hipMalloc(sendbuff, nbytes));
-    HIPCHECK(hipMalloc(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipMalloc(expected, recvBytes));
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
   }
   return testSuccess;
 }
@@ -940,7 +941,7 @@ int main(int argc, char* argv[]) {
     }
   }
 
-  HIPCHECK(hipGetDeviceCount(&numDevices));
+  CUDACHECK(cudaGetDeviceCount(&numDevices));
 #ifndef MPI_SUPPORT
   if (nGpus > numDevices)
   {
@@ -1016,10 +1017,10 @@ testResult_t run() {
   for (int i=0; i<nThreads*nGpus; i++) {
     int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
     int rank = proc*nThreads*nGpus+i;
-    hipDeviceProp_t prop;
-    HIPCHECK(hipGetDeviceProperties(&prop, cudaDev));
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
     char busIdStr[] = "00000000:00:00.0";
-    HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+    CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
     len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
       rank, getpid(), hostname, cudaDev, busIdStr, prop.name);
 	  maxMem = std::min(maxMem, prop.totalGlobalMem);
@@ -1055,7 +1056,7 @@ testResult_t run() {
 #endif
 
   int gpus[nGpus*nThreads];
-  hipStream_t streams[nGpus*nThreads];
+  cudaStream_t streams[nGpus*nThreads];
   void* sendbuffs[nGpus*nThreads];
   void* recvbuffs[nGpus*nThreads];
   void* expected[nGpus*nThreads];
@@ -1067,12 +1068,12 @@ testResult_t run() {
   gpu0 = envstr ? atoi(envstr) : -1;
   for (int i=0; i<nGpus*nThreads; i++) {
     gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
-    HIPCHECK(hipSetDevice(gpus[i]));
+    CUDACHECK(cudaSetDevice(gpus[i]));
     TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
     if (streamnull)
       streams[i] = NULL;
     else
-      HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
   }
 
   //if parallel init is not selected, use main thread to initialize NCCL
@@ -1083,7 +1084,7 @@ testResult_t run() {
      } else {
        NCCLCHECK(ncclGroupStart());
        for (int i=0; i<nGpus*nThreads; i++) {
-         HIPCHECK(hipSetDevice(gpus[i]));
+         CUDACHECK(cudaSetDevice(gpus[i]));
          NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
        }
        NCCLCHECK(ncclGroupEnd());
@@ -1093,7 +1094,7 @@ testResult_t run() {
   int errors[nThreads];
   double bw[nThreads];
   double* delta;
-  HIPCHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, hipHostMallocPortable | hipHostMallocMapped));
+  CUDACHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
   int bw_count[nThreads];
   for (int t=0; t<nThreads; t++) {
     bw[t] = 0.0;
@@ -1178,11 +1179,11 @@ testResult_t run() {
 
   // Free off CUDA allocated memory
   for (int i=0; i<nGpus*nThreads; i++) {
-    if (sendbuffs[i]) HIPCHECK(hipFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) HIPCHECK(hipFree((char*)recvbuffs[i]));
-    if (datacheck) HIPCHECK(hipFree(expected[i]));
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
   }
-  HIPCHECK(hipHostFree(delta));
+  CUDACHECK(cudaFreeHost(delta));
 
   envstr = getenv("NCCL_TESTS_MIN_BW");
   double check_avg_bw = envstr ? atof(envstr) : -1;
@@ -1197,9 +1198,9 @@ testResult_t run() {
   MPI_Finalize();
 #endif
 
-  // 'hip-memcheck --leak-check full' requires this
+  // 'cuda-memcheck --leak-check full' requires this
   PRINT("%s\n", ncclGetLastError(NULL));
-  hipDeviceReset();
+  cudaDeviceReset();
 
   if (errors[0] || bw[0] < check_avg_bw*(0.9))
     exit(EXIT_FAILURE);
diff --git a/src/common.h b/src/common.h
index efba238755..cabc6a6657 100644
--- a/src/common.h
+++ b/src/common.h
@@ -21,14 +21,14 @@
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);
 
-#define HIPCHECK(cmd) do {                          \
-  hipError_t e = cmd;                               \
-  if( e != hipSuccess ) {                           \
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
     char hostname[1024];                            \
     getHostName(hostname, 1024);                    \
-    printf("%s: Test HIP failure %s:%d '%s'\n",     \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
          hostname,                                  \
-        __FILE__,__LINE__,hipGetErrorString(e));    \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
     return testCudaError;                           \
   }                                                 \
 } while(0)
@@ -93,7 +93,7 @@ struct testColl {
       ncclRedOp_t op, int root, int rep, int in_place);
   void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
   testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -133,7 +133,7 @@ struct threadArgs {
   size_t recvInplaceOffset;
   ncclUniqueId ncclId;
   ncclComm_t* comms;
-  hipStream_t* streams;
+  cudaStream_t* streams;
 
   void** expected;
   size_t expectedBytes;
diff --git a/src/gather.cu b/src/gather.cu
index 3ac9cfb6b4..784d637c84 100644
--- a/src/gather.cu
+++ b/src/gather.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "common.h"
 
 void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     if (rank == root) {
       TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -44,7 +44,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/hypercube.cu b/src/hypercube.cu
index 2058de1dd3..9c49cd7984 100644
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
     for (int j=0; j<nranks; j++) {
       TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
     }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
   *busBw = baseBw * factor;
 }
 
-testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   char* sbuff = (char*)sendbuff;
   char* rbuff = (char*)recvbuff;
   int nRanks;
@@ -54,7 +54,7 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
   int rank;
   NCCLCHECK(ncclCommUserRank(comm, &rank));
   size_t rankSize = count * wordSize(type);
-  if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
+  if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
 
   // Hypercube AllGather
   for (int mask=1; mask<nRanks; mask<<=1) {
diff --git a/src/nccl1_compat.h b/src/nccl1_compat.h
index 3c241d3d14..020a4bc36f 100644
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
@@ -1,6 +1,5 @@
 /*************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -21,28 +20,28 @@ static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
 #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
 
 static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(count);
   return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
 }
 static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(count);
   return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
 }
 static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
+    ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(count);
   return ncclBcast(buff, (int)count, datatype, root, comm, stream);
 }
 static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    hipStream_t stream) {
+    cudaStream_t stream) {
   CHECKCOUNT(recvcount);
   return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
 }
 static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
   CHECKCOUNT(sendcount);
   return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
 }
diff --git a/src/reduce.cu b/src/reduce.cu
index 62850f8212..dd90c25bf4 100644
--- a/src/reduce.cu
+++ b/src/reduce.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -40,7 +40,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw;
 }
 
-testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
   return testSuccess;
 }
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
index a58d2578af..2f6c8c56d6 100644
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 #define ALIGN 4
@@ -25,14 +25,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
     TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -45,7 +45,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
   *busBw = baseBw * factor;
 }
 
-testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
   return testSuccess;
 }
diff --git a/src/scatter.cu b/src/scatter.cu
index 7445624b71..993289203c 100644
--- a/src/scatter.cu
+++ b/src/scatter.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
   size_t recvcount = args->expectedBytes / wordSize(type);
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
     TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   return testSuccess;
 }
@@ -40,7 +40,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
   *busBw = baseBw * factor;
 }
 
-testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/src/sendrecv.cu b/src/sendrecv.cu
index d5b0300cdf..cda6d699ca 100644
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
@@ -5,7 +5,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"
 
 void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
   int nranks = args->nProcs*args->nThreads*args->nGpus;
 
   for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
     int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
     void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
     TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
     int peer = (rank-1+nranks)%nranks;
     TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
   }
   // We don't support in-place sendrecv
   args->reportErrors = in_place ? 0 : 1;
@@ -44,7 +44,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
   *busBw = baseBw * factor;
 }
 
-testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   int nRanks;
   NCCLCHECK(ncclCommCount(comm, &nRanks));
   int rank;
diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu
index 973b965412..3200ff3918 100644
--- a/verifiable/inexact_regress.cu
+++ b/verifiable/inexact_regress.cu
@@ -24,11 +24,11 @@
 #include <cstdio>
 #include <cstdint>
 #include <hip/hip_bfloat16.h>
-#include <hip/hip_fp16.h>
+#include <cuda_fp16.h>
 
 using std::uint64_t;
 using std::uint32_t;
-using bfloat16 = hip_bfloat16;
+using bfloat16 = __nv_bfloat16;
 
 template<typename T>
 struct float_traits;
@@ -182,14 +182,14 @@ __global__ void kernel() {
 int main() {
   std::printf("type=float:\n");
   kernel<float><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
 
   std::printf("\ntype=half:\n");
   kernel<half><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
 
   std::printf("\ntype=bfloat16:\n");
   kernel<bfloat16><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
   return 0;
 }
diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu
index a375809bcf..31fdfe10c8 100644
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
@@ -8,8 +8,8 @@
 //#pragma nv_diag_suppress declared_but_not_referenced
 
 #include "verifiable.h"
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
 #include <hip/hip_bfloat16.h>
 
 #include "rccl/rccl.h"
@@ -91,7 +91,7 @@ template<>
 struct IsIntegral<__half>: std::false_type {};
 #if RCCL_BFLOAT16 == 1
 template<>
-struct IsIntegral<hip_bfloat16>: std::false_type {};
+struct IsIntegral<__nv_bfloat16>: std::false_type {};
 #endif
 }
 
@@ -126,7 +126,7 @@ namespace {
   }
   #if RCCL_BFLOAT16 == 1
   template<>
-  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
     return hip_bfloat16(x);
   }
   #endif
@@ -153,7 +153,7 @@ struct ReduceSum {
       return __float2half(__half2float(a) + __half2float(b));
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
   }
   #endif
@@ -169,7 +169,7 @@ struct ReduceProd {
       return __float2half(__half2float(a) * __half2float(b));
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
   }
   #endif
@@ -185,7 +185,7 @@ struct ReduceMin {
     return __half2float(a) < __half2float(b) ? a : b;
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return static_cast<float>(a) < static_cast<float>(b) ? a : b;
   }
   #endif
@@ -201,7 +201,7 @@ struct ReduceMax {
       return __half2float(a) > __half2float(b) ? a : b;
   }
   #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
       return static_cast<float>(a) > static_cast<float>(b) ? a : b;
   }
   #endif
@@ -280,7 +280,7 @@ struct FloatLayout<__half> {
 };
 #if RCCL_BFLOAT16 == 1
 template<>
-struct FloatLayout<hip_bfloat16> {
+struct FloatLayout<__nv_bfloat16> {
   static constexpr int exponent_bits = 8, mantissa_bits = 7;
   static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
@@ -801,7 +801,7 @@ __global__ void prepareInput2(
 template<typename ReduceOp>
 void prepareInput1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
@@ -814,7 +814,7 @@ void prepareInput1(
   case ncclUint64: CASE_TY(uint64_t)
   case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -826,7 +826,7 @@ void prepareInput1(
 
 void ncclVerifiablePrepareInput(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -877,7 +877,7 @@ __global__ void prepareExpected2(
 template<typename ReduceOp>
 void prepareExpected1(
     void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
   #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
@@ -890,7 +890,7 @@ void prepareExpected1(
   case ncclUint64: CASE_TY(uint64_t)
   case ncclFloat16: CASE_TY(__half)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
   #endif
   case ncclFloat32: CASE_TY(float)
   case ncclFloat64: CASE_TY(double)
@@ -902,7 +902,7 @@ void prepareExpected1(
 
 void ncclVerifiablePrepareExpected(
     void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1051,7 +1051,7 @@ __global__ void verifyInline2(
 template<typename T, typename Uint>
 void verifyInline1(
     T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
+    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
   ) {
   #define CASE_OP(op) \
     if(rank_n == 1) \
@@ -1080,7 +1080,7 @@ void verifyInline1(
 void ncclVerifiableVerify(
     void const *results, void const *expected, intptr_t elt_n, int elt_ty,
     int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    int64_t *bad_elt_n, hipStream_t stream
+    int64_t *bad_elt_n, cudaStream_t stream
   ) {
   bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
   #if HAVE_ncclBfloat16
@@ -1112,7 +1112,7 @@ void ncclVerifiableVerify(
   case ncclUint64: CASE_TY(uint64_t, uint64_t)
   case ncclFloat16: CASE_TY(__half, uint16_t)
   #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
   #endif
   case ncclFloat32: CASE_TY(float, uint32_t)
   case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1179,7 +1179,7 @@ __global__ void sweep() {
   sweep1<uint64_t>(ncclUint64, "uint64");
   sweep1<__half>(ncclFloat16, "half");
   #if HAVE_ncclBfloat16
-    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
+    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
   #endif
   sweep1<float>(ncclFloat32, "float");
   sweep1<double>(ncclFloat64, "double");
@@ -1187,9 +1187,9 @@ __global__ void sweep() {
 
 int main(int arg_n, char **args) {
   std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  hipSetDevice(0);
+  cudaSetDevice(0);
   sweep<<<1,512>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
   return 0;
 }
 #endif
diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h
index b41ef1ad12..da54778a6f 100644
--- a/verifiable/verifiable.h
+++ b/verifiable/verifiable.h
@@ -8,7 +8,7 @@
 #ifndef _d41d8cd98f00b204e9800998ecf8427e
 #define _d41d8cd98f00b204e9800998ecf8427e
 
-#include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
 
 #include <stdint.h>
 
@@ -43,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 // Enqueue kernel to generate data which is to be reduced.
 void ncclVerifiablePrepareInput(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
 
 // Enqueue kernel to generate expected results of reduction.
 void ncclVerifiablePrepareExpected(
   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );
 
 // Enqueue kernel to verify reduced data matches expectation. The number of
@@ -61,6 +61,6 @@ void ncclVerifiablePrepareExpected(
 void ncclVerifiableVerify(
   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-  int64_t *bad_elt_n, hipStream_t stream
+  int64_t *bad_elt_n, cudaStream_t stream
 );
 #endif
diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk
index fba1fbf35c..c526ffb720 100644
--- a/verifiable/verifiable.mk
+++ b/verifiable/verifiable.mk
@@ -1,5 +1,5 @@
 # Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 
@@ -8,11 +8,21 @@
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
 
-TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
-TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+TEST_VERIFIABLE_HDRS      = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS      = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 
-$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<