Add hipify steps prior to build (#62)

* Add hipify steps prior to build
2024-03-05 09:47:18 -07:00
@@ -1,37 +1,33 @@
 # ########################################################################
-# Copyright 2022 Advanced Micro Devices, Inc.
+# Copyright 2022-2024 Advanced Micro Devices, Inc.
 # ########################################################################

-# Compile common object library
-set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
-add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
-target_link_libraries(rccl_common roc::rccl hip::device)
-if(USE_MPI)
-    target_link_libraries(rccl_common MPI::MPI_CXX)
-endif()
-
-function(add_relative_test test_name test_target)
-    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
-    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
-        set(EXE_PATH ".")
-    endif()
-    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
-    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
-        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
-        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
-            set(EXE_NAME "${test_target}")
-        endif()
-    endif()
-    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
-    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
-endfunction()
-
 function(add_rccl_test TEST)
    set(TEST_SOURCE "${TEST}.cu")
    set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
+
+    # Check that file exists
+    if (NOT EXISTS ${SOURCE_DIR}/${TEST_SOURCE})
+        message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${TEST_SOURCE}")
+    endif()
+
+    # Establish hipified copy of the source file
+    set(HIP_FILE "${HIPIFY_DIR}/${TEST_SOURCE}")
+    get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
+
+    # Convert .cu files to .cpp so that they get processed properly
+    string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+
+    # Create a custom command to create hipified source code
+    add_custom_command(
+        OUTPUT ${HIP_FILE}
+        COMMAND mkdir -p ${HIP_FILE_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${TEST_SOURCE} -o ${HIP_FILE}
+        MAIN_DEPENDENCY ${TEST_SOURCE}
+        COMMENT "Hipifying ${TEST_SOURCE} -> ${HIP_FILE}"
+    )
+
    set(TEST_TARGET "${TEST}_perf")
-    add_executable(${TEST_TARGET} ${TEST_SOURCE})
+    add_executable(${TEST_TARGET} ${HIP_FILE})
    target_link_libraries(
        ${TEST_TARGET}
        PRIVATE
@@ -52,6 +48,78 @@ function(add_rccl_test TEST)
    )
 endfunction()

+function(add_relative_test test_name test_target)
+    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
+    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
+        set(EXE_PATH ".")
+    endif()
+    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
+    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
+        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+            set(EXE_NAME "${test_target}")
+        endif()
+    endif()
+    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
+    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
+endfunction()
+
+# Collect list of common source files
+#==================================================================================================
+set(COMMON_FILES
+  common.h
+  common.cu
+  nccl1_compat.h
+  rccl_bfloat16.h
+  timer.h
+  timer.cc
+  ../verifiable/verifiable.h
+  ../verifiable/verifiable.cu
+)
+
+# Hipify common files (copy of source generated into hipify directory)
+#==================================================================================================
+find_program(hipify-perl_executable hipify-perl)
+set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
+set(SOURCE_DIR "${CMAKE_SOURCE_DIR}/src")
+
+## Loop over each common file to hipify
+foreach(COMMON_FILE ${COMMON_FILES})
+  # Check that file exists
+  if (NOT EXISTS ${SOURCE_DIR}/${COMMON_FILE})
+    message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${COMMON_FILE}")
+  endif()
+
+  # Establish hipified copy of the common file
+  get_filename_component(HIP_FILE_NAME ${HIPIFY_DIR}/${COMMON_FILE} NAME)
+  set(HIP_FILE "${HIPIFY_DIR}/${HIP_FILE_NAME}")
+
+  # Convert .cu files to .cpp so that they get processed properly
+  string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
+  list(APPEND HIP_COMMON_SOURCES ${HIP_FILE})
+
+  # Create a custom command to create hipified source code
+  add_custom_command(
+    OUTPUT ${HIP_FILE}
+    COMMAND mkdir -p ${HIPIFY_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${COMMON_FILE} -o ${HIP_FILE}
+    MAIN_DEPENDENCY ${COMMON_FILE}
+    COMMENT "Hipifying ${COMMON_FILE} -> ${HIP_FILE}"
+  )
+endforeach()
+
+# Compile common object library
+#==================================================================================================
+add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
+add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
+add_dependencies(rccl_common hipify)
+target_link_libraries(rccl_common roc::rccl hip::device)
+if(USE_MPI)
+    target_link_libraries(rccl_common MPI::MPI_CXX)
+endif()
+
+# Compile tests
+#==================================================================================================
 add_rccl_test(all_gather)
 add_rccl_test(all_reduce)
 add_rccl_test(alltoall)
@@ -1,6 +1,6 @@
 #
 # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications are Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -41,6 +41,9 @@ endif
 .PHONY: build clean

 BUILDDIR ?= ../build
+HIPIFY_DIR ?= $(BUILDDIR)/hipify
+
+.PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h

 ifeq ($(MPI), 1)
 HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
@@ -68,11 +71,21 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
 TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
 include ../verifiable/verifiable.mk

-${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+${HIPIFY_DIR}/%.cu.cpp: %.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${HIPIFY_DIR}/%.h: %.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	@mkdir -p ${DST_DIR}
-	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
+	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<

 ${DST_DIR}/timer.o: timer.cc timer.h
 	@printf "Compiling  %-35s > %s\n" $< $@
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 #define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
    for (int j=0; j<nranks; j++) {
      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
    }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -46,7 +46,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
  *busBw = baseBw * factor;
 }

-testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
  return testSuccess;
 }
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,13 +22,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
    TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -41,7 +41,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
  *busBw = baseBw * factor;
 }

-testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
  return testSuccess;
 }
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
    for (int j=0; j<nranks; j++) {
      size_t partcount = sendcount/nranks;
      TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
    }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  // We don't support in-place alltoall
  args->reportErrors = in_place ? 0 : 1;
@@ -46,7 +46,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
  *busBw = baseBw * factor;
 }

-testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
  return testSuccess;
 }
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 #define USE_RCCL_GATHER_SCATTER
@@ -32,15 +32,15 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));

 #if 0
    int *dataHost = (int *)malloc(args->sendBytes);
-    hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+    cudaMemcpy(dataHost, data, args->sendBytes, cudaMemcpyDeviceToHost);
    printf(" Rank [%d] Original: ", rank);
    for(int j=0; j<sendcount; j++) {
 	    printf("%d:%d ", j, dataHost[j]);
@@ -68,7 +68,7 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
      TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
      rdisp += rcount;
    }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  // We don't support in-place alltoall
  args->reportErrors = in_place ? 0 : 1;
@@ -83,7 +83,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
  *busBw = baseBw * factor;
 }

-testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  int nranks;
  NCCLCHECK(ncclCommCount(comm, &nranks));
  int rank;
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
  size_t recvcount = args->expectedBytes / wordSize(type);

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
    TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -40,7 +40,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
  *busBw = baseBw * factor;
 }

-testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  int rank;
  NCCLCHECK(ncclCommUserRank(comm, &rank));
 #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
@@ -6,7 +6,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "rccl_bfloat16.h"
 #include "common.h"
 #include <pthread.h>
@@ -14,10 +14,11 @@
 #include <type_traits>
 #include <getopt.h>
 #include <libgen.h>
+#include "cuda.h"

 //#define DEBUG_PRINT

-#include "../verifiable/verifiable.h"
+#include "verifiable.h"

 int test_ncclVersion = 0; // init'd with ncclGetVersion()

@@ -165,18 +166,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
 }

 testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
-  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
-  HIPCHECK(hipDeviceSynchronize());
+  ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
+  CUDACHECK(cudaDeviceSynchronize());
  return testSuccess;
 }

 testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
-  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
+  ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
  return testSuccess;
 }

 testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
-  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
+  ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
  return testSuccess;
 }

@@ -271,11 +272,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  size_t count = args->expectedBytes/wordSize(type);

  int64_t *wrongPerGpu = nullptr;
-  HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
+  CUDACHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
  
  for (int i=0; i<args->nGpus; i++) {
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];

    TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
@@ -286,8 +287,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      char *expectedHost = (char*)malloc(args->expectedBytes);
      char *dataHost = (char*)malloc(args->expectedBytes);
      int eltsz = wordSize(type);
-      hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
-      hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
+      cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
+      cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);

      for(int j=0; j<args->expectedBytes/eltsz; j++) {
        unsigned long long want, got;
@@ -307,14 +308,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t

  *wrongElts = 0;
  for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
-  hipHostFree(wrongPerGpu);
+  cudaFreeHost(wrongPerGpu);

  if (args->reportErrors && *wrongElts) args->errors[0]++;
  return testSuccess;
 }
    
-testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
-  hipError_t hipErr;
+testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
+  cudaError_t cudaErr;
  int remaining = ngpus;
  int* done = (int*)malloc(sizeof(int)*ngpus);
  memset(done, 0, sizeof(int)*ngpus);
@@ -325,15 +326,15 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
   for (int i=0; i<ngpus; i++) {
     if (done[i]) continue;

-     hipErr = hipStreamQuery(streams[i]);
-     if (hipErr == hipSuccess) {
+     cudaErr = cudaStreamQuery(streams[i]);
+     if (cudaErr == cudaSuccess) {
       done[i] = 1;
       remaining--;
       idle = 0;
       continue;
     }

-     if (hipErr != hipErrorNotReady) HIPCHECK(hipErr);
+     if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);

 #if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
     if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
@@ -382,7 +383,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
  for (int i = 0; i < args->nGpus; i++) {
 #ifndef NCCL_MAJOR
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
 #endif
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
    char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -463,16 +464,16 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  Barrier(args);

 #if HIP_VERSION >= 50221310
-  hipGraph_t graphs[args->nGpus];
-  hipGraphExec_t graphExec[args->nGpus];
+  cudaGraph_t graphs[args->nGpus];
+  cudaGraphExec_t graphExec[args->nGpus];
  if (cudaGraphLaunches >= 1) {
    // Begin cuda graph capture
    for (int i=0; i<args->nGpus; i++) {
      // Thread local mdoe is needed for:
      // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
      // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
-      //   Since pre-connect calls hipMalloc, we cannot use global capture mode
-      HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
+      //   Since pre-connect calls cudaMalloc, we cannot use global capture mode
+      CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
    }
  }
 #endif
@@ -491,18 +492,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  if (cudaGraphLaunches >= 1) {
    // End cuda graph capture
    for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
+      CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
    }
    // Instantiate cuda graph
    for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+      CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
    }
    // Resync CPU, restart timing, launch cuda graph
    Barrier(args);
    tim.reset();
    for (int l=0; l<cudaGraphLaunches; l++) {
      for (int i=0; i<args->nGpus; i++) {
-        HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
+        CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
      }
    }
  }
@@ -520,8 +521,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
  if (cudaGraphLaunches >= 1) {
    //destroy cuda graph
    for (int i=0; i<args->nGpus; i++) {
-      HIPCHECK(hipGraphExecDestroy(graphExec[i]));
-      HIPCHECK(hipGraphDestroy(graphs[i]));
+      CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+      CUDACHECK(cudaGraphDestroy(graphs[i]));
    }
  }
 #endif
@@ -542,7 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      if (cudaGraphLaunches >= 1) {
        // Begin cuda graph capture for data check
        for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
+          CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
        }
      }
 #endif
@@ -554,15 +555,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      if (cudaGraphLaunches >= 1) {
        // End cuda graph capture
        for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
+          CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
        }
        // Instantiate cuda graph
        for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
+          CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
        }
        // Launch cuda graph
        for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
+          CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
        }
      }
 #endif
@@ -573,8 +574,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
      if (cudaGraphLaunches >= 1) {
        //destroy cuda graph
        for (int i=0; i<args->nGpus; i++) {
-          HIPCHECK(hipGraphExecDestroy(graphExec[i]));
-          HIPCHECK(hipGraphDestroy(graphs[i]));
+          CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
+          CUDACHECK(cudaGraphDestroy(graphs[i]));
        }
      }
 #endif
@@ -664,7 +665,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
  // Set device to the first of our GPUs. If we don't do that, some operations
  // will be done on the current GPU (by default : 0) and if the GPUs are in
  // exclusive mode those operations will fail.
-  HIPCHECK(hipSetDevice(args->gpus[0]));
+  CUDACHECK(cudaSetDevice(args->gpus[0]));
  TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
  return testSuccess;
 }
@@ -680,7 +681,7 @@ testResult_t threadInit(struct threadArgs* args) {
  NCCLCHECK(ncclGroupStart());
  for (int i=0; i<args->nGpus; i++) {
    int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
  }
  NCCLCHECK(ncclGroupEnd());
@@ -705,29 +706,29 @@ testResult_t threadLaunch(struct testThread* thread) {

 testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
  if (memorytype == ncclFine) {
-    HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
-    HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
-    if (datacheck) HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
+    CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
+    CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
+    if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
  }
  else if (memorytype == ncclHost) {
-    HIPCHECK(hipHostMalloc(sendbuff, nbytes));
-    HIPCHECK(hipHostMalloc(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipHostMalloc(expected, recvBytes));
+    CUDACHECK(hipHostMalloc(sendbuff, nbytes));
+    CUDACHECK(hipHostMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
  }
  else if (memorytype == ncclManaged) {
-    HIPCHECK(hipMallocManaged(sendbuff, nbytes));
-    HIPCHECK(hipMallocManaged(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipMallocManaged(expected, recvBytes));
+    CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
+    CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
 #if 0
-    HIPCHECK(hipMemset(*sendbuff, 0, nbytes));
-    HIPCHECK(hipMemset(*recvbuff, 0, nbytes));
-    if (datacheck) HIPCHECK(hipMemset(*expected, 0, recvBytes));
+    CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
+    CUDACHECK(cudaMemset(*recvbuff, 0, nbytes));
+    if (datacheck) CUDACHECK(cudaMemset(*expected, 0, recvBytes));
 #endif
  }
  else {
-    HIPCHECK(hipMalloc(sendbuff, nbytes));
-    HIPCHECK(hipMalloc(recvbuff, nbytes));
-    if (datacheck) HIPCHECK(hipMalloc(expected, recvBytes));
+    CUDACHECK(cudaMalloc(sendbuff, nbytes));
+    CUDACHECK(cudaMalloc(recvbuff, nbytes));
+    if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
  }
  return testSuccess;
 }
@@ -940,7 +941,7 @@ int main(int argc, char* argv[]) {
    }
  }

-  HIPCHECK(hipGetDeviceCount(&numDevices));
+  CUDACHECK(cudaGetDeviceCount(&numDevices));
 #ifndef MPI_SUPPORT
  if (nGpus > numDevices)
  {
@@ -1016,10 +1017,10 @@ testResult_t run() {
  for (int i=0; i<nThreads*nGpus; i++) {
    int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
    int rank = proc*nThreads*nGpus+i;
-    hipDeviceProp_t prop;
-    HIPCHECK(hipGetDeviceProperties(&prop, cudaDev));
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
    char busIdStr[] = "00000000:00:00.0";
-    HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+    CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
    len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "#   Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
      rank, getpid(), hostname, cudaDev, busIdStr, prop.name);
 	  maxMem = std::min(maxMem, prop.totalGlobalMem);
@@ -1055,7 +1056,7 @@ testResult_t run() {
 #endif

  int gpus[nGpus*nThreads];
-  hipStream_t streams[nGpus*nThreads];
+  cudaStream_t streams[nGpus*nThreads];
  void* sendbuffs[nGpus*nThreads];
  void* recvbuffs[nGpus*nThreads];
  void* expected[nGpus*nThreads];
@@ -1067,12 +1068,12 @@ testResult_t run() {
  gpu0 = envstr ? atoi(envstr) : -1;
  for (int i=0; i<nGpus*nThreads; i++) {
    gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
-    HIPCHECK(hipSetDevice(gpus[i]));
+    CUDACHECK(cudaSetDevice(gpus[i]));
    TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
    if (streamnull)
      streams[i] = NULL;
    else
-      HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
+      CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
  }

  //if parallel init is not selected, use main thread to initialize NCCL
@@ -1083,7 +1084,7 @@ testResult_t run() {
     } else {
       NCCLCHECK(ncclGroupStart());
       for (int i=0; i<nGpus*nThreads; i++) {
-         HIPCHECK(hipSetDevice(gpus[i]));
+         CUDACHECK(cudaSetDevice(gpus[i]));
         NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
       }
       NCCLCHECK(ncclGroupEnd());
@@ -1093,7 +1094,7 @@ testResult_t run() {
  int errors[nThreads];
  double bw[nThreads];
  double* delta;
-  HIPCHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, hipHostMallocPortable | hipHostMallocMapped));
+  CUDACHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
  int bw_count[nThreads];
  for (int t=0; t<nThreads; t++) {
    bw[t] = 0.0;
@@ -1178,11 +1179,11 @@ testResult_t run() {

  // Free off CUDA allocated memory
  for (int i=0; i<nGpus*nThreads; i++) {
-    if (sendbuffs[i]) HIPCHECK(hipFree((char*)sendbuffs[i]));
-    if (recvbuffs[i]) HIPCHECK(hipFree((char*)recvbuffs[i]));
-    if (datacheck) HIPCHECK(hipFree(expected[i]));
+    if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
+    if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
+    if (datacheck) CUDACHECK(cudaFree(expected[i]));
  }
-  HIPCHECK(hipHostFree(delta));
+  CUDACHECK(cudaFreeHost(delta));

  envstr = getenv("NCCL_TESTS_MIN_BW");
  double check_avg_bw = envstr ? atof(envstr) : -1;
@@ -1197,9 +1198,9 @@ testResult_t run() {
  MPI_Finalize();
 #endif

-  // 'hip-memcheck --leak-check full' requires this
+  // 'cuda-memcheck --leak-check full' requires this
  PRINT("%s\n", ncclGetLastError(NULL));
-  hipDeviceReset();
+  cudaDeviceReset();

  if (errors[0] || bw[0] < check_avg_bw*(0.9))
    exit(EXIT_FAILURE);
@@ -21,14 +21,14 @@
 // For nccl.h < 2.13 since we define a weak fallback
 extern "C" char const* ncclGetLastError(ncclComm_t comm);

-#define HIPCHECK(cmd) do {                          \
-  hipError_t e = cmd;                               \
-  if( e != hipSuccess ) {                           \
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t err = cmd;                            \
+  if( err != cudaSuccess ) {                        \
    char hostname[1024];                            \
    getHostName(hostname, 1024);                    \
-    printf("%s: Test HIP failure %s:%d '%s'\n",     \
+    printf("%s: Test CUDA failure %s:%d '%s'\n",    \
         hostname,                                  \
-        __FILE__,__LINE__,hipGetErrorString(e));    \
+        __FILE__,__LINE__,cudaGetErrorString(err)); \
    return testCudaError;                           \
  }                                                 \
 } while(0)
@@ -93,7 +93,7 @@ struct testColl {
      ncclRedOp_t op, int root, int rep, int in_place);
  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
-      ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+      ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 };
 extern struct testColl allReduceTest;
 extern struct testColl allGatherTest;
@@ -133,7 +133,7 @@ struct threadArgs {
  size_t recvInplaceOffset;
  ncclUniqueId ncclId;
  ncclComm_t* comms;
-  hipStream_t* streams;
+  cudaStream_t* streams;

  void** expected;
  size_t expectedBytes;
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "common.h"

 void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
    if (rank == root) {
      TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
    }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -44,7 +44,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
  *busBw = baseBw * factor;
 }

-testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  int nRanks;
  NCCLCHECK(ncclCommCount(comm, &nRanks));
  int rank;
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include "hip/hip_runtime.h"
+#include "cuda_runtime.h"
 #include "common.h"

 #define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
    for (int j=0; j<nranks; j++) {
      TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
    }
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
  *busBw = baseBw * factor;
 }

-testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  char* sbuff = (char*)sendbuff;
  char* rbuff = (char*)recvbuff;
  int nRanks;
@@ -54,7 +54,7 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
  int rank;
  NCCLCHECK(ncclCommUserRank(comm, &rank));
  size_t rankSize = count * wordSize(type);
-  if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
+  if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));

  // Hypercube AllGather
  for (int mask=1; mask<nRanks; mask<<=1) {
@@ -1,6 +1,5 @@
 /*************************************************************************
 * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -21,28 +20,28 @@ static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
 #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;

 static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  CHECKCOUNT(count);
  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
 }
 static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
  CHECKCOUNT(count);
  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
 }
 static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, hipStream_t stream) {
+    ncclComm_t comm, cudaStream_t stream) {
  CHECKCOUNT(count);
  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
 }
 static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    hipStream_t stream) {
+    cudaStream_t stream) {
  CHECKCOUNT(recvcount);
  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
 }
 static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
  CHECKCOUNT(sendcount);
  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
 }
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
    if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -40,7 +40,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
  *busBw = baseBw;
 }

-testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
  return testSuccess;
 }
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 #define ALIGN 4
@@ -25,14 +25,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
-    HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
+    CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
    TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -45,7 +45,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
  *busBw = baseBw * factor;
 }

-testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
  return testSuccess;
 }
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
  size_t recvcount = args->expectedBytes / wordSize(type);

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
    TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  return testSuccess;
 }
@@ -40,7 +40,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
  *busBw = baseBw * factor;
 }

-testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  int nRanks;
  NCCLCHECK(ncclCommCount(comm, &nRanks));
  int rank;
@@ -5,7 +5,7 @@
 * See LICENSE.txt for license information
 ************************************************************************/

-#include <hip/hip_runtime.h>
+#include "cuda_runtime.h"
 #include "common.h"

 void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
  int nranks = args->nProcs*args->nThreads*args->nGpus;

  for (int i=0; i<args->nGpus; i++) {
-    HIPCHECK(hipSetDevice(args->gpus[i]));
+    CUDACHECK(cudaSetDevice(args->gpus[i]));
    int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
-    HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
+    CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
    void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
    TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
    int peer = (rank-1+nranks)%nranks;
    TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
-    HIPCHECK(hipDeviceSynchronize());
+    CUDACHECK(cudaDeviceSynchronize());
  }
  // We don't support in-place sendrecv
  args->reportErrors = in_place ? 0 : 1;
@@ -44,7 +44,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
  *busBw = baseBw * factor;
 }

-testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
  int nRanks;
  NCCLCHECK(ncclCommCount(comm, &nRanks));
  int rank;
@@ -24,11 +24,11 @@
 #include <cstdio>
 #include <cstdint>
 #include <hip/hip_bfloat16.h>
-#include <hip/hip_fp16.h>
+#include <cuda_fp16.h>

 using std::uint64_t;
 using std::uint32_t;
-using bfloat16 = hip_bfloat16;
+using bfloat16 = __nv_bfloat16;

 template<typename T>
 struct float_traits;
@@ -182,14 +182,14 @@ __global__ void kernel() {
 int main() {
  std::printf("type=float:\n");
  kernel<float><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();

  std::printf("\ntype=half:\n");
  kernel<half><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();

  std::printf("\ntype=bfloat16:\n");
  kernel<bfloat16><<<1,32>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
  return 0;
 }
@@ -8,8 +8,8 @@
 //#pragma nv_diag_suppress declared_but_not_referenced

 #include "verifiable.h"
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
 #include <hip/hip_bfloat16.h>

 #include "rccl/rccl.h"
@@ -91,7 +91,7 @@ template<>
 struct IsIntegral<__half>: std::false_type {};
 #if RCCL_BFLOAT16 == 1
 template<>
-struct IsIntegral<hip_bfloat16>: std::false_type {};
+struct IsIntegral<__nv_bfloat16>: std::false_type {};
 #endif
 }

@@ -126,7 +126,7 @@ namespace {
  }
  #if RCCL_BFLOAT16 == 1
  template<>
-  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+  __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
    return hip_bfloat16(x);
  }
  #endif
@@ -153,7 +153,7 @@ struct ReduceSum {
      return __float2half(__half2float(a) + __half2float(b));
  }
  #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
      return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
  }
  #endif
@@ -169,7 +169,7 @@ struct ReduceProd {
      return __float2half(__half2float(a) * __half2float(b));
  }
  #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
      return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
  }
  #endif
@@ -185,7 +185,7 @@ struct ReduceMin {
    return __half2float(a) < __half2float(b) ? a : b;
  }
  #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
  }
  #endif
@@ -201,7 +201,7 @@ struct ReduceMax {
      return __half2float(a) > __half2float(b) ? a : b;
  }
  #if RCCL_BFLOAT16 == 1
-  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+  __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
  }
  #endif
@@ -280,7 +280,7 @@ struct FloatLayout<__half> {
 };
 #if RCCL_BFLOAT16 == 1
 template<>
-struct FloatLayout<hip_bfloat16> {
+struct FloatLayout<__nv_bfloat16> {
  static constexpr int exponent_bits = 8, mantissa_bits = 7;
  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
 };
@@ -801,7 +801,7 @@ __global__ void prepareInput2(
 template<typename ReduceOp>
 void prepareInput1(
    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
@@ -814,7 +814,7 @@ void prepareInput1(
  case ncclUint64: CASE_TY(uint64_t)
  case ncclFloat16: CASE_TY(__half)
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
  #endif
  case ncclFloat32: CASE_TY(float)
  case ncclFloat64: CASE_TY(double)
@@ -826,7 +826,7 @@ void prepareInput1(

 void ncclVerifiablePrepareInput(
    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
@@ -877,7 +877,7 @@ __global__ void prepareExpected2(
 template<typename ReduceOp>
 void prepareExpected1(
    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
@@ -890,7 +890,7 @@ void prepareExpected1(
  case ncclUint64: CASE_TY(uint64_t)
  case ncclFloat16: CASE_TY(__half)
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16)
  #endif
  case ncclFloat32: CASE_TY(float)
  case ncclFloat64: CASE_TY(double)
@@ -902,7 +902,7 @@ void prepareExpected1(

 void ncclVerifiablePrepareExpected(
    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+    uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
@@ -1051,7 +1051,7 @@ __global__ void verifyInline2(
 template<typename T, typename Uint>
 void verifyInline1(
    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
+    unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
  ) {
  #define CASE_OP(op) \
    if(rank_n == 1) \
@@ -1080,7 +1080,7 @@ void verifyInline1(
 void ncclVerifiableVerify(
    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-    int64_t *bad_elt_n, hipStream_t stream
+    int64_t *bad_elt_n, cudaStream_t stream
  ) {
  bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
  #if HAVE_ncclBfloat16
@@ -1112,7 +1112,7 @@ void ncclVerifiableVerify(
  case ncclUint64: CASE_TY(uint64_t, uint64_t)
  case ncclFloat16: CASE_TY(__half, uint16_t)
  #if HAVE_ncclBfloat16
-  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
+  case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
  #endif
  case ncclFloat32: CASE_TY(float, uint32_t)
  case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1179,7 +1179,7 @@ __global__ void sweep() {
  sweep1<uint64_t>(ncclUint64, "uint64");
  sweep1<__half>(ncclFloat16, "half");
  #if HAVE_ncclBfloat16
-    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
+    sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
  #endif
  sweep1<float>(ncclFloat32, "float");
  sweep1<double>(ncclFloat64, "double");
@@ -1187,9 +1187,9 @@ __global__ void sweep() {

 int main(int arg_n, char **args) {
  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
-  hipSetDevice(0);
+  cudaSetDevice(0);
  sweep<<<1,512>>>();
-  hipDeviceSynchronize();
+  cudaDeviceSynchronize();
  return 0;
 }
 #endif
@@ -8,7 +8,7 @@
 #ifndef _d41d8cd98f00b204e9800998ecf8427e
 #define _d41d8cd98f00b204e9800998ecf8427e

-#include <hip/hip_runtime.h>
+#include <cuda_runtime.h>

 #include <stdint.h>

@@ -43,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
 // Enqueue kernel to generate data which is to be reduced.
 void ncclVerifiablePrepareInput(
  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
-  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );

 // Enqueue kernel to generate expected results of reduction.
 void ncclVerifiablePrepareExpected(
  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
-  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
 );

 // Enqueue kernel to verify reduced data matches expectation. The number of
@@ -61,6 +61,6 @@ void ncclVerifiablePrepareExpected(
 void ncclVerifiableVerify(
  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
-  int64_t *bad_elt_n, hipStream_t stream
+  int64_t *bad_elt_n, cudaStream_t stream
 );
 #endif
@@ -1,5 +1,5 @@
 # Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information

@@ -8,11 +8,21 @@
 # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>

-TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
-TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+TEST_VERIFIABLE_HDRS      = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS      = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o

-$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+	@printf "Hipifying  %-35s > %s\n" $< $@
+	@mkdir -p ${HIPIFY_DIR}
+	hipify-perl -quiet-warnings $< > $@
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h
 	@printf "Compiling %s\n" $@
 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
-	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
-	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<