diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 41d312855f..fb4dc7d5b7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,37 +1,33 @@ # ######################################################################## -# Copyright 2022 Advanced Micro Devices, Inc. +# Copyright 2022-2024 Advanced Micro Devices, Inc. # ######################################################################## -# Compile common object library -set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX) -add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu) -target_link_libraries(rccl_common roc::rccl hip::device) -if(USE_MPI) - target_link_libraries(rccl_common MPI::MPI_CXX) -endif() - -function(add_relative_test test_name test_target) - get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY) - if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND") - set(EXE_PATH ".") - endif() - get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}") - get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME) - if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND") - get_target_property(EXE_NAME ${test_target} OUTPUT_NAME) - if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND") - set(EXE_NAME "${test_target}") - endif() - endif() - file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}") - add_test(NAME "${test_name}" COMMAND "./${rel_path}") -endfunction() - function(add_rccl_test TEST) set(TEST_SOURCE "${TEST}.cu") set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX) + + # Check that file exists + if (NOT EXISTS ${SOURCE_DIR}/${TEST_SOURCE}) + message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${TEST_SOURCE}") + endif() + + # Establish hipified copy of the source file + set(HIP_FILE "${HIPIFY_DIR}/${TEST_SOURCE}") + get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY) + + # Convert .cu files to .cpp so that they get processed properly + string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE}) + + # Create a custom command to create hipified source code + add_custom_command( + OUTPUT ${HIP_FILE} + COMMAND mkdir -p ${HIP_FILE_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${TEST_SOURCE} -o ${HIP_FILE} + MAIN_DEPENDENCY ${TEST_SOURCE} + COMMENT "Hipifying ${TEST_SOURCE} -> ${HIP_FILE}" + ) + set(TEST_TARGET "${TEST}_perf") - add_executable(${TEST_TARGET} ${TEST_SOURCE}) + add_executable(${TEST_TARGET} ${HIP_FILE}) target_link_libraries( ${TEST_TARGET} PRIVATE @@ -52,6 +48,78 @@ function(add_rccl_test TEST) ) endfunction() +function(add_relative_test test_name test_target) + get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY) + if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND") + set(EXE_PATH ".") + endif() + get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}") + get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME) + if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND") + get_target_property(EXE_NAME ${test_target} OUTPUT_NAME) + if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND") + set(EXE_NAME "${test_target}") + endif() + endif() + file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}") + add_test(NAME "${test_name}" COMMAND "./${rel_path}") +endfunction() + +# Collect list of common source files +#================================================================================================== +set(COMMON_FILES + common.h + common.cu + nccl1_compat.h + rccl_bfloat16.h + timer.h + timer.cc + ../verifiable/verifiable.h + ../verifiable/verifiable.cu +) + +# Hipify common files (copy of source generated into hipify directory) +#================================================================================================== +find_program(hipify-perl_executable hipify-perl) +set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify") +set(SOURCE_DIR "${CMAKE_SOURCE_DIR}/src") + +## Loop over each common file to hipify +foreach(COMMON_FILE ${COMMON_FILES}) + # Check that file exists + if (NOT EXISTS ${SOURCE_DIR}/${COMMON_FILE}) + message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${COMMON_FILE}") + endif() + + # Establish hipified copy of the common file + get_filename_component(HIP_FILE_NAME ${HIPIFY_DIR}/${COMMON_FILE} NAME) + set(HIP_FILE "${HIPIFY_DIR}/${HIP_FILE_NAME}") + + # Convert .cu files to .cpp so that they get processed properly + string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE}) + list(APPEND HIP_COMMON_SOURCES ${HIP_FILE}) + + # Create a custom command to create hipified source code + add_custom_command( + OUTPUT ${HIP_FILE} + COMMAND mkdir -p ${HIPIFY_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${COMMON_FILE} -o ${HIP_FILE} + MAIN_DEPENDENCY ${COMMON_FILE} + COMMENT "Hipifying ${COMMON_FILE} -> ${HIP_FILE}" + ) +endforeach() + +# Compile common object library +#================================================================================================== +add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES}) +add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES}) +add_dependencies(rccl_common hipify) +target_link_libraries(rccl_common roc::rccl hip::device) +if(USE_MPI) + target_link_libraries(rccl_common MPI::MPI_CXX) +endif() + +# Compile tests +#================================================================================================== add_rccl_test(all_gather) add_rccl_test(all_reduce) add_rccl_test(alltoall) diff --git a/src/Makefile b/src/Makefile index 42daba2706..00a17b56a8 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,6 @@ # # Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. -# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. +# Modifications are Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE.txt for license information # @@ -41,6 +41,9 @@ endif .PHONY: build clean BUILDDIR ?= ../build +HIPIFY_DIR ?= $(BUILDDIR)/hipify + +.PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h ifeq ($(MPI), 1) HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi @@ -68,11 +71,21 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable include ../verifiable/verifiable.mk -${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS) +${HIPIFY_DIR}/%.cu.cpp: %.cu + @printf "Hipifying %-35s > %s\n" $< $@ + @mkdir -p ${HIPIFY_DIR} + hipify-perl -quiet-warnings $< > $@ + +${HIPIFY_DIR}/%.h: %.h + @printf "Hipifying %-35s > %s\n" $< $@ + @mkdir -p ${HIPIFY_DIR} + hipify-perl -quiet-warnings $< > $@ + +${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} - echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<" - $(HIPCC) -o $@ $(HIPCUFLAGS) -c $< + echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<" + $(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $< ${DST_DIR}/timer.o: timer.cc timer.h @printf "Compiling %-35s > %s\n" $< $@ diff --git a/src/all_gather.cu b/src/all_gather.cu index f18ce0cb65..7efc8f2c5e 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" #define ALIGN 4 @@ -25,15 +25,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -46,7 +46,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } diff --git a/src/all_reduce.cu b/src/all_reduce.cu index de03a206ff..21e3ce3a9e 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { @@ -22,13 +22,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -41,7 +41,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/alltoall.cu b/src/alltoall.cu index acfeb7d8ee..2773223dc9 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { @@ -22,16 +22,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0)); } - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } // We don't support in-place alltoall args->reportErrors = in_place ? 0 : 1; @@ -46,7 +46,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double *busBw = baseBw * factor; } -testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } diff --git a/src/alltoallv.cu b/src/alltoallv.cu index 73b53d20c3..5bab3071b1 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" #define USE_RCCL_GATHER_SCATTER @@ -32,15 +32,15 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0)); #if 0 int *dataHost = (int *)malloc(args->sendBytes); - hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost); + cudaMemcpy(dataHost, data, args->sendBytes, cudaMemcpyDeviceToHost); printf(" Rank [%d] Original: ", rank); for(int j=0; jexpected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0)); rdisp += rcount; } - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } // We don't support in-place alltoall args->reportErrors = in_place ? 0 : 1; @@ -83,7 +83,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { int nranks; NCCLCHECK(ncclCommCount(comm, &nranks)); int rank; diff --git a/src/broadcast.cu b/src/broadcast.cu index 5cd6147f10..9157c4c0c0 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { @@ -21,13 +21,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc size_t recvcount = args->expectedBytes / wordSize(type); for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0)); - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -40,7 +40,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 diff --git a/src/common.cu b/src/common.cu index 0096ecb729..0979e6992d 100644 --- a/src/common.cu +++ b/src/common.cu @@ -6,7 +6,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "hip/hip_runtime.h" +#include "cuda_runtime.h" #include "rccl_bfloat16.h" #include "common.h" #include @@ -14,10 +14,11 @@ #include #include #include +#include "cuda.h" //#define DEBUG_PRINT -#include "../verifiable/verifiable.h" +#include "verifiable.h" int test_ncclVersion = 0; // init'd with ncclGetVersion() @@ -165,18 +166,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch) } testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) { - ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault); - HIPCHECK(hipDeviceSynchronize()); + ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault); + CUDACHECK(cudaDeviceSynchronize()); return testSuccess; } testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) { - ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault); + ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault); return testSuccess; } testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) { - ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault); + ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault); return testSuccess; } @@ -271,11 +272,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t size_t count = args->expectedBytes/wordSize(type); int64_t *wrongPerGpu = nullptr; - HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped)); + CUDACHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped)); for (int i=0; inGpus; i++) { int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i]; TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i)); @@ -286,8 +287,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t char *expectedHost = (char*)malloc(args->expectedBytes); char *dataHost = (char*)malloc(args->expectedBytes); int eltsz = wordSize(type); - hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost); - hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost); + cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost); + cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost); for(int j=0; jexpectedBytes/eltsz; j++) { unsigned long long want, got; @@ -307,14 +308,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t *wrongElts = 0; for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i]; - hipHostFree(wrongPerGpu); + cudaFreeHost(wrongPerGpu); if (args->reportErrors && *wrongElts) args->errors[0]++; return testSuccess; } -testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) { - hipError_t hipErr; +testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) { + cudaError_t cudaErr; int remaining = ngpus; int* done = (int*)malloc(sizeof(int)*ngpus); memset(done, 0, sizeof(int)*ngpus); @@ -325,15 +326,15 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* for (int i=0; i= NCCL_VERSION(2,4,0) if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) { @@ -382,7 +383,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (args->nGpus > 1) NCCLCHECK(ncclGroupStart()); for (int i = 0; i < args->nGpus; i++) { #ifndef NCCL_MAJOR - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); #endif int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); char* recvBuff = ((char*)args->recvbuffs[i]) + shift; @@ -463,16 +464,16 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t Barrier(args); #if HIP_VERSION >= 50221310 - hipGraph_t graphs[args->nGpus]; - hipGraphExec_t graphExec[args->nGpus]; + cudaGraph_t graphs[args->nGpus]; + cudaGraphExec_t graphExec[args->nGpus]; if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for (int i=0; inGpus; i++) { // Thread local mdoe is needed for: // - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads // - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture. - // Since pre-connect calls hipMalloc, we cannot use global capture mode - HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal)); + // Since pre-connect calls cudaMalloc, we cannot use global capture mode + CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal)); } } #endif @@ -491,18 +492,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // End cuda graph capture for (int i=0; inGpus; i++) { - HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i)); + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); } // Instantiate cuda graph for (int i=0; inGpus; i++) { - HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); } // Resync CPU, restart timing, launch cuda graph Barrier(args); tim.reset(); for (int l=0; lnGpus; i++) { - HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i])); + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); } } } @@ -520,8 +521,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { //destroy cuda graph for (int i=0; inGpus; i++) { - HIPCHECK(hipGraphExecDestroy(graphExec[i])); - HIPCHECK(hipGraphDestroy(graphs[i])); + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); } } #endif @@ -542,7 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // Begin cuda graph capture for data check for (int i=0; inGpus; i++) { - HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal)); + CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal)); } } #endif @@ -554,15 +555,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { // End cuda graph capture for (int i=0; inGpus; i++) { - HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i)); + CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i)); } // Instantiate cuda graph for (int i=0; inGpus; i++) { - HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); + CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0)); } // Launch cuda graph for (int i=0; inGpus; i++) { - HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i])); + CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i])); } } #endif @@ -573,8 +574,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t if (cudaGraphLaunches >= 1) { //destroy cuda graph for (int i=0; inGpus; i++) { - HIPCHECK(hipGraphExecDestroy(graphExec[i])); - HIPCHECK(hipGraphDestroy(graphs[i])); + CUDACHECK(cudaGraphExecDestroy(graphExec[i])); + CUDACHECK(cudaGraphDestroy(graphs[i])); } } #endif @@ -664,7 +665,7 @@ testResult_t threadRunTests(struct threadArgs* args) { // Set device to the first of our GPUs. If we don't do that, some operations // will be done on the current GPU (by default : 0) and if the GPUs are in // exclusive mode those operations will fail. - HIPCHECK(hipSetDevice(args->gpus[0])); + CUDACHECK(cudaSetDevice(args->gpus[0])); TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop])); return testSuccess; } @@ -680,7 +681,7 @@ testResult_t threadInit(struct threadArgs* args) { NCCLCHECK(ncclGroupStart()); for (int i=0; inGpus; i++) { int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i; - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank)); } NCCLCHECK(ncclGroupEnd()); @@ -705,29 +706,29 @@ testResult_t threadLaunch(struct testThread* thread) { testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) { if (memorytype == ncclFine) { - HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained)); - HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained)); - if (datacheck) HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained)); + CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained)); + CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained)); + if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained)); } else if (memorytype == ncclHost) { - HIPCHECK(hipHostMalloc(sendbuff, nbytes)); - HIPCHECK(hipHostMalloc(recvbuff, nbytes)); - if (datacheck) HIPCHECK(hipHostMalloc(expected, recvBytes)); + CUDACHECK(hipHostMalloc(sendbuff, nbytes)); + CUDACHECK(hipHostMalloc(recvbuff, nbytes)); + if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes)); } else if (memorytype == ncclManaged) { - HIPCHECK(hipMallocManaged(sendbuff, nbytes)); - HIPCHECK(hipMallocManaged(recvbuff, nbytes)); - if (datacheck) HIPCHECK(hipMallocManaged(expected, recvBytes)); + CUDACHECK(cudaMallocManaged(sendbuff, nbytes)); + CUDACHECK(cudaMallocManaged(recvbuff, nbytes)); + if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes)); #if 0 - HIPCHECK(hipMemset(*sendbuff, 0, nbytes)); - HIPCHECK(hipMemset(*recvbuff, 0, nbytes)); - if (datacheck) HIPCHECK(hipMemset(*expected, 0, recvBytes)); + CUDACHECK(cudaMemset(*sendbuff, 0, nbytes)); + CUDACHECK(cudaMemset(*recvbuff, 0, nbytes)); + if (datacheck) CUDACHECK(cudaMemset(*expected, 0, recvBytes)); #endif } else { - HIPCHECK(hipMalloc(sendbuff, nbytes)); - HIPCHECK(hipMalloc(recvbuff, nbytes)); - if (datacheck) HIPCHECK(hipMalloc(expected, recvBytes)); + CUDACHECK(cudaMalloc(sendbuff, nbytes)); + CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); } return testSuccess; } @@ -940,7 +941,7 @@ int main(int argc, char* argv[]) { } } - HIPCHECK(hipGetDeviceCount(&numDevices)); + CUDACHECK(cudaGetDeviceCount(&numDevices)); #ifndef MPI_SUPPORT if (nGpus > numDevices) { @@ -1016,10 +1017,10 @@ testResult_t run() { for (int i=0; ilen ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n", rank, getpid(), hostname, cudaDev, busIdStr, prop.name); maxMem = std::min(maxMem, prop.totalGlobalMem); @@ -1055,7 +1056,7 @@ testResult_t run() { #endif int gpus[nGpus*nThreads]; - hipStream_t streams[nGpus*nThreads]; + cudaStream_t streams[nGpus*nThreads]; void* sendbuffs[nGpus*nThreads]; void* recvbuffs[nGpus*nThreads]; void* expected[nGpus*nThreads]; @@ -1067,12 +1068,12 @@ testResult_t run() { gpu0 = envstr ? atoi(envstr) : -1; for (int i=0; inProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); - HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); if (rank == root) { TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0)); } - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -44,7 +44,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/hypercube.cu b/src/hypercube.cu index 2058de1dd3..9c49cd7984 100644 --- a/src/hypercube.cu +++ b/src/hypercube.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "hip/hip_runtime.h" +#include "cuda_runtime.h" #include "common.h" #define ALIGN 4 @@ -25,15 +25,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0)); for (int j=0; jexpected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0)); } - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl *busBw = baseBw * factor; } -testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { char* sbuff = (char*)sendbuff; char* rbuff = (char*)recvbuff; int nRanks; @@ -54,7 +54,7 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); size_t rankSize = count * wordSize(type); - if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream)); + if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream)); // Hypercube AllGather for (int mask=1; mask INT_MAX) return ncclInvalidArgument; static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { CHECKCOUNT(count); return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); } static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) { + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { CHECKCOUNT(count); return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); } static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, hipStream_t stream) { + ncclComm_t comm, cudaStream_t stream) { CHECKCOUNT(count); return ncclBcast(buff, (int)count, datatype, root, comm, stream); } static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - hipStream_t stream) { + cudaStream_t stream) { CHECKCOUNT(recvcount); return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); } static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) { + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { CHECKCOUNT(sendcount); return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); } diff --git a/src/reduce.cu b/src/reduce.cu index 62850f8212..dd90c25bf4 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { @@ -22,14 +22,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); - HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks)); - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -40,7 +40,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw; } -testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); return testSuccess; } diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index a58d2578af..2f6c8c56d6 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" #define ALIGN 4 @@ -25,14 +25,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank)); - HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault)); + CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault)); TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks)); - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -45,7 +45,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d *busBw = baseBw * factor; } -testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } diff --git a/src/scatter.cu b/src/scatter.cu index 7445624b71..993289203c 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { @@ -21,13 +21,13 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR size_t recvcount = args->expectedBytes / wordSize(type); for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0)); TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0)); - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } return testSuccess; } @@ -40,7 +40,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* *busBw = baseBw * factor; } -testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/src/sendrecv.cu b/src/sendrecv.cu index d5b0300cdf..cda6d699ca 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -5,7 +5,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include +#include "cuda_runtime.h" #include "common.h" void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) { @@ -22,14 +22,14 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl int nranks = args->nProcs*args->nThreads*args->nGpus; for (int i=0; inGpus; i++) { - HIPCHECK(hipSetDevice(args->gpus[i])); + CUDACHECK(cudaSetDevice(args->gpus[i])); int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i); - HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes)); + CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes)); void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i]; TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0)); int peer = (rank-1+nranks)%nranks; TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0)); - HIPCHECK(hipDeviceSynchronize()); + CUDACHECK(cudaDeviceSynchronize()); } // We don't support in-place sendrecv args->reportErrors = in_place ? 0 : 1; @@ -44,7 +44,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double *busBw = baseBw * factor; } -testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) { +testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; diff --git a/verifiable/inexact_regress.cu b/verifiable/inexact_regress.cu index 973b965412..3200ff3918 100644 --- a/verifiable/inexact_regress.cu +++ b/verifiable/inexact_regress.cu @@ -24,11 +24,11 @@ #include #include #include -#include +#include using std::uint64_t; using std::uint32_t; -using bfloat16 = hip_bfloat16; +using bfloat16 = __nv_bfloat16; template struct float_traits; @@ -182,14 +182,14 @@ __global__ void kernel() { int main() { std::printf("type=float:\n"); kernel<<<1,32>>>(); - hipDeviceSynchronize(); + cudaDeviceSynchronize(); std::printf("\ntype=half:\n"); kernel<<<1,32>>>(); - hipDeviceSynchronize(); + cudaDeviceSynchronize(); std::printf("\ntype=bfloat16:\n"); kernel<<<1,32>>>(); - hipDeviceSynchronize(); + cudaDeviceSynchronize(); return 0; } diff --git a/verifiable/verifiable.cu b/verifiable/verifiable.cu index a375809bcf..31fdfe10c8 100644 --- a/verifiable/verifiable.cu +++ b/verifiable/verifiable.cu @@ -8,8 +8,8 @@ //#pragma nv_diag_suppress declared_but_not_referenced #include "verifiable.h" -#include -#include +#include +#include #include #include "rccl/rccl.h" @@ -91,7 +91,7 @@ template<> struct IsIntegral<__half>: std::false_type {}; #if RCCL_BFLOAT16 == 1 template<> -struct IsIntegral: std::false_type {}; +struct IsIntegral<__nv_bfloat16>: std::false_type {}; #endif } @@ -126,7 +126,7 @@ namespace { } #if RCCL_BFLOAT16 == 1 template<> - __host__ __device__ hip_bfloat16 castTo(float x) { + __host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) { return hip_bfloat16(x); } #endif @@ -153,7 +153,7 @@ struct ReduceSum { return __float2half(__half2float(a) + __half2float(b)); } #if RCCL_BFLOAT16 == 1 - __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { return hip_bfloat16(static_cast(a) + static_cast(b)); } #endif @@ -169,7 +169,7 @@ struct ReduceProd { return __float2half(__half2float(a) * __half2float(b)); } #if RCCL_BFLOAT16 == 1 - __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { return hip_bfloat16(static_cast(a) * static_cast(b)); } #endif @@ -185,7 +185,7 @@ struct ReduceMin { return __half2float(a) < __half2float(b) ? a : b; } #if RCCL_BFLOAT16 == 1 - __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { return static_cast(a) < static_cast(b) ? a : b; } #endif @@ -201,7 +201,7 @@ struct ReduceMax { return __half2float(a) > __half2float(b) ? a : b; } #if RCCL_BFLOAT16 == 1 - __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const { + __host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const { return static_cast(a) > static_cast(b) ? a : b; } #endif @@ -280,7 +280,7 @@ struct FloatLayout<__half> { }; #if RCCL_BFLOAT16 == 1 template<> -struct FloatLayout { +struct FloatLayout<__nv_bfloat16> { static constexpr int exponent_bits = 8, mantissa_bits = 7; static constexpr int exponent_bias = (1<<(exponent_bits-1))-1; }; @@ -801,7 +801,7 @@ __global__ void prepareInput2( template void prepareInput1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me, - uint64_t seed, intptr_t elt_ix0, hipStream_t stream + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); #define CASE_TY(T) prepareInput2<<>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break; @@ -814,7 +814,7 @@ void prepareInput1( case ncclUint64: CASE_TY(uint64_t) case ncclFloat16: CASE_TY(__half) #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(hip_bfloat16) + case ncclBfloat16: CASE_TY(__nv_bfloat16) #endif case ncclFloat32: CASE_TY(float) case ncclFloat64: CASE_TY(double) @@ -826,7 +826,7 @@ void prepareInput1( void ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, - uint64_t seed, intptr_t elt_ix0, hipStream_t stream + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ @@ -877,7 +877,7 @@ __global__ void prepareExpected2( template void prepareExpected1( void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, - uint64_t seed, intptr_t elt_ix0, hipStream_t stream + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { int block_n = std::min(32, (elt_n + 4*512-1)/(4*512)); #define CASE_TY(T) prepareExpected2<<>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break; @@ -890,7 +890,7 @@ void prepareExpected1( case ncclUint64: CASE_TY(uint64_t) case ncclFloat16: CASE_TY(__half) #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(hip_bfloat16) + case ncclBfloat16: CASE_TY(__nv_bfloat16) #endif case ncclFloat32: CASE_TY(float) case ncclFloat64: CASE_TY(double) @@ -902,7 +902,7 @@ void prepareExpected1( void ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, - uint64_t seed, intptr_t elt_ix0, hipStream_t stream + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ) { #define CASE_OP(op) \ if(rank_n == 1) \ @@ -1051,7 +1051,7 @@ __global__ void verifyInline2( template void verifyInline1( T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, - unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n + unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n ) { #define CASE_OP(op) \ if(rank_n == 1) \ @@ -1080,7 +1080,7 @@ void verifyInline1( void ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, - int64_t *bad_elt_n, hipStream_t stream + int64_t *bad_elt_n, cudaStream_t stream ) { bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64; #if HAVE_ncclBfloat16 @@ -1112,7 +1112,7 @@ void ncclVerifiableVerify( case ncclUint64: CASE_TY(uint64_t, uint64_t) case ncclFloat16: CASE_TY(__half, uint16_t) #if HAVE_ncclBfloat16 - case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t) + case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t) #endif case ncclFloat32: CASE_TY(float, uint32_t) case ncclFloat64: CASE_TY(double, uint64_t) @@ -1179,7 +1179,7 @@ __global__ void sweep() { sweep1(ncclUint64, "uint64"); sweep1<__half>(ncclFloat16, "half"); #if HAVE_ncclBfloat16 - sweep1(ncclBfloat16, "bfloat16"); + sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16"); #endif sweep1(ncclFloat32, "float"); sweep1(ncclFloat64, "double"); @@ -1187,9 +1187,9 @@ __global__ void sweep() { int main(int arg_n, char **args) { std::cerr<<"You are hoping to see no output beyond this line."<>>(); - hipDeviceSynchronize(); + cudaDeviceSynchronize(); return 0; } #endif diff --git a/verifiable/verifiable.h b/verifiable/verifiable.h index b41ef1ad12..da54778a6f 100644 --- a/verifiable/verifiable.h +++ b/verifiable/verifiable.h @@ -8,7 +8,7 @@ #ifndef _d41d8cd98f00b204e9800998ecf8427e #define _d41d8cd98f00b204e9800998ecf8427e -#include +#include #include @@ -43,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) { // Enqueue kernel to generate data which is to be reduced. void ncclVerifiablePrepareInput( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, - uint64_t seed, intptr_t elt_ix0, hipStream_t stream + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ); // Enqueue kernel to generate expected results of reduction. void ncclVerifiablePrepareExpected( void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, - uint64_t seed, intptr_t elt_ix0, hipStream_t stream + uint64_t seed, intptr_t elt_ix0, cudaStream_t stream ); // Enqueue kernel to verify reduced data matches expectation. The number of @@ -61,6 +61,6 @@ void ncclVerifiablePrepareExpected( void ncclVerifiableVerify( void const *results, void const *expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, - int64_t *bad_elt_n, hipStream_t stream + int64_t *bad_elt_n, cudaStream_t stream ); #endif diff --git a/verifiable/verifiable.mk b/verifiable/verifiable.mk index fba1fbf35c..c526ffb720 100644 --- a/verifiable/verifiable.mk +++ b/verifiable/verifiable.mk @@ -1,5 +1,5 @@ # Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. -# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. +# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE.txt for license information @@ -8,11 +8,21 @@ # TEST_VERIFIABLE_SRCDIR = # TEST_VERIFIABLE_BUILDDIR = -TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h -TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o +TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h +TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o -$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS) +${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu + @printf "Hipifying %-35s > %s\n" $< $@ + @mkdir -p ${HIPIFY_DIR} + hipify-perl -quiet-warnings $< > $@ + +${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h + @printf "Hipifying %-35s > %s\n" $< $@ + @mkdir -p ${HIPIFY_DIR} + hipify-perl -quiet-warnings $< > $@ + +$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h @printf "Compiling %s\n" $@ @mkdir -p $(TEST_VERIFIABLE_BUILDDIR) - echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu" - $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu + echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<" + $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<