Add hipify steps prior to build (#62)

* Add hipify steps prior to build
Этот коммит содержится в:
Bertan Dogancay
2024-03-05 09:47:18 -07:00
коммит произвёл GitHub
родитель b9b73002da
Коммит 88cf7dbf45
20 изменённых файлов: 295 добавлений и 204 удалений
+95 -27
Просмотреть файл
@@ -1,37 +1,33 @@
# ########################################################################
# Copyright 2022 Advanced Micro Devices, Inc.
# Copyright 2022-2024 Advanced Micro Devices, Inc.
# ########################################################################
# Compile common object library
set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
target_link_libraries(rccl_common roc::rccl hip::device)
if(USE_MPI)
target_link_libraries(rccl_common MPI::MPI_CXX)
endif()
function(add_relative_test test_name test_target)
get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
set(EXE_PATH ".")
endif()
get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
set(EXE_NAME "${test_target}")
endif()
endif()
file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
add_test(NAME "${test_name}" COMMAND "./${rel_path}")
endfunction()
function(add_rccl_test TEST)
set(TEST_SOURCE "${TEST}.cu")
set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
# Check that file exists
if (NOT EXISTS ${SOURCE_DIR}/${TEST_SOURCE})
message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${TEST_SOURCE}")
endif()
# Establish hipified copy of the source file
set(HIP_FILE "${HIPIFY_DIR}/${TEST_SOURCE}")
get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
# Convert .cu files to .cpp so that they get processed properly
string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
# Create a custom command to create hipified source code
add_custom_command(
OUTPUT ${HIP_FILE}
COMMAND mkdir -p ${HIP_FILE_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${TEST_SOURCE} -o ${HIP_FILE}
MAIN_DEPENDENCY ${TEST_SOURCE}
COMMENT "Hipifying ${TEST_SOURCE} -> ${HIP_FILE}"
)
set(TEST_TARGET "${TEST}_perf")
add_executable(${TEST_TARGET} ${TEST_SOURCE})
add_executable(${TEST_TARGET} ${HIP_FILE})
target_link_libraries(
${TEST_TARGET}
PRIVATE
@@ -52,6 +48,78 @@ function(add_rccl_test TEST)
)
endfunction()
function(add_relative_test test_name test_target)
get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
set(EXE_PATH ".")
endif()
get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
set(EXE_NAME "${test_target}")
endif()
endif()
file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
add_test(NAME "${test_name}" COMMAND "./${rel_path}")
endfunction()
# Collect list of common source files
#==================================================================================================
set(COMMON_FILES
common.h
common.cu
nccl1_compat.h
rccl_bfloat16.h
timer.h
timer.cc
../verifiable/verifiable.h
../verifiable/verifiable.cu
)
# Hipify common files (copy of source generated into hipify directory)
#==================================================================================================
find_program(hipify-perl_executable hipify-perl)
set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
set(SOURCE_DIR "${CMAKE_SOURCE_DIR}/src")
## Loop over each common file to hipify
foreach(COMMON_FILE ${COMMON_FILES})
# Check that file exists
if (NOT EXISTS ${SOURCE_DIR}/${COMMON_FILE})
message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${COMMON_FILE}")
endif()
# Establish hipified copy of the common file
get_filename_component(HIP_FILE_NAME ${HIPIFY_DIR}/${COMMON_FILE} NAME)
set(HIP_FILE "${HIPIFY_DIR}/${HIP_FILE_NAME}")
# Convert .cu files to .cpp so that they get processed properly
string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
list(APPEND HIP_COMMON_SOURCES ${HIP_FILE})
# Create a custom command to create hipified source code
add_custom_command(
OUTPUT ${HIP_FILE}
COMMAND mkdir -p ${HIPIFY_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${COMMON_FILE} -o ${HIP_FILE}
MAIN_DEPENDENCY ${COMMON_FILE}
COMMENT "Hipifying ${COMMON_FILE} -> ${HIP_FILE}"
)
endforeach()
# Compile common object library
#==================================================================================================
add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
add_dependencies(rccl_common hipify)
target_link_libraries(rccl_common roc::rccl hip::device)
if(USE_MPI)
target_link_libraries(rccl_common MPI::MPI_CXX)
endif()
# Compile tests
#==================================================================================================
add_rccl_test(all_gather)
add_rccl_test(all_reduce)
add_rccl_test(alltoall)
+17 -4
Просмотреть файл
@@ -1,6 +1,6 @@
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
# Modifications are Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -41,6 +41,9 @@ endif
.PHONY: build clean
BUILDDIR ?= ../build
HIPIFY_DIR ?= $(BUILDDIR)/hipify
.PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h
ifeq ($(MPI), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
@@ -68,11 +71,21 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
include ../verifiable/verifiable.mk
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
${HIPIFY_DIR}/%.cu.cpp: %.cu
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${HIPIFY_DIR}/%.h: %.h
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<
${DST_DIR}/timer.o: timer.cc timer.h
@printf "Compiling %-35s > %s\n" $< $@
+5 -5
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
#define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -46,7 +46,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
+5 -5
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,13 +22,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -41,7 +41,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
+5 -5
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
size_t partcount = sendcount/nranks;
TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
}
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
// We don't support in-place alltoall
args->reportErrors = in_place ? 0 : 1;
@@ -46,7 +46,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
*busBw = baseBw * factor;
}
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
+6 -6
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
#define USE_RCCL_GATHER_SCATTER
@@ -32,15 +32,15 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
#if 0
int *dataHost = (int *)malloc(args->sendBytes);
hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
cudaMemcpy(dataHost, data, args->sendBytes, cudaMemcpyDeviceToHost);
printf(" Rank [%d] Original: ", rank);
for(int j=0; j<sendcount; j++) {
printf("%d:%d ", j, dataHost[j]);
@@ -68,7 +68,7 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
rdisp += rcount;
}
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
// We don't support in-place alltoall
args->reportErrors = in_place ? 0 : 1;
@@ -83,7 +83,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
int nranks;
NCCLCHECK(ncclCommCount(comm, &nranks));
int rank;
+5 -5
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
size_t recvcount = args->expectedBytes / wordSize(type);
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -40,7 +40,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+65 -64
Просмотреть файл
@@ -6,7 +6,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "hip/hip_runtime.h"
#include "cuda_runtime.h"
#include "rccl_bfloat16.h"
#include "common.h"
#include <pthread.h>
@@ -14,10 +14,11 @@
#include <type_traits>
#include <getopt.h>
#include <libgen.h>
#include "cuda.h"
//#define DEBUG_PRINT
#include "../verifiable/verifiable.h"
#include "verifiable.h"
int test_ncclVersion = 0; // init'd with ncclGetVersion()
@@ -165,18 +166,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
}
testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
HIPCHECK(hipDeviceSynchronize());
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
CUDACHECK(cudaDeviceSynchronize());
return testSuccess;
}
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
return testSuccess;
}
testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
return testSuccess;
}
@@ -271,11 +272,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
size_t count = args->expectedBytes/wordSize(type);
int64_t *wrongPerGpu = nullptr;
HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
CUDACHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
for (int i=0; i<args->nGpus; i++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
@@ -286,8 +287,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
char *expectedHost = (char*)malloc(args->expectedBytes);
char *dataHost = (char*)malloc(args->expectedBytes);
int eltsz = wordSize(type);
hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
for(int j=0; j<args->expectedBytes/eltsz; j++) {
unsigned long long want, got;
@@ -307,14 +308,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
*wrongElts = 0;
for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
hipHostFree(wrongPerGpu);
cudaFreeHost(wrongPerGpu);
if (args->reportErrors && *wrongElts) args->errors[0]++;
return testSuccess;
}
testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
hipError_t hipErr;
testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
cudaError_t cudaErr;
int remaining = ngpus;
int* done = (int*)malloc(sizeof(int)*ngpus);
memset(done, 0, sizeof(int)*ngpus);
@@ -325,15 +326,15 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
for (int i=0; i<ngpus; i++) {
if (done[i]) continue;
hipErr = hipStreamQuery(streams[i]);
if (hipErr == hipSuccess) {
cudaErr = cudaStreamQuery(streams[i]);
if (cudaErr == cudaSuccess) {
done[i] = 1;
remaining--;
idle = 0;
continue;
}
if (hipErr != hipErrorNotReady) HIPCHECK(hipErr);
if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
@@ -382,7 +383,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
for (int i = 0; i < args->nGpus; i++) {
#ifndef NCCL_MAJOR
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
#endif
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -463,16 +464,16 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
Barrier(args);
#if HIP_VERSION >= 50221310
hipGraph_t graphs[args->nGpus];
hipGraphExec_t graphExec[args->nGpus];
cudaGraph_t graphs[args->nGpus];
cudaGraphExec_t graphExec[args->nGpus];
if (cudaGraphLaunches >= 1) {
// Begin cuda graph capture
for (int i=0; i<args->nGpus; i++) {
// Thread local mdoe is needed for:
// - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
// - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
// Since pre-connect calls hipMalloc, we cannot use global capture mode
HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
// Since pre-connect calls cudaMalloc, we cannot use global capture mode
CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
}
}
#endif
@@ -491,18 +492,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
if (cudaGraphLaunches >= 1) {
// End cuda graph capture
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
}
// Instantiate cuda graph
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
}
// Resync CPU, restart timing, launch cuda graph
Barrier(args);
tim.reset();
for (int l=0; l<cudaGraphLaunches; l++) {
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
}
}
}
@@ -520,8 +521,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
if (cudaGraphLaunches >= 1) {
//destroy cuda graph
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipGraphExecDestroy(graphExec[i]));
HIPCHECK(hipGraphDestroy(graphs[i]));
CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
CUDACHECK(cudaGraphDestroy(graphs[i]));
}
}
#endif
@@ -542,7 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
if (cudaGraphLaunches >= 1) {
// Begin cuda graph capture for data check
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
}
}
#endif
@@ -554,15 +555,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
if (cudaGraphLaunches >= 1) {
// End cuda graph capture
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
}
// Instantiate cuda graph
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
}
// Launch cuda graph
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
}
}
#endif
@@ -573,8 +574,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
if (cudaGraphLaunches >= 1) {
//destroy cuda graph
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipGraphExecDestroy(graphExec[i]));
HIPCHECK(hipGraphDestroy(graphs[i]));
CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
CUDACHECK(cudaGraphDestroy(graphs[i]));
}
}
#endif
@@ -664,7 +665,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
// Set device to the first of our GPUs. If we don't do that, some operations
// will be done on the current GPU (by default : 0) and if the GPUs are in
// exclusive mode those operations will fail.
HIPCHECK(hipSetDevice(args->gpus[0]));
CUDACHECK(cudaSetDevice(args->gpus[0]));
TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
return testSuccess;
}
@@ -680,7 +681,7 @@ testResult_t threadInit(struct threadArgs* args) {
NCCLCHECK(ncclGroupStart());
for (int i=0; i<args->nGpus; i++) {
int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
}
NCCLCHECK(ncclGroupEnd());
@@ -705,29 +706,29 @@ testResult_t threadLaunch(struct testThread* thread) {
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
if (memorytype == ncclFine) {
HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
if (datacheck) HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
}
else if (memorytype == ncclHost) {
HIPCHECK(hipHostMalloc(sendbuff, nbytes));
HIPCHECK(hipHostMalloc(recvbuff, nbytes));
if (datacheck) HIPCHECK(hipHostMalloc(expected, recvBytes));
CUDACHECK(hipHostMalloc(sendbuff, nbytes));
CUDACHECK(hipHostMalloc(recvbuff, nbytes));
if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
}
else if (memorytype == ncclManaged) {
HIPCHECK(hipMallocManaged(sendbuff, nbytes));
HIPCHECK(hipMallocManaged(recvbuff, nbytes));
if (datacheck) HIPCHECK(hipMallocManaged(expected, recvBytes));
CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
#if 0
HIPCHECK(hipMemset(*sendbuff, 0, nbytes));
HIPCHECK(hipMemset(*recvbuff, 0, nbytes));
if (datacheck) HIPCHECK(hipMemset(*expected, 0, recvBytes));
CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
CUDACHECK(cudaMemset(*recvbuff, 0, nbytes));
if (datacheck) CUDACHECK(cudaMemset(*expected, 0, recvBytes));
#endif
}
else {
HIPCHECK(hipMalloc(sendbuff, nbytes));
HIPCHECK(hipMalloc(recvbuff, nbytes));
if (datacheck) HIPCHECK(hipMalloc(expected, recvBytes));
CUDACHECK(cudaMalloc(sendbuff, nbytes));
CUDACHECK(cudaMalloc(recvbuff, nbytes));
if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
}
return testSuccess;
}
@@ -940,7 +941,7 @@ int main(int argc, char* argv[]) {
}
}
HIPCHECK(hipGetDeviceCount(&numDevices));
CUDACHECK(cudaGetDeviceCount(&numDevices));
#ifndef MPI_SUPPORT
if (nGpus > numDevices)
{
@@ -1016,10 +1017,10 @@ testResult_t run() {
for (int i=0; i<nThreads*nGpus; i++) {
int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
int rank = proc*nThreads*nGpus+i;
hipDeviceProp_t prop;
HIPCHECK(hipGetDeviceProperties(&prop, cudaDev));
cudaDeviceProp prop;
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
char busIdStr[] = "00000000:00:00.0";
HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
rank, getpid(), hostname, cudaDev, busIdStr, prop.name);
maxMem = std::min(maxMem, prop.totalGlobalMem);
@@ -1055,7 +1056,7 @@ testResult_t run() {
#endif
int gpus[nGpus*nThreads];
hipStream_t streams[nGpus*nThreads];
cudaStream_t streams[nGpus*nThreads];
void* sendbuffs[nGpus*nThreads];
void* recvbuffs[nGpus*nThreads];
void* expected[nGpus*nThreads];
@@ -1067,12 +1068,12 @@ testResult_t run() {
gpu0 = envstr ? atoi(envstr) : -1;
for (int i=0; i<nGpus*nThreads; i++) {
gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
HIPCHECK(hipSetDevice(gpus[i]));
CUDACHECK(cudaSetDevice(gpus[i]));
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
if (streamnull)
streams[i] = NULL;
else
HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
}
//if parallel init is not selected, use main thread to initialize NCCL
@@ -1083,7 +1084,7 @@ testResult_t run() {
} else {
NCCLCHECK(ncclGroupStart());
for (int i=0; i<nGpus*nThreads; i++) {
HIPCHECK(hipSetDevice(gpus[i]));
CUDACHECK(cudaSetDevice(gpus[i]));
NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
}
NCCLCHECK(ncclGroupEnd());
@@ -1093,7 +1094,7 @@ testResult_t run() {
int errors[nThreads];
double bw[nThreads];
double* delta;
HIPCHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, hipHostMallocPortable | hipHostMallocMapped));
CUDACHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
int bw_count[nThreads];
for (int t=0; t<nThreads; t++) {
bw[t] = 0.0;
@@ -1178,11 +1179,11 @@ testResult_t run() {
// Free off CUDA allocated memory
for (int i=0; i<nGpus*nThreads; i++) {
if (sendbuffs[i]) HIPCHECK(hipFree((char*)sendbuffs[i]));
if (recvbuffs[i]) HIPCHECK(hipFree((char*)recvbuffs[i]));
if (datacheck) HIPCHECK(hipFree(expected[i]));
if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
if (datacheck) CUDACHECK(cudaFree(expected[i]));
}
HIPCHECK(hipHostFree(delta));
CUDACHECK(cudaFreeHost(delta));
envstr = getenv("NCCL_TESTS_MIN_BW");
double check_avg_bw = envstr ? atof(envstr) : -1;
@@ -1197,9 +1198,9 @@ testResult_t run() {
MPI_Finalize();
#endif
// 'hip-memcheck --leak-check full' requires this
// 'cuda-memcheck --leak-check full' requires this
PRINT("%s\n", ncclGetLastError(NULL));
hipDeviceReset();
cudaDeviceReset();
if (errors[0] || bw[0] < check_avg_bw*(0.9))
exit(EXIT_FAILURE);
+7 -7
Просмотреть файл
@@ -21,14 +21,14 @@
// For nccl.h < 2.13 since we define a weak fallback
extern "C" char const* ncclGetLastError(ncclComm_t comm);
#define HIPCHECK(cmd) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
#define CUDACHECK(cmd) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test HIP failure %s:%d '%s'\n", \
printf("%s: Test CUDA failure %s:%d '%s'\n", \
hostname, \
__FILE__,__LINE__,hipGetErrorString(e)); \
__FILE__,__LINE__,cudaGetErrorString(err)); \
return testCudaError; \
} \
} while(0)
@@ -93,7 +93,7 @@ struct testColl {
ncclRedOp_t op, int root, int rep, int in_place);
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
};
extern struct testColl allReduceTest;
extern struct testColl allGatherTest;
@@ -133,7 +133,7 @@ struct threadArgs {
size_t recvInplaceOffset;
ncclUniqueId ncclId;
ncclComm_t* comms;
hipStream_t* streams;
cudaStream_t* streams;
void** expected;
size_t expectedBytes;
+6 -6
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "hip/hip_runtime.h"
#include "cuda_runtime.h"
#include "common.h"
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,16 +22,16 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
if (rank == root) {
TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
}
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -44,7 +44,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
*busBw = baseBw * factor;
}
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
+6 -6
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include "hip/hip_runtime.h"
#include "cuda_runtime.h"
#include "common.h"
#define ALIGN 4
@@ -25,15 +25,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
char* sbuff = (char*)sendbuff;
char* rbuff = (char*)recvbuff;
int nRanks;
@@ -54,7 +54,7 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankSize = count * wordSize(type);
if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
// Hypercube AllGather
for (int mask=1; mask<nRanks; mask<<=1) {
+5 -6
Просмотреть файл
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -21,28 +20,28 @@ static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
CHECKCOUNT(count);
return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
}
static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
CHECKCOUNT(count);
return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
}
static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
ncclComm_t comm, cudaStream_t stream) {
CHECKCOUNT(count);
return ncclBcast(buff, (int)count, datatype, root, comm, stream);
}
static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream) {
cudaStream_t stream) {
CHECKCOUNT(recvcount);
return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
}
static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
CHECKCOUNT(sendcount);
return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
}
+6 -6
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -40,7 +40,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
*busBw = baseBw;
}
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
return testSuccess;
}
+6 -6
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
#define ALIGN 4
@@ -25,14 +25,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -45,7 +45,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
*busBw = baseBw * factor;
}
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
+5 -5
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -21,13 +21,13 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
size_t recvcount = args->expectedBytes / wordSize(type);
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
@@ -40,7 +40,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
*busBw = baseBw * factor;
}
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
+5 -5
Просмотреть файл
@@ -5,7 +5,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "cuda_runtime.h"
#include "common.h"
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
@@ -22,14 +22,14 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
int peer = (rank-1+nranks)%nranks;
TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
HIPCHECK(hipDeviceSynchronize());
CUDACHECK(cudaDeviceSynchronize());
}
// We don't support in-place sendrecv
args->reportErrors = in_place ? 0 : 1;
@@ -44,7 +44,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
*busBw = baseBw * factor;
}
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
+5 -5
Просмотреть файл
@@ -24,11 +24,11 @@
#include <cstdio>
#include <cstdint>
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp16.h>
#include <cuda_fp16.h>
using std::uint64_t;
using std::uint32_t;
using bfloat16 = hip_bfloat16;
using bfloat16 = __nv_bfloat16;
template<typename T>
struct float_traits;
@@ -182,14 +182,14 @@ __global__ void kernel() {
int main() {
std::printf("type=float:\n");
kernel<float><<<1,32>>>();
hipDeviceSynchronize();
cudaDeviceSynchronize();
std::printf("\ntype=half:\n");
kernel<half><<<1,32>>>();
hipDeviceSynchronize();
cudaDeviceSynchronize();
std::printf("\ntype=bfloat16:\n");
kernel<bfloat16><<<1,32>>>();
hipDeviceSynchronize();
cudaDeviceSynchronize();
return 0;
}
+21 -21
Просмотреть файл
@@ -8,8 +8,8 @@
//#pragma nv_diag_suppress declared_but_not_referenced
#include "verifiable.h"
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <hip/hip_bfloat16.h>
#include "rccl/rccl.h"
@@ -91,7 +91,7 @@ template<>
struct IsIntegral<__half>: std::false_type {};
#if RCCL_BFLOAT16 == 1
template<>
struct IsIntegral<hip_bfloat16>: std::false_type {};
struct IsIntegral<__nv_bfloat16>: std::false_type {};
#endif
}
@@ -126,7 +126,7 @@ namespace {
}
#if RCCL_BFLOAT16 == 1
template<>
__host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
__host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
return hip_bfloat16(x);
}
#endif
@@ -153,7 +153,7 @@ struct ReduceSum {
return __float2half(__half2float(a) + __half2float(b));
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
}
#endif
@@ -169,7 +169,7 @@ struct ReduceProd {
return __float2half(__half2float(a) * __half2float(b));
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
}
#endif
@@ -185,7 +185,7 @@ struct ReduceMin {
return __half2float(a) < __half2float(b) ? a : b;
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
return static_cast<float>(a) < static_cast<float>(b) ? a : b;
}
#endif
@@ -201,7 +201,7 @@ struct ReduceMax {
return __half2float(a) > __half2float(b) ? a : b;
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
return static_cast<float>(a) > static_cast<float>(b) ? a : b;
}
#endif
@@ -280,7 +280,7 @@ struct FloatLayout<__half> {
};
#if RCCL_BFLOAT16 == 1
template<>
struct FloatLayout<hip_bfloat16> {
struct FloatLayout<__nv_bfloat16> {
static constexpr int exponent_bits = 8, mantissa_bits = 7;
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
};
@@ -801,7 +801,7 @@ __global__ void prepareInput2(
template<typename ReduceOp>
void prepareInput1(
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
) {
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
#define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
@@ -814,7 +814,7 @@ void prepareInput1(
case ncclUint64: CASE_TY(uint64_t)
case ncclFloat16: CASE_TY(__half)
#if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16)
case ncclBfloat16: CASE_TY(__nv_bfloat16)
#endif
case ncclFloat32: CASE_TY(float)
case ncclFloat64: CASE_TY(double)
@@ -826,7 +826,7 @@ void prepareInput1(
void ncclVerifiablePrepareInput(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
) {
#define CASE_OP(op) \
if(rank_n == 1) \
@@ -877,7 +877,7 @@ __global__ void prepareExpected2(
template<typename ReduceOp>
void prepareExpected1(
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
) {
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
#define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
@@ -890,7 +890,7 @@ void prepareExpected1(
case ncclUint64: CASE_TY(uint64_t)
case ncclFloat16: CASE_TY(__half)
#if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16)
case ncclBfloat16: CASE_TY(__nv_bfloat16)
#endif
case ncclFloat32: CASE_TY(float)
case ncclFloat64: CASE_TY(double)
@@ -902,7 +902,7 @@ void prepareExpected1(
void ncclVerifiablePrepareExpected(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
) {
#define CASE_OP(op) \
if(rank_n == 1) \
@@ -1051,7 +1051,7 @@ __global__ void verifyInline2(
template<typename T, typename Uint>
void verifyInline1(
T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
) {
#define CASE_OP(op) \
if(rank_n == 1) \
@@ -1080,7 +1080,7 @@ void verifyInline1(
void ncclVerifiableVerify(
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, hipStream_t stream
int64_t *bad_elt_n, cudaStream_t stream
) {
bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
#if HAVE_ncclBfloat16
@@ -1112,7 +1112,7 @@ void ncclVerifiableVerify(
case ncclUint64: CASE_TY(uint64_t, uint64_t)
case ncclFloat16: CASE_TY(__half, uint16_t)
#if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
#endif
case ncclFloat32: CASE_TY(float, uint32_t)
case ncclFloat64: CASE_TY(double, uint64_t)
@@ -1179,7 +1179,7 @@ __global__ void sweep() {
sweep1<uint64_t>(ncclUint64, "uint64");
sweep1<__half>(ncclFloat16, "half");
#if HAVE_ncclBfloat16
sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
#endif
sweep1<float>(ncclFloat32, "float");
sweep1<double>(ncclFloat64, "double");
@@ -1187,9 +1187,9 @@ __global__ void sweep() {
int main(int arg_n, char **args) {
std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
hipSetDevice(0);
cudaSetDevice(0);
sweep<<<1,512>>>();
hipDeviceSynchronize();
cudaDeviceSynchronize();
return 0;
}
#endif
+4 -4
Просмотреть файл
@@ -8,7 +8,7 @@
#ifndef _d41d8cd98f00b204e9800998ecf8427e
#define _d41d8cd98f00b204e9800998ecf8427e
#include <hip/hip_runtime.h>
#include <cuda_runtime.h>
#include <stdint.h>
@@ -43,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
// Enqueue kernel to generate data which is to be reduced.
void ncclVerifiablePrepareInput(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
);
// Enqueue kernel to generate expected results of reduction.
void ncclVerifiablePrepareExpected(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
);
// Enqueue kernel to verify reduced data matches expectation. The number of
@@ -61,6 +61,6 @@ void ncclVerifiablePrepareExpected(
void ncclVerifiableVerify(
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, hipStream_t stream
int64_t *bad_elt_n, cudaStream_t stream
);
#endif
+16 -6
Просмотреть файл
@@ -1,5 +1,5 @@
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
@@ -8,11 +8,21 @@
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h
@printf "Compiling %s\n" $@
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<