коммит произвёл
GitHub
родитель
b9b73002da
Коммит
88cf7dbf45
+95
-27
@@ -1,37 +1,33 @@
|
||||
# ########################################################################
|
||||
# Copyright 2022 Advanced Micro Devices, Inc.
|
||||
# Copyright 2022-2024 Advanced Micro Devices, Inc.
|
||||
# ########################################################################
|
||||
|
||||
# Compile common object library
|
||||
set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
|
||||
add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
|
||||
target_link_libraries(rccl_common roc::rccl hip::device)
|
||||
if(USE_MPI)
|
||||
target_link_libraries(rccl_common MPI::MPI_CXX)
|
||||
endif()
|
||||
|
||||
function(add_relative_test test_name test_target)
|
||||
get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
|
||||
if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
|
||||
set(EXE_PATH ".")
|
||||
endif()
|
||||
get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
|
||||
get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
|
||||
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
|
||||
get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
|
||||
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
|
||||
set(EXE_NAME "${test_target}")
|
||||
endif()
|
||||
endif()
|
||||
file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
|
||||
add_test(NAME "${test_name}" COMMAND "./${rel_path}")
|
||||
endfunction()
|
||||
|
||||
function(add_rccl_test TEST)
|
||||
set(TEST_SOURCE "${TEST}.cu")
|
||||
set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
|
||||
|
||||
# Check that file exists
|
||||
if (NOT EXISTS ${SOURCE_DIR}/${TEST_SOURCE})
|
||||
message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${TEST_SOURCE}")
|
||||
endif()
|
||||
|
||||
# Establish hipified copy of the source file
|
||||
set(HIP_FILE "${HIPIFY_DIR}/${TEST_SOURCE}")
|
||||
get_filename_component(HIP_FILE_DIR ${HIP_FILE} DIRECTORY)
|
||||
|
||||
# Convert .cu files to .cpp so that they get processed properly
|
||||
string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
|
||||
|
||||
# Create a custom command to create hipified source code
|
||||
add_custom_command(
|
||||
OUTPUT ${HIP_FILE}
|
||||
COMMAND mkdir -p ${HIP_FILE_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${TEST_SOURCE} -o ${HIP_FILE}
|
||||
MAIN_DEPENDENCY ${TEST_SOURCE}
|
||||
COMMENT "Hipifying ${TEST_SOURCE} -> ${HIP_FILE}"
|
||||
)
|
||||
|
||||
set(TEST_TARGET "${TEST}_perf")
|
||||
add_executable(${TEST_TARGET} ${TEST_SOURCE})
|
||||
add_executable(${TEST_TARGET} ${HIP_FILE})
|
||||
target_link_libraries(
|
||||
${TEST_TARGET}
|
||||
PRIVATE
|
||||
@@ -52,6 +48,78 @@ function(add_rccl_test TEST)
|
||||
)
|
||||
endfunction()
|
||||
|
||||
function(add_relative_test test_name test_target)
|
||||
get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
|
||||
if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
|
||||
set(EXE_PATH ".")
|
||||
endif()
|
||||
get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
|
||||
get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
|
||||
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
|
||||
get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
|
||||
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
|
||||
set(EXE_NAME "${test_target}")
|
||||
endif()
|
||||
endif()
|
||||
file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
|
||||
add_test(NAME "${test_name}" COMMAND "./${rel_path}")
|
||||
endfunction()
|
||||
|
||||
# Collect list of common source files
|
||||
#==================================================================================================
|
||||
set(COMMON_FILES
|
||||
common.h
|
||||
common.cu
|
||||
nccl1_compat.h
|
||||
rccl_bfloat16.h
|
||||
timer.h
|
||||
timer.cc
|
||||
../verifiable/verifiable.h
|
||||
../verifiable/verifiable.cu
|
||||
)
|
||||
|
||||
# Hipify common files (copy of source generated into hipify directory)
|
||||
#==================================================================================================
|
||||
find_program(hipify-perl_executable hipify-perl)
|
||||
set(HIPIFY_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify")
|
||||
set(SOURCE_DIR "${CMAKE_SOURCE_DIR}/src")
|
||||
|
||||
## Loop over each common file to hipify
|
||||
foreach(COMMON_FILE ${COMMON_FILES})
|
||||
# Check that file exists
|
||||
if (NOT EXISTS ${SOURCE_DIR}/${COMMON_FILE})
|
||||
message(FATAL_ERROR "Unable to find file listed in CMakeLists.txt: ${SOURCE_DIR}/${COMMON_FILE}")
|
||||
endif()
|
||||
|
||||
# Establish hipified copy of the common file
|
||||
get_filename_component(HIP_FILE_NAME ${HIPIFY_DIR}/${COMMON_FILE} NAME)
|
||||
set(HIP_FILE "${HIPIFY_DIR}/${HIP_FILE_NAME}")
|
||||
|
||||
# Convert .cu files to .cpp so that they get processed properly
|
||||
string(REPLACE "\.cu" "\.cu.cpp" HIP_FILE ${HIP_FILE})
|
||||
list(APPEND HIP_COMMON_SOURCES ${HIP_FILE})
|
||||
|
||||
# Create a custom command to create hipified source code
|
||||
add_custom_command(
|
||||
OUTPUT ${HIP_FILE}
|
||||
COMMAND mkdir -p ${HIPIFY_DIR} && $ ${hipify-perl_executable} -quiet-warnings ${SOURCE_DIR}/${COMMON_FILE} -o ${HIP_FILE}
|
||||
MAIN_DEPENDENCY ${COMMON_FILE}
|
||||
COMMENT "Hipifying ${COMMON_FILE} -> ${HIP_FILE}"
|
||||
)
|
||||
endforeach()
|
||||
|
||||
# Compile common object library
|
||||
#==================================================================================================
|
||||
add_custom_target(hipify DEPENDS ${HIP_COMMON_SOURCES})
|
||||
add_library(rccl_common OBJECT ${HIP_COMMON_SOURCES})
|
||||
add_dependencies(rccl_common hipify)
|
||||
target_link_libraries(rccl_common roc::rccl hip::device)
|
||||
if(USE_MPI)
|
||||
target_link_libraries(rccl_common MPI::MPI_CXX)
|
||||
endif()
|
||||
|
||||
# Compile tests
|
||||
#==================================================================================================
|
||||
add_rccl_test(all_gather)
|
||||
add_rccl_test(all_reduce)
|
||||
add_rccl_test(alltoall)
|
||||
|
||||
+17
-4
@@ -1,6 +1,6 @@
|
||||
#
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -41,6 +41,9 @@ endif
|
||||
.PHONY: build clean
|
||||
|
||||
BUILDDIR ?= ../build
|
||||
HIPIFY_DIR ?= $(BUILDDIR)/hipify
|
||||
|
||||
.PRECIOUS: $(HIPIFY_DIR)/%.cu.cpp $(HIPIFY_DIR)/%.h
|
||||
|
||||
ifeq ($(MPI), 1)
|
||||
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
|
||||
@@ -68,11 +71,21 @@ TEST_VERIFIABLE_SRCDIR := ../verifiable
|
||||
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
|
||||
include ../verifiable/verifiable.mk
|
||||
|
||||
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
|
||||
${HIPIFY_DIR}/%.cu.cpp: %.cu
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
|
||||
${HIPIFY_DIR}/%.h: %.h
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
|
||||
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
|
||||
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -I. -c $<
|
||||
|
||||
${DST_DIR}/timer.o: timer.cc timer.h
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
#define ALIGN 4
|
||||
@@ -25,15 +25,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -46,7 +46,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@@ -22,13 +22,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -41,7 +41,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@@ -22,16 +22,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
size_t partcount = sendcount/nranks;
|
||||
TESTCHECK(InitData((char*)args->expected[i] + j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
// We don't support in-place alltoall
|
||||
args->reportErrors = in_place ? 0 : 1;
|
||||
@@ -46,7 +46,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
#define USE_RCCL_GATHER_SCATTER
|
||||
@@ -32,15 +32,15 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
|
||||
|
||||
#if 0
|
||||
int *dataHost = (int *)malloc(args->sendBytes);
|
||||
hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
|
||||
cudaMemcpy(dataHost, data, args->sendBytes, cudaMemcpyDeviceToHost);
|
||||
printf(" Rank [%d] Original: ", rank);
|
||||
for(int j=0; j<sendcount; j++) {
|
||||
printf("%d:%d ", j, dataHost[j]);
|
||||
@@ -68,7 +68,7 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
TESTCHECK(InitData(((char*)args->expected[i])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
|
||||
rdisp += rcount;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
// We don't support in-place alltoall
|
||||
args->reportErrors = in_place ? 0 : 1;
|
||||
@@ -83,7 +83,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int nranks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nranks));
|
||||
int rank;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@@ -21,13 +21,13 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
size_t recvcount = args->expectedBytes / wordSize(type);
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
TESTCHECK(InitData(args->expected[i], recvcount, 0, type, ncclSum, rep, 1, 0));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -40,7 +40,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
|
||||
|
||||
+65
-64
@@ -6,7 +6,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_bfloat16.h"
|
||||
#include "common.h"
|
||||
#include <pthread.h>
|
||||
@@ -14,10 +14,11 @@
|
||||
#include <type_traits>
|
||||
#include <getopt.h>
|
||||
#include <libgen.h>
|
||||
#include "cuda.h"
|
||||
|
||||
//#define DEBUG_PRINT
|
||||
|
||||
#include "../verifiable/verifiable.h"
|
||||
#include "verifiable.h"
|
||||
|
||||
int test_ncclVersion = 0; // init'd with ncclGetVersion()
|
||||
|
||||
@@ -165,18 +166,18 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
|
||||
}
|
||||
|
||||
testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
|
||||
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, hipStreamDefault);
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
|
||||
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, hipStreamDefault);
|
||||
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
|
||||
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, hipStreamDefault);
|
||||
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -271,11 +272,11 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
size_t count = args->expectedBytes/wordSize(type);
|
||||
|
||||
int64_t *wrongPerGpu = nullptr;
|
||||
HIPCHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostMallocMapped));
|
||||
CUDACHECK(hipHostMalloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), cudaHostAllocMapped));
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
|
||||
|
||||
TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
|
||||
@@ -286,8 +287,8 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
char *expectedHost = (char*)malloc(args->expectedBytes);
|
||||
char *dataHost = (char*)malloc(args->expectedBytes);
|
||||
int eltsz = wordSize(type);
|
||||
hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
|
||||
cudaMemcpy(expectedHost, args->expected[i], args->expectedBytes, cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(dataHost, data, args->expectedBytes, cudaMemcpyDeviceToHost);
|
||||
|
||||
for(int j=0; j<args->expectedBytes/eltsz; j++) {
|
||||
unsigned long long want, got;
|
||||
@@ -307,14 +308,14 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
|
||||
*wrongElts = 0;
|
||||
for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
|
||||
hipHostFree(wrongPerGpu);
|
||||
cudaFreeHost(wrongPerGpu);
|
||||
|
||||
if (args->reportErrors && *wrongElts) args->errors[0]++;
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t* comms) {
|
||||
hipError_t hipErr;
|
||||
testResult_t testStreamSynchronize(int ngpus, cudaStream_t* streams, ncclComm_t* comms) {
|
||||
cudaError_t cudaErr;
|
||||
int remaining = ngpus;
|
||||
int* done = (int*)malloc(sizeof(int)*ngpus);
|
||||
memset(done, 0, sizeof(int)*ngpus);
|
||||
@@ -325,15 +326,15 @@ testResult_t testStreamSynchronize(int ngpus, hipStream_t* streams, ncclComm_t*
|
||||
for (int i=0; i<ngpus; i++) {
|
||||
if (done[i]) continue;
|
||||
|
||||
hipErr = hipStreamQuery(streams[i]);
|
||||
if (hipErr == hipSuccess) {
|
||||
cudaErr = cudaStreamQuery(streams[i]);
|
||||
if (cudaErr == cudaSuccess) {
|
||||
done[i] = 1;
|
||||
remaining--;
|
||||
idle = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (hipErr != hipErrorNotReady) HIPCHECK(hipErr);
|
||||
if (cudaErr != cudaErrorNotReady) CUDACHECK(cudaErr);
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,4,0)
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,4,0) && comms) {
|
||||
@@ -382,7 +383,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
if (args->nGpus > 1) NCCLCHECK(ncclGroupStart());
|
||||
for (int i = 0; i < args->nGpus; i++) {
|
||||
#ifndef NCCL_MAJOR
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
#endif
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
|
||||
@@ -463,16 +464,16 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
Barrier(args);
|
||||
|
||||
#if HIP_VERSION >= 50221310
|
||||
hipGraph_t graphs[args->nGpus];
|
||||
hipGraphExec_t graphExec[args->nGpus];
|
||||
cudaGraph_t graphs[args->nGpus];
|
||||
cudaGraphExec_t graphExec[args->nGpus];
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// Begin cuda graph capture
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
// Thread local mdoe is needed for:
|
||||
// - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
|
||||
// - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
|
||||
// Since pre-connect calls hipMalloc, we cannot use global capture mode
|
||||
HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
|
||||
// Since pre-connect calls cudaMalloc, we cannot use global capture mode
|
||||
CUDACHECK(cudaStreamBeginCapture(args->streams[i], cudaStreamCaptureModeThreadLocal));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -491,18 +492,18 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// End cuda graph capture
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
|
||||
CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
|
||||
}
|
||||
// Instantiate cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
|
||||
CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
|
||||
}
|
||||
// Resync CPU, restart timing, launch cuda graph
|
||||
Barrier(args);
|
||||
tim.reset();
|
||||
for (int l=0; l<cudaGraphLaunches; l++) {
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
|
||||
CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -520,8 +521,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
//destroy cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipGraphExecDestroy(graphExec[i]));
|
||||
HIPCHECK(hipGraphDestroy(graphs[i]));
|
||||
CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
|
||||
CUDACHECK(cudaGraphDestroy(graphs[i]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -542,7 +543,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// Begin cuda graph capture for data check
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipStreamBeginCapture(args->streams[i], args->nThreads > 1 ? hipStreamCaptureModeThreadLocal : hipStreamCaptureModeGlobal));
|
||||
CUDACHECK(cudaStreamBeginCapture(args->streams[i], args->nThreads > 1 ? cudaStreamCaptureModeThreadLocal : cudaStreamCaptureModeGlobal));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -554,15 +555,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// End cuda graph capture
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipStreamEndCapture(args->streams[i], graphs+i));
|
||||
CUDACHECK(cudaStreamEndCapture(args->streams[i], graphs+i));
|
||||
}
|
||||
// Instantiate cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
|
||||
CUDACHECK(cudaGraphInstantiate(graphExec+i, graphs[i], NULL, NULL, 0));
|
||||
}
|
||||
// Launch cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
|
||||
CUDACHECK(cudaGraphLaunch(graphExec[i], args->streams[i]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -573,8 +574,8 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
//destroy cuda graph
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipGraphExecDestroy(graphExec[i]));
|
||||
HIPCHECK(hipGraphDestroy(graphs[i]));
|
||||
CUDACHECK(cudaGraphExecDestroy(graphExec[i]));
|
||||
CUDACHECK(cudaGraphDestroy(graphs[i]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -664,7 +665,7 @@ testResult_t threadRunTests(struct threadArgs* args) {
|
||||
// Set device to the first of our GPUs. If we don't do that, some operations
|
||||
// will be done on the current GPU (by default : 0) and if the GPUs are in
|
||||
// exclusive mode those operations will fail.
|
||||
HIPCHECK(hipSetDevice(args->gpus[0]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[0]));
|
||||
TESTCHECK(ncclTestEngine.runTest(args, ncclroot, (ncclDataType_t)nccltype, test_typenames[nccltype], (ncclRedOp_t)ncclop, test_opnames[ncclop]));
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -680,7 +681,7 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int rank = args->proc*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
NCCLCHECK(ncclCommInitRank(args->comms+i, nranks, args->ncclId, rank));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
@@ -705,29 +706,29 @@ testResult_t threadLaunch(struct testThread* thread) {
|
||||
|
||||
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
|
||||
if (memorytype == ncclFine) {
|
||||
HIPCHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
HIPCHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
if (datacheck) HIPCHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
|
||||
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
|
||||
}
|
||||
else if (memorytype == ncclHost) {
|
||||
HIPCHECK(hipHostMalloc(sendbuff, nbytes));
|
||||
HIPCHECK(hipHostMalloc(recvbuff, nbytes));
|
||||
if (datacheck) HIPCHECK(hipHostMalloc(expected, recvBytes));
|
||||
CUDACHECK(hipHostMalloc(sendbuff, nbytes));
|
||||
CUDACHECK(hipHostMalloc(recvbuff, nbytes));
|
||||
if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
|
||||
}
|
||||
else if (memorytype == ncclManaged) {
|
||||
HIPCHECK(hipMallocManaged(sendbuff, nbytes));
|
||||
HIPCHECK(hipMallocManaged(recvbuff, nbytes));
|
||||
if (datacheck) HIPCHECK(hipMallocManaged(expected, recvBytes));
|
||||
CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
|
||||
CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
|
||||
if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
|
||||
#if 0
|
||||
HIPCHECK(hipMemset(*sendbuff, 0, nbytes));
|
||||
HIPCHECK(hipMemset(*recvbuff, 0, nbytes));
|
||||
if (datacheck) HIPCHECK(hipMemset(*expected, 0, recvBytes));
|
||||
CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
|
||||
CUDACHECK(cudaMemset(*recvbuff, 0, nbytes));
|
||||
if (datacheck) CUDACHECK(cudaMemset(*expected, 0, recvBytes));
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
HIPCHECK(hipMalloc(sendbuff, nbytes));
|
||||
HIPCHECK(hipMalloc(recvbuff, nbytes));
|
||||
if (datacheck) HIPCHECK(hipMalloc(expected, recvBytes));
|
||||
CUDACHECK(cudaMalloc(sendbuff, nbytes));
|
||||
CUDACHECK(cudaMalloc(recvbuff, nbytes));
|
||||
if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -940,7 +941,7 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
}
|
||||
|
||||
HIPCHECK(hipGetDeviceCount(&numDevices));
|
||||
CUDACHECK(cudaGetDeviceCount(&numDevices));
|
||||
#ifndef MPI_SUPPORT
|
||||
if (nGpus > numDevices)
|
||||
{
|
||||
@@ -1016,10 +1017,10 @@ testResult_t run() {
|
||||
for (int i=0; i<nThreads*nGpus; i++) {
|
||||
int cudaDev = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
|
||||
int rank = proc*nThreads*nGpus+i;
|
||||
hipDeviceProp_t prop;
|
||||
HIPCHECK(hipGetDeviceProperties(&prop, cudaDev));
|
||||
cudaDeviceProp prop;
|
||||
CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
|
||||
char busIdStr[] = "00000000:00:00.0";
|
||||
HIPCHECK(hipDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
|
||||
CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
|
||||
len += snprintf(line+len, MAX_LINE>len ? MAX_LINE-len : 0, "# Rank %2d Pid %6d on %10s device %2d [%s] %s\n",
|
||||
rank, getpid(), hostname, cudaDev, busIdStr, prop.name);
|
||||
maxMem = std::min(maxMem, prop.totalGlobalMem);
|
||||
@@ -1055,7 +1056,7 @@ testResult_t run() {
|
||||
#endif
|
||||
|
||||
int gpus[nGpus*nThreads];
|
||||
hipStream_t streams[nGpus*nThreads];
|
||||
cudaStream_t streams[nGpus*nThreads];
|
||||
void* sendbuffs[nGpus*nThreads];
|
||||
void* recvbuffs[nGpus*nThreads];
|
||||
void* expected[nGpus*nThreads];
|
||||
@@ -1067,12 +1068,12 @@ testResult_t run() {
|
||||
gpu0 = envstr ? atoi(envstr) : -1;
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
gpus[i] = (gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i;
|
||||
HIPCHECK(hipSetDevice(gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(gpus[i]));
|
||||
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes));
|
||||
if (streamnull)
|
||||
streams[i] = NULL;
|
||||
else
|
||||
HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
|
||||
CUDACHECK(cudaStreamCreateWithFlags(streams+i, cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
//if parallel init is not selected, use main thread to initialize NCCL
|
||||
@@ -1083,7 +1084,7 @@ testResult_t run() {
|
||||
} else {
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
HIPCHECK(hipSetDevice(gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(gpus[i]));
|
||||
NCCLCHECK(ncclCommInitRank(comms+i, ncclProcs*nThreads*nGpus, ncclId, ncclProc*nThreads*nGpus+i));
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
@@ -1093,7 +1094,7 @@ testResult_t run() {
|
||||
int errors[nThreads];
|
||||
double bw[nThreads];
|
||||
double* delta;
|
||||
HIPCHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, hipHostMallocPortable | hipHostMallocMapped));
|
||||
CUDACHECK(hipHostMalloc(&delta, sizeof(double)*nThreads*NUM_BLOCKS, cudaHostAllocPortable | cudaHostAllocMapped));
|
||||
int bw_count[nThreads];
|
||||
for (int t=0; t<nThreads; t++) {
|
||||
bw[t] = 0.0;
|
||||
@@ -1178,11 +1179,11 @@ testResult_t run() {
|
||||
|
||||
// Free off CUDA allocated memory
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
if (sendbuffs[i]) HIPCHECK(hipFree((char*)sendbuffs[i]));
|
||||
if (recvbuffs[i]) HIPCHECK(hipFree((char*)recvbuffs[i]));
|
||||
if (datacheck) HIPCHECK(hipFree(expected[i]));
|
||||
if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
|
||||
if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
|
||||
if (datacheck) CUDACHECK(cudaFree(expected[i]));
|
||||
}
|
||||
HIPCHECK(hipHostFree(delta));
|
||||
CUDACHECK(cudaFreeHost(delta));
|
||||
|
||||
envstr = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_avg_bw = envstr ? atof(envstr) : -1;
|
||||
@@ -1197,9 +1198,9 @@ testResult_t run() {
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
|
||||
// 'hip-memcheck --leak-check full' requires this
|
||||
// 'cuda-memcheck --leak-check full' requires this
|
||||
PRINT("%s\n", ncclGetLastError(NULL));
|
||||
hipDeviceReset();
|
||||
cudaDeviceReset();
|
||||
|
||||
if (errors[0] || bw[0] < check_avg_bw*(0.9))
|
||||
exit(EXIT_FAILURE);
|
||||
|
||||
+7
-7
@@ -21,14 +21,14 @@
|
||||
// For nccl.h < 2.13 since we define a weak fallback
|
||||
extern "C" char const* ncclGetLastError(ncclComm_t comm);
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t err = cmd; \
|
||||
if( err != cudaSuccess ) { \
|
||||
char hostname[1024]; \
|
||||
getHostName(hostname, 1024); \
|
||||
printf("%s: Test HIP failure %s:%d '%s'\n", \
|
||||
printf("%s: Test CUDA failure %s:%d '%s'\n", \
|
||||
hostname, \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
__FILE__,__LINE__,cudaGetErrorString(err)); \
|
||||
return testCudaError; \
|
||||
} \
|
||||
} while(0)
|
||||
@@ -93,7 +93,7 @@ struct testColl {
|
||||
ncclRedOp_t op, int root, int rep, int in_place);
|
||||
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
|
||||
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||
};
|
||||
extern struct testColl allReduceTest;
|
||||
extern struct testColl allGatherTest;
|
||||
@@ -133,7 +133,7 @@ struct threadArgs {
|
||||
size_t recvInplaceOffset;
|
||||
ncclUniqueId ncclId;
|
||||
ncclComm_t* comms;
|
||||
hipStream_t* streams;
|
||||
cudaStream_t* streams;
|
||||
|
||||
void** expected;
|
||||
size_t expectedBytes;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@@ -22,16 +22,16 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
|
||||
HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
|
||||
CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
|
||||
if (rank == root) {
|
||||
TESTCHECK(InitData(args->expected[i], nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -44,7 +44,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
#define ALIGN 4
|
||||
@@ -25,15 +25,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[i])+rank*args->sendBytes : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData((char*)args->expected[i] + args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
char* sbuff = (char*)sendbuff;
|
||||
char* rbuff = (char*)recvbuff;
|
||||
int nRanks;
|
||||
@@ -54,7 +54,7 @@ testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
size_t rankSize = count * wordSize(type);
|
||||
if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
|
||||
if (rbuff+rank*rankSize != sbuff) CUDACHECK(cudaMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, cudaMemcpyDeviceToDevice, stream));
|
||||
|
||||
// Hypercube AllGather
|
||||
for (int mask=1; mask<nRanks; mask<<=1) {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -21,28 +20,28 @@ static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
|
||||
#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
|
||||
|
||||
static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
CHECKCOUNT(count);
|
||||
return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
|
||||
}
|
||||
static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
|
||||
CHECKCOUNT(count);
|
||||
return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
|
||||
}
|
||||
static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
|
||||
ncclComm_t comm, hipStream_t stream) {
|
||||
ncclComm_t comm, cudaStream_t stream) {
|
||||
CHECKCOUNT(count);
|
||||
return ncclBcast(buff, (int)count, datatype, root, comm, stream);
|
||||
}
|
||||
static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
|
||||
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
|
||||
hipStream_t stream) {
|
||||
cudaStream_t stream) {
|
||||
CHECKCOUNT(recvcount);
|
||||
return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
|
||||
}
|
||||
static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
|
||||
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
|
||||
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
|
||||
CHECKCOUNT(sendcount);
|
||||
return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@@ -22,14 +22,14 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
|
||||
CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
|
||||
if (rank == root) TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -40,7 +40,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
|
||||
*busBw = baseBw;
|
||||
}
|
||||
|
||||
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
#define ALIGN 4
|
||||
@@ -25,14 +25,14 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
HIPCHECK(hipMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, hipMemcpyDefault));
|
||||
CUDACHECK(cudaMemcpy(args->expected[i], args->recvbuffs[i], args->expectedBytes, cudaMemcpyDefault));
|
||||
TESTCHECK(InitDataReduce(args->expected[i], recvcount, rank*recvcount, type, op, rep, nranks));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -45,7 +45,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@@ -21,13 +21,13 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
|
||||
size_t recvcount = args->expectedBytes / wordSize(type);
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
TESTCHECK(InitData(args->expected[i], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -40,7 +40,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
@@ -22,14 +22,14 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
|
||||
int peer = (rank-1+nranks)%nranks;
|
||||
TESTCHECK(InitData(args->expected[i], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
// We don't support in-place sendrecv
|
||||
args->reportErrors = in_place ? 0 : 1;
|
||||
@@ -44,7 +44,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
|
||||
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
|
||||
@@ -24,11 +24,11 @@
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
using std::uint64_t;
|
||||
using std::uint32_t;
|
||||
using bfloat16 = hip_bfloat16;
|
||||
using bfloat16 = __nv_bfloat16;
|
||||
|
||||
template<typename T>
|
||||
struct float_traits;
|
||||
@@ -182,14 +182,14 @@ __global__ void kernel() {
|
||||
int main() {
|
||||
std::printf("type=float:\n");
|
||||
kernel<float><<<1,32>>>();
|
||||
hipDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=half:\n");
|
||||
kernel<half><<<1,32>>>();
|
||||
hipDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=bfloat16:\n");
|
||||
kernel<bfloat16><<<1,32>>>();
|
||||
hipDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
//#pragma nv_diag_suppress declared_but_not_referenced
|
||||
|
||||
#include "verifiable.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
|
||||
#include "rccl/rccl.h"
|
||||
@@ -91,7 +91,7 @@ template<>
|
||||
struct IsIntegral<__half>: std::false_type {};
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
struct IsIntegral<hip_bfloat16>: std::false_type {};
|
||||
struct IsIntegral<__nv_bfloat16>: std::false_type {};
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -126,7 +126,7 @@ namespace {
|
||||
}
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
__host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
|
||||
__host__ __device__ __nv_bfloat16 castTo<__nv_bfloat16>(float x) {
|
||||
return hip_bfloat16(x);
|
||||
}
|
||||
#endif
|
||||
@@ -153,7 +153,7 @@ struct ReduceSum {
|
||||
return __float2half(__half2float(a) + __half2float(b));
|
||||
}
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
|
||||
}
|
||||
#endif
|
||||
@@ -169,7 +169,7 @@ struct ReduceProd {
|
||||
return __float2half(__half2float(a) * __half2float(b));
|
||||
}
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
|
||||
}
|
||||
#endif
|
||||
@@ -185,7 +185,7 @@ struct ReduceMin {
|
||||
return __half2float(a) < __half2float(b) ? a : b;
|
||||
}
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
return static_cast<float>(a) < static_cast<float>(b) ? a : b;
|
||||
}
|
||||
#endif
|
||||
@@ -201,7 +201,7 @@ struct ReduceMax {
|
||||
return __half2float(a) > __half2float(b) ? a : b;
|
||||
}
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
|
||||
__host__ __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b) const {
|
||||
return static_cast<float>(a) > static_cast<float>(b) ? a : b;
|
||||
}
|
||||
#endif
|
||||
@@ -280,7 +280,7 @@ struct FloatLayout<__half> {
|
||||
};
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
struct FloatLayout<hip_bfloat16> {
|
||||
struct FloatLayout<__nv_bfloat16> {
|
||||
static constexpr int exponent_bits = 8, mantissa_bits = 7;
|
||||
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
|
||||
};
|
||||
@@ -801,7 +801,7 @@ __global__ void prepareInput2(
|
||||
template<typename ReduceOp>
|
||||
void prepareInput1(
|
||||
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
) {
|
||||
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
|
||||
#define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
|
||||
@@ -814,7 +814,7 @@ void prepareInput1(
|
||||
case ncclUint64: CASE_TY(uint64_t)
|
||||
case ncclFloat16: CASE_TY(__half)
|
||||
#if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16)
|
||||
case ncclBfloat16: CASE_TY(__nv_bfloat16)
|
||||
#endif
|
||||
case ncclFloat32: CASE_TY(float)
|
||||
case ncclFloat64: CASE_TY(double)
|
||||
@@ -826,7 +826,7 @@ void prepareInput1(
|
||||
|
||||
void ncclVerifiablePrepareInput(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
) {
|
||||
#define CASE_OP(op) \
|
||||
if(rank_n == 1) \
|
||||
@@ -877,7 +877,7 @@ __global__ void prepareExpected2(
|
||||
template<typename ReduceOp>
|
||||
void prepareExpected1(
|
||||
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
) {
|
||||
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
|
||||
#define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
|
||||
@@ -890,7 +890,7 @@ void prepareExpected1(
|
||||
case ncclUint64: CASE_TY(uint64_t)
|
||||
case ncclFloat16: CASE_TY(__half)
|
||||
#if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16)
|
||||
case ncclBfloat16: CASE_TY(__nv_bfloat16)
|
||||
#endif
|
||||
case ncclFloat32: CASE_TY(float)
|
||||
case ncclFloat64: CASE_TY(double)
|
||||
@@ -902,7 +902,7 @@ void prepareExpected1(
|
||||
|
||||
void ncclVerifiablePrepareExpected(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
) {
|
||||
#define CASE_OP(op) \
|
||||
if(rank_n == 1) \
|
||||
@@ -1051,7 +1051,7 @@ __global__ void verifyInline2(
|
||||
template<typename T, typename Uint>
|
||||
void verifyInline1(
|
||||
T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
|
||||
unsigned tolerance, int64_t *bad_elt_n, cudaStream_t stream, int block_n
|
||||
) {
|
||||
#define CASE_OP(op) \
|
||||
if(rank_n == 1) \
|
||||
@@ -1080,7 +1080,7 @@ void verifyInline1(
|
||||
void ncclVerifiableVerify(
|
||||
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
|
||||
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t *bad_elt_n, hipStream_t stream
|
||||
int64_t *bad_elt_n, cudaStream_t stream
|
||||
) {
|
||||
bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
|
||||
#if HAVE_ncclBfloat16
|
||||
@@ -1112,7 +1112,7 @@ void ncclVerifiableVerify(
|
||||
case ncclUint64: CASE_TY(uint64_t, uint64_t)
|
||||
case ncclFloat16: CASE_TY(__half, uint16_t)
|
||||
#if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
|
||||
case ncclBfloat16: CASE_TY(__nv_bfloat16, uint16_t)
|
||||
#endif
|
||||
case ncclFloat32: CASE_TY(float, uint32_t)
|
||||
case ncclFloat64: CASE_TY(double, uint64_t)
|
||||
@@ -1179,7 +1179,7 @@ __global__ void sweep() {
|
||||
sweep1<uint64_t>(ncclUint64, "uint64");
|
||||
sweep1<__half>(ncclFloat16, "half");
|
||||
#if HAVE_ncclBfloat16
|
||||
sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
|
||||
sweep1<__nv_bfloat16>(ncclBfloat16, "bfloat16");
|
||||
#endif
|
||||
sweep1<float>(ncclFloat32, "float");
|
||||
sweep1<double>(ncclFloat64, "double");
|
||||
@@ -1187,9 +1187,9 @@ __global__ void sweep() {
|
||||
|
||||
int main(int arg_n, char **args) {
|
||||
std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
|
||||
hipSetDevice(0);
|
||||
cudaSetDevice(0);
|
||||
sweep<<<1,512>>>();
|
||||
hipDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#ifndef _d41d8cd98f00b204e9800998ecf8427e
|
||||
#define _d41d8cd98f00b204e9800998ecf8427e
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -43,13 +43,13 @@ __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
|
||||
// Enqueue kernel to generate data which is to be reduced.
|
||||
void ncclVerifiablePrepareInput(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to generate expected results of reduction.
|
||||
void ncclVerifiablePrepareExpected(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to verify reduced data matches expectation. The number of
|
||||
@@ -61,6 +61,6 @@ void ncclVerifiablePrepareExpected(
|
||||
void ncclVerifiableVerify(
|
||||
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
|
||||
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t *bad_elt_n, hipStream_t stream
|
||||
int64_t *bad_elt_n, cudaStream_t stream
|
||||
);
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
|
||||
@@ -8,11 +8,21 @@
|
||||
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
|
||||
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
|
||||
|
||||
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
|
||||
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
|
||||
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
|
||||
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
|
||||
|
||||
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
|
||||
${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
|
||||
${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
|
||||
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPIFY_DIR)/verifiable.h
|
||||
@printf "Compiling %s\n" $@
|
||||
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
|
||||
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
|
||||
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
|
||||
|
||||
Ссылка в новой задаче
Block a user