update build rccl-tests infrastructure and add getAlgoProtoChannels support (#2212)
Tento commit je obsažen v:
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
@@ -15,6 +15,8 @@ CUSTOM_RCCL_LIB ?= ""
|
||||
|
||||
HIPCC ?= $(ROCM_PATH)/bin/amdclang++
|
||||
HIPCONFIG = $(ROCM_PATH)/bin/hipconfig
|
||||
HIPIFY_PL_EXE = $(ROCM_PATH)/bin/hipify-perl
|
||||
HIPIFY_PL_FLAGS = -quiet-warnings
|
||||
CXX = $(HIPCC)
|
||||
|
||||
HIPCUFLAGS := -std=c++14
|
||||
@@ -30,7 +32,10 @@ HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
|
||||
# Currently, supports gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
|
||||
ifndef GPU_TARGETS
|
||||
GPU_TARGETS = gfx906 gfx908 gfx90a
|
||||
ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
|
||||
ifeq ($(shell test "0$(HIP_MAJOR)" -ge 7; echo $$?),0)
|
||||
# Include gfx942 and gfx950 support if we're using ROCm 7.0 or above
|
||||
GPU_TARGETS += gfx942 gfx950
|
||||
else ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
|
||||
# Include gfx942 support if we're using ROCm 6.0 or above
|
||||
GPU_TARGETS += gfx942
|
||||
ifeq ($(shell test "0$(HIP_MINOR)" -ge 5; echo $$?),0)
|
||||
@@ -43,41 +48,6 @@ endif
|
||||
|
||||
GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
|
||||
|
||||
#CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||
#CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
#CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
#
|
||||
## Better define NVCC_GENCODE in your environment to the minimal set
|
||||
## of archs to reduce compile time.
|
||||
#ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
|
||||
## Include Blackwell support if we're using CUDA12.8 or above
|
||||
#NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
|
||||
# -gencode=arch=compute_90,code=sm_90 \
|
||||
# -gencode=arch=compute_100,code=sm_100 \
|
||||
# -gencode=arch=compute_120,code=sm_120 \
|
||||
# -gencode=arch=compute_120,code=compute_120
|
||||
#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
|
||||
#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
|
||||
# -gencode=arch=compute_61,code=sm_61 \
|
||||
# -gencode=arch=compute_70,code=sm_70 \
|
||||
# -gencode=arch=compute_80,code=sm_80 \
|
||||
# -gencode=arch=compute_90,code=sm_90 \
|
||||
# -gencode=arch=compute_90,code=compute_90
|
||||
#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
||||
#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
|
||||
# -gencode=arch=compute_61,code=sm_61 \
|
||||
# -gencode=arch=compute_70,code=sm_70 \
|
||||
# -gencode=arch=compute_80,code=sm_80 \
|
||||
# -gencode=arch=compute_80,code=compute_80
|
||||
#else
|
||||
#NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
|
||||
# -gencode=arch=compute_50,code=sm_50 \
|
||||
# -gencode=arch=compute_60,code=sm_60 \
|
||||
# -gencode=arch=compute_61,code=sm_61 \
|
||||
# -gencode=arch=compute_70,code=sm_70 \
|
||||
# -gencode=arch=compute_70,code=compute_70
|
||||
#endif
|
||||
|
||||
ifneq ($(NCCL_HOME), "")
|
||||
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
|
||||
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
|
||||
@@ -117,7 +87,7 @@ HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I
|
||||
HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
|
||||
endif
|
||||
|
||||
LIBRARIES += rccl
|
||||
LIBRARIES += rccl dl
|
||||
HIPLDFLAGS += $(LIBRARIES:%=-l%)
|
||||
|
||||
DST_DIR := $(BUILDDIR)
|
||||
@@ -148,14 +118,14 @@ $(GIT_VERSION_FILE):
|
||||
${HIPIFY_DIR}/%.cpp: %.cpp
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
|
||||
|
||||
${HIPIFY_DIR}/%.h: %.h
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
|
||||
|
||||
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
|
||||
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h ${HIPIFY_DIR}/rccl_compat.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
|
||||
@@ -166,9 +136,8 @@ ${DST_DIR}/timer.o: timer.cc timer.h
|
||||
@mkdir -p ${DST_DIR}
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
|
||||
${DST_DIR}/%_perf: ${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
|
||||
@printf "Linking %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
|
||||
$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
AllGatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -47,6 +48,16 @@ AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t
|
||||
AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
|
||||
int* algo, int* proto, int* nchannels)
|
||||
{
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllGather, count, type, 0, 0, 1, algo,
|
||||
proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void
|
||||
AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
|
||||
int nranks)
|
||||
@@ -60,14 +71,17 @@ AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
|
||||
|
||||
testResult_t
|
||||
AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl allGatherTest = { "AllGather", AllGatherGetCollByteCount,
|
||||
AllGatherInitData, AllGatherGetBw, AllGatherRunColl };
|
||||
struct testColl allGatherTest = { "AllGather", AllGatherGetCollByteCount,
|
||||
AllGatherInitData, AllGatherGetBw,
|
||||
AllGatherRunColl, AllGatherGetAlgoProtoChannels };
|
||||
|
||||
void
|
||||
AllGatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
AllReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -41,6 +42,16 @@ AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t
|
||||
AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
|
||||
int* algo, int* proto, int* nchannels)
|
||||
{
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce, count, type, 0, 0, 1, algo,
|
||||
proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void
|
||||
AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
|
||||
int nranks)
|
||||
@@ -54,14 +65,17 @@ AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
|
||||
|
||||
testResult_t
|
||||
AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl allReduceTest = { "AllReduce", AllReduceGetCollByteCount,
|
||||
AllReduceInitData, AllReduceGetBw, AllReduceRunColl };
|
||||
struct testColl allReduceTest = { "AllReduce", AllReduceGetCollByteCount,
|
||||
AllReduceInitData, AllReduceGetBw,
|
||||
AllReduceRunColl, AllReduceGetAlgoProtoChannels };
|
||||
|
||||
void
|
||||
AllReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
AlltoAllGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -62,14 +63,17 @@ AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* bus
|
||||
|
||||
testResult_t
|
||||
AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl alltoAllTest = { "AlltoAll", AlltoAllGetCollByteCount, AlltoAllInitData,
|
||||
AlltoAllGetBw, AlltoAllRunColl };
|
||||
struct testColl alltoAllTest = { "AlltoAll", AlltoAllGetCollByteCount,
|
||||
AlltoAllInitData, AlltoAllGetBw,
|
||||
AlltoAllRunColl, nullptr };
|
||||
|
||||
void
|
||||
AlltoAllGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
#define USE_RCCL_GATHER_SCATTER
|
||||
|
||||
@@ -101,8 +102,10 @@ AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
|
||||
|
||||
testResult_t
|
||||
AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
int nranks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nranks));
|
||||
int rank;
|
||||
@@ -174,8 +177,9 @@ AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl alltoAllTest = { "AlltoAllv", AlltoAllvGetCollByteCount,
|
||||
AlltoAllvInitData, AlltoAllvGetBw, AlltoAllvRunColl };
|
||||
struct testColl alltoAllTest = { "AlltoAllv", AlltoAllvGetCollByteCount,
|
||||
AlltoAllvInitData, AlltoAllvGetBw,
|
||||
AlltoAllvRunColl, nullptr };
|
||||
|
||||
void
|
||||
AlltoAllvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
BroadcastGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -41,6 +42,16 @@ BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t
|
||||
BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
|
||||
int* algo, int* proto, int* nchannels)
|
||||
{
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast, count, type, 0, 0, 1, algo,
|
||||
proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void
|
||||
BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
|
||||
int nranks)
|
||||
@@ -54,8 +65,10 @@ BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
|
||||
|
||||
testResult_t
|
||||
BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
|
||||
@@ -73,8 +86,9 @@ BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl broadcastTest = { "Broadcast", BroadcastGetCollByteCount,
|
||||
BroadcastInitData, BroadcastGetBw, BroadcastRunColl };
|
||||
struct testColl broadcastTest = { "Broadcast", BroadcastGetCollByteCount,
|
||||
BroadcastInitData, BroadcastGetBw,
|
||||
BroadcastRunColl, BroadcastGetAlgoProtoChannels };
|
||||
|
||||
void
|
||||
BroadcastGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_runtime_api.h"
|
||||
#include "rccl_float8.h"
|
||||
#include <cstdio>
|
||||
#include <ctype.h>
|
||||
#include <dlfcn.h>
|
||||
#include <errno.h>
|
||||
#include <getopt.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include <libgen.h>
|
||||
@@ -27,10 +29,46 @@
|
||||
#include "git_version.h"
|
||||
#include "verifiable.h"
|
||||
|
||||
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
|
||||
|
||||
int test_ncclVersion = 0; // init'd with ncclGetVersion()
|
||||
int32_t gpu_block3;
|
||||
size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
|
||||
|
||||
rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL;
|
||||
rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL;
|
||||
rcclTestsGetAlgoName_t rcclTestsGetAlgoName = NULL;
|
||||
|
||||
static void
|
||||
loadRcclSyms()
|
||||
{
|
||||
static void* handle = NULL;
|
||||
const char* libname = "librccl.so";
|
||||
if(!handle)
|
||||
{
|
||||
handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
|
||||
if(!handle)
|
||||
{
|
||||
fprintf(stderr, "dlopen failed: %s\n", dlerror());
|
||||
return;
|
||||
}
|
||||
}
|
||||
rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo");
|
||||
rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName");
|
||||
rcclTestsGetProtocolName =
|
||||
(rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName");
|
||||
}
|
||||
|
||||
// RCCL_FLOAT8 support
|
||||
bool rccl_float8_useFnuz = false;
|
||||
|
||||
bool
|
||||
IsArchMatch(char const* arch, char const* target)
|
||||
{
|
||||
// helper function to reduce clutter in code elsewhere. Returns true on match.
|
||||
return (strncmp(arch, target, strlen(target)) == 0);
|
||||
}
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
ncclDataType_t test_types[ncclNumTypes] = { ncclInt8,
|
||||
ncclUint8,
|
||||
@@ -41,14 +79,14 @@ ncclDataType_t test_types[ncclNumTypes] = { ncclInt8,
|
||||
ncclHalf,
|
||||
ncclFloat,
|
||||
ncclDouble
|
||||
# if RCCL_BFLOAT16 == 1
|
||||
# if HAVE_BF16
|
||||
,
|
||||
ncclBfloat16
|
||||
# endif
|
||||
# if RCCL_FLOAT8 == 1
|
||||
# if HAVE_FP8
|
||||
,
|
||||
ncclFp8E4M3,
|
||||
ncclFp8E5M2
|
||||
ncclFloat8e4m3,
|
||||
ncclFloat8e5m2
|
||||
# endif
|
||||
};
|
||||
const char* test_typenames[ncclNumTypes] = { "int8",
|
||||
@@ -60,11 +98,11 @@ const char* test_typenames[ncclNumTypes] = { "int8",
|
||||
"half",
|
||||
"float",
|
||||
"double"
|
||||
# if RCCL_BFLOAT16 == 1
|
||||
# if HAVE_BF16
|
||||
,
|
||||
"bfloat16"
|
||||
# endif
|
||||
# if RCCL_FLOAT8 == 1
|
||||
# if HAVE_FP8
|
||||
,
|
||||
"fp8_e4m3",
|
||||
"fp8_e5m2"
|
||||
@@ -374,6 +412,15 @@ InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t
|
||||
InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset,
|
||||
ncclDataType_t type, ncclRedOp_t op)
|
||||
{
|
||||
ncclVerifiableApplyBias(expected, bias, count, (int) type, (int) op, offset,
|
||||
cudaStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t
|
||||
InitData(void* data, const size_t count, size_t offset, ncclDataType_t type,
|
||||
ncclRedOp_t op, uint64_t seed, int nranks, int rank)
|
||||
@@ -716,7 +763,7 @@ startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int
|
||||
TESTCHECK(args->collTest->runColl(
|
||||
(void*) (in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff),
|
||||
(void*) (in_place ? recvBuff + args->recvInplaceOffset * rank : recvBuff),
|
||||
count, type, op, root, args->comms[i], args->streams[i]));
|
||||
count, type, op, root, args->comms[i], args->streams[i], args->bias[i]));
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
|
||||
if(opIndex >= ncclNumOps)
|
||||
@@ -1239,7 +1286,7 @@ threadLaunch(struct testThread* thread)
|
||||
|
||||
testResult_t
|
||||
AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes,
|
||||
void** expected, size_t nbytes)
|
||||
void** expected, size_t nbytes, void** bias)
|
||||
{
|
||||
if(enable_rotating_tensor)
|
||||
{
|
||||
@@ -1252,6 +1299,8 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
|
||||
{
|
||||
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
|
||||
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
|
||||
if(bias)
|
||||
CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached));
|
||||
if(datacheck)
|
||||
CUDACHECK(
|
||||
hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
|
||||
@@ -1262,6 +1311,9 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
|
||||
hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
CUDACHECK(
|
||||
hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
if(bias)
|
||||
CUDACHECK(
|
||||
hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained));
|
||||
if(datacheck)
|
||||
CUDACHECK(hipExtMallocWithFlags(expected, recvBytes,
|
||||
hipDeviceMallocFinegrained));
|
||||
@@ -1271,12 +1323,14 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
|
||||
{
|
||||
CUDACHECK(hipHostMalloc(sendbuff, nbytes));
|
||||
CUDACHECK(hipHostMalloc(recvbuff, nbytes));
|
||||
if(bias) CUDACHECK(hipHostMalloc(bias, nbytes));
|
||||
if(datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
|
||||
}
|
||||
else if(memorytype == ncclManaged)
|
||||
{
|
||||
CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
|
||||
CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
|
||||
if(bias) CUDACHECK(cudaMallocManaged(bias, nbytes));
|
||||
if(datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
|
||||
#if 0
|
||||
CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
|
||||
@@ -1288,6 +1342,7 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
|
||||
{
|
||||
CUDACHECK(cudaMalloc(sendbuff, nbytes));
|
||||
CUDACHECK(cudaMalloc(recvbuff, nbytes));
|
||||
if(bias) CUDACHECK(cudaMalloc(bias, nbytes));
|
||||
if(datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
|
||||
}
|
||||
CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
|
||||
@@ -1333,6 +1388,7 @@ main(int argc, char* argv[])
|
||||
test_opnum++; // PreMulSum
|
||||
}
|
||||
#endif
|
||||
loadRcclSyms();
|
||||
|
||||
// Parse args
|
||||
// Replace getopt_long with manual argument parsing
|
||||
@@ -1764,6 +1820,7 @@ run()
|
||||
std::vector<cudaStream_t> streams(nGpus * nThreads);
|
||||
std::vector<void*> sendbuffs(nGpus * nThreads);
|
||||
std::vector<void*> recvbuffs(nGpus * nThreads);
|
||||
std::vector<void*> bias(nGpus * nThreads);
|
||||
std::vector<void*> expected(nGpus * nThreads);
|
||||
size_t sendBytes, recvBytes;
|
||||
|
||||
@@ -1777,7 +1834,8 @@ run()
|
||||
gpus[i] = ((gpu0 != -1 ? gpu0 : localRank * nThreads * nGpus) + i) % numDevices;
|
||||
CUDACHECK(cudaSetDevice(gpus[i]));
|
||||
TESTCHECK(AllocateBuffs(sendbuffs.data() + i, sendBytes, recvbuffs.data() + i,
|
||||
recvBytes, expected.data() + i, (size_t) maxBytes));
|
||||
recvBytes, expected.data() + i, (size_t) maxBytes,
|
||||
bias.data() + i));
|
||||
if(streamnull)
|
||||
streams[i] = NULL;
|
||||
else
|
||||
@@ -1895,6 +1953,7 @@ run()
|
||||
threads[t].args.gpus = gpus.data() + t * nGpus;
|
||||
threads[t].args.sendbuffs = sendbuffs.data() + t * nGpus;
|
||||
threads[t].args.recvbuffs = recvbuffs.data() + t * nGpus;
|
||||
threads[t].args.bias = bias.data() + t * nGpus;
|
||||
threads[t].args.expected = expected.data() + t * nGpus;
|
||||
threads[t].args.ncclId = ncclId;
|
||||
threads[t].args.comms = comms + t * nGpus;
|
||||
@@ -1952,6 +2011,7 @@ run()
|
||||
{
|
||||
if(sendbuffs[i]) CUDACHECK(cudaFree((char*) sendbuffs[i]));
|
||||
if(recvbuffs[i]) CUDACHECK(cudaFree((char*) recvbuffs[i]));
|
||||
if(bias[i]) CUDACHECK(cudaFree((char*) bias[i]));
|
||||
if(datacheck) CUDACHECK(cudaFree(expected[i]));
|
||||
}
|
||||
CUDACHECK(cudaFreeHost(delta));
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
@@ -11,18 +11,26 @@
|
||||
#include "rccl/rccl.h"
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#ifdef MPI_SUPPORT
|
||||
# include "mpi.h"
|
||||
#endif
|
||||
#include "nccl1_compat.h"
|
||||
#include "timer.h"
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <pthread.h>
|
||||
// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 3)
|
||||
|
||||
// Ensures backward compatibility for FP8 datatypes
|
||||
#if NCCL_VERSION_CODE < NCCL_VERSION(2, 24, 3)
|
||||
# define ncclFloat8e4m3 ncclFp8E4M3
|
||||
# define ncclFloat8e5m2 ncclFp8E5M2
|
||||
#else
|
||||
// For newer RCCL versions, define old names in terms of new names
|
||||
# define ncclFp8E4M3 ncclFloat8e4m3
|
||||
# define ncclFp8E5M2 ncclFloat8e5m2
|
||||
#endif
|
||||
@@ -114,7 +122,10 @@ struct testColl
|
||||
int nranks);
|
||||
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t type, ncclRedOp_t op, int root,
|
||||
ncclComm_t comm, cudaStream_t stream);
|
||||
ncclComm_t comm, cudaStream_t stream, void* bias);
|
||||
testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count,
|
||||
ncclDataType_t type, int* algo, int* proto,
|
||||
int* nchannels);
|
||||
};
|
||||
extern struct testColl allReduceTest;
|
||||
extern struct testColl allGatherTest;
|
||||
@@ -139,6 +150,7 @@ public:
|
||||
void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes,
|
||||
int inPlace, double timeUsec, double algBw, double busBw,
|
||||
int64_t wrongElts = -1);
|
||||
void writeFile();
|
||||
|
||||
private:
|
||||
bool isMainThread();
|
||||
@@ -154,13 +166,14 @@ private:
|
||||
return std::make_pair("\"" + v + "\"", k);
|
||||
};
|
||||
|
||||
bool _outputValid = false;
|
||||
std::ofstream _out;
|
||||
std::string _outputFormat;
|
||||
size_t _numCycle = 0;
|
||||
std::string _collectiveName;
|
||||
std::string _typeName;
|
||||
std::string _opName;
|
||||
bool _outputValid = false;
|
||||
std::ofstream _out;
|
||||
std::string _outputFormat;
|
||||
size_t _numCycle = 0;
|
||||
std::string _collectiveName;
|
||||
std::string _typeName;
|
||||
std::string _opName;
|
||||
std::vector<std::vector<std::pair<std::string, std::string>>> _outputData;
|
||||
};
|
||||
|
||||
struct testEngine
|
||||
@@ -200,6 +213,7 @@ struct threadArgs
|
||||
ncclUniqueId ncclId;
|
||||
ncclComm_t* comms;
|
||||
cudaStream_t* streams;
|
||||
void** bias;
|
||||
|
||||
void** expected;
|
||||
size_t expectedBytes;
|
||||
@@ -233,11 +247,14 @@ extern testResult_t
|
||||
InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type,
|
||||
ncclRedOp_t op, const uint64_t seed, const int nranks);
|
||||
extern testResult_t
|
||||
InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset,
|
||||
ncclDataType_t type, ncclRedOp_t op);
|
||||
extern testResult_t
|
||||
InitData(void* data, const size_t count, size_t offset, ncclDataType_t type,
|
||||
ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
|
||||
extern void
|
||||
AllocateBuffs(void** sendbuff, void** recvbuff, void** expected, void** expectedHost,
|
||||
size_t nbytes, int nranks);
|
||||
extern testResult_t
|
||||
AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes,
|
||||
void** expected, size_t nbytes, void** bias);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -304,6 +321,29 @@ getHostHash(const char* hostname)
|
||||
return getHash(hostHash, strlen(hostHash));
|
||||
}
|
||||
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
# define HAVE_BF16 1
|
||||
#else
|
||||
# define HAVE_BF16 0
|
||||
#endif
|
||||
#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
|
||||
# define HAVE_FP8 1
|
||||
#else
|
||||
# define HAVE_FP8 0
|
||||
#endif
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
# if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
|
||||
# undef HAVE_BF16
|
||||
# define HAVE_BF16 1
|
||||
# if defined(__CUDA_FP8_TYPES_EXIST__) && \
|
||||
NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 0)
|
||||
# undef HAVE_FP8
|
||||
# define HAVE_FP8 1
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
static size_t
|
||||
wordSize(ncclDataType_t type)
|
||||
{
|
||||
@@ -313,14 +353,14 @@ wordSize(ncclDataType_t type)
|
||||
#if NCCL_MAJOR >= 2
|
||||
// case ncclInt8:
|
||||
case ncclUint8:
|
||||
# if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
|
||||
case ncclFp8E4M3:
|
||||
case ncclFp8E5M2:
|
||||
# if HAVE_FP8
|
||||
case ncclFloat8e4m3:
|
||||
case ncclFloat8e5m2:
|
||||
# endif
|
||||
#endif
|
||||
return 1;
|
||||
case ncclHalf:
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
#if HAVE_BF16
|
||||
case ncclBfloat16:
|
||||
#endif
|
||||
// case ncclFloat16:
|
||||
@@ -427,4 +467,30 @@ extern thread_local int is_main_thread;
|
||||
#define PRINT \
|
||||
if(is_main_thread) printf
|
||||
|
||||
typedef enum
|
||||
{
|
||||
ncclFuncBroadcast = 0,
|
||||
ncclFuncReduce = 1,
|
||||
ncclFuncAllGather = 2,
|
||||
ncclFuncReduceScatter = 3,
|
||||
ncclFuncAllReduce = 4,
|
||||
ncclFuncAllReduceWithBias = 5,
|
||||
ncclFuncSendRecv = 6,
|
||||
ncclFuncSend = 7,
|
||||
ncclFuncRecv = 8,
|
||||
ncclFuncAllToAllPivot = 9,
|
||||
ncclNumFuncs = 10
|
||||
} ncclFunc_t;
|
||||
|
||||
typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll,
|
||||
uint64_t count, ncclDataType_t dataType,
|
||||
int collNetSupport, int nvlsSupport,
|
||||
int numPipeOps, int* algo, int* protocol,
|
||||
int* maxChannels);
|
||||
typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
|
||||
typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol,
|
||||
const char** protocolName);
|
||||
|
||||
#include "rccl_compat.h"
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
GatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -61,8 +62,10 @@ GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw
|
||||
|
||||
testResult_t
|
||||
GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
@@ -85,8 +88,8 @@ GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl gatherTest = { "Gather", GatherGetCollByteCount, GatherInitData,
|
||||
GatherGetBw, GatherRunColl };
|
||||
struct testColl gatherTest = { "Gather", GatherGetCollByteCount, GatherInitData,
|
||||
GatherGetBw, GatherRunColl, nullptr };
|
||||
|
||||
void
|
||||
GatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
/* ************************************************************************
|
||||
* Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
* ies of the Software, and to permit persons to whom the Software is furnished
|
||||
* to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
* PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
* CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ************************************************************************ */
|
||||
|
||||
#ifndef RCCL_COMPAT_H
|
||||
#define RCCL_COMPAT_H
|
||||
|
||||
extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo;
|
||||
extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName;
|
||||
extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName;
|
||||
|
||||
#endif
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
ReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -45,6 +46,16 @@ ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t
|
||||
ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo,
|
||||
int* proto, int* nchannels)
|
||||
{
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce, count, type, 0, 0, 1, algo,
|
||||
proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void
|
||||
ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
|
||||
int nranks)
|
||||
@@ -56,14 +67,17 @@ ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw
|
||||
|
||||
testResult_t
|
||||
ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl reduceTest = { "Reduce", ReduceGetCollByteCount, ReduceInitData,
|
||||
ReduceGetBw, ReduceRunColl };
|
||||
struct testColl reduceTest = { "Reduce", ReduceGetCollByteCount,
|
||||
ReduceInitData, ReduceGetBw,
|
||||
ReduceRunColl, ReduceGetAlgoProtoChannels };
|
||||
|
||||
void
|
||||
ReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
ReduceScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -45,6 +46,16 @@ ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t
|
||||
ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
|
||||
int* algo, int* proto, int* nchannels)
|
||||
{
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter, count, type, 0, 0, 1,
|
||||
algo, proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void
|
||||
ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
|
||||
int nranks)
|
||||
@@ -58,15 +69,18 @@ ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double
|
||||
|
||||
testResult_t
|
||||
ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl reduceScatterTest = { "ReduceScatter", ReduceScatterGetCollByteCount,
|
||||
ReduceScatterInitData, ReduceScatterGetBw,
|
||||
ReduceScatterRunColl };
|
||||
struct testColl reduceScatterTest = {
|
||||
"ReduceScatter", ReduceScatterGetCollByteCount, ReduceScatterInitData,
|
||||
ReduceScatterGetBw, ReduceScatterRunColl, ReduceScatterGetAlgoProtoChannels
|
||||
};
|
||||
|
||||
void
|
||||
ReduceScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
ScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -55,8 +56,10 @@ ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busB
|
||||
|
||||
testResult_t
|
||||
ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
@@ -79,8 +82,8 @@ ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl scatterTest = { "Scatter", ScatterGetCollByteCount, ScatterInitData,
|
||||
ScatterGetBw, ScatterRunColl };
|
||||
struct testColl scatterTest = { "Scatter", ScatterGetCollByteCount, ScatterInitData,
|
||||
ScatterGetBw, ScatterRunColl, nullptr };
|
||||
|
||||
void
|
||||
ScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "cuda_runtime.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void
|
||||
SendRecvGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
|
||||
@@ -58,8 +59,10 @@ SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* bus
|
||||
|
||||
testResult_t
|
||||
SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
|
||||
void* bias = nullptr)
|
||||
{
|
||||
(void) bias;
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
@@ -74,8 +77,9 @@ SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t typ
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl sendRecvTest = { "SendRecv", SendRecvGetCollByteCount, SendRecvInitData,
|
||||
SendRecvGetBw, SendRecvRunColl };
|
||||
struct testColl sendRecvTest = { "SendRecv", SendRecvGetCollByteCount,
|
||||
SendRecvInitData, SendRecvGetBw,
|
||||
SendRecvRunColl, nullptr };
|
||||
|
||||
void
|
||||
SendRecvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
|
||||
|
||||
+68
-4
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -1108,7 +1108,7 @@ prepareInput1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
hipError_t
|
||||
ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
int rank_me, uint64_t seed, intptr_t elt_ix0,
|
||||
cudaStream_t stream)
|
||||
@@ -1135,6 +1135,7 @@ ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, i
|
||||
# endif
|
||||
}
|
||||
# undef CASE_OP
|
||||
return hipSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1198,7 +1199,7 @@ prepareExpected1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
hipError_t
|
||||
ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op,
|
||||
int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
cudaStream_t stream)
|
||||
@@ -1224,6 +1225,7 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op
|
||||
# endif
|
||||
}
|
||||
# undef CASE_OP
|
||||
return hipSuccess;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1411,7 +1413,7 @@ verifyInline1(T const* results, intptr_t elt_n, int red_op, int rank_n, uint64_t
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
hipError_t
|
||||
ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
|
||||
int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t* bad_elt_n, cudaStream_t stream)
|
||||
@@ -1470,6 +1472,68 @@ ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
|
||||
default: assert(0);
|
||||
}
|
||||
# undef CASE_TY
|
||||
return hipSuccess;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Apply bias to expected results
|
||||
|
||||
namespace
|
||||
{
|
||||
template <typename T>
|
||||
__global__ void
|
||||
applyBias2(T* elts, T* bias, intptr_t elt_n)
|
||||
{
|
||||
intptr_t i0 = blockIdx.x * (elt_n / gridDim.x);
|
||||
i0 += blockIdx.x < elt_n % gridDim.x ? blockIdx.x : elt_n % gridDim.x;
|
||||
intptr_t i1 = (blockIdx.x + 1) * (elt_n / gridDim.x);
|
||||
i1 += blockIdx.x + 1 < elt_n % gridDim.x ? blockIdx.x + 1 : elt_n % gridDim.x;
|
||||
intptr_t i = i0 + threadIdx.x;
|
||||
while(i < i1)
|
||||
{
|
||||
elts[i] = ReduceSum()(elts[i], bias[i]);
|
||||
i += blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
applyBias1(void* elts, void* bias, intptr_t elt_n, int elt_ty, cudaStream_t stream)
|
||||
{
|
||||
int block_n = std::min<intptr_t>(32, (elt_n + 4 * 512 - 1) / (4 * 512));
|
||||
# define CASE_TY(T) \
|
||||
applyBias2<<<block_n, 512, 0, stream>>>((T*) elts, (T*) bias, elt_n); \
|
||||
break;
|
||||
switch(elt_ty)
|
||||
{
|
||||
case ncclInt8: CASE_TY(int8_t)
|
||||
case ncclUint8: CASE_TY(uint8_t)
|
||||
case ncclInt32: CASE_TY(int32_t)
|
||||
case ncclUint32: CASE_TY(uint32_t)
|
||||
case ncclInt64: CASE_TY(int64_t)
|
||||
case ncclUint64: CASE_TY(uint64_t)
|
||||
case ncclFloat16: CASE_TY(__half)
|
||||
# if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16)
|
||||
# endif
|
||||
# if HAVE_ncclfp8
|
||||
case ncclFp8E4M3: CASE_TY(rccl_float8)
|
||||
case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
|
||||
# endif
|
||||
case ncclFloat32: CASE_TY(float)
|
||||
case ncclFloat64: CASE_TY(double)
|
||||
default: assert(0);
|
||||
}
|
||||
# undef CASE_TY
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void
|
||||
ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op,
|
||||
intptr_t elt_ix0, cudaStream_t stream)
|
||||
{
|
||||
(void) red_op;
|
||||
(void) elt_ix0;
|
||||
applyBias1(elts, bias, elt_n, elt_ty, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
+15
-4
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -43,13 +43,13 @@ ncclVerifiablePremulScalar(int rank_me)
|
||||
}
|
||||
|
||||
// Enqueue kernel to generate data which is to be reduced.
|
||||
void
|
||||
hipError_t
|
||||
ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
int rank_me, uint64_t seed, intptr_t elt_ix0,
|
||||
cudaStream_t stream);
|
||||
|
||||
// Enqueue kernel to generate expected results of reduction.
|
||||
void
|
||||
hipError_t
|
||||
ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op,
|
||||
int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
cudaStream_t stream);
|
||||
@@ -60,8 +60,19 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op
|
||||
// which can be costly. Thus if you plan to run the same reduction multiple
|
||||
// times it is advantageous to precompute the expected values with
|
||||
// ncclVerifiablePrepareExpected and pass them as `expected` here.
|
||||
void
|
||||
hipError_t
|
||||
ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
|
||||
int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t* bad_elt_n, cudaStream_t stream);
|
||||
|
||||
// Enqueue kernel that applies bias to expected results
|
||||
void
|
||||
ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op,
|
||||
intptr_t elt_ix0, cudaStream_t stream);
|
||||
|
||||
#ifdef NCCL_VERIFIABLE_SELF_TEST
|
||||
void
|
||||
ncclVerifiableLaunchSelfTest();
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
+4
-4
@@ -1,5 +1,5 @@
|
||||
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
|
||||
@@ -14,17 +14,17 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
|
||||
${HIPIFY_DIR}/verifiable.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cpp
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
|
||||
|
||||
${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
|
||||
|
||||
${HIPIFY_DIR}/rccl_float8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_float8.h
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
|
||||
|
||||
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_float8.h
|
||||
@printf "Compiling %s\n" $@
|
||||
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele