From 6b4565749385138377ed387bfc5b6712eacb0b3a Mon Sep 17 00:00:00 2001 From: habajpai-amd Date: Thu, 11 Dec 2025 18:29:06 +0530 Subject: [PATCH] update build rccl-tests infrastructure and add getAlgoProtoChannels support (#2212) --- .../examples/rccl/rccl-tests/src/Makefile | 55 ++------- .../rccl/rccl-tests/src/all_gather.cpp | 20 +++- .../rccl/rccl-tests/src/all_reduce.cpp | 20 +++- .../examples/rccl/rccl-tests/src/alltoall.cpp | 10 +- .../rccl/rccl-tests/src/alltoallv.cpp | 10 +- .../rccl/rccl-tests/src/broadcast.cpp | 20 +++- .../examples/rccl/rccl-tests/src/common.cpp | 84 +++++++++++-- .../examples/rccl/rccl-tests/src/common.h | 110 ++++++++++++++---- .../examples/rccl/rccl-tests/src/gather.cpp | 9 +- .../rccl/rccl-tests/src/rccl_compat.h | 30 +++++ .../examples/rccl/rccl-tests/src/reduce.cpp | 20 +++- .../rccl/rccl-tests/src/reduce_scatter.cpp | 22 +++- .../examples/rccl/rccl-tests/src/scatter.cpp | 9 +- .../examples/rccl/rccl-tests/src/sendrecv.cpp | 10 +- .../rccl/rccl-tests/verifiable/verifiable.cpp | 72 +++++++++++- .../rccl/rccl-tests/verifiable/verifiable.h | 19 ++- .../rccl/rccl-tests/verifiable/verifiable.mk | 8 +- 17 files changed, 408 insertions(+), 120 deletions(-) create mode 100644 projects/rocprofiler-systems/examples/rccl/rccl-tests/src/rccl_compat.h diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/Makefile b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/Makefile index ad6bf4ab7a..5997a03b36 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/Makefile +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. # Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE.txt for license information @@ -15,6 +15,8 @@ CUSTOM_RCCL_LIB ?= "" HIPCC ?= $(ROCM_PATH)/bin/amdclang++ HIPCONFIG = $(ROCM_PATH)/bin/hipconfig +HIPIFY_PL_EXE = $(ROCM_PATH)/bin/hipify-perl +HIPIFY_PL_FLAGS = -quiet-warnings CXX = $(HIPCC) HIPCUFLAGS := -std=c++14 @@ -30,7 +32,10 @@ HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2) # Currently, supports gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201 ifndef GPU_TARGETS GPU_TARGETS = gfx906 gfx908 gfx90a - ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0) + ifeq ($(shell test "0$(HIP_MAJOR)" -ge 7; echo $$?),0) + # Include gfx942 and gfx950 support if we're using ROCm 7.0 or above + GPU_TARGETS += gfx942 gfx950 + else ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0) # Include gfx942 support if we're using ROCm 6.0 or above GPU_TARGETS += gfx942 ifeq ($(shell test "0$(HIP_MINOR)" -ge 5; echo $$?),0) @@ -43,41 +48,6 @@ endif GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)") -#CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) -#CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) -#CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) -# -## Better define NVCC_GENCODE in your environment to the minimal set -## of archs to reduce compile time. -#ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0) -## Include Blackwell support if we're using CUDA12.8 or above -#NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \ -# -gencode=arch=compute_90,code=sm_90 \ -# -gencode=arch=compute_100,code=sm_100 \ -# -gencode=arch=compute_120,code=sm_120 \ -# -gencode=arch=compute_120,code=compute_120 -#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0) -#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ -# -gencode=arch=compute_61,code=sm_61 \ -# -gencode=arch=compute_70,code=sm_70 \ -# -gencode=arch=compute_80,code=sm_80 \ -# -gencode=arch=compute_90,code=sm_90 \ -# -gencode=arch=compute_90,code=compute_90 -#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) -#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \ -# -gencode=arch=compute_61,code=sm_61 \ -# -gencode=arch=compute_70,code=sm_70 \ -# -gencode=arch=compute_80,code=sm_80 \ -# -gencode=arch=compute_80,code=compute_80 -#else -#NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ -# -gencode=arch=compute_50,code=sm_50 \ -# -gencode=arch=compute_60,code=sm_60 \ -# -gencode=arch=compute_61,code=sm_61 \ -# -gencode=arch=compute_70,code=sm_70 \ -# -gencode=arch=compute_70,code=compute_70 -#endif - ifneq ($(NCCL_HOME), "") HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib @@ -117,7 +87,7 @@ HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich endif -LIBRARIES += rccl +LIBRARIES += rccl dl HIPLDFLAGS += $(LIBRARIES:%=-l%) DST_DIR := $(BUILDDIR) @@ -148,14 +118,14 @@ $(GIT_VERSION_FILE): ${HIPIFY_DIR}/%.cpp: %.cpp @printf "Hipifying %-35s > %s\n" $< $@ @mkdir -p ${HIPIFY_DIR} - hipify-perl -quiet-warnings $< > $@ + ${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@ ${HIPIFY_DIR}/%.h: %.h @printf "Hipifying %-35s > %s\n" $< $@ @mkdir -p ${HIPIFY_DIR} - hipify-perl -quiet-warnings $< > $@ + ${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@ -${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE) +${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h ${HIPIFY_DIR}/rccl_compat.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE) @printf "Compiling %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<" @@ -166,9 +136,8 @@ ${DST_DIR}/timer.o: timer.cc timer.h @mkdir -p ${DST_DIR} $(CXX) $(CXXFLAGS) -o $@ -c timer.cc -${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp +${DST_DIR}/%_perf: ${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp @printf "Linking %-35s > %s\n" $< $@ @mkdir -p ${DST_DIR} echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)" $(HIPCC) -o $@ $^ $(HIPLDFLAGS) - diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_gather.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_gather.cpp index a8cc9a1614..09d2d6933f 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_gather.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_gather.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void AllGatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -47,6 +48,16 @@ AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, return testSuccess; } +testResult_t +AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, + int* algo, int* proto, int* nchannels) +{ + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllGather, count, type, 0, 0, 1, algo, + proto, nchannels)); + return testSuccess; +} + void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) @@ -60,14 +71,17 @@ AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* bu testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } -struct testColl allGatherTest = { "AllGather", AllGatherGetCollByteCount, - AllGatherInitData, AllGatherGetBw, AllGatherRunColl }; +struct testColl allGatherTest = { "AllGather", AllGatherGetCollByteCount, + AllGatherInitData, AllGatherGetBw, + AllGatherRunColl, AllGatherGetAlgoProtoChannels }; void AllGatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_reduce.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_reduce.cpp index 551d2eb8b6..3ccce880df 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_reduce.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/all_reduce.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void AllReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -41,6 +42,16 @@ AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, return testSuccess; } +testResult_t +AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, + int* algo, int* proto, int* nchannels) +{ + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce, count, type, 0, 0, 1, algo, + proto, nchannels)); + return testSuccess; +} + void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) @@ -54,14 +65,17 @@ AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* bu testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } -struct testColl allReduceTest = { "AllReduce", AllReduceGetCollByteCount, - AllReduceInitData, AllReduceGetBw, AllReduceRunColl }; +struct testColl allReduceTest = { "AllReduce", AllReduceGetCollByteCount, + AllReduceInitData, AllReduceGetBw, + AllReduceRunColl, AllReduceGetAlgoProtoChannels }; void AllReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoall.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoall.cpp index 37c9d19052..39fb9ec0b7 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoall.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoall.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void AlltoAllGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -62,14 +63,17 @@ AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* bus testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream)); return testSuccess; } -struct testColl alltoAllTest = { "AlltoAll", AlltoAllGetCollByteCount, AlltoAllInitData, - AlltoAllGetBw, AlltoAllRunColl }; +struct testColl alltoAllTest = { "AlltoAll", AlltoAllGetCollByteCount, + AlltoAllInitData, AlltoAllGetBw, + AlltoAllRunColl, nullptr }; void AlltoAllGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoallv.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoallv.cpp index 78c90775b9..ca326ca1f4 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoallv.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/alltoallv.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" #define USE_RCCL_GATHER_SCATTER @@ -101,8 +102,10 @@ AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* bu testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; int nranks; NCCLCHECK(ncclCommCount(comm, &nranks)); int rank; @@ -174,8 +177,9 @@ AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty return testSuccess; } -struct testColl alltoAllTest = { "AlltoAllv", AlltoAllvGetCollByteCount, - AlltoAllvInitData, AlltoAllvGetBw, AlltoAllvRunColl }; +struct testColl alltoAllTest = { "AlltoAllv", AlltoAllvGetCollByteCount, + AlltoAllvInitData, AlltoAllvGetBw, + AlltoAllvRunColl, nullptr }; void AlltoAllvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/broadcast.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/broadcast.cpp index b98c3a8fd7..e0d2e1b1cf 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/broadcast.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/broadcast.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void BroadcastGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -41,6 +42,16 @@ BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, return testSuccess; } +testResult_t +BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, + int* algo, int* proto, int* nchannels) +{ + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast, count, type, 0, 0, 1, algo, + proto, nchannels)); + return testSuccess; +} + void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) @@ -54,8 +65,10 @@ BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* bu testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; int rank; NCCLCHECK(ncclCommUserRank(comm, &rank)); #if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2 @@ -73,8 +86,9 @@ BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty return testSuccess; } -struct testColl broadcastTest = { "Broadcast", BroadcastGetCollByteCount, - BroadcastInitData, BroadcastGetBw, BroadcastRunColl }; +struct testColl broadcastTest = { "Broadcast", BroadcastGetCollByteCount, + BroadcastInitData, BroadcastGetBw, + BroadcastRunColl, BroadcastGetAlgoProtoChannels }; void BroadcastGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.cpp index 1e7658b975..1670e1ea80 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.cpp @@ -1,18 +1,20 @@ /************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. * * See LICENSE.txt for license information ************************************************************************/ #include "common.h" -#include "cuda.h" -#include "cuda_runtime.h" +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" #include "rccl_float8.h" #include #include +#include +#include #include #include #include @@ -27,10 +29,46 @@ #include "git_version.h" #include "verifiable.h" +#define DIVUP(x, y) (((x) + (y) - 1) / (y)) + int test_ncclVersion = 0; // init'd with ncclGetVersion() int32_t gpu_block3; size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB +rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL; +rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL; +rcclTestsGetAlgoName_t rcclTestsGetAlgoName = NULL; + +static void +loadRcclSyms() +{ + static void* handle = NULL; + const char* libname = "librccl.so"; + if(!handle) + { + handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL); + if(!handle) + { + fprintf(stderr, "dlopen failed: %s\n", dlerror()); + return; + } + } + rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo"); + rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName"); + rcclTestsGetProtocolName = + (rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName"); +} + +// RCCL_FLOAT8 support +bool rccl_float8_useFnuz = false; + +bool +IsArchMatch(char const* arch, char const* target) +{ + // helper function to reduce clutter in code elsewhere. Returns true on match. + return (strncmp(arch, target, strlen(target)) == 0); +} + #if NCCL_MAJOR >= 2 ncclDataType_t test_types[ncclNumTypes] = { ncclInt8, ncclUint8, @@ -41,14 +79,14 @@ ncclDataType_t test_types[ncclNumTypes] = { ncclInt8, ncclHalf, ncclFloat, ncclDouble -# if RCCL_BFLOAT16 == 1 +# if HAVE_BF16 , ncclBfloat16 # endif -# if RCCL_FLOAT8 == 1 +# if HAVE_FP8 , - ncclFp8E4M3, - ncclFp8E5M2 + ncclFloat8e4m3, + ncclFloat8e5m2 # endif }; const char* test_typenames[ncclNumTypes] = { "int8", @@ -60,11 +98,11 @@ const char* test_typenames[ncclNumTypes] = { "int8", "half", "float", "double" -# if RCCL_BFLOAT16 == 1 +# if HAVE_BF16 , "bfloat16" # endif -# if RCCL_FLOAT8 == 1 +# if HAVE_FP8 , "fp8_e4m3", "fp8_e5m2" @@ -374,6 +412,15 @@ InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType return testSuccess; } +testResult_t +InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op) +{ + ncclVerifiableApplyBias(expected, bias, count, (int) type, (int) op, offset, + cudaStreamDefault); + return testSuccess; +} + testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) @@ -716,7 +763,7 @@ startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int TESTCHECK(args->collTest->runColl( (void*) (in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff), (void*) (in_place ? recvBuff + args->recvInplaceOffset * rank : recvBuff), - count, type, op, root, args->comms[i], args->streams[i])); + count, type, op, root, args->comms[i], args->streams[i], args->bias[i])); #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0) if(opIndex >= ncclNumOps) @@ -1239,7 +1286,7 @@ threadLaunch(struct testThread* thread) testResult_t AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes, - void** expected, size_t nbytes) + void** expected, size_t nbytes, void** bias) { if(enable_rotating_tensor) { @@ -1252,6 +1299,8 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt { CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached)); CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached)); + if(bias) + CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached)); if(datacheck) CUDACHECK( hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached)); @@ -1262,6 +1311,9 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained)); CUDACHECK( hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained)); + if(bias) + CUDACHECK( + hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained)); if(datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained)); @@ -1271,12 +1323,14 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt { CUDACHECK(hipHostMalloc(sendbuff, nbytes)); CUDACHECK(hipHostMalloc(recvbuff, nbytes)); + if(bias) CUDACHECK(hipHostMalloc(bias, nbytes)); if(datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes)); } else if(memorytype == ncclManaged) { CUDACHECK(cudaMallocManaged(sendbuff, nbytes)); CUDACHECK(cudaMallocManaged(recvbuff, nbytes)); + if(bias) CUDACHECK(cudaMallocManaged(bias, nbytes)); if(datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes)); #if 0 CUDACHECK(cudaMemset(*sendbuff, 0, nbytes)); @@ -1288,6 +1342,7 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt { CUDACHECK(cudaMalloc(sendbuff, nbytes)); CUDACHECK(cudaMalloc(recvbuff, nbytes)); + if(bias) CUDACHECK(cudaMalloc(bias, nbytes)); if(datacheck) CUDACHECK(cudaMalloc(expected, recvBytes)); } CUDACHECK(hipMemset(*sendbuff, 1, nbytes)); @@ -1333,6 +1388,7 @@ main(int argc, char* argv[]) test_opnum++; // PreMulSum } #endif + loadRcclSyms(); // Parse args // Replace getopt_long with manual argument parsing @@ -1764,6 +1820,7 @@ run() std::vector streams(nGpus * nThreads); std::vector sendbuffs(nGpus * nThreads); std::vector recvbuffs(nGpus * nThreads); + std::vector bias(nGpus * nThreads); std::vector expected(nGpus * nThreads); size_t sendBytes, recvBytes; @@ -1777,7 +1834,8 @@ run() gpus[i] = ((gpu0 != -1 ? gpu0 : localRank * nThreads * nGpus) + i) % numDevices; CUDACHECK(cudaSetDevice(gpus[i])); TESTCHECK(AllocateBuffs(sendbuffs.data() + i, sendBytes, recvbuffs.data() + i, - recvBytes, expected.data() + i, (size_t) maxBytes)); + recvBytes, expected.data() + i, (size_t) maxBytes, + bias.data() + i)); if(streamnull) streams[i] = NULL; else @@ -1895,6 +1953,7 @@ run() threads[t].args.gpus = gpus.data() + t * nGpus; threads[t].args.sendbuffs = sendbuffs.data() + t * nGpus; threads[t].args.recvbuffs = recvbuffs.data() + t * nGpus; + threads[t].args.bias = bias.data() + t * nGpus; threads[t].args.expected = expected.data() + t * nGpus; threads[t].args.ncclId = ncclId; threads[t].args.comms = comms + t * nGpus; @@ -1952,6 +2011,7 @@ run() { if(sendbuffs[i]) CUDACHECK(cudaFree((char*) sendbuffs[i])); if(recvbuffs[i]) CUDACHECK(cudaFree((char*) recvbuffs[i])); + if(bias[i]) CUDACHECK(cudaFree((char*) bias[i])); if(datacheck) CUDACHECK(cudaFree(expected[i])); } CUDACHECK(cudaFreeHost(delta)); diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.h b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.h index 2194e4e7af..eebf71a6b6 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.h +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/common.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. * * See LICENSE.txt for license information @@ -11,18 +11,26 @@ #include "rccl/rccl.h" #include #include +#include +#include +#include +#include #include +#include +#include +#include #ifdef MPI_SUPPORT # include "mpi.h" #endif #include "nccl1_compat.h" #include "timer.h" -#include -#include -#include -#include -// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later -#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 3) + +// Ensures backward compatibility for FP8 datatypes +#if NCCL_VERSION_CODE < NCCL_VERSION(2, 24, 3) +# define ncclFloat8e4m3 ncclFp8E4M3 +# define ncclFloat8e5m2 ncclFp8E5M2 +#else +// For newer RCCL versions, define old names in terms of new names # define ncclFp8E4M3 ncclFloat8e4m3 # define ncclFp8E5M2 ncclFloat8e5m2 #endif @@ -114,7 +122,10 @@ struct testColl int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, - ncclComm_t comm, cudaStream_t stream); + ncclComm_t comm, cudaStream_t stream, void* bias); + testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, + ncclDataType_t type, int* algo, int* proto, + int* nchannels); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; @@ -139,6 +150,7 @@ public: void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes, int inPlace, double timeUsec, double algBw, double busBw, int64_t wrongElts = -1); + void writeFile(); private: bool isMainThread(); @@ -154,13 +166,14 @@ private: return std::make_pair("\"" + v + "\"", k); }; - bool _outputValid = false; - std::ofstream _out; - std::string _outputFormat; - size_t _numCycle = 0; - std::string _collectiveName; - std::string _typeName; - std::string _opName; + bool _outputValid = false; + std::ofstream _out; + std::string _outputFormat; + size_t _numCycle = 0; + std::string _collectiveName; + std::string _typeName; + std::string _opName; + std::vector>> _outputData; }; struct testEngine @@ -200,6 +213,7 @@ struct threadArgs ncclUniqueId ncclId; ncclComm_t* comms; cudaStream_t* streams; + void** bias; void** expected; size_t expectedBytes; @@ -233,11 +247,14 @@ extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks); extern testResult_t +InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, + ncclDataType_t type, ncclRedOp_t op); +extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank); -extern void -AllocateBuffs(void** sendbuff, void** recvbuff, void** expected, void** expectedHost, - size_t nbytes, int nranks); +extern testResult_t +AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes, + void** expected, size_t nbytes, void** bias); #include @@ -304,6 +321,29 @@ getHostHash(const char* hostname) return getHash(hostHash, strlen(hostHash)); } +#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 +# define HAVE_BF16 1 +#else +# define HAVE_BF16 0 +#endif +#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1 +# define HAVE_FP8 1 +#else +# define HAVE_FP8 0 +#endif + +#if NCCL_MAJOR >= 2 +# if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0) +# undef HAVE_BF16 +# define HAVE_BF16 1 +# if defined(__CUDA_FP8_TYPES_EXIST__) && \ + NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 0) +# undef HAVE_FP8 +# define HAVE_FP8 1 +# endif +# endif +#endif + static size_t wordSize(ncclDataType_t type) { @@ -313,14 +353,14 @@ wordSize(ncclDataType_t type) #if NCCL_MAJOR >= 2 // case ncclInt8: case ncclUint8: -# if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1 - case ncclFp8E4M3: - case ncclFp8E5M2: +# if HAVE_FP8 + case ncclFloat8e4m3: + case ncclFloat8e5m2: # endif #endif return 1; case ncclHalf: -#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1 +#if HAVE_BF16 case ncclBfloat16: #endif // case ncclFloat16: @@ -427,4 +467,30 @@ extern thread_local int is_main_thread; #define PRINT \ if(is_main_thread) printf +typedef enum +{ + ncclFuncBroadcast = 0, + ncclFuncReduce = 1, + ncclFuncAllGather = 2, + ncclFuncReduceScatter = 3, + ncclFuncAllReduce = 4, + ncclFuncAllReduceWithBias = 5, + ncclFuncSendRecv = 6, + ncclFuncSend = 7, + ncclFuncRecv = 8, + ncclFuncAllToAllPivot = 9, + ncclNumFuncs = 10 +} ncclFunc_t; + +typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll, + uint64_t count, ncclDataType_t dataType, + int collNetSupport, int nvlsSupport, + int numPipeOps, int* algo, int* protocol, + int* maxChannels); +typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName); +typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol, + const char** protocolName); + +#include "rccl_compat.h" + #endif diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/gather.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/gather.cpp index fc670f02ab..d38c18f577 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/gather.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/gather.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void GatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -61,8 +62,10 @@ GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; @@ -85,8 +88,8 @@ GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, return testSuccess; } -struct testColl gatherTest = { "Gather", GatherGetCollByteCount, GatherInitData, - GatherGetBw, GatherRunColl }; +struct testColl gatherTest = { "Gather", GatherGetCollByteCount, GatherInitData, + GatherGetBw, GatherRunColl, nullptr }; void GatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/rccl_compat.h b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/rccl_compat.h new file mode 100644 index 0000000000..f1b404fbe3 --- /dev/null +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/rccl_compat.h @@ -0,0 +1,30 @@ +/* ************************************************************************ + * Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- + * ies of the Software, and to permit persons to whom the Software is furnished + * to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- + * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- + * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ************************************************************************ */ + +#ifndef RCCL_COMPAT_H +#define RCCL_COMPAT_H + +extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo; +extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName; +extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName; + +#endif \ No newline at end of file diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce.cpp index a4799ffe1e..3aa52173c0 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void ReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -45,6 +46,16 @@ ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int return testSuccess; } +testResult_t +ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, + int* proto, int* nchannels) +{ + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce, count, type, 0, 0, 1, algo, + proto, nchannels)); + return testSuccess; +} + void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) @@ -56,14 +67,17 @@ ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream)); return testSuccess; } -struct testColl reduceTest = { "Reduce", ReduceGetCollByteCount, ReduceInitData, - ReduceGetBw, ReduceRunColl }; +struct testColl reduceTest = { "Reduce", ReduceGetCollByteCount, + ReduceInitData, ReduceGetBw, + ReduceRunColl, ReduceGetAlgoProtoChannels }; void ReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce_scatter.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce_scatter.cpp index d05d930a61..a0877acf5d 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce_scatter.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/reduce_scatter.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void ReduceScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -45,6 +46,16 @@ ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t return testSuccess; } +testResult_t +ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, + int* algo, int* proto, int* nchannels) +{ + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter, count, type, 0, 0, 1, + algo, proto, nchannels)); + return testSuccess; +} + void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) @@ -58,15 +69,18 @@ ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream)); return testSuccess; } -struct testColl reduceScatterTest = { "ReduceScatter", ReduceScatterGetCollByteCount, - ReduceScatterInitData, ReduceScatterGetBw, - ReduceScatterRunColl }; +struct testColl reduceScatterTest = { + "ReduceScatter", ReduceScatterGetCollByteCount, ReduceScatterInitData, + ReduceScatterGetBw, ReduceScatterRunColl, ReduceScatterGetAlgoProtoChannels +}; void ReduceScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/scatter.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/scatter.cpp index de9f0fbb07..99642ecab2 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/scatter.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/scatter.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void ScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -55,8 +56,10 @@ ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busB testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; @@ -79,8 +82,8 @@ ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type return testSuccess; } -struct testColl scatterTest = { "Scatter", ScatterGetCollByteCount, ScatterInitData, - ScatterGetBw, ScatterRunColl }; +struct testColl scatterTest = { "Scatter", ScatterGetCollByteCount, ScatterInitData, + ScatterGetBw, ScatterRunColl, nullptr }; void ScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/sendrecv.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/sendrecv.cpp index acbf058b8c..d3cf3fa4d8 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/sendrecv.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/src/sendrecv.cpp @@ -7,6 +7,7 @@ #include "common.h" #include "cuda_runtime.h" +#include "rccl_compat.h" void SendRecvGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount, @@ -58,8 +59,10 @@ SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* bus testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, + void* bias = nullptr) { + (void) bias; int nRanks; NCCLCHECK(ncclCommCount(comm, &nRanks)); int rank; @@ -74,8 +77,9 @@ SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t typ return testSuccess; } -struct testColl sendRecvTest = { "SendRecv", SendRecvGetCollByteCount, SendRecvInitData, - SendRecvGetBw, SendRecvRunColl }; +struct testColl sendRecvTest = { "SendRecv", SendRecvGetCollByteCount, + SendRecvInitData, SendRecvGetBw, + SendRecvRunColl, nullptr }; void SendRecvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks) diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.cpp b/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.cpp index fbf515dfee..65a4e8b3db 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.cpp +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.cpp @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. * See LICENSE.txt for license information ************************************************************************/ @@ -1108,7 +1108,7 @@ prepareInput1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, } } // namespace -void +hipError_t ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream) @@ -1135,6 +1135,7 @@ ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, i # endif } # undef CASE_OP + return hipSuccess; } #endif @@ -1198,7 +1199,7 @@ prepareExpected1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n } } // namespace -void +hipError_t ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream) @@ -1224,6 +1225,7 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op # endif } # undef CASE_OP + return hipSuccess; } #endif @@ -1411,7 +1413,7 @@ verifyInline1(T const* results, intptr_t elt_n, int red_op, int rank_n, uint64_t } } // namespace -void +hipError_t ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, int64_t* bad_elt_n, cudaStream_t stream) @@ -1470,6 +1472,68 @@ ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n, default: assert(0); } # undef CASE_TY + return hipSuccess; +} + +//////////////////////////////////////////////////////////////////////////////// +// Apply bias to expected results + +namespace +{ +template +__global__ void +applyBias2(T* elts, T* bias, intptr_t elt_n) +{ + intptr_t i0 = blockIdx.x * (elt_n / gridDim.x); + i0 += blockIdx.x < elt_n % gridDim.x ? blockIdx.x : elt_n % gridDim.x; + intptr_t i1 = (blockIdx.x + 1) * (elt_n / gridDim.x); + i1 += blockIdx.x + 1 < elt_n % gridDim.x ? blockIdx.x + 1 : elt_n % gridDim.x; + intptr_t i = i0 + threadIdx.x; + while(i < i1) + { + elts[i] = ReduceSum()(elts[i], bias[i]); + i += blockDim.x; + } +} + +void +applyBias1(void* elts, void* bias, intptr_t elt_n, int elt_ty, cudaStream_t stream) +{ + int block_n = std::min(32, (elt_n + 4 * 512 - 1) / (4 * 512)); +# define CASE_TY(T) \ + applyBias2<<>>((T*) elts, (T*) bias, elt_n); \ + break; + switch(elt_ty) + { + case ncclInt8: CASE_TY(int8_t) + case ncclUint8: CASE_TY(uint8_t) + case ncclInt32: CASE_TY(int32_t) + case ncclUint32: CASE_TY(uint32_t) + case ncclInt64: CASE_TY(int64_t) + case ncclUint64: CASE_TY(uint64_t) + case ncclFloat16: CASE_TY(__half) +# if HAVE_ncclBfloat16 + case ncclBfloat16: CASE_TY(hip_bfloat16) +# endif +# if HAVE_ncclfp8 + case ncclFp8E4M3: CASE_TY(rccl_float8) + case ncclFp8E5M2: CASE_TY(rccl_bfloat8) +# endif + case ncclFloat32: CASE_TY(float) + case ncclFloat64: CASE_TY(double) + default: assert(0); + } +# undef CASE_TY +} +} // namespace + +void +ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, + intptr_t elt_ix0, cudaStream_t stream) +{ + (void) red_op; + (void) elt_ix0; + applyBias1(elts, bias, elt_n, elt_ty, stream); } #endif diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.h b/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.h index a6d0d1c8c7..8515647785 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.h +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -43,13 +43,13 @@ ncclVerifiablePremulScalar(int rank_me) } // Enqueue kernel to generate data which is to be reduced. -void +hipError_t ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream); // Enqueue kernel to generate expected results of reduction. -void +hipError_t ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, cudaStream_t stream); @@ -60,8 +60,19 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op // which can be costly. Thus if you plan to run the same reduction multiple // times it is advantageous to precompute the expected values with // ncclVerifiablePrepareExpected and pass them as `expected` here. -void +hipError_t ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n, int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, int64_t* bad_elt_n, cudaStream_t stream); + +// Enqueue kernel that applies bias to expected results +void +ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, + intptr_t elt_ix0, cudaStream_t stream); + +#ifdef NCCL_VERIFIABLE_SELF_TEST +void +ncclVerifiableLaunchSelfTest(); +#endif + #endif diff --git a/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.mk b/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.mk index b0e00d9a5b..b5b7476a3d 100644 --- a/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.mk +++ b/projects/rocprofiler-systems/examples/rccl/rccl-tests/verifiable/verifiable.mk @@ -1,5 +1,5 @@ # Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. -# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. +# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved. # # See LICENSE.txt for license information @@ -14,17 +14,17 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o ${HIPIFY_DIR}/verifiable.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cpp @printf "Hipifying %-35s > %s\n" $< $@ @mkdir -p ${HIPIFY_DIR} - hipify-perl -quiet-warnings $< > $@ + ${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@ ${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h @printf "Hipifying %-35s > %s\n" $< $@ @mkdir -p ${HIPIFY_DIR} - hipify-perl -quiet-warnings $< > $@ + ${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@ ${HIPIFY_DIR}/rccl_float8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_float8.h @printf "Hipifying %-35s > %s\n" $< $@ @mkdir -p ${HIPIFY_DIR} - hipify-perl -quiet-warnings $< > $@ + ${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@ $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_float8.h @printf "Compiling %s\n" $@