update build rccl-tests infrastructure and add getAlgoProtoChannels support (#2212)

Tento commit je obsažen v:
habajpai-amd
2025-12-11 18:29:06 +05:30
odevzdal GitHub
rodič 10635483ad
revize 6b45657493
17 změnil soubory, kde provedl 408 přidání a 120 odebrání
+12 -43
Zobrazit soubor
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
@@ -15,6 +15,8 @@ CUSTOM_RCCL_LIB ?= ""
HIPCC ?= $(ROCM_PATH)/bin/amdclang++
HIPCONFIG = $(ROCM_PATH)/bin/hipconfig
HIPIFY_PL_EXE = $(ROCM_PATH)/bin/hipify-perl
HIPIFY_PL_FLAGS = -quiet-warnings
CXX = $(HIPCC)
HIPCUFLAGS := -std=c++14
@@ -30,7 +32,10 @@ HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
# Currently, supports gfx906,gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
ifndef GPU_TARGETS
GPU_TARGETS = gfx906 gfx908 gfx90a
ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
ifeq ($(shell test "0$(HIP_MAJOR)" -ge 7; echo $$?),0)
# Include gfx942 and gfx950 support if we're using ROCm 7.0 or above
GPU_TARGETS += gfx942 gfx950
else ifeq ($(shell test "0$(HIP_MAJOR)" -eq 6; echo $$?),0)
# Include gfx942 support if we're using ROCm 6.0 or above
GPU_TARGETS += gfx942
ifeq ($(shell test "0$(HIP_MINOR)" -ge 5; echo $$?),0)
@@ -43,41 +48,6 @@ endif
GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
#CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
#CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
#
## Better define NVCC_GENCODE in your environment to the minimal set
## of archs to reduce compile time.
#ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
## Include Blackwell support if we're using CUDA12.8 or above
#NVCC_GENCODE ?= -gencode=arch=compute_80,code=sm_80 \
# -gencode=arch=compute_90,code=sm_90 \
# -gencode=arch=compute_100,code=sm_100 \
# -gencode=arch=compute_120,code=sm_120 \
# -gencode=arch=compute_120,code=compute_120
#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
# -gencode=arch=compute_61,code=sm_61 \
# -gencode=arch=compute_70,code=sm_70 \
# -gencode=arch=compute_80,code=sm_80 \
# -gencode=arch=compute_90,code=sm_90 \
# -gencode=arch=compute_90,code=compute_90
#else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
#NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
# -gencode=arch=compute_61,code=sm_61 \
# -gencode=arch=compute_70,code=sm_70 \
# -gencode=arch=compute_80,code=sm_80 \
# -gencode=arch=compute_80,code=compute_80
#else
#NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
# -gencode=arch=compute_50,code=sm_50 \
# -gencode=arch=compute_60,code=sm_60 \
# -gencode=arch=compute_61,code=sm_61 \
# -gencode=arch=compute_70,code=sm_70 \
# -gencode=arch=compute_70,code=compute_70
#endif
ifneq ($(NCCL_HOME), "")
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
@@ -117,7 +87,7 @@ HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/mpich/include -I
HIPLDFLAGS += -L${MPI_HOME}/lib -L${MPI_HOME}/mpich/lib -lmpich
endif
LIBRARIES += rccl
LIBRARIES += rccl dl
HIPLDFLAGS += $(LIBRARIES:%=-l%)
DST_DIR := $(BUILDDIR)
@@ -148,14 +118,14 @@ $(GIT_VERSION_FILE):
${HIPIFY_DIR}/%.cpp: %.cpp
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
${HIPIFY_DIR}/%.h: %.h
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cpp ${HIPIFY_DIR}/common.h ${HIPIFY_DIR}/rccl_compat.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
@@ -166,9 +136,8 @@ ${DST_DIR}/timer.o: timer.cc timer.h
@mkdir -p ${DST_DIR}
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
${DST_DIR}/%_perf: ${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
AllGatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -47,6 +48,16 @@ AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
return testSuccess;
}
testResult_t
AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
int* algo, int* proto, int* nchannels)
{
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllGather, count, type, 0, 0, 1, algo,
proto, nchannels));
return testSuccess;
}
void
AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
int nranks)
@@ -60,14 +71,17 @@ AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
testResult_t
AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
struct testColl allGatherTest = { "AllGather", AllGatherGetCollByteCount,
AllGatherInitData, AllGatherGetBw, AllGatherRunColl };
struct testColl allGatherTest = { "AllGather", AllGatherGetCollByteCount,
AllGatherInitData, AllGatherGetBw,
AllGatherRunColl, AllGatherGetAlgoProtoChannels };
void
AllGatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
AllReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -41,6 +42,16 @@ AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
return testSuccess;
}
testResult_t
AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
int* algo, int* proto, int* nchannels)
{
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce, count, type, 0, 0, 1, algo,
proto, nchannels));
return testSuccess;
}
void
AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
int nranks)
@@ -54,14 +65,17 @@ AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
testResult_t
AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
struct testColl allReduceTest = { "AllReduce", AllReduceGetCollByteCount,
AllReduceInitData, AllReduceGetBw, AllReduceRunColl };
struct testColl allReduceTest = { "AllReduce", AllReduceGetCollByteCount,
AllReduceInitData, AllReduceGetBw,
AllReduceRunColl, AllReduceGetAlgoProtoChannels };
void
AllReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
+7 -3
Zobrazit soubor
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
AlltoAllGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -62,14 +63,17 @@ AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* bus
testResult_t
AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
struct testColl alltoAllTest = { "AlltoAll", AlltoAllGetCollByteCount, AlltoAllInitData,
AlltoAllGetBw, AlltoAllRunColl };
struct testColl alltoAllTest = { "AlltoAll", AlltoAllGetCollByteCount,
AlltoAllInitData, AlltoAllGetBw,
AlltoAllRunColl, nullptr };
void
AlltoAllGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
#define USE_RCCL_GATHER_SCATTER
@@ -101,8 +102,10 @@ AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
testResult_t
AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
int nranks;
NCCLCHECK(ncclCommCount(comm, &nranks));
int rank;
@@ -174,8 +177,9 @@ AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty
return testSuccess;
}
struct testColl alltoAllTest = { "AlltoAllv", AlltoAllvGetCollByteCount,
AlltoAllvInitData, AlltoAllvGetBw, AlltoAllvRunColl };
struct testColl alltoAllTest = { "AlltoAllv", AlltoAllvGetCollByteCount,
AlltoAllvInitData, AlltoAllvGetBw,
AlltoAllvRunColl, nullptr };
void
AlltoAllvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
+17 -3
Zobrazit soubor
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
BroadcastGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -41,6 +42,16 @@ BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op,
return testSuccess;
}
testResult_t
BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
int* algo, int* proto, int* nchannels)
{
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast, count, type, 0, 0, 1, algo,
proto, nchannels));
return testSuccess;
}
void
BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
int nranks)
@@ -54,8 +65,10 @@ BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* bu
testResult_t
BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
@@ -73,8 +86,9 @@ BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t ty
return testSuccess;
}
struct testColl broadcastTest = { "Broadcast", BroadcastGetCollByteCount,
BroadcastInitData, BroadcastGetBw, BroadcastRunColl };
struct testColl broadcastTest = { "Broadcast", BroadcastGetCollByteCount,
BroadcastInitData, BroadcastGetBw,
BroadcastRunColl, BroadcastGetAlgoProtoChannels };
void
BroadcastGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
+72 -12
Zobrazit soubor
@@ -1,18 +1,20 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "common.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include "rccl_float8.h"
#include <cstdio>
#include <ctype.h>
#include <dlfcn.h>
#include <errno.h>
#include <getopt.h>
#include <hip/hip_bfloat16.h>
#include <libgen.h>
@@ -27,10 +29,46 @@
#include "git_version.h"
#include "verifiable.h"
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
int test_ncclVersion = 0; // init'd with ncclGetVersion()
int32_t gpu_block3;
size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL;
rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL;
rcclTestsGetAlgoName_t rcclTestsGetAlgoName = NULL;
static void
loadRcclSyms()
{
static void* handle = NULL;
const char* libname = "librccl.so";
if(!handle)
{
handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
if(!handle)
{
fprintf(stderr, "dlopen failed: %s\n", dlerror());
return;
}
}
rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo");
rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName");
rcclTestsGetProtocolName =
(rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName");
}
// RCCL_FLOAT8 support
bool rccl_float8_useFnuz = false;
bool
IsArchMatch(char const* arch, char const* target)
{
// helper function to reduce clutter in code elsewhere. Returns true on match.
return (strncmp(arch, target, strlen(target)) == 0);
}
#if NCCL_MAJOR >= 2
ncclDataType_t test_types[ncclNumTypes] = { ncclInt8,
ncclUint8,
@@ -41,14 +79,14 @@ ncclDataType_t test_types[ncclNumTypes] = { ncclInt8,
ncclHalf,
ncclFloat,
ncclDouble
# if RCCL_BFLOAT16 == 1
# if HAVE_BF16
,
ncclBfloat16
# endif
# if RCCL_FLOAT8 == 1
# if HAVE_FP8
,
ncclFp8E4M3,
ncclFp8E5M2
ncclFloat8e4m3,
ncclFloat8e5m2
# endif
};
const char* test_typenames[ncclNumTypes] = { "int8",
@@ -60,11 +98,11 @@ const char* test_typenames[ncclNumTypes] = { "int8",
"half",
"float",
"double"
# if RCCL_BFLOAT16 == 1
# if HAVE_BF16
,
"bfloat16"
# endif
# if RCCL_FLOAT8 == 1
# if HAVE_FP8
,
"fp8_e4m3",
"fp8_e5m2"
@@ -374,6 +412,15 @@ InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType
return testSuccess;
}
testResult_t
InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset,
ncclDataType_t type, ncclRedOp_t op)
{
ncclVerifiableApplyBias(expected, bias, count, (int) type, (int) op, offset,
cudaStreamDefault);
return testSuccess;
}
testResult_t
InitData(void* data, const size_t count, size_t offset, ncclDataType_t type,
ncclRedOp_t op, uint64_t seed, int nranks, int rank)
@@ -716,7 +763,7 @@ startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t opIndex, int
TESTCHECK(args->collTest->runColl(
(void*) (in_place ? recvBuff + args->sendInplaceOffset * rank : sendBuff),
(void*) (in_place ? recvBuff + args->recvInplaceOffset * rank : recvBuff),
count, type, op, root, args->comms[i], args->streams[i]));
count, type, op, root, args->comms[i], args->streams[i], args->bias[i]));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
if(opIndex >= ncclNumOps)
@@ -1239,7 +1286,7 @@ threadLaunch(struct testThread* thread)
testResult_t
AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes,
void** expected, size_t nbytes)
void** expected, size_t nbytes, void** bias)
{
if(enable_rotating_tensor)
{
@@ -1252,6 +1299,8 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
{
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
if(bias)
CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached));
if(datacheck)
CUDACHECK(
hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
@@ -1262,6 +1311,9 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
CUDACHECK(
hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
if(bias)
CUDACHECK(
hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained));
if(datacheck)
CUDACHECK(hipExtMallocWithFlags(expected, recvBytes,
hipDeviceMallocFinegrained));
@@ -1271,12 +1323,14 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
{
CUDACHECK(hipHostMalloc(sendbuff, nbytes));
CUDACHECK(hipHostMalloc(recvbuff, nbytes));
if(bias) CUDACHECK(hipHostMalloc(bias, nbytes));
if(datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
}
else if(memorytype == ncclManaged)
{
CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
if(bias) CUDACHECK(cudaMallocManaged(bias, nbytes));
if(datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
#if 0
CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
@@ -1288,6 +1342,7 @@ AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvByt
{
CUDACHECK(cudaMalloc(sendbuff, nbytes));
CUDACHECK(cudaMalloc(recvbuff, nbytes));
if(bias) CUDACHECK(cudaMalloc(bias, nbytes));
if(datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
}
CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
@@ -1333,6 +1388,7 @@ main(int argc, char* argv[])
test_opnum++; // PreMulSum
}
#endif
loadRcclSyms();
// Parse args
// Replace getopt_long with manual argument parsing
@@ -1764,6 +1820,7 @@ run()
std::vector<cudaStream_t> streams(nGpus * nThreads);
std::vector<void*> sendbuffs(nGpus * nThreads);
std::vector<void*> recvbuffs(nGpus * nThreads);
std::vector<void*> bias(nGpus * nThreads);
std::vector<void*> expected(nGpus * nThreads);
size_t sendBytes, recvBytes;
@@ -1777,7 +1834,8 @@ run()
gpus[i] = ((gpu0 != -1 ? gpu0 : localRank * nThreads * nGpus) + i) % numDevices;
CUDACHECK(cudaSetDevice(gpus[i]));
TESTCHECK(AllocateBuffs(sendbuffs.data() + i, sendBytes, recvbuffs.data() + i,
recvBytes, expected.data() + i, (size_t) maxBytes));
recvBytes, expected.data() + i, (size_t) maxBytes,
bias.data() + i));
if(streamnull)
streams[i] = NULL;
else
@@ -1895,6 +1953,7 @@ run()
threads[t].args.gpus = gpus.data() + t * nGpus;
threads[t].args.sendbuffs = sendbuffs.data() + t * nGpus;
threads[t].args.recvbuffs = recvbuffs.data() + t * nGpus;
threads[t].args.bias = bias.data() + t * nGpus;
threads[t].args.expected = expected.data() + t * nGpus;
threads[t].args.ncclId = ncclId;
threads[t].args.comms = comms + t * nGpus;
@@ -1952,6 +2011,7 @@ run()
{
if(sendbuffs[i]) CUDACHECK(cudaFree((char*) sendbuffs[i]));
if(recvbuffs[i]) CUDACHECK(cudaFree((char*) recvbuffs[i]));
if(bias[i]) CUDACHECK(cudaFree((char*) bias[i]));
if(datacheck) CUDACHECK(cudaFree(expected[i]));
}
CUDACHECK(cudaFreeHost(delta));
+88 -22
Zobrazit soubor
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
@@ -11,18 +11,26 @@
#include "rccl/rccl.h"
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <fstream>
#include <iostream>
#include <pthread.h>
#include <stdio.h>
#include <string>
#include <utility>
#include <vector>
#ifdef MPI_SUPPORT
# include "mpi.h"
#endif
#include "nccl1_compat.h"
#include "timer.h"
#include <cstring>
#include <fstream>
#include <iostream>
#include <pthread.h>
// Ensures backward compatibility for FP8 types in RCCL 2.24.3 and later
#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 3)
// Ensures backward compatibility for FP8 datatypes
#if NCCL_VERSION_CODE < NCCL_VERSION(2, 24, 3)
# define ncclFloat8e4m3 ncclFp8E4M3
# define ncclFloat8e5m2 ncclFp8E5M2
#else
// For newer RCCL versions, define old names in terms of new names
# define ncclFp8E4M3 ncclFloat8e4m3
# define ncclFp8E5M2 ncclFloat8e5m2
#endif
@@ -114,7 +122,10 @@ struct testColl
int nranks);
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root,
ncclComm_t comm, cudaStream_t stream);
ncclComm_t comm, cudaStream_t stream, void* bias);
testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count,
ncclDataType_t type, int* algo, int* proto,
int* nchannels);
};
extern struct testColl allReduceTest;
extern struct testColl allGatherTest;
@@ -139,6 +150,7 @@ public:
void addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size_t numBytes,
int inPlace, double timeUsec, double algBw, double busBw,
int64_t wrongElts = -1);
void writeFile();
private:
bool isMainThread();
@@ -154,13 +166,14 @@ private:
return std::make_pair("\"" + v + "\"", k);
};
bool _outputValid = false;
std::ofstream _out;
std::string _outputFormat;
size_t _numCycle = 0;
std::string _collectiveName;
std::string _typeName;
std::string _opName;
bool _outputValid = false;
std::ofstream _out;
std::string _outputFormat;
size_t _numCycle = 0;
std::string _collectiveName;
std::string _typeName;
std::string _opName;
std::vector<std::vector<std::pair<std::string, std::string>>> _outputData;
};
struct testEngine
@@ -200,6 +213,7 @@ struct threadArgs
ncclUniqueId ncclId;
ncclComm_t* comms;
cudaStream_t* streams;
void** bias;
void** expected;
size_t expectedBytes;
@@ -233,11 +247,14 @@ extern testResult_t
InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type,
ncclRedOp_t op, const uint64_t seed, const int nranks);
extern testResult_t
InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset,
ncclDataType_t type, ncclRedOp_t op);
extern testResult_t
InitData(void* data, const size_t count, size_t offset, ncclDataType_t type,
ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
extern void
AllocateBuffs(void** sendbuff, void** recvbuff, void** expected, void** expectedHost,
size_t nbytes, int nranks);
extern testResult_t
AllocateBuffs(void** sendbuff, size_t sendBytes, void** recvbuff, size_t recvBytes,
void** expected, size_t nbytes, void** bias);
#include <unistd.h>
@@ -304,6 +321,29 @@ getHostHash(const char* hostname)
return getHash(hostHash, strlen(hostHash));
}
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
# define HAVE_BF16 1
#else
# define HAVE_BF16 0
#endif
#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
# define HAVE_FP8 1
#else
# define HAVE_FP8 0
#endif
#if NCCL_MAJOR >= 2
# if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2, 10, 0)
# undef HAVE_BF16
# define HAVE_BF16 1
# if defined(__CUDA_FP8_TYPES_EXIST__) && \
NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 0)
# undef HAVE_FP8
# define HAVE_FP8 1
# endif
# endif
#endif
static size_t
wordSize(ncclDataType_t type)
{
@@ -313,14 +353,14 @@ wordSize(ncclDataType_t type)
#if NCCL_MAJOR >= 2
// case ncclInt8:
case ncclUint8:
# if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
case ncclFp8E4M3:
case ncclFp8E5M2:
# if HAVE_FP8
case ncclFloat8e4m3:
case ncclFloat8e5m2:
# endif
#endif
return 1;
case ncclHalf:
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
#if HAVE_BF16
case ncclBfloat16:
#endif
// case ncclFloat16:
@@ -427,4 +467,30 @@ extern thread_local int is_main_thread;
#define PRINT \
if(is_main_thread) printf
typedef enum
{
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncAllReduceWithBias = 5,
ncclFuncSendRecv = 6,
ncclFuncSend = 7,
ncclFuncRecv = 8,
ncclFuncAllToAllPivot = 9,
ncclNumFuncs = 10
} ncclFunc_t;
typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll,
uint64_t count, ncclDataType_t dataType,
int collNetSupport, int nvlsSupport,
int numPipeOps, int* algo, int* protocol,
int* maxChannels);
typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol,
const char** protocolName);
#include "rccl_compat.h"
#endif
+6 -3
Zobrazit soubor
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
GatherGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -61,8 +62,10 @@ GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw
testResult_t
GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
@@ -85,8 +88,8 @@ GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
return testSuccess;
}
struct testColl gatherTest = { "Gather", GatherGetCollByteCount, GatherInitData,
GatherGetBw, GatherRunColl };
struct testColl gatherTest = { "Gather", GatherGetCollByteCount, GatherInitData,
GatherGetBw, GatherRunColl, nullptr };
void
GatherGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -0,0 +1,30 @@
/* ************************************************************************
* Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
* ies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
* PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
* CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ************************************************************************ */
#ifndef RCCL_COMPAT_H
#define RCCL_COMPAT_H
extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo;
extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName;
extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName;
#endif
+17 -3
Zobrazit soubor
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
ReduceGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -45,6 +46,16 @@ ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int
return testSuccess;
}
testResult_t
ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo,
int* proto, int* nchannels)
{
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce, count, type, 0, 0, 1, algo,
proto, nchannels));
return testSuccess;
}
void
ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
int nranks)
@@ -56,14 +67,17 @@ ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw
testResult_t
ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
return testSuccess;
}
struct testColl reduceTest = { "Reduce", ReduceGetCollByteCount, ReduceInitData,
ReduceGetBw, ReduceRunColl };
struct testColl reduceTest = { "Reduce", ReduceGetCollByteCount,
ReduceInitData, ReduceGetBw,
ReduceRunColl, ReduceGetAlgoProtoChannels };
void
ReduceGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
ReduceScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -45,6 +46,16 @@ ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
return testSuccess;
}
testResult_t
ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type,
int* algo, int* proto, int* nchannels)
{
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter, count, type, 0, 0, 1,
algo, proto, nchannels));
return testSuccess;
}
void
ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw,
int nranks)
@@ -58,15 +69,18 @@ ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double
testResult_t
ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
struct testColl reduceScatterTest = { "ReduceScatter", ReduceScatterGetCollByteCount,
ReduceScatterInitData, ReduceScatterGetBw,
ReduceScatterRunColl };
struct testColl reduceScatterTest = {
"ReduceScatter", ReduceScatterGetCollByteCount, ReduceScatterInitData,
ReduceScatterGetBw, ReduceScatterRunColl, ReduceScatterGetAlgoProtoChannels
};
void
ReduceScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
+6 -3
Zobrazit soubor
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
ScatterGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -55,8 +56,10 @@ ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busB
testResult_t
ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
@@ -79,8 +82,8 @@ ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type
return testSuccess;
}
struct testColl scatterTest = { "Scatter", ScatterGetCollByteCount, ScatterInitData,
ScatterGetBw, ScatterRunColl };
struct testColl scatterTest = { "Scatter", ScatterGetCollByteCount, ScatterInitData,
ScatterGetBw, ScatterRunColl, nullptr };
void
ScatterGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
+7 -3
Zobrazit soubor
@@ -7,6 +7,7 @@
#include "common.h"
#include "cuda_runtime.h"
#include "rccl_compat.h"
void
SendRecvGetCollByteCount(size_t* sendcount, size_t* recvcount, size_t* paramcount,
@@ -58,8 +59,10 @@ SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* bus
testResult_t
SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream)
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream,
void* bias = nullptr)
{
(void) bias;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
@@ -74,8 +77,9 @@ SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t typ
return testSuccess;
}
struct testColl sendRecvTest = { "SendRecv", SendRecvGetCollByteCount, SendRecvInitData,
SendRecvGetBw, SendRecvRunColl };
struct testColl sendRecvTest = { "SendRecv", SendRecvGetCollByteCount,
SendRecvInitData, SendRecvGetBw,
SendRecvRunColl, nullptr };
void
SendRecvGetBuffSize(size_t* sendcount, size_t* recvcount, size_t count, int nranks)
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
* See LICENSE.txt for license information
************************************************************************/
@@ -1108,7 +1108,7 @@ prepareInput1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
}
} // namespace
void
hipError_t
ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
int rank_me, uint64_t seed, intptr_t elt_ix0,
cudaStream_t stream)
@@ -1135,6 +1135,7 @@ ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, i
# endif
}
# undef CASE_OP
return hipSuccess;
}
#endif
@@ -1198,7 +1199,7 @@ prepareExpected1(void* elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n
}
} // namespace
void
hipError_t
ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op,
int rank_n, uint64_t seed, intptr_t elt_ix0,
cudaStream_t stream)
@@ -1224,6 +1225,7 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op
# endif
}
# undef CASE_OP
return hipSuccess;
}
#endif
@@ -1411,7 +1413,7 @@ verifyInline1(T const* results, intptr_t elt_n, int red_op, int rank_n, uint64_t
}
} // namespace
void
hipError_t
ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t* bad_elt_n, cudaStream_t stream)
@@ -1470,6 +1472,68 @@ ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
default: assert(0);
}
# undef CASE_TY
return hipSuccess;
}
////////////////////////////////////////////////////////////////////////////////
// Apply bias to expected results
namespace
{
template <typename T>
__global__ void
applyBias2(T* elts, T* bias, intptr_t elt_n)
{
intptr_t i0 = blockIdx.x * (elt_n / gridDim.x);
i0 += blockIdx.x < elt_n % gridDim.x ? blockIdx.x : elt_n % gridDim.x;
intptr_t i1 = (blockIdx.x + 1) * (elt_n / gridDim.x);
i1 += blockIdx.x + 1 < elt_n % gridDim.x ? blockIdx.x + 1 : elt_n % gridDim.x;
intptr_t i = i0 + threadIdx.x;
while(i < i1)
{
elts[i] = ReduceSum()(elts[i], bias[i]);
i += blockDim.x;
}
}
void
applyBias1(void* elts, void* bias, intptr_t elt_n, int elt_ty, cudaStream_t stream)
{
int block_n = std::min<intptr_t>(32, (elt_n + 4 * 512 - 1) / (4 * 512));
# define CASE_TY(T) \
applyBias2<<<block_n, 512, 0, stream>>>((T*) elts, (T*) bias, elt_n); \
break;
switch(elt_ty)
{
case ncclInt8: CASE_TY(int8_t)
case ncclUint8: CASE_TY(uint8_t)
case ncclInt32: CASE_TY(int32_t)
case ncclUint32: CASE_TY(uint32_t)
case ncclInt64: CASE_TY(int64_t)
case ncclUint64: CASE_TY(uint64_t)
case ncclFloat16: CASE_TY(__half)
# if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16)
# endif
# if HAVE_ncclfp8
case ncclFp8E4M3: CASE_TY(rccl_float8)
case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
# endif
case ncclFloat32: CASE_TY(float)
case ncclFloat64: CASE_TY(double)
default: assert(0);
}
# undef CASE_TY
}
} // namespace
void
ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op,
intptr_t elt_ix0, cudaStream_t stream)
{
(void) red_op;
(void) elt_ix0;
applyBias1(elts, bias, elt_n, elt_ty, stream);
}
#endif
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -43,13 +43,13 @@ ncclVerifiablePremulScalar(int rank_me)
}
// Enqueue kernel to generate data which is to be reduced.
void
hipError_t
ncclVerifiablePrepareInput(void* elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
int rank_me, uint64_t seed, intptr_t elt_ix0,
cudaStream_t stream);
// Enqueue kernel to generate expected results of reduction.
void
hipError_t
ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op,
int rank_n, uint64_t seed, intptr_t elt_ix0,
cudaStream_t stream);
@@ -60,8 +60,19 @@ ncclVerifiablePrepareExpected(void* elts, intptr_t elt_n, int elt_ty, int red_op
// which can be costly. Thus if you plan to run the same reduction multiple
// times it is advantageous to precompute the expected values with
// ncclVerifiablePrepareExpected and pass them as `expected` here.
void
hipError_t
ncclVerifiableVerify(void const* results, void const* expected, intptr_t elt_n,
int elt_ty, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t* bad_elt_n, cudaStream_t stream);
// Enqueue kernel that applies bias to expected results
void
ncclVerifiableApplyBias(void* elts, void* bias, intptr_t elt_n, int elt_ty, int red_op,
intptr_t elt_ix0, cudaStream_t stream);
#ifdef NCCL_VERIFIABLE_SELF_TEST
void
ncclVerifiableLaunchSelfTest();
#endif
#endif
@@ -1,5 +1,5 @@
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
@@ -14,17 +14,17 @@ TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
${HIPIFY_DIR}/verifiable.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cpp
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
${HIPIFY_DIR}/verifiable.h: $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
${HIPIFY_DIR}/rccl_float8.h: $(TEST_VERIFIABLE_SRCDIR)/../src/rccl_float8.h
@printf "Hipifying %-35s > %s\n" $< $@
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
${HIPIFY_PL_EXE} ${HIPIFY_PL_FLAGS} $< > $@
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cpp $(HIPIFY_DIR)/verifiable.h $(HIPIFY_DIR)/rccl_float8.h
@printf "Compiling %s\n" $@