Add all_reduce_bias_perf to support All Reduce with Bias (#130)

Use dynamic symbol loading of ncclAllReduceWithBias

Co-authored-by: mberenjk <146776561+mberenjk@users.noreply.github.com>
このコミットが含まれているのは:
Wenkai Du
2025-10-13 14:09:10 -07:00
committed by GitHub
コミット db6ea5a594
18個のファイルの変更237行の追加19行の削除
+1
ファイルの表示
@@ -145,3 +145,4 @@ add_rccl_test(reduce_scatter)
add_rccl_test(reduce)
add_rccl_test(scatter)
add_rccl_test(sendrecv)
add_rccl_test(all_reduce_bias)
+1 -1
ファイルの表示
@@ -135,7 +135,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%)
DST_DIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.cu)
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube all_reduce_bias
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
+1 -1
ファイルの表示
@@ -52,7 +52,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
+1 -1
ファイルの表示
@@ -48,7 +48,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
+123
ファイルの表示
@@ -0,0 +1,123 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "cuda_runtime.h"
#include "common.h"
#include <dlfcn.h>
typedef ncclResult_t (*PFN_ncclAllReduceWithBias)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc);
#define DECLARE_RCCL_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
DECLARE_RCCL_PFN(ncclAllReduceWithBias);
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
static void initOnceFunc() {
void *librccl = dlopen("librccl.so", RTLD_NOLOAD);
pfn_ncclAllReduceWithBias = (PFN_ncclAllReduceWithBias) dlsym(librccl, "ncclAllReduceWithBias");
}
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
pthread_once(&initOnceControl, initOnceFunc);
}
testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus;
for (int i=0; i<args->nGpus; i++) {
CUDACHECK(cudaSetDevice(args->gpus[i]));
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
TESTCHECK(InitData(args->bias[i], sendcount, 0, type, op, rep+0x12345678, nranks, rank));
TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
TESTCHECK(InitDataApplyBias(args->expected[i], args->bias[i], recvcount, 0, type, op));
CUDACHECK(cudaDeviceSynchronize());
}
return testSuccess;
}
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(2*(nranks - 1)))/((double)nranks);
*busBw = baseBw * factor;
}
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
if (pfn_ncclAllReduceWithBias == nullptr) {
fprintf(stderr, "[ERROR] This version of RCCL doesn't support ncclAllReduceWithBias\n");
return testNcclError;
}
NCCLCHECK((*pfn_ncclAllReduceWithBias)(sendbuff, recvbuff, count, type, op, comm, stream, bias));
return testSuccess;
}
struct testColl allReduceTest = {
"AllReduce",
AllReduceGetCollByteCount,
AllReduceInitData,
AllReduceGetBw,
AllReduceRunColl
};
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
}
testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &allReduceTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
#if defined(RCCL_FLOAT8)
if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
continue;
#endif
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AllReduceGetBuffSize,
AllReduceRunTest
};
+1 -1
ファイルの表示
@@ -47,7 +47,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
*busBw = baseBw * factor;
}
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
+1 -1
ファイルの表示
@@ -84,7 +84,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
int nranks;
NCCLCHECK(ncclCommCount(comm, &nranks));
int rank;
+1 -1
ファイルの表示
@@ -47,7 +47,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+20 -4
ファイルの表示
@@ -364,6 +364,11 @@ testResult_t InitDataReduce(void* data, const size_t count, const size_t offset,
return testSuccess;
}
testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op) {
ncclVerifiableApplyBias(expected, bias, count, (int)type, (int)op, offset, cudaStreamDefault);
return testSuccess;
}
testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
return testSuccess;
@@ -469,7 +474,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
#if 1 && DEBUG_PRINT
#if 1 && defined(DEBUG_PRINT)
if (args->reportErrors && wrongPerGpu[i] != 0) {
printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
char *expectedHost = (char*)malloc(args->expectedBytes);
@@ -582,6 +587,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
char* bias = ((char*)args->bias[i]) + shift;
ncclRedOp_t op;
if(opIndex < ncclNumOps) {
@@ -629,7 +635,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
TESTCHECK(args->collTest->runColl(
(void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
(void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
count, type, op, root, args->comms[i], args->streams[i]));
count, type, op, root, args->comms[i], args->streams[i], bias));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
if(opIndex >= ncclNumOps) {
@@ -1060,7 +1066,7 @@ testResult_t threadLaunch(struct testThread* thread) {
return testSuccess;
}
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, void **bias) {
if(enable_rotating_tensor) {
recvBytes = recvBytes + cache_bytes;
nbytes = nbytes + cache_bytes;
@@ -1069,22 +1075,26 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
if(HIP_VERSION >= 50700000) {
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached));
if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
}
else {
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained));
if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
}
}
else if (memorytype == ncclHost) {
CUDACHECK(hipHostMalloc(sendbuff, nbytes));
CUDACHECK(hipHostMalloc(recvbuff, nbytes));
if (bias) CUDACHECK(hipHostMalloc(bias, nbytes));
if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
}
else if (memorytype == ncclManaged) {
CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
if (bias) CUDACHECK(cudaMallocManaged(bias, nbytes));
if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
#if 0
CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
@@ -1096,14 +1106,17 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
#else
CUDACHECK(cudaMalloc(sendbuff, nbytes));
CUDACHECK(cudaMalloc(recvbuff, nbytes));
if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
#endif
}
CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
if (bias) CUDACHECK(hipMemset(*bias, 1, nbytes));
if (datacheck) CUDACHECK(hipMemset(*expected, 1, recvBytes));
return testSuccess;
}
@@ -1554,6 +1567,7 @@ testResult_t run() {
std::vector<cudaStream_t> streams(nGpus*nThreads);
std::vector<void*> sendbuffs(nGpus*nThreads);
std::vector<void*> recvbuffs(nGpus*nThreads);
std::vector<void*> bias(nGpus*nThreads);
std::vector<void*> expected(nGpus*nThreads);
size_t sendBytes, recvBytes;
@@ -1564,7 +1578,7 @@ testResult_t run() {
for (int i=0; i<nGpus*nThreads; i++) {
gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
CUDACHECK(cudaSetDevice(gpus[i]));
TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes));
TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes, bias.data()+i));
if (streamnull) {
streams[i] = NULL;
}
@@ -1699,6 +1713,7 @@ testResult_t run() {
threads[t].args.gpus=gpus.data()+t*nGpus;
threads[t].args.sendbuffs = sendbuffs.data()+t*nGpus;
threads[t].args.recvbuffs = recvbuffs.data()+t*nGpus;
threads[t].args.bias = bias.data()+t*nGpus;
threads[t].args.expected = expected.data()+t*nGpus;
threads[t].args.ncclId = ncclId;
threads[t].args.comms=comms+t*nGpus;
@@ -1764,6 +1779,7 @@ testResult_t run() {
#else
if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
if (bias[i]) CUDACHECK(cudaFree((char*)bias[i]));
if (datacheck) CUDACHECK(cudaFree(expected[i]));
#endif
}
+5 -3
ファイルの表示
@@ -104,9 +104,9 @@ struct testColl {
testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
ncclRedOp_t op, int root, int rep, int in_place);
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias);
testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
};
extern struct testColl allReduceTest;
extern struct testColl allGatherTest;
@@ -173,6 +173,7 @@ struct threadArgs {
ncclUniqueId ncclId;
ncclComm_t* comms;
cudaStream_t* streams;
void** bias;
void** expected;
size_t expectedBytes;
@@ -199,8 +200,9 @@ struct testThread {
extern void Barrier(struct threadArgs* args);
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
extern testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op);
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks, void **bias);
#include <unistd.h>
+1 -1
ファイルの表示
@@ -45,7 +45,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
*busBw = baseBw * factor;
}
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
+1 -1
ファイルの表示
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
*busBw = baseBw * factor;
}
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
char* sbuff = (char*)sendbuff;
char* rbuff = (char*)recvbuff;
int nRanks;
+1 -1
ファイルの表示
@@ -48,7 +48,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
*busBw = baseBw;
}
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
return testSuccess;
}
+1 -1
ファイルの表示
@@ -51,7 +51,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
*busBw = baseBw * factor;
}
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
+1 -1
ファイルの表示
@@ -41,7 +41,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
*busBw = baseBw * factor;
}
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
+1 -1
ファイルの表示
@@ -45,7 +45,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
*busBw = baseBw * factor;
}
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
+70
ファイルの表示
@@ -1044,6 +1044,76 @@ hipError_t ncclVerifiablePrepareInput(
#undef CASE_OP
}
namespace {
template<typename T, typename ReduceFn>
__global__ void applyBias2(
T *elts, T *bias, intptr_t elt_n, ReduceFn op, intptr_t elt_ix0
) {
intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
intptr_t i = i0 + threadIdx.x;
while(i < i1) {
elts[i] = op(elts[i], bias[i]);
#if 0
T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
#endif
i += blockDim.x;
}
}
template<typename ReduceOp>
void applyBias1(
void *elts, void* bias, intptr_t elt_n, int elt_ty, ReduceOp op,
intptr_t elt_ix0, cudaStream_t stream
) {
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
#define CASE_TY(T) applyBias2<<<block_n, 512, 0, stream>>>((T*)elts, (T*)bias, elt_n, op, elt_ix0); break;
switch(elt_ty) {
case ncclInt8: CASE_TY(int8_t)
case ncclUint8: CASE_TY(uint8_t)
case ncclInt32: CASE_TY(int32_t)
case ncclUint32: CASE_TY(uint32_t)
case ncclInt64: CASE_TY(int64_t)
case ncclUint64: CASE_TY(uint64_t)
case ncclFloat16: CASE_TY(__half)
#if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16)
#endif
#if HAVE_ncclfp8
case ncclFp8E4M3: CASE_TY(rccl_float8)
case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
#endif
case ncclFloat32: CASE_TY(float)
case ncclFloat64: CASE_TY(double)
default: assert(0);
}
#undef CASE_TY
}
}
void ncclVerifiableApplyBias(
void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
cudaStream_t stream
) {
#define CASE_OP(op) \
applyBias1(elts, bias, elt_n, elt_ty, op, elt_ix0, stream); \
break;
switch(red_op) {
case ncclSum: CASE_OP(ReduceSum())
case ncclMin: CASE_OP(ReduceMin())
case ncclMax: CASE_OP(ReduceMax())
case ncclProd: CASE_OP(ReduceProd())
#if HAVE_ncclPreMulSum
default: CASE_OP(ReducePreMulSum())
#endif
}
#undef CASE_OP
}
////////////////////////////////////////////////////////////////////////////////
namespace {
+6
ファイルの表示
@@ -64,6 +64,12 @@ hipError_t ncclVerifiableVerify(
int64_t *bad_elt_n, cudaStream_t stream
);
// Enqueue kernel that applies bias to expected results
void ncclVerifiableApplyBias(
void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
cudaStream_t stream
);
#ifdef NCCL_VERIFIABLE_SELF_TEST
void ncclVerifiableLaunchSelfTest();
#endif