Add all_reduce_bias_perf to support All Reduce with Bias (#130)
Use dynamic symbol loading of ncclAllReduceWithBias Co-authored-by: mberenjk <146776561+mberenjk@users.noreply.github.com>
このコミットが含まれているのは:
@@ -145,3 +145,4 @@ add_rccl_test(reduce_scatter)
|
||||
add_rccl_test(reduce)
|
||||
add_rccl_test(scatter)
|
||||
add_rccl_test(sendrecv)
|
||||
add_rccl_test(all_reduce_bias)
|
||||
|
||||
+1
-1
@@ -135,7 +135,7 @@ HIPLDFLAGS += $(LIBRARIES:%=-l%)
|
||||
DST_DIR := $(BUILDDIR)
|
||||
SRC_FILES := $(wildcard *.cu)
|
||||
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
|
||||
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
|
||||
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube all_reduce_bias
|
||||
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
|
||||
|
||||
GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
|
||||
|
||||
+1
-1
@@ -52,7 +52,7 @@ void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
+1
-1
@@ -48,7 +48,7 @@ void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include <dlfcn.h>
|
||||
|
||||
typedef ncclResult_t (*PFN_ncclAllReduceWithBias)(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream, const void* acc);
|
||||
#define DECLARE_RCCL_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
|
||||
DECLARE_RCCL_PFN(ncclAllReduceWithBias);
|
||||
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
|
||||
|
||||
static void initOnceFunc() {
|
||||
void *librccl = dlopen("librccl.so", RTLD_NOLOAD);
|
||||
pfn_ncclAllReduceWithBias = (PFN_ncclAllReduceWithBias) dlsym(librccl, "ncclAllReduceWithBias");
|
||||
}
|
||||
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = *sendcount;
|
||||
pthread_once(&initOnceControl, initOnceFunc);
|
||||
}
|
||||
|
||||
testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
size_t sendcount = args->sendBytes / wordSize(type);
|
||||
size_t recvcount = args->expectedBytes / wordSize(type);
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
TESTCHECK(InitData(args->bias[i], sendcount, 0, type, op, rep+0x12345678, nranks, rank));
|
||||
TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
|
||||
TESTCHECK(InitDataApplyBias(args->expected[i], args->bias[i], recvcount, 0, type, op));
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
|
||||
|
||||
*algBw = baseBw;
|
||||
double factor = ((double)(2*(nranks - 1)))/((double)nranks);
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
if (pfn_ncclAllReduceWithBias == nullptr) {
|
||||
fprintf(stderr, "[ERROR] This version of RCCL doesn't support ncclAllReduceWithBias\n");
|
||||
return testNcclError;
|
||||
}
|
||||
NCCLCHECK((*pfn_ncclAllReduceWithBias)(sendbuff, recvbuff, count, type, op, comm, stream, bias));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl allReduceTest = {
|
||||
"AllReduce",
|
||||
AllReduceGetCollByteCount,
|
||||
AllReduceInitData,
|
||||
AllReduceGetBw,
|
||||
AllReduceRunColl
|
||||
};
|
||||
|
||||
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
||||
AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
||||
}
|
||||
|
||||
testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
||||
args->collTest = &allReduceTest;
|
||||
ncclDataType_t *run_types;
|
||||
ncclRedOp_t *run_ops;
|
||||
const char **run_typenames, **run_opnames;
|
||||
int type_count, op_count;
|
||||
|
||||
if ((int)type != -1) {
|
||||
type_count = 1;
|
||||
run_types = &type;
|
||||
run_typenames = &typeName;
|
||||
} else {
|
||||
type_count = test_typenum;
|
||||
run_types = test_types;
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
if ((int)op != -1) {
|
||||
op_count = 1;
|
||||
run_ops = &op;
|
||||
run_opnames = &opName;
|
||||
} else {
|
||||
op_count = test_opnum;
|
||||
run_ops = test_ops;
|
||||
run_opnames = test_opnames;
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=0; j<op_count; j++) {
|
||||
#if defined(RCCL_FLOAT8)
|
||||
if((run_types[i] == ncclFloat8e4m3 || run_types[i] == ncclFloat8e5m2) && (run_ops[j] == ncclProd || run_ops[j] == ncclAvg || strcmp(run_opnames[j],"mulsum") == 0))
|
||||
continue;
|
||||
#endif
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testEngine ncclTestEngine = {
|
||||
AllReduceGetBuffSize,
|
||||
AllReduceRunTest
|
||||
};
|
||||
+1
-1
@@ -47,7 +47,7 @@ void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
+1
-1
@@ -84,7 +84,7 @@ void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
int nranks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nranks));
|
||||
int rank;
|
||||
|
||||
+1
-1
@@ -47,7 +47,7 @@ void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
|
||||
|
||||
+20
-4
@@ -364,6 +364,11 @@ testResult_t InitDataReduce(void* data, const size_t count, const size_t offset,
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op) {
|
||||
ncclVerifiableApplyBias(expected, bias, count, (int)type, (int)op, offset, cudaStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
|
||||
CUDACHECK(ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault));
|
||||
return testSuccess;
|
||||
@@ -469,7 +474,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
|
||||
TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
|
||||
|
||||
#if 1 && DEBUG_PRINT
|
||||
#if 1 && defined(DEBUG_PRINT)
|
||||
if (args->reportErrors && wrongPerGpu[i] != 0) {
|
||||
printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
|
||||
char *expectedHost = (char*)malloc(args->expectedBytes);
|
||||
@@ -582,6 +587,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
||||
char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
|
||||
char* sendBuff = ((char*)args->sendbuffs[i]) + shift;
|
||||
char* bias = ((char*)args->bias[i]) + shift;
|
||||
ncclRedOp_t op;
|
||||
|
||||
if(opIndex < ncclNumOps) {
|
||||
@@ -629,7 +635,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
TESTCHECK(args->collTest->runColl(
|
||||
(void*)(in_place ? recvBuff + args->sendInplaceOffset*rank : sendBuff),
|
||||
(void*)(in_place ? recvBuff + args->recvInplaceOffset*rank : recvBuff),
|
||||
count, type, op, root, args->comms[i], args->streams[i]));
|
||||
count, type, op, root, args->comms[i], args->streams[i], bias));
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
|
||||
if(opIndex >= ncclNumOps) {
|
||||
@@ -1060,7 +1066,7 @@ testResult_t threadLaunch(struct testThread* thread) {
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes) {
|
||||
testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, size_t recvBytes, void **expected, size_t nbytes, void **bias) {
|
||||
if(enable_rotating_tensor) {
|
||||
recvBytes = recvBytes + cache_bytes;
|
||||
nbytes = nbytes + cache_bytes;
|
||||
@@ -1069,22 +1075,26 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
|
||||
if(HIP_VERSION >= 50700000) {
|
||||
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocUncached));
|
||||
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocUncached));
|
||||
if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocUncached));
|
||||
if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocUncached));
|
||||
}
|
||||
else {
|
||||
CUDACHECK(hipExtMallocWithFlags(sendbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
CUDACHECK(hipExtMallocWithFlags(recvbuff, nbytes, hipDeviceMallocFinegrained));
|
||||
if (bias) CUDACHECK(hipExtMallocWithFlags(bias, nbytes, hipDeviceMallocFinegrained));
|
||||
if (datacheck) CUDACHECK(hipExtMallocWithFlags(expected, recvBytes, hipDeviceMallocFinegrained));
|
||||
}
|
||||
}
|
||||
else if (memorytype == ncclHost) {
|
||||
CUDACHECK(hipHostMalloc(sendbuff, nbytes));
|
||||
CUDACHECK(hipHostMalloc(recvbuff, nbytes));
|
||||
if (bias) CUDACHECK(hipHostMalloc(bias, nbytes));
|
||||
if (datacheck) CUDACHECK(hipHostMalloc(expected, recvBytes));
|
||||
}
|
||||
else if (memorytype == ncclManaged) {
|
||||
CUDACHECK(cudaMallocManaged(sendbuff, nbytes));
|
||||
CUDACHECK(cudaMallocManaged(recvbuff, nbytes));
|
||||
if (bias) CUDACHECK(cudaMallocManaged(bias, nbytes));
|
||||
if (datacheck) CUDACHECK(cudaMallocManaged(expected, recvBytes));
|
||||
#if 0
|
||||
CUDACHECK(cudaMemset(*sendbuff, 0, nbytes));
|
||||
@@ -1096,14 +1106,17 @@ testResult_t AllocateBuffs(void **sendbuff, size_t sendBytes, void **recvbuff, s
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
NCCLCHECK(ncclMemAlloc(sendbuff, nbytes));
|
||||
NCCLCHECK(ncclMemAlloc(recvbuff, nbytes));
|
||||
if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
|
||||
if (datacheck) NCCLCHECK(ncclMemAlloc(expected, recvBytes));
|
||||
#else
|
||||
CUDACHECK(cudaMalloc(sendbuff, nbytes));
|
||||
CUDACHECK(cudaMalloc(recvbuff, nbytes));
|
||||
if (bias) CUDACHECK(cudaMalloc(bias, nbytes));
|
||||
if (datacheck) CUDACHECK(cudaMalloc(expected, recvBytes));
|
||||
#endif
|
||||
}
|
||||
CUDACHECK(hipMemset(*sendbuff, 1, nbytes));
|
||||
if (bias) CUDACHECK(hipMemset(*bias, 1, nbytes));
|
||||
if (datacheck) CUDACHECK(hipMemset(*expected, 1, recvBytes));
|
||||
return testSuccess;
|
||||
}
|
||||
@@ -1554,6 +1567,7 @@ testResult_t run() {
|
||||
std::vector<cudaStream_t> streams(nGpus*nThreads);
|
||||
std::vector<void*> sendbuffs(nGpus*nThreads);
|
||||
std::vector<void*> recvbuffs(nGpus*nThreads);
|
||||
std::vector<void*> bias(nGpus*nThreads);
|
||||
std::vector<void*> expected(nGpus*nThreads);
|
||||
size_t sendBytes, recvBytes;
|
||||
|
||||
@@ -1564,7 +1578,7 @@ testResult_t run() {
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
gpus[i] = ((gpu0 != -1 ? gpu0 : localRank*nThreads*nGpus) + i)%numDevices;
|
||||
CUDACHECK(cudaSetDevice(gpus[i]));
|
||||
TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes));
|
||||
TESTCHECK(AllocateBuffs(sendbuffs.data()+i, sendBytes, recvbuffs.data()+i, recvBytes, expected.data()+i, (size_t)maxBytes, bias.data()+i));
|
||||
if (streamnull) {
|
||||
streams[i] = NULL;
|
||||
}
|
||||
@@ -1699,6 +1713,7 @@ testResult_t run() {
|
||||
threads[t].args.gpus=gpus.data()+t*nGpus;
|
||||
threads[t].args.sendbuffs = sendbuffs.data()+t*nGpus;
|
||||
threads[t].args.recvbuffs = recvbuffs.data()+t*nGpus;
|
||||
threads[t].args.bias = bias.data()+t*nGpus;
|
||||
threads[t].args.expected = expected.data()+t*nGpus;
|
||||
threads[t].args.ncclId = ncclId;
|
||||
threads[t].args.comms=comms+t*nGpus;
|
||||
@@ -1764,6 +1779,7 @@ testResult_t run() {
|
||||
#else
|
||||
if (sendbuffs[i]) CUDACHECK(cudaFree((char*)sendbuffs[i]));
|
||||
if (recvbuffs[i]) CUDACHECK(cudaFree((char*)recvbuffs[i]));
|
||||
if (bias[i]) CUDACHECK(cudaFree((char*)bias[i]));
|
||||
if (datacheck) CUDACHECK(cudaFree(expected[i]));
|
||||
#endif
|
||||
}
|
||||
|
||||
+5
-3
@@ -104,9 +104,9 @@ struct testColl {
|
||||
testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, int rep, int in_place);
|
||||
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
|
||||
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias);
|
||||
testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
|
||||
|
||||
};
|
||||
extern struct testColl allReduceTest;
|
||||
extern struct testColl allGatherTest;
|
||||
@@ -173,6 +173,7 @@ struct threadArgs {
|
||||
ncclUniqueId ncclId;
|
||||
ncclComm_t* comms;
|
||||
cudaStream_t* streams;
|
||||
void** bias;
|
||||
|
||||
void** expected;
|
||||
size_t expectedBytes;
|
||||
@@ -199,8 +200,9 @@ struct testThread {
|
||||
extern void Barrier(struct threadArgs* args);
|
||||
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
|
||||
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
|
||||
extern testResult_t InitDataApplyBias(void* expected, void* bias, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op);
|
||||
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
|
||||
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
|
||||
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks, void **bias);
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
+1
-1
@@ -45,7 +45,7 @@ void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double*
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
|
||||
+1
-1
@@ -46,7 +46,7 @@ void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, doubl
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
char* sbuff = (char*)sendbuff;
|
||||
char* rbuff = (char*)recvbuff;
|
||||
int nRanks;
|
||||
|
||||
+1
-1
@@ -48,7 +48,7 @@ void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double*
|
||||
*busBw = baseBw;
|
||||
}
|
||||
|
||||
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
+1
-1
@@ -51,7 +51,7 @@ void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, d
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
+1
-1
@@ -41,7 +41,7 @@ void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double*
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
|
||||
+1
-1
@@ -45,7 +45,7 @@ void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double
|
||||
*busBw = baseBw * factor;
|
||||
}
|
||||
|
||||
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
||||
testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, void* bias = nullptr) {
|
||||
int nRanks;
|
||||
NCCLCHECK(ncclCommCount(comm, &nRanks));
|
||||
int rank;
|
||||
|
||||
@@ -1044,6 +1044,76 @@ hipError_t ncclVerifiablePrepareInput(
|
||||
#undef CASE_OP
|
||||
}
|
||||
|
||||
namespace {
|
||||
template<typename T, typename ReduceFn>
|
||||
__global__ void applyBias2(
|
||||
T *elts, T *bias, intptr_t elt_n, ReduceFn op, intptr_t elt_ix0
|
||||
) {
|
||||
intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
|
||||
i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
|
||||
intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
|
||||
i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
|
||||
intptr_t i = i0 + threadIdx.x;
|
||||
while(i < i1) {
|
||||
elts[i] = op(elts[i], bias[i]);
|
||||
#if 0
|
||||
T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
|
||||
printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
|
||||
std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
|
||||
#endif
|
||||
i += blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename ReduceOp>
|
||||
void applyBias1(
|
||||
void *elts, void* bias, intptr_t elt_n, int elt_ty, ReduceOp op,
|
||||
intptr_t elt_ix0, cudaStream_t stream
|
||||
) {
|
||||
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
|
||||
#define CASE_TY(T) applyBias2<<<block_n, 512, 0, stream>>>((T*)elts, (T*)bias, elt_n, op, elt_ix0); break;
|
||||
switch(elt_ty) {
|
||||
case ncclInt8: CASE_TY(int8_t)
|
||||
case ncclUint8: CASE_TY(uint8_t)
|
||||
case ncclInt32: CASE_TY(int32_t)
|
||||
case ncclUint32: CASE_TY(uint32_t)
|
||||
case ncclInt64: CASE_TY(int64_t)
|
||||
case ncclUint64: CASE_TY(uint64_t)
|
||||
case ncclFloat16: CASE_TY(__half)
|
||||
#if HAVE_ncclBfloat16
|
||||
case ncclBfloat16: CASE_TY(hip_bfloat16)
|
||||
#endif
|
||||
#if HAVE_ncclfp8
|
||||
case ncclFp8E4M3: CASE_TY(rccl_float8)
|
||||
case ncclFp8E5M2: CASE_TY(rccl_bfloat8)
|
||||
#endif
|
||||
case ncclFloat32: CASE_TY(float)
|
||||
case ncclFloat64: CASE_TY(double)
|
||||
default: assert(0);
|
||||
}
|
||||
#undef CASE_TY
|
||||
}
|
||||
}
|
||||
|
||||
void ncclVerifiableApplyBias(
|
||||
void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
|
||||
cudaStream_t stream
|
||||
) {
|
||||
#define CASE_OP(op) \
|
||||
applyBias1(elts, bias, elt_n, elt_ty, op, elt_ix0, stream); \
|
||||
break;
|
||||
switch(red_op) {
|
||||
case ncclSum: CASE_OP(ReduceSum())
|
||||
case ncclMin: CASE_OP(ReduceMin())
|
||||
case ncclMax: CASE_OP(ReduceMax())
|
||||
case ncclProd: CASE_OP(ReduceProd())
|
||||
#if HAVE_ncclPreMulSum
|
||||
default: CASE_OP(ReducePreMulSum())
|
||||
#endif
|
||||
}
|
||||
#undef CASE_OP
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -64,6 +64,12 @@ hipError_t ncclVerifiableVerify(
|
||||
int64_t *bad_elt_n, cudaStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel that applies bias to expected results
|
||||
void ncclVerifiableApplyBias(
|
||||
void *elts, void* bias, intptr_t elt_n, int elt_ty, int red_op, intptr_t elt_ix0,
|
||||
cudaStream_t stream
|
||||
);
|
||||
|
||||
#ifdef NCCL_VERIFIABLE_SELF_TEST
|
||||
void ncclVerifiableLaunchSelfTest();
|
||||
#endif
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする