8d887aad0d
[ROCm/rccl-tests commit: 5625599dda]
105 wiersze
3.6 KiB
Plaintext
105 wiersze
3.6 KiB
Plaintext
/*************************************************************************
|
|
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
|
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "cuda_runtime.h"
|
|
#include "common.h"
|
|
|
|
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
|
*sendcount = count;
|
|
*recvcount = count;
|
|
*sendInplaceOffset = 0;
|
|
*recvInplaceOffset = 0;
|
|
*paramcount = *sendcount;
|
|
}
|
|
|
|
testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
|
size_t sendcount = args->sendBytes / wordSize(type);
|
|
size_t recvcount = args->expectedBytes / wordSize(type);
|
|
int nranks = args->nProcs*args->nThreads*args->nGpus;
|
|
|
|
for (int i=0; i<args->nGpus; i++) {
|
|
CUDACHECK(cudaSetDevice(args->gpus[i]));
|
|
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus + i);
|
|
CUDACHECK(cudaMemset(args->recvbuffs[i], 0, args->expectedBytes));
|
|
void* data = in_place ? args->recvbuffs[i] : args->sendbuffs[i];
|
|
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
|
TESTCHECK(InitDataReduce(args->expected[i], recvcount, 0, type, op, rep, nranks));
|
|
CUDACHECK(cudaDeviceSynchronize());
|
|
}
|
|
return testSuccess;
|
|
}
|
|
|
|
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
|
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
|
|
|
|
*algBw = baseBw;
|
|
double factor = ((double)(2*(nranks - 1)))/((double)nranks);
|
|
*busBw = baseBw * factor;
|
|
}
|
|
|
|
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
|
|
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
|
|
return testSuccess;
|
|
}
|
|
|
|
struct testColl allReduceTest = {
|
|
"AllReduce",
|
|
AllReduceGetCollByteCount,
|
|
AllReduceInitData,
|
|
AllReduceGetBw,
|
|
AllReduceRunColl
|
|
};
|
|
|
|
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
|
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
|
|
AllReduceGetCollByteCount(sendcount, recvcount, ¶mcount, &sendInplaceOffset, &recvInplaceOffset, count, /*eltSize=*/1, nranks);
|
|
}
|
|
|
|
testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
|
|
args->collTest = &allReduceTest;
|
|
ncclDataType_t *run_types;
|
|
ncclRedOp_t *run_ops;
|
|
const char **run_typenames, **run_opnames;
|
|
int type_count, op_count;
|
|
if((type == ncclFp8E4M3 || type == ncclFp8E5M2) && op == ncclProd)
|
|
return testSuccess;
|
|
|
|
if ((int)type != -1) {
|
|
type_count = 1;
|
|
run_types = &type;
|
|
run_typenames = &typeName;
|
|
} else {
|
|
type_count = test_typenum;
|
|
run_types = test_types;
|
|
run_typenames = test_typenames;
|
|
}
|
|
|
|
if ((int)op != -1) {
|
|
op_count = 1;
|
|
run_ops = &op;
|
|
run_opnames = &opName;
|
|
} else {
|
|
op_count = test_opnum;
|
|
run_ops = test_ops;
|
|
run_opnames = test_opnames;
|
|
}
|
|
|
|
for (int i=0; i<type_count; i++) {
|
|
for (int j=0; j<op_count; j++) {
|
|
if((i == ncclFp8E4M3 || i == ncclFp8E5M2) && j == ncclProd)
|
|
continue;
|
|
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
|
|
}
|
|
}
|
|
return testSuccess;
|
|
}
|
|
|
|
struct testEngine ncclTestEngine = {
|
|
AllReduceGetBuffSize,
|
|
AllReduceRunTest
|
|
};
|