Enable viewing algo/proto/channels used in rccl-tests output (#151)
* Enable algo/proto/channel viewing
* Use dynamic symbol loading to avoid build/runtime issues with non-compatible RCCL versions
* Reduce code duplication
[ROCm/rccl-tests commit: 0c94d4d2b3]
This commit is contained in:
committed by
GitHub
parent
b07376b9ae
commit
cb4b286d2b
@@ -148,6 +148,7 @@ All tests support the same set of arguments :
|
||||
* Parsing RCCL-Tests output
|
||||
* `-Z,--output_format <csv|json>` Parse RCCL-Tests output as a CSV or JSON. Default : disabled.
|
||||
* `-x,--output_file <output file name>` RCCL-Tests output file name. Default : disabled.
|
||||
* `-M,--output_algo_proto_channels <0/1>` Report Algorithm/Protocol/Channels for each message size. Default : 0.
|
||||
|
||||
### Running multiple operations in parallel
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
size_t base = (count/nranks) & -(16/eltSize);
|
||||
@@ -36,6 +37,13 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFunc_t::ncclFuncAllGather , count, type , 0, 0, 1, algo, proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
|
||||
void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
|
||||
|
||||
@@ -54,7 +62,8 @@ struct testColl allGatherTest = {
|
||||
AllGatherGetCollByteCount,
|
||||
AllGatherInitData,
|
||||
AllGatherGetBw,
|
||||
AllGatherRunColl
|
||||
AllGatherRunColl,
|
||||
AllGatherGetAlgoProtoChannels
|
||||
};
|
||||
|
||||
void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
@@ -33,6 +34,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce , count, type , 0, 0, 1, algo, proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
|
||||
|
||||
@@ -51,7 +58,8 @@ struct testColl allReduceTest = {
|
||||
AllReduceGetCollByteCount,
|
||||
AllReduceInitData,
|
||||
AllReduceGetBw,
|
||||
AllReduceRunColl
|
||||
AllReduceRunColl,
|
||||
AllReduceGetAlgoProtoChannels
|
||||
};
|
||||
|
||||
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*paramcount = (count/nranks) & -(16/eltSize);
|
||||
@@ -56,7 +57,8 @@ struct testColl alltoAllTest = {
|
||||
AlltoAllGetCollByteCount,
|
||||
AlltoAllInitData,
|
||||
AlltoAllGetBw,
|
||||
AlltoAllRunColl
|
||||
AlltoAllRunColl,
|
||||
NULL
|
||||
};
|
||||
|
||||
void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
#define USE_RCCL_GATHER_SCATTER
|
||||
|
||||
@@ -156,7 +157,8 @@ struct testColl alltoAllTest = {
|
||||
AlltoAllvGetCollByteCount,
|
||||
AlltoAllvInitData,
|
||||
AlltoAllvGetBw,
|
||||
AlltoAllvRunColl
|
||||
AlltoAllvRunColl,
|
||||
NULL
|
||||
};
|
||||
|
||||
void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
@@ -32,6 +33,12 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast , count, type , 0, 0, 1, algo, proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
|
||||
|
||||
@@ -60,7 +67,8 @@ struct testColl broadcastTest = {
|
||||
BroadcastGetCollByteCount,
|
||||
BroadcastInitData,
|
||||
BroadcastGetBw,
|
||||
BroadcastRunColl
|
||||
BroadcastRunColl,
|
||||
BroadcastGetAlgoProtoChannels
|
||||
};
|
||||
|
||||
void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <errno.h> /* program_invocation_short_name */
|
||||
|
||||
#include <dlfcn.h>
|
||||
//#define DEBUG_PRINT
|
||||
|
||||
#include "verifiable.h"
|
||||
@@ -35,6 +35,24 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
|
||||
int32_t gpu_block3;
|
||||
size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
|
||||
|
||||
rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL;
|
||||
rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL;
|
||||
rcclTestsGetAlgoName_t rcclTestsGetAlgoName= NULL;
|
||||
static void loadRcclSyms() {
|
||||
static void* handle = NULL;
|
||||
const char* libname = "librccl.so";
|
||||
if (!handle) {
|
||||
handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
|
||||
if (!handle) {
|
||||
fprintf(stderr, "dlopen failed: %s\n", dlerror());
|
||||
return;
|
||||
}
|
||||
}
|
||||
rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo");
|
||||
rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName");
|
||||
rcclTestsGetProtocolName = (rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName");
|
||||
}
|
||||
|
||||
// RCCL_FLOAT8 support
|
||||
bool rccl_float8_useFnuz = false;
|
||||
bool IsArchMatch(char const* arch, char const* target) {
|
||||
@@ -109,6 +127,7 @@ static int nccltype = ncclFloat;
|
||||
static int ncclroot = 0;
|
||||
static int parallel_init = 0;
|
||||
static int blocking_coll = 0;
|
||||
static int output_algo_proto_channels = 0;
|
||||
static int memorytype = 0;
|
||||
static uint32_t cumask[4];
|
||||
static int streamnull = 0;
|
||||
@@ -944,8 +963,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
|
||||
TESTCHECK(BenchTime(args, type, op, root, 0));
|
||||
usleep(delay_inout_place);
|
||||
}
|
||||
if (enable_in_place)
|
||||
if (enable_in_place)
|
||||
TESTCHECK(BenchTime(args, type, op, root, 1));
|
||||
if(output_algo_proto_channels) {
|
||||
if(args->collTest->getAlgoProtoChannels) {
|
||||
int algo, proto, nchannels;
|
||||
const char* algoName = NULL;
|
||||
const char* protoName = NULL;
|
||||
TESTCHECK(args->collTest->getAlgoProtoChannels(args->comms[0], args->nbytes / wordSize(type), type, &algo, &proto, &nchannels));
|
||||
NCCLCHECK(rcclTestsGetAlgoName(algo, &algoName));
|
||||
NCCLCHECK(rcclTestsGetProtocolName(proto, &protoName));
|
||||
PRINT("%8s %8s %10d", algoName, protoName, nchannels);
|
||||
} else {
|
||||
PRINT("%8s %8s %10s","N/A", "N/A", "N/A");
|
||||
}
|
||||
}
|
||||
PRINT("\n");
|
||||
}
|
||||
--repeat;
|
||||
@@ -1108,7 +1140,7 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
loadRcclSyms();
|
||||
// Parse args
|
||||
double parsed;
|
||||
int longindex;
|
||||
@@ -1135,14 +1167,15 @@ int main(int argc, char* argv[]) {
|
||||
{"report_cputime", required_argument, 0, 'C'},
|
||||
{"average", required_argument, 0, 'a'},
|
||||
{"local_register", required_argument, 0, 'R'},
|
||||
{"memory_type", required_argument, 0, 'y'}, //RCCL
|
||||
{"cumask", required_argument, 0, 'u'}, //RCCL
|
||||
{"out_of_place", required_argument, 0, 'O'}, //RCCL
|
||||
{"delay_inout_place", required_argument, 0, 'q'}, //RCCL
|
||||
{"cache_flush", required_argument, 0, 'F'}, //RCCL
|
||||
{"rotating_tensor", required_argument, 0, 'E'}, //RCCL
|
||||
{"output_file", required_argument, 0, 'x'}, //RCCL
|
||||
{"output_format", required_argument, 0, 'Z'}, //RCCL
|
||||
{"memory_type", required_argument, 0, 'y'}, //RCCL
|
||||
{"cumask", required_argument, 0, 'u'}, //RCCL
|
||||
{"out_of_place", required_argument, 0, 'O'}, //RCCL
|
||||
{"delay_inout_place", required_argument, 0, 'q'}, //RCCL
|
||||
{"cache_flush", required_argument, 0, 'F'}, //RCCL
|
||||
{"rotating_tensor", required_argument, 0, 'E'}, //RCCL
|
||||
{"output_file", required_argument, 0, 'x'}, //RCCL
|
||||
{"output_format", required_argument, 0, 'Z'}, //RCCL
|
||||
{"output_algo_proto_channels", required_argument, 0, 'M'}, //RCCL
|
||||
{"help", no_argument, 0, 'h'},
|
||||
{}
|
||||
};
|
||||
@@ -1150,7 +1183,7 @@ int main(int argc, char* argv[]) {
|
||||
while(1) {
|
||||
int c;
|
||||
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:h", longopts, &longindex);
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:M:h", longopts, &longindex);
|
||||
|
||||
if (c == -1)
|
||||
break;
|
||||
@@ -1290,6 +1323,10 @@ int main(int argc, char* argv[]) {
|
||||
case 'Z':
|
||||
output_format = optarg;
|
||||
break;
|
||||
case 'M':
|
||||
output_algo_proto_channels = strtol(optarg, NULL, 0);
|
||||
if(rcclTestsGetAlgoInfo == NULL || rcclTestsGetAlgoName == NULL || rcclTestsGetProtocolName == NULL) output_algo_proto_channels = 0;
|
||||
break;
|
||||
case 'h':
|
||||
default:
|
||||
if (c != 'h') printf("invalid option '%c'\n", c);
|
||||
@@ -1607,27 +1644,39 @@ testResult_t run() {
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
const char* extra_col_str[3] = {"", "", ""};
|
||||
if (output_algo_proto_channels) {
|
||||
extra_col_str[0] = "algo";
|
||||
extra_col_str[1] = "proto";
|
||||
extra_col_str[2] = "nchannels";
|
||||
}
|
||||
const char* header_col_str[3] = {" out-of-place in-place ",
|
||||
" out-of-place "," in-place "};
|
||||
int header_index =(enable_out_of_place && enable_in_place) ? 0 : (enable_out_of_place ? 1 : 2);
|
||||
const char* timeStr = report_cputime ? "cputime" : "time";
|
||||
|
||||
PRINT("#\n");
|
||||
PRINT("# %10s %12s %8s %6s %6s%s\n", "", "", "", "", "", header_col_str[header_index]);
|
||||
if (enable_out_of_place && enable_in_place) {
|
||||
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
|
||||
timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
} else if (enable_out_of_place) {
|
||||
PRINT("# %10s %12s %8s %6s %6s out-of-place \n", "", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
|
||||
timeStr, "algbw", "busbw", "#wrong");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n",
|
||||
"size", "count", "type", "redop", "root",
|
||||
timeStr, "algbw", "busbw", "#wrong",
|
||||
timeStr, "algbw", "busbw", "#wrong",
|
||||
extra_col_str[0], extra_col_str[1], extra_col_str[2]);
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s %8s %8s %10s\n",
|
||||
"(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "",
|
||||
"", "", "");
|
||||
} else {
|
||||
PRINT("# %10s %12s %8s %6s %6s in-place \n", "", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
|
||||
timeStr, "algbw", "busbw", "#wrong");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n",
|
||||
"size", "count", "type", "redop", "root",
|
||||
timeStr, "algbw", "busbw", "#wrong",
|
||||
extra_col_str[0], extra_col_str[1], extra_col_str[2]);
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %8s %8s %10s\n",
|
||||
"(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "",
|
||||
"", "", "");
|
||||
}
|
||||
Reporter reporter(output_file, output_format);
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
************************************************************************/
|
||||
#ifndef __COMMON_H__
|
||||
#define __COMMON_H__
|
||||
|
||||
#include "rccl/rccl.h"
|
||||
#include <stdio.h>
|
||||
#include <cstdint>
|
||||
@@ -107,6 +106,7 @@ struct testColl {
|
||||
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
|
||||
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
|
||||
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||
testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
|
||||
};
|
||||
extern struct testColl allReduceTest;
|
||||
extern struct testColl allGatherTest;
|
||||
@@ -375,4 +375,24 @@ extern int is_main_proc;
|
||||
extern thread_local int is_main_thread;
|
||||
#define PRINT if (is_main_thread) printf
|
||||
|
||||
typedef enum {
|
||||
ncclFuncBroadcast = 0,
|
||||
ncclFuncReduce = 1,
|
||||
ncclFuncAllGather = 2,
|
||||
ncclFuncReduceScatter = 3,
|
||||
ncclFuncAllReduce = 4,
|
||||
ncclFuncAllReduceWithBias = 5,
|
||||
ncclFuncSendRecv = 6,
|
||||
ncclFuncSend = 7,
|
||||
ncclFuncRecv = 8,
|
||||
ncclFuncAllToAllPivot = 9,
|
||||
ncclNumFuncs = 10
|
||||
} ncclFunc_t;
|
||||
|
||||
typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int* algo, int* protocol, int* maxChannels);
|
||||
typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
|
||||
typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol, const char** protocolName);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = (count/nranks) & -(16/eltSize);
|
||||
@@ -69,7 +70,8 @@ struct testColl gatherTest = {
|
||||
GatherGetCollByteCount,
|
||||
GatherInitData,
|
||||
GatherGetBw,
|
||||
GatherRunColl
|
||||
GatherRunColl,
|
||||
NULL
|
||||
};
|
||||
|
||||
void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
/* ************************************************************************
|
||||
* Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
|
||||
* ies of the Software, and to permit persons to whom the Software is furnished
|
||||
* to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in all
|
||||
* copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
|
||||
* PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
||||
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
||||
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
||||
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
|
||||
* CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ************************************************************************ */
|
||||
|
||||
#ifndef RCCL_COMPAT_H
|
||||
#define RCCL_COMPAT_H
|
||||
|
||||
extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo;
|
||||
extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName;
|
||||
extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName;
|
||||
|
||||
#endif
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
@@ -34,6 +35,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce , count, type , 0, 0, 1, algo, proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
|
||||
void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
|
||||
*algBw = baseBw;
|
||||
@@ -50,7 +58,8 @@ struct testColl reduceTest = {
|
||||
ReduceGetCollByteCount,
|
||||
ReduceInitData,
|
||||
ReduceGetBw,
|
||||
ReduceRunColl
|
||||
ReduceRunColl,
|
||||
ReduceGetAlgoProtoChannels
|
||||
};
|
||||
|
||||
void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
size_t base = (count/nranks) & -(16/eltSize);
|
||||
@@ -35,6 +36,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
|
||||
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
|
||||
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter , count, type , 0, 0, 1, algo, proto, nchannels));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
|
||||
void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
|
||||
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
|
||||
|
||||
@@ -53,7 +61,8 @@ struct testColl reduceScatterTest = {
|
||||
ReduceScatterGetCollByteCount,
|
||||
ReduceScatterInitData,
|
||||
ReduceScatterGetBw,
|
||||
ReduceScatterRunColl
|
||||
ReduceScatterRunColl,
|
||||
ReduceScatterGetAlgoProtoChannels
|
||||
};
|
||||
|
||||
void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*recvcount = (count/nranks) & -(16/eltSize);
|
||||
@@ -65,7 +66,8 @@ struct testColl scatterTest = {
|
||||
ScatterGetCollByteCount,
|
||||
ScatterInitData,
|
||||
ScatterGetBw,
|
||||
ScatterRunColl
|
||||
ScatterRunColl,
|
||||
NULL
|
||||
};
|
||||
|
||||
void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "common.h"
|
||||
#include "rccl_compat.h"
|
||||
|
||||
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
|
||||
*sendcount = count;
|
||||
@@ -64,7 +65,8 @@ struct testColl sendRecvTest = {
|
||||
SendRecvGetCollByteCount,
|
||||
SendRecvInitData,
|
||||
SendRecvGetBw,
|
||||
SendRecvRunColl
|
||||
SendRecvRunColl,
|
||||
NULL
|
||||
};
|
||||
|
||||
void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
|
||||
|
||||
Reference in New Issue
Block a user