Enable viewing algo/proto/channels used in rccl-tests output (#151)

* Enable algo/proto/channel viewing 

* Use dynamic symbol loading to avoid build/runtime issues with non-compatible RCCL versions

* Reduce code duplication

[ROCm/rccl-tests commit: 0c94d4d2b3]
Этот коммит содержится в:
Mustafa Abduljabbar
2025-09-26 18:09:01 -04:00
коммит произвёл GitHub
родитель b07376b9ae
Коммит cb4b286d2b
14 изменённых файлов: 193 добавлений и 40 удалений
+1
Просмотреть файл
@@ -148,6 +148,7 @@ All tests support the same set of arguments :
* Parsing RCCL-Tests output
* `-Z,--output_format <csv|json>` Parse RCCL-Tests output as a CSV or JSON. Default : disabled.
* `-x,--output_file <output file name>` RCCL-Tests output file name. Default : disabled.
* `-M,--output_algo_proto_channels <0/1>` Report Algorithm/Protocol/Channels for each message size. Default : 0.
### Running multiple operations in parallel
+10 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
size_t base = (count/nranks) & -(16/eltSize);
@@ -36,6 +37,13 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
return testSuccess;
}
testResult_t AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFunc_t::ncclFuncAllGather , count, type , 0, 0, 1, algo, proto, nchannels));
return testSuccess;
}
void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
@@ -54,7 +62,8 @@ struct testColl allGatherTest = {
AllGatherGetCollByteCount,
AllGatherInitData,
AllGatherGetBw,
AllGatherRunColl
AllGatherRunColl,
AllGatherGetAlgoProtoChannels
};
void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+9 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*sendcount = count;
@@ -33,6 +34,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
return testSuccess;
}
testResult_t AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce , count, type , 0, 0, 1, algo, proto, nchannels));
return testSuccess;
}
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
@@ -51,7 +58,8 @@ struct testColl allReduceTest = {
AllReduceGetCollByteCount,
AllReduceInitData,
AllReduceGetBw,
AllReduceRunColl
AllReduceRunColl,
AllReduceGetAlgoProtoChannels
};
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+3 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*paramcount = (count/nranks) & -(16/eltSize);
@@ -56,7 +57,8 @@ struct testColl alltoAllTest = {
AlltoAllGetCollByteCount,
AlltoAllInitData,
AlltoAllGetBw,
AlltoAllRunColl
AlltoAllRunColl,
NULL
};
void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+3 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
#define USE_RCCL_GATHER_SCATTER
@@ -156,7 +157,8 @@ struct testColl alltoAllTest = {
AlltoAllvGetCollByteCount,
AlltoAllvInitData,
AlltoAllvGetBw,
AlltoAllvRunColl
AlltoAllvRunColl,
NULL
};
void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+9 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*sendcount = count;
@@ -32,6 +33,12 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
return testSuccess;
}
testResult_t BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast , count, type , 0, 0, 1, algo, proto, nchannels));
return testSuccess;
}
void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
@@ -60,7 +67,8 @@ struct testColl broadcastTest = {
BroadcastGetCollByteCount,
BroadcastInitData,
BroadcastGetBw,
BroadcastRunColl
BroadcastRunColl,
BroadcastGetAlgoProtoChannels
};
void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+78 -29
Просмотреть файл
@@ -22,7 +22,7 @@
#include <vector>
#include <utility>
#include <errno.h> /* program_invocation_short_name */
#include <dlfcn.h>
//#define DEBUG_PRINT
#include "verifiable.h"
@@ -35,6 +35,24 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
int32_t gpu_block3;
size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL;
rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL;
rcclTestsGetAlgoName_t rcclTestsGetAlgoName= NULL;
static void loadRcclSyms() {
static void* handle = NULL;
const char* libname = "librccl.so";
if (!handle) {
handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
if (!handle) {
fprintf(stderr, "dlopen failed: %s\n", dlerror());
return;
}
}
rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo");
rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName");
rcclTestsGetProtocolName = (rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName");
}
// RCCL_FLOAT8 support
bool rccl_float8_useFnuz = false;
bool IsArchMatch(char const* arch, char const* target) {
@@ -109,6 +127,7 @@ static int nccltype = ncclFloat;
static int ncclroot = 0;
static int parallel_init = 0;
static int blocking_coll = 0;
static int output_algo_proto_channels = 0;
static int memorytype = 0;
static uint32_t cumask[4];
static int streamnull = 0;
@@ -944,8 +963,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
TESTCHECK(BenchTime(args, type, op, root, 0));
usleep(delay_inout_place);
}
if (enable_in_place)
if (enable_in_place)
TESTCHECK(BenchTime(args, type, op, root, 1));
if(output_algo_proto_channels) {
if(args->collTest->getAlgoProtoChannels) {
int algo, proto, nchannels;
const char* algoName = NULL;
const char* protoName = NULL;
TESTCHECK(args->collTest->getAlgoProtoChannels(args->comms[0], args->nbytes / wordSize(type), type, &algo, &proto, &nchannels));
NCCLCHECK(rcclTestsGetAlgoName(algo, &algoName));
NCCLCHECK(rcclTestsGetProtocolName(proto, &protoName));
PRINT("%8s %8s %10d", algoName, protoName, nchannels);
} else {
PRINT("%8s %8s %10s","N/A", "N/A", "N/A");
}
}
PRINT("\n");
}
--repeat;
@@ -1108,7 +1140,7 @@ int main(int argc, char* argv[]) {
}
#endif
#endif
loadRcclSyms();
// Parse args
double parsed;
int longindex;
@@ -1135,14 +1167,15 @@ int main(int argc, char* argv[]) {
{"report_cputime", required_argument, 0, 'C'},
{"average", required_argument, 0, 'a'},
{"local_register", required_argument, 0, 'R'},
{"memory_type", required_argument, 0, 'y'}, //RCCL
{"cumask", required_argument, 0, 'u'}, //RCCL
{"out_of_place", required_argument, 0, 'O'}, //RCCL
{"delay_inout_place", required_argument, 0, 'q'}, //RCCL
{"cache_flush", required_argument, 0, 'F'}, //RCCL
{"rotating_tensor", required_argument, 0, 'E'}, //RCCL
{"output_file", required_argument, 0, 'x'}, //RCCL
{"output_format", required_argument, 0, 'Z'}, //RCCL
{"memory_type", required_argument, 0, 'y'}, //RCCL
{"cumask", required_argument, 0, 'u'}, //RCCL
{"out_of_place", required_argument, 0, 'O'}, //RCCL
{"delay_inout_place", required_argument, 0, 'q'}, //RCCL
{"cache_flush", required_argument, 0, 'F'}, //RCCL
{"rotating_tensor", required_argument, 0, 'E'}, //RCCL
{"output_file", required_argument, 0, 'x'}, //RCCL
{"output_format", required_argument, 0, 'Z'}, //RCCL
{"output_algo_proto_channels", required_argument, 0, 'M'}, //RCCL
{"help", no_argument, 0, 'h'},
{}
};
@@ -1150,7 +1183,7 @@ int main(int argc, char* argv[]) {
while(1) {
int c;
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:h", longopts, &longindex);
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:M:h", longopts, &longindex);
if (c == -1)
break;
@@ -1290,6 +1323,10 @@ int main(int argc, char* argv[]) {
case 'Z':
output_format = optarg;
break;
case 'M':
output_algo_proto_channels = strtol(optarg, NULL, 0);
if(rcclTestsGetAlgoInfo == NULL || rcclTestsGetAlgoName == NULL || rcclTestsGetProtocolName == NULL) output_algo_proto_channels = 0;
break;
case 'h':
default:
if (c != 'h') printf("invalid option '%c'\n", c);
@@ -1607,27 +1644,39 @@ testResult_t run() {
}
fflush(stdout);
const char* extra_col_str[3] = {"", "", ""};
if (output_algo_proto_channels) {
extra_col_str[0] = "algo";
extra_col_str[1] = "proto";
extra_col_str[2] = "nchannels";
}
const char* header_col_str[3] = {" out-of-place in-place ",
" out-of-place "," in-place "};
int header_index =(enable_out_of_place && enable_in_place) ? 0 : (enable_out_of_place ? 1 : 2);
const char* timeStr = report_cputime ? "cputime" : "time";
PRINT("#\n");
PRINT("# %10s %12s %8s %6s %6s%s\n", "", "", "", "", "", header_col_str[header_index]);
if (enable_out_of_place && enable_in_place) {
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
} else if (enable_out_of_place) {
PRINT("# %10s %12s %8s %6s %6s out-of-place \n", "", "", "", "", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
timeStr, "algbw", "busbw", "#wrong");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n",
"size", "count", "type", "redop", "root",
timeStr, "algbw", "busbw", "#wrong",
timeStr, "algbw", "busbw", "#wrong",
extra_col_str[0], extra_col_str[1], extra_col_str[2]);
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s %8s %8s %10s\n",
"(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "",
"(us)", "(GB/s)", "(GB/s)", "",
"", "", "");
} else {
PRINT("# %10s %12s %8s %6s %6s in-place \n", "", "", "", "", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
timeStr, "algbw", "busbw", "#wrong");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n",
"size", "count", "type", "redop", "root",
timeStr, "algbw", "busbw", "#wrong",
extra_col_str[0], extra_col_str[1], extra_col_str[2]);
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %8s %8s %10s\n",
"(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "",
"", "", "");
}
Reporter reporter(output_file, output_format);
+21 -1
Просмотреть файл
@@ -7,7 +7,6 @@
************************************************************************/
#ifndef __COMMON_H__
#define __COMMON_H__
#include "rccl/rccl.h"
#include <stdio.h>
#include <cstdint>
@@ -107,6 +106,7 @@ struct testColl {
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels);
};
extern struct testColl allReduceTest;
extern struct testColl allGatherTest;
@@ -375,4 +375,24 @@ extern int is_main_proc;
extern thread_local int is_main_thread;
#define PRINT if (is_main_thread) printf
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncAllReduceWithBias = 5,
ncclFuncSendRecv = 6,
ncclFuncSend = 7,
ncclFuncRecv = 8,
ncclFuncAllToAllPivot = 9,
ncclNumFuncs = 10
} ncclFunc_t;
typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
int collNetSupport, int nvlsSupport, int numPipeOps,
int* algo, int* protocol, int* maxChannels);
typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName);
typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol, const char** protocolName);
#endif
+3 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*sendcount = (count/nranks) & -(16/eltSize);
@@ -69,7 +70,8 @@ struct testColl gatherTest = {
GatherGetCollByteCount,
GatherInitData,
GatherGetBw,
GatherRunColl
GatherRunColl,
NULL
};
void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+30
Просмотреть файл
@@ -0,0 +1,30 @@
/* ************************************************************************
* Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
* ies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
* PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
* CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ************************************************************************ */
#ifndef RCCL_COMPAT_H
#define RCCL_COMPAT_H
extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo;
extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName;
extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName;
#endif
+10 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*sendcount = count;
@@ -34,6 +35,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
return testSuccess;
}
testResult_t ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce , count, type , 0, 0, 1, algo, proto, nchannels));
return testSuccess;
}
void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
@@ -50,7 +58,8 @@ struct testColl reduceTest = {
ReduceGetCollByteCount,
ReduceInitData,
ReduceGetBw,
ReduceRunColl
ReduceRunColl,
ReduceGetAlgoProtoChannels
};
void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+10 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
size_t base = (count/nranks) & -(16/eltSize);
@@ -35,6 +36,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
return testSuccess;
}
testResult_t ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) {
if(rcclTestsGetAlgoInfo == NULL) return testInternalError;
NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter , count, type , 0, 0, 1, algo, proto, nchannels));
return testSuccess;
}
void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
@@ -53,7 +61,8 @@ struct testColl reduceScatterTest = {
ReduceScatterGetCollByteCount,
ReduceScatterInitData,
ReduceScatterGetBw,
ReduceScatterRunColl
ReduceScatterRunColl,
ReduceScatterGetAlgoProtoChannels
};
void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+3 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*recvcount = (count/nranks) & -(16/eltSize);
@@ -65,7 +66,8 @@ struct testColl scatterTest = {
ScatterGetCollByteCount,
ScatterInitData,
ScatterGetBw,
ScatterRunColl
ScatterRunColl,
NULL
};
void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+3 -1
Просмотреть файл
@@ -7,6 +7,7 @@
#include "cuda_runtime.h"
#include "common.h"
#include "rccl_compat.h"
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) {
*sendcount = count;
@@ -64,7 +65,8 @@ struct testColl sendRecvTest = {
SendRecvGetCollByteCount,
SendRecvInitData,
SendRecvGetBw,
SendRecvRunColl
SendRecvRunColl,
NULL
};
void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {