From 0c94d4d2b3fda463ec30d2b469ae25375dbad2a8 Mon Sep 17 00:00:00 2001 From: Mustafa Abduljabbar Date: Fri, 26 Sep 2025 18:09:01 -0400 Subject: [PATCH] Enable viewing algo/proto/channels used in rccl-tests output (#151) * Enable algo/proto/channel viewing * Use dynamic symbol loading to avoid build/runtime issues with non-compatible RCCL versions * Reduce code duplication --- README.md | 1 + src/all_gather.cu | 11 ++++- src/all_reduce.cu | 10 +++- src/alltoall.cu | 4 +- src/alltoallv.cu | 4 +- src/broadcast.cu | 10 +++- src/common.cu | 107 ++++++++++++++++++++++++++++++------------ src/common.h | 22 ++++++++- src/gather.cu | 4 +- src/rccl_compat.h | 30 ++++++++++++ src/reduce.cu | 11 ++++- src/reduce_scatter.cu | 11 ++++- src/scatter.cu | 4 +- src/sendrecv.cu | 4 +- 14 files changed, 193 insertions(+), 40 deletions(-) create mode 100644 src/rccl_compat.h diff --git a/README.md b/README.md index a3a4336870..6ae8c81db9 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,7 @@ All tests support the same set of arguments : * Parsing RCCL-Tests output * `-Z,--output_format ` Parse RCCL-Tests output as a CSV or JSON. Default : disabled. * `-x,--output_file ` RCCL-Tests output file name. Default : disabled. + * `-M,--output_algo_proto_channels <0/1>` Report Algorithm/Protocol/Channels for each message size. Default : 0. ### Running multiple operations in parallel diff --git a/src/all_gather.cu b/src/all_gather.cu index 6fe7a9214d..dbbd977ec0 100644 --- a/src/all_gather.cu +++ b/src/all_gather.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { size_t base = (count/nranks) & -(16/eltSize); @@ -36,6 +37,13 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc return testSuccess; } +testResult_t AllGatherGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) { + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFunc_t::ncclFuncAllGather , count, type , 0, 0, 1, algo, proto, nchannels)); + return testSuccess; +} + + void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; @@ -54,7 +62,8 @@ struct testColl allGatherTest = { AllGatherGetCollByteCount, AllGatherInitData, AllGatherGetBw, - AllGatherRunColl + AllGatherRunColl, + AllGatherGetAlgoProtoChannels }; void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/all_reduce.cu b/src/all_reduce.cu index d64371bbb2..038188a74e 100644 --- a/src/all_reduce.cu +++ b/src/all_reduce.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; @@ -33,6 +34,12 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc return testSuccess; } +testResult_t AllReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) { + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncAllReduce , count, type , 0, 0, 1, algo, proto, nchannels)); + return testSuccess; +} + void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize) / 1.0E9 / sec; @@ -51,7 +58,8 @@ struct testColl allReduceTest = { AllReduceGetCollByteCount, AllReduceInitData, AllReduceGetBw, - AllReduceRunColl + AllReduceRunColl, + AllReduceGetAlgoProtoChannels }; void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/alltoall.cu b/src/alltoall.cu index 02b0ae77f3..eeab700902 100644 --- a/src/alltoall.cu +++ b/src/alltoall.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *paramcount = (count/nranks) & -(16/eltSize); @@ -56,7 +57,8 @@ struct testColl alltoAllTest = { AlltoAllGetCollByteCount, AlltoAllInitData, AlltoAllGetBw, - AlltoAllRunColl + AlltoAllRunColl, + NULL }; void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/alltoallv.cu b/src/alltoallv.cu index bb335eda87..8195826f31 100644 --- a/src/alltoallv.cu +++ b/src/alltoallv.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" #define USE_RCCL_GATHER_SCATTER @@ -156,7 +157,8 @@ struct testColl alltoAllTest = { AlltoAllvGetCollByteCount, AlltoAllvInitData, AlltoAllvGetBw, - AlltoAllvRunColl + AlltoAllvRunColl, + NULL }; void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/broadcast.cu b/src/broadcast.cu index bc8c5512ff..18d09b7285 100644 --- a/src/broadcast.cu +++ b/src/broadcast.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; @@ -32,6 +33,12 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc return testSuccess; } +testResult_t BroadcastGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) { + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncBroadcast , count, type , 0, 0, 1, algo, proto, nchannels)); + return testSuccess; +} + void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize) / 1.0E9 / sec; @@ -60,7 +67,8 @@ struct testColl broadcastTest = { BroadcastGetCollByteCount, BroadcastInitData, BroadcastGetBw, - BroadcastRunColl + BroadcastRunColl, + BroadcastGetAlgoProtoChannels }; void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/common.cu b/src/common.cu index 0b16556203..3934d22463 100644 --- a/src/common.cu +++ b/src/common.cu @@ -22,7 +22,7 @@ #include #include #include /* program_invocation_short_name */ - +#include //#define DEBUG_PRINT #include "verifiable.h" @@ -35,6 +35,24 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion() int32_t gpu_block3; size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB +rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo = NULL; +rcclTestsGetProtocolName_t rcclTestsGetProtocolName = NULL; +rcclTestsGetAlgoName_t rcclTestsGetAlgoName= NULL; +static void loadRcclSyms() { + static void* handle = NULL; + const char* libname = "librccl.so"; + if (!handle) { + handle = dlopen(libname, RTLD_LAZY | RTLD_LOCAL); + if (!handle) { + fprintf(stderr, "dlopen failed: %s\n", dlerror()); + return; + } + } + rcclTestsGetAlgoInfo = (rcclTestsGetAlgoInfo_t) dlsym(handle, "rcclGetAlgoInfo"); + rcclTestsGetAlgoName = (rcclTestsGetAlgoName_t) dlsym(handle, "rcclGetAlgoName"); + rcclTestsGetProtocolName = (rcclTestsGetProtocolName_t) dlsym(handle, "rcclGetProtocolName"); +} + // RCCL_FLOAT8 support bool rccl_float8_useFnuz = false; bool IsArchMatch(char const* arch, char const* target) { @@ -109,6 +127,7 @@ static int nccltype = ncclFloat; static int ncclroot = 0; static int parallel_init = 0; static int blocking_coll = 0; +static int output_algo_proto_channels = 0; static int memorytype = 0; static uint32_t cumask[4]; static int streamnull = 0; @@ -944,8 +963,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* TESTCHECK(BenchTime(args, type, op, root, 0)); usleep(delay_inout_place); } - if (enable_in_place) + if (enable_in_place) TESTCHECK(BenchTime(args, type, op, root, 1)); + if(output_algo_proto_channels) { + if(args->collTest->getAlgoProtoChannels) { + int algo, proto, nchannels; + const char* algoName = NULL; + const char* protoName = NULL; + TESTCHECK(args->collTest->getAlgoProtoChannels(args->comms[0], args->nbytes / wordSize(type), type, &algo, &proto, &nchannels)); + NCCLCHECK(rcclTestsGetAlgoName(algo, &algoName)); + NCCLCHECK(rcclTestsGetProtocolName(proto, &protoName)); + PRINT("%8s %8s %10d", algoName, protoName, nchannels); + } else { + PRINT("%8s %8s %10s","N/A", "N/A", "N/A"); + } + } PRINT("\n"); } --repeat; @@ -1108,7 +1140,7 @@ int main(int argc, char* argv[]) { } #endif #endif - + loadRcclSyms(); // Parse args double parsed; int longindex; @@ -1135,14 +1167,15 @@ int main(int argc, char* argv[]) { {"report_cputime", required_argument, 0, 'C'}, {"average", required_argument, 0, 'a'}, {"local_register", required_argument, 0, 'R'}, - {"memory_type", required_argument, 0, 'y'}, //RCCL - {"cumask", required_argument, 0, 'u'}, //RCCL - {"out_of_place", required_argument, 0, 'O'}, //RCCL - {"delay_inout_place", required_argument, 0, 'q'}, //RCCL - {"cache_flush", required_argument, 0, 'F'}, //RCCL - {"rotating_tensor", required_argument, 0, 'E'}, //RCCL - {"output_file", required_argument, 0, 'x'}, //RCCL - {"output_format", required_argument, 0, 'Z'}, //RCCL + {"memory_type", required_argument, 0, 'y'}, //RCCL + {"cumask", required_argument, 0, 'u'}, //RCCL + {"out_of_place", required_argument, 0, 'O'}, //RCCL + {"delay_inout_place", required_argument, 0, 'q'}, //RCCL + {"cache_flush", required_argument, 0, 'F'}, //RCCL + {"rotating_tensor", required_argument, 0, 'E'}, //RCCL + {"output_file", required_argument, 0, 'x'}, //RCCL + {"output_format", required_argument, 0, 'Z'}, //RCCL + {"output_algo_proto_channels", required_argument, 0, 'M'}, //RCCL {"help", no_argument, 0, 'h'}, {} }; @@ -1150,7 +1183,7 @@ int main(int argc, char* argv[]) { while(1) { int c; - c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:h", longopts, &longindex); + c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:N:p:c:o:d:r:z:y:T:G:C:a:R:Y:u:O:q:F:E:x:Z:M:h", longopts, &longindex); if (c == -1) break; @@ -1290,6 +1323,10 @@ int main(int argc, char* argv[]) { case 'Z': output_format = optarg; break; + case 'M': + output_algo_proto_channels = strtol(optarg, NULL, 0); + if(rcclTestsGetAlgoInfo == NULL || rcclTestsGetAlgoName == NULL || rcclTestsGetProtocolName == NULL) output_algo_proto_channels = 0; + break; case 'h': default: if (c != 'h') printf("invalid option '%c'\n", c); @@ -1607,27 +1644,39 @@ testResult_t run() { } fflush(stdout); - + const char* extra_col_str[3] = {"", "", ""}; + if (output_algo_proto_channels) { + extra_col_str[0] = "algo"; + extra_col_str[1] = "proto"; + extra_col_str[2] = "nchannels"; + } + const char* header_col_str[3] = {" out-of-place in-place ", + " out-of-place "," in-place "}; + int header_index =(enable_out_of_place && enable_in_place) ? 0 : (enable_out_of_place ? 1 : 2); const char* timeStr = report_cputime ? "cputime" : "time"; + PRINT("#\n"); + PRINT("# %10s %12s %8s %6s %6s%s\n", "", "", "", "", "", header_col_str[header_index]); if (enable_out_of_place && enable_in_place) { - PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root", - timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong"); - PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", - "(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", ""); - } else if (enable_out_of_place) { - PRINT("# %10s %12s %8s %6s %6s out-of-place \n", "", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root", - timeStr, "algbw", "busbw", "#wrong"); - PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", - "(us)", "(GB/s)", "(GB/s)", ""); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n", + "size", "count", "type", "redop", "root", + timeStr, "algbw", "busbw", "#wrong", + timeStr, "algbw", "busbw", "#wrong", + extra_col_str[0], extra_col_str[1], extra_col_str[2]); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s %8s %8s %10s\n", + "(B)", "(elements)", "", "", "", + "(us)", "(GB/s)", "(GB/s)", "", + "(us)", "(GB/s)", "(GB/s)", "", + "", "", ""); } else { - PRINT("# %10s %12s %8s %6s %6s in-place \n", "", "", "", "", ""); - PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root", - timeStr, "algbw", "busbw", "#wrong"); - PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "", - "(us)", "(GB/s)", "(GB/s)", ""); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %8s %8s %10s\n", + "size", "count", "type", "redop", "root", + timeStr, "algbw", "busbw", "#wrong", + extra_col_str[0], extra_col_str[1], extra_col_str[2]); + PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %8s %8s %10s\n", + "(B)", "(elements)", "", "", "", + "(us)", "(GB/s)", "(GB/s)", "", + "", "", ""); } Reporter reporter(output_file, output_format); diff --git a/src/common.h b/src/common.h index 97ea1829f5..645d3a1f0a 100644 --- a/src/common.h +++ b/src/common.h @@ -7,7 +7,6 @@ ************************************************************************/ #ifndef __COMMON_H__ #define __COMMON_H__ - #include "rccl/rccl.h" #include #include @@ -107,6 +106,7 @@ struct testColl { void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks); testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + testResult_t (*getAlgoProtoChannels)(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels); }; extern struct testColl allReduceTest; extern struct testColl allGatherTest; @@ -375,4 +375,24 @@ extern int is_main_proc; extern thread_local int is_main_thread; #define PRINT if (is_main_thread) printf +typedef enum { + ncclFuncBroadcast = 0, + ncclFuncReduce = 1, + ncclFuncAllGather = 2, + ncclFuncReduceScatter = 3, + ncclFuncAllReduce = 4, + ncclFuncAllReduceWithBias = 5, + ncclFuncSendRecv = 6, + ncclFuncSend = 7, + ncclFuncRecv = 8, + ncclFuncAllToAllPivot = 9, + ncclNumFuncs = 10 +} ncclFunc_t; + +typedef ncclResult_t (*rcclTestsGetAlgoInfo_t)(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType, + int collNetSupport, int nvlsSupport, int numPipeOps, + int* algo, int* protocol, int* maxChannels); +typedef ncclResult_t (*rcclTestsGetAlgoName_t)(int algo, const char** algoName); +typedef ncclResult_t (*rcclTestsGetProtocolName_t)(int protocol, const char** protocolName); + #endif diff --git a/src/gather.cu b/src/gather.cu index 662e2d47e8..a0dc00de56 100644 --- a/src/gather.cu +++ b/src/gather.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = (count/nranks) & -(16/eltSize); @@ -69,7 +70,8 @@ struct testColl gatherTest = { GatherGetCollByteCount, GatherInitData, GatherGetBw, - GatherRunColl + GatherRunColl, + NULL }; void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/rccl_compat.h b/src/rccl_compat.h new file mode 100644 index 0000000000..4e132774d1 --- /dev/null +++ b/src/rccl_compat.h @@ -0,0 +1,30 @@ +/* ************************************************************************ + * Copyright (C) 2016-2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- + * ies of the Software, and to permit persons to whom the Software is furnished + * to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- + * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- + * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ************************************************************************ */ + +#ifndef RCCL_COMPAT_H +#define RCCL_COMPAT_H + +extern rcclTestsGetAlgoInfo_t rcclTestsGetAlgoInfo; +extern rcclTestsGetProtocolName_t rcclTestsGetProtocolName; +extern rcclTestsGetAlgoName_t rcclTestsGetAlgoName; + +#endif \ No newline at end of file diff --git a/src/reduce.cu b/src/reduce.cu index f8c059e140..c2353c3fc0 100644 --- a/src/reduce.cu +++ b/src/reduce.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; @@ -34,6 +35,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe return testSuccess; } +testResult_t ReduceGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) { + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduce , count, type , 0, 0, 1, algo, proto, nchannels)); + return testSuccess; +} + + void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize) / 1.0E9 / sec; *algBw = baseBw; @@ -50,7 +58,8 @@ struct testColl reduceTest = { ReduceGetCollByteCount, ReduceInitData, ReduceGetBw, - ReduceRunColl + ReduceRunColl, + ReduceGetAlgoProtoChannels }; void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu index 2e04cc7456..fe906ce372 100644 --- a/src/reduce_scatter.cu +++ b/src/reduce_scatter.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { size_t base = (count/nranks) & -(16/eltSize); @@ -35,6 +36,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, return testSuccess; } +testResult_t ReduceScatterGetAlgoProtoChannels(ncclComm_t comm, size_t count, ncclDataType_t type, int* algo, int* proto, int* nchannels) { + if(rcclTestsGetAlgoInfo == NULL) return testInternalError; + NCCLCHECK(rcclTestsGetAlgoInfo(comm, ncclFuncReduceScatter , count, type , 0, 0, 1, algo, proto, nchannels)); + return testSuccess; +} + + void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) { double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec; @@ -53,7 +61,8 @@ struct testColl reduceScatterTest = { ReduceScatterGetCollByteCount, ReduceScatterInitData, ReduceScatterGetBw, - ReduceScatterRunColl + ReduceScatterRunColl, + ReduceScatterGetAlgoProtoChannels }; void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/scatter.cu b/src/scatter.cu index d93663ced7..d0323fa36d 100644 --- a/src/scatter.cu +++ b/src/scatter.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *recvcount = (count/nranks) & -(16/eltSize); @@ -65,7 +66,8 @@ struct testColl scatterTest = { ScatterGetCollByteCount, ScatterInitData, ScatterGetBw, - ScatterRunColl + ScatterRunColl, + NULL }; void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) { diff --git a/src/sendrecv.cu b/src/sendrecv.cu index 3f84dcffc9..4f5f6b8a7b 100644 --- a/src/sendrecv.cu +++ b/src/sendrecv.cu @@ -7,6 +7,7 @@ #include "cuda_runtime.h" #include "common.h" +#include "rccl_compat.h" void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, size_t eltSize, int nranks) { *sendcount = count; @@ -64,7 +65,8 @@ struct testColl sendRecvTest = { SendRecvGetCollByteCount, SendRecvInitData, SendRecvGetBw, - SendRecvRunColl + SendRecvRunColl, + NULL }; void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {