Expose production tuning table in topo_explorer using internal RCCL/NCCL logic (#1628)

* Internal RCCL/NCCL functionality exposed when RCCL_EXPOSE_STATIC is enabled
* Algo/protocol/max channels can be obtained with the new RCCL API
* Introduce rccl_static and rccl_static_inline macros to work around invisible functions in core source files like enqueue.cc
* Add usage example in topo-explorer tool

[ROCm/rccl commit: 82afb2bcfe]
Cette révision appartient à :
Mustafa Abduljabbar
2025-04-23 15:44:56 -04:00
révisé par GitHub
Parent 38f91fa2c8
révision 07620c7efd
8 fichiers modifiés avec 110 ajouts et 12 suppressions
-1
Voir le fichier
@@ -14,7 +14,6 @@
#include "reduce_kernel.h"
#include "device_table.h"
#include "network/unpack/unpack_defs.h"
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
#define __syncwarp()
+3 -3
Voir le fichier
@@ -111,7 +111,7 @@ static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count
static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
return func == ncclFuncAllGather ? nRanks*count : count;
}
static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
rccl_static_inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
}
@@ -569,7 +569,7 @@ static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int
}
static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
static ncclResult_t getAlgoInfo(
rccl_static ncclResult_t getAlgoInfo(
struct ncclComm* comm, struct ncclTaskColl* task,
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL
);
@@ -2022,7 +2022,7 @@ static ncclResult_t topoGetAlgoInfo(
// Call the plugin first. Let it set algo+proto, and/or nChannels.
// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
// Finally, nChannels will be overriden by the plugin setting.
static ncclResult_t getAlgoInfo(
rccl_static ncclResult_t getAlgoInfo(
struct ncclComm* comm, struct ncclTaskColl* info,
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/
) {
+19 -1
Voir le fichier
@@ -22,7 +22,7 @@ THE SOFTWARE.
#ifndef RCCL_COMMON_H_
#define RCCL_COMMON_H_
#include "nccl_common.h"
#include "nccl.h"
typedef enum RcclTunableColls {
RCCL_UNSUPPORTED_TUNABLE = -1,
RCCL_RS_TUNABLE = 0, // reduce_scatter index
@@ -38,6 +38,16 @@ typedef enum RcclTunableColls {
#define RCCL_PROTOCOL_MAX_IDX 1
#define RCCL_PROTOCOL_FACTOR_IDX 2
#ifdef RCCL_EXPOSE_STATIC
#define RCCL_STATIC_EXPOSE_CHECK()
#else
#define RCCL_STATIC_EXPOSE_CHECK() \
do { \
WARN("Attempting to use internal logic while required static functions are not exposed. Rebuild with RCCL_EXPOSE_STATIC enabled"); \
return ncclInvalidUsage; \
} while (0)
#endif
inline rcclTunableIndex_t rcclGetTunableIndex(ncclFunc_t const& func) {
switch (func) {
case ncclFuncReduceScatter:
@@ -61,4 +71,12 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i
return (func == ncclFuncReduceScatter || func == ncclFuncAllGather) ? nBytes / nRanks : nBytes;
}
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info);
ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
int collNetSupport, int nvlsSupport, int numPipeOps,
int* algo, int* protocol, int* maxChannels);
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
#endif
+8
Voir le fichier
@@ -27,4 +27,12 @@ THE SOFTWARE.
RCCL_PARAM_DECLARE(EnableHipGraph); // Opt-in environment variable for enabling hipGraph
#ifdef RCCL_EXPOSE_STATIC
#define rccl_static
#define rccl_static_inline
#else
#define rccl_static static
#define rccl_static_inline static inline
#endif
#endif
+28
Voir le fichier
@@ -71,3 +71,31 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s
}
}
}
extern size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count);
extern ncclResult_t getAlgoInfo(
struct ncclComm* comm, struct ncclTaskColl* task,
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL
);
ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
int collNetSupport, int nvlsSupport, int numPipeOps,
int* algo, int* protocol, int* maxChannels) {
RCCL_STATIC_EXPOSE_CHECK();
struct ncclTaskColl task;
task.func = coll;
task.count = count;
task.datatype = dataType;
NCCLCHECK(getAlgoInfo(comm, &task, collNetSupport, nvlsSupport, numPipeOps));
*algo = task.algorithm;
*protocol = task.protocol;
*maxChannels = task.nMaxChannels;
return ncclSuccess;
}
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount) {
RCCL_STATIC_EXPOSE_CHECK();
maxCount = ncclFuncMaxSendRecvCount(func, nRanks, count);
return ncclSuccess;
}
+16 -6
Voir le fichier
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
HIP_PATH ?= $(wildcard /opt/rocm)
ifeq (,$(HIP_PATH))
HIP_PATH = ../../..
@@ -6,10 +6,11 @@ endif
HIPCC = $(HIP_PATH)/bin/hipcc
EXE = topo_expl
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL -lpthread
CXXFLAGS = -g -ffunction-sections -fdata-sections -Wl,--gc-sections -fgpu-rdc -Iinclude -Ihipify_rccl/include -Ihipify_rccl/device/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DENABLE_LL128 -DNVTX_NO_IMPL -DRCCL_EXPOSE_STATIC -lpthread
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc \
hipify_rccl/collectives.cc hipify_rccl/enqueue.cc ../../src/rccl_wrap.cc
all: hipify $(EXE)
@@ -18,13 +19,22 @@ $(EXE): $(files)
hipify:
rm -rf hipify_rccl
mkdir -p hipify_rccl
mkdir -p hipify_rccl/device/include hipify_rccl/include/network/unpack
cp -a ../../src/include/ hipify_rccl/
cp -a ../../src/graph/ hipify_rccl/
cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/
cp -a ../../src/device/*.h hipify_rccl/device/include
cp -a ../../src/device/network/unpack/*.h hipify_rccl/include/network/unpack
cp -a ../../src/enqueue.cc hipify_rccl/
cp -a ../../src/collectives.cc hipify_rccl/
cp -a ../../src/misc/archinfo.cc hipify_rccl/graph/
hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
hipify-perl -inplace -quiet-warnings hipify_rccl/device/include/*.h
sed -i "s/template<typename T, typename RedOp>/template<typename T, typename RedOp, int COLL_UNROLL>/g" "hipify_rccl/device/include/common.h"
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "hipify_rccl/device/include/common.h"
hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
hipify-perl -inplace -quiet-warnings hipify_rccl/include/network/unpack/*
hipify-perl -inplace -quiet-warnings hipify_rccl/*.cc
clean:
rm -rf hipify_rccl
rm -f *.o $(EXE)
rm -f *.o $(EXE)
+6
Voir le fichier
@@ -0,0 +1,6 @@
#ifndef DEVICE_TABLE_COMPATIBILITY
#define DEVICE_TABLE_COMPATIBILITY
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {}
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {}
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_4(unsigned short funcIndex) noexcept {}
#endif
+30 -1
Voir le fichier
@@ -47,6 +47,7 @@ THE SOFTWARE.
#include "utils.h"
#include "topo.h"
#include "graph.h"
#include "rccl_common.h"
NodeModel *node_model;
extern ncclNet_t* ncclNet;
@@ -255,7 +256,6 @@ int main(int argc,char* argv[])
assert(node_model!=0);
initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
}
for (uint64_t len = 8; len <= 4294967296L; len *= 2) {
struct ncclInfo info;
float minTime = 3600000000.0;
@@ -284,6 +284,35 @@ int main(int argc,char* argv[])
INFO(NCCL_TUNING, "%10ld %s %s time %f", len, ncclAlgoStr[algorithm], ncclProtoStr[protocol], minTime);
}
// Arrays to store function types for ncclFuncAllReduce, ReduceScatter, and AllGather
std::vector<ncclFunc_t> ncclFuncTypes = {
ncclFuncAllReduce,
ncclFuncReduceScatter,
ncclFuncAllGather
};
std::cout << "Running fp32 production choices for algorithm/protocol/maxChannels" << std::endl;
// RCCL tuning results
printf("| %-15s | %-15s | %-15s | %-10s | %-10s | %-12s |\n", "Max Size(B)", "Count", "Collective", "Algorithm", "Protocol", "Max Channels");
printf("|-----------------|-----------------|-----------------|------------|------------|--------------|\n");
for(int i = 0; i < ncclFuncTypes.size(); ++i) {
for (uint64_t count = 8; count <= 1073741824L; count *= 2) { // Up to 1 gigabyte
int algo, proto, nChannels;
NCCLCHECK(rcclGetAlgoInfo(&comm[0], ncclFuncTypes[i], count, ncclFloat32 , 0, 0, 1, &algo, &proto, &nChannels));
uint64_t maxCount;
NCCLCHECK(rcclFuncMaxSendRecvCount(ncclFuncTypes[i], comm[0].nRanks, count, maxCount));
printf("| %-15ld | %-15ld | %-15s | %-10s | %-10s | %-12d |\n",
maxCount * sizeof(float),
count,
ncclFuncStr[ncclFuncTypes[i]],
ncclAlgoStr[algo],
ncclProtoStr[proto],
nChannels);
}
}
for (int i = 0; i < nranks; i++) {
free(comm[i].connectSend);
free(comm[i].connectRecv);