Expose production tuning table in topo_explorer using internal RCCL/NCCL logic (#1628)
* Internal RCCL/NCCL functionality exposed when RCCL_EXPOSE_STATIC is enabled
* Algo/protocol/max channels can be obtained with the new RCCL API
* Introduce rccl_static and rccl_static_inline macros to work around invisible functions in core source files like enqueue.cc
* Add usage example in topo-explorer tool
[ROCm/rccl commit: 82afb2bcfe]
Cette révision appartient à :
révisé par
GitHub
Parent
38f91fa2c8
révision
07620c7efd
@@ -14,7 +14,6 @@
|
||||
#include "reduce_kernel.h"
|
||||
#include "device_table.h"
|
||||
#include "network/unpack/unpack_defs.h"
|
||||
|
||||
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
|
||||
|
||||
#define __syncwarp()
|
||||
|
||||
@@ -111,7 +111,7 @@ static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count
|
||||
static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
|
||||
return func == ncclFuncAllGather ? nRanks*count : count;
|
||||
}
|
||||
static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
|
||||
rccl_static_inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
|
||||
return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
|
||||
}
|
||||
|
||||
@@ -569,7 +569,7 @@ static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int
|
||||
}
|
||||
|
||||
static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
|
||||
static ncclResult_t getAlgoInfo(
|
||||
rccl_static ncclResult_t getAlgoInfo(
|
||||
struct ncclComm* comm, struct ncclTaskColl* task,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL
|
||||
);
|
||||
@@ -2022,7 +2022,7 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
// Call the plugin first. Let it set algo+proto, and/or nChannels.
|
||||
// Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto.
|
||||
// Finally, nChannels will be overriden by the plugin setting.
|
||||
static ncclResult_t getAlgoInfo(
|
||||
rccl_static ncclResult_t getAlgoInfo(
|
||||
struct ncclComm* comm, struct ncclTaskColl* info,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/
|
||||
) {
|
||||
|
||||
@@ -22,7 +22,7 @@ THE SOFTWARE.
|
||||
#ifndef RCCL_COMMON_H_
|
||||
#define RCCL_COMMON_H_
|
||||
#include "nccl_common.h"
|
||||
|
||||
#include "nccl.h"
|
||||
typedef enum RcclTunableColls {
|
||||
RCCL_UNSUPPORTED_TUNABLE = -1,
|
||||
RCCL_RS_TUNABLE = 0, // reduce_scatter index
|
||||
@@ -38,6 +38,16 @@ typedef enum RcclTunableColls {
|
||||
#define RCCL_PROTOCOL_MAX_IDX 1
|
||||
#define RCCL_PROTOCOL_FACTOR_IDX 2
|
||||
|
||||
#ifdef RCCL_EXPOSE_STATIC
|
||||
#define RCCL_STATIC_EXPOSE_CHECK()
|
||||
#else
|
||||
#define RCCL_STATIC_EXPOSE_CHECK() \
|
||||
do { \
|
||||
WARN("Attempting to use internal logic while required static functions are not exposed. Rebuild with RCCL_EXPOSE_STATIC enabled"); \
|
||||
return ncclInvalidUsage; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
inline rcclTunableIndex_t rcclGetTunableIndex(ncclFunc_t const& func) {
|
||||
switch (func) {
|
||||
case ncclFuncReduceScatter:
|
||||
@@ -61,4 +71,12 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i
|
||||
return (func == ncclFuncReduceScatter || func == ncclFuncAllGather) ? nBytes / nRanks : nBytes;
|
||||
}
|
||||
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info);
|
||||
|
||||
|
||||
ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int* algo, int* protocol, int* maxChannels);
|
||||
|
||||
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
|
||||
|
||||
#endif
|
||||
@@ -27,4 +27,12 @@ THE SOFTWARE.
|
||||
|
||||
RCCL_PARAM_DECLARE(EnableHipGraph); // Opt-in environment variable for enabling hipGraph
|
||||
|
||||
#ifdef RCCL_EXPOSE_STATIC
|
||||
#define rccl_static
|
||||
#define rccl_static_inline
|
||||
#else
|
||||
#define rccl_static static
|
||||
#define rccl_static_inline static inline
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -71,3 +71,31 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count);
|
||||
extern ncclResult_t getAlgoInfo(
|
||||
struct ncclComm* comm, struct ncclTaskColl* task,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL
|
||||
);
|
||||
|
||||
ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int* algo, int* protocol, int* maxChannels) {
|
||||
RCCL_STATIC_EXPOSE_CHECK();
|
||||
struct ncclTaskColl task;
|
||||
task.func = coll;
|
||||
task.count = count;
|
||||
task.datatype = dataType;
|
||||
NCCLCHECK(getAlgoInfo(comm, &task, collNetSupport, nvlsSupport, numPipeOps));
|
||||
*algo = task.algorithm;
|
||||
*protocol = task.protocol;
|
||||
*maxChannels = task.nMaxChannels;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount) {
|
||||
RCCL_STATIC_EXPOSE_CHECK();
|
||||
maxCount = ncclFuncMaxSendRecvCount(func, nRanks, count);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
HIP_PATH ?= $(wildcard /opt/rocm)
|
||||
ifeq (,$(HIP_PATH))
|
||||
HIP_PATH = ../../..
|
||||
@@ -6,10 +6,11 @@ endif
|
||||
HIPCC = $(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE = topo_expl
|
||||
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL -lpthread
|
||||
CXXFLAGS = -g -ffunction-sections -fdata-sections -Wl,--gc-sections -fgpu-rdc -Iinclude -Ihipify_rccl/include -Ihipify_rccl/device/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DENABLE_LL128 -DNVTX_NO_IMPL -DRCCL_EXPOSE_STATIC -lpthread
|
||||
|
||||
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
|
||||
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc
|
||||
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc \
|
||||
hipify_rccl/collectives.cc hipify_rccl/enqueue.cc ../../src/rccl_wrap.cc
|
||||
|
||||
all: hipify $(EXE)
|
||||
|
||||
@@ -18,13 +19,22 @@ $(EXE): $(files)
|
||||
|
||||
hipify:
|
||||
rm -rf hipify_rccl
|
||||
mkdir -p hipify_rccl
|
||||
mkdir -p hipify_rccl/device/include hipify_rccl/include/network/unpack
|
||||
cp -a ../../src/include/ hipify_rccl/
|
||||
cp -a ../../src/graph/ hipify_rccl/
|
||||
cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/
|
||||
cp -a ../../src/device/*.h hipify_rccl/device/include
|
||||
cp -a ../../src/device/network/unpack/*.h hipify_rccl/include/network/unpack
|
||||
cp -a ../../src/enqueue.cc hipify_rccl/
|
||||
cp -a ../../src/collectives.cc hipify_rccl/
|
||||
cp -a ../../src/misc/archinfo.cc hipify_rccl/graph/
|
||||
hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
|
||||
hipify-perl -inplace -quiet-warnings hipify_rccl/device/include/*.h
|
||||
sed -i "s/template<typename T, typename RedOp>/template<typename T, typename RedOp, int COLL_UNROLL>/g" "hipify_rccl/device/include/common.h"
|
||||
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "hipify_rccl/device/include/common.h"
|
||||
hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
|
||||
hipify-perl -inplace -quiet-warnings hipify_rccl/include/network/unpack/*
|
||||
hipify-perl -inplace -quiet-warnings hipify_rccl/*.cc
|
||||
|
||||
clean:
|
||||
rm -rf hipify_rccl
|
||||
rm -f *.o $(EXE)
|
||||
rm -f *.o $(EXE)
|
||||
@@ -0,0 +1,6 @@
|
||||
#ifndef DEVICE_TABLE_COMPATIBILITY
|
||||
#define DEVICE_TABLE_COMPATIBILITY
|
||||
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {}
|
||||
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {}
|
||||
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_4(unsigned short funcIndex) noexcept {}
|
||||
#endif
|
||||
@@ -47,6 +47,7 @@ THE SOFTWARE.
|
||||
#include "utils.h"
|
||||
#include "topo.h"
|
||||
#include "graph.h"
|
||||
#include "rccl_common.h"
|
||||
|
||||
NodeModel *node_model;
|
||||
extern ncclNet_t* ncclNet;
|
||||
@@ -255,7 +256,6 @@ int main(int argc,char* argv[])
|
||||
assert(node_model!=0);
|
||||
initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]);
|
||||
}
|
||||
|
||||
for (uint64_t len = 8; len <= 4294967296L; len *= 2) {
|
||||
struct ncclInfo info;
|
||||
float minTime = 3600000000.0;
|
||||
@@ -284,6 +284,35 @@ int main(int argc,char* argv[])
|
||||
INFO(NCCL_TUNING, "%10ld %s %s time %f", len, ncclAlgoStr[algorithm], ncclProtoStr[protocol], minTime);
|
||||
}
|
||||
|
||||
// Arrays to store function types for ncclFuncAllReduce, ReduceScatter, and AllGather
|
||||
std::vector<ncclFunc_t> ncclFuncTypes = {
|
||||
ncclFuncAllReduce,
|
||||
ncclFuncReduceScatter,
|
||||
ncclFuncAllGather
|
||||
};
|
||||
|
||||
std::cout << "Running fp32 production choices for algorithm/protocol/maxChannels" << std::endl;
|
||||
// RCCL tuning results
|
||||
printf("| %-15s | %-15s | %-15s | %-10s | %-10s | %-12s |\n", "Max Size(B)", "Count", "Collective", "Algorithm", "Protocol", "Max Channels");
|
||||
printf("|-----------------|-----------------|-----------------|------------|------------|--------------|\n");
|
||||
for(int i = 0; i < ncclFuncTypes.size(); ++i) {
|
||||
for (uint64_t count = 8; count <= 1073741824L; count *= 2) { // Up to 1 gigabyte
|
||||
int algo, proto, nChannels;
|
||||
NCCLCHECK(rcclGetAlgoInfo(&comm[0], ncclFuncTypes[i], count, ncclFloat32 , 0, 0, 1, &algo, &proto, &nChannels));
|
||||
uint64_t maxCount;
|
||||
NCCLCHECK(rcclFuncMaxSendRecvCount(ncclFuncTypes[i], comm[0].nRanks, count, maxCount));
|
||||
printf("| %-15ld | %-15ld | %-15s | %-10s | %-10s | %-12d |\n",
|
||||
maxCount * sizeof(float),
|
||||
count,
|
||||
ncclFuncStr[ncclFuncTypes[i]],
|
||||
ncclAlgoStr[algo],
|
||||
ncclProtoStr[proto],
|
||||
nChannels);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
free(comm[i].connectSend);
|
||||
free(comm[i].connectRecv);
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur