diff --git a/projects/rccl/src/device/common.h b/projects/rccl/src/device/common.h index a564b10d6e..010c0c9cc9 100644 --- a/projects/rccl/src/device/common.h +++ b/projects/rccl/src/device/common.h @@ -14,7 +14,6 @@ #include "reduce_kernel.h" #include "device_table.h" #include "network/unpack/unpack_defs.h" - #define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree #define __syncwarp() diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index 42af17bc26..93c040dd53 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -111,7 +111,7 @@ static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) { return func == ncclFuncAllGather ? nRanks*count : count; } -static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { +rccl_static_inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) { return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count; } @@ -569,7 +569,7 @@ static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int } static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport); -static ncclResult_t getAlgoInfo( +rccl_static ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* task, int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL ); @@ -2022,7 +2022,7 @@ static ncclResult_t topoGetAlgoInfo( // Call the plugin first. Let it set algo+proto, and/or nChannels. // Then, topoGetAlgoInfo will set algo/proto if not set, then nChannels and nThreads based on algo/proto. // Finally, nChannels will be overriden by the plugin setting. -static ncclResult_t getAlgoInfo( +rccl_static ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* info, int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/ ) { diff --git a/projects/rccl/src/include/rccl_common.h b/projects/rccl/src/include/rccl_common.h index 2302b5f67b..549df17ef3 100644 --- a/projects/rccl/src/include/rccl_common.h +++ b/projects/rccl/src/include/rccl_common.h @@ -22,7 +22,7 @@ THE SOFTWARE. #ifndef RCCL_COMMON_H_ #define RCCL_COMMON_H_ #include "nccl_common.h" - +#include "nccl.h" typedef enum RcclTunableColls { RCCL_UNSUPPORTED_TUNABLE = -1, RCCL_RS_TUNABLE = 0, // reduce_scatter index @@ -38,6 +38,16 @@ typedef enum RcclTunableColls { #define RCCL_PROTOCOL_MAX_IDX 1 #define RCCL_PROTOCOL_FACTOR_IDX 2 +#ifdef RCCL_EXPOSE_STATIC +#define RCCL_STATIC_EXPOSE_CHECK() +#else +#define RCCL_STATIC_EXPOSE_CHECK() \ + do { \ + WARN("Attempting to use internal logic while required static functions are not exposed. Rebuild with RCCL_EXPOSE_STATIC enabled"); \ + return ncclInvalidUsage; \ + } while (0) +#endif + inline rcclTunableIndex_t rcclGetTunableIndex(ncclFunc_t const& func) { switch (func) { case ncclFuncReduceScatter: @@ -61,4 +71,12 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i return (func == ncclFuncReduceScatter || func == ncclFuncAllGather) ? nBytes / nRanks : nBytes; } void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info); + + +ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType, + int collNetSupport, int nvlsSupport, int numPipeOps, + int* algo, int* protocol, int* maxChannels); + +ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount); + #endif \ No newline at end of file diff --git a/projects/rccl/src/include/rccl_vars.h b/projects/rccl/src/include/rccl_vars.h index a9b1b45815..462b6860dd 100644 --- a/projects/rccl/src/include/rccl_vars.h +++ b/projects/rccl/src/include/rccl_vars.h @@ -27,4 +27,12 @@ THE SOFTWARE. RCCL_PARAM_DECLARE(EnableHipGraph); // Opt-in environment variable for enabling hipGraph +#ifdef RCCL_EXPOSE_STATIC +#define rccl_static +#define rccl_static_inline +#else +#define rccl_static static +#define rccl_static_inline static inline +#endif + #endif diff --git a/projects/rccl/src/rccl_wrap.cc b/projects/rccl/src/rccl_wrap.cc index 8b4ebcb0db..33f4f2afb1 100644 --- a/projects/rccl/src/rccl_wrap.cc +++ b/projects/rccl/src/rccl_wrap.cc @@ -71,3 +71,31 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s } } } + +extern size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count); +extern ncclResult_t getAlgoInfo( + struct ncclComm* comm, struct ncclTaskColl* task, + int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL +); + +ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType, + int collNetSupport, int nvlsSupport, int numPipeOps, + int* algo, int* protocol, int* maxChannels) { + RCCL_STATIC_EXPOSE_CHECK(); + struct ncclTaskColl task; + task.func = coll; + task.count = count; + task.datatype = dataType; + NCCLCHECK(getAlgoInfo(comm, &task, collNetSupport, nvlsSupport, numPipeOps)); + *algo = task.algorithm; + *protocol = task.protocol; + *maxChannels = task.nMaxChannels; + return ncclSuccess; +} + + +ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount) { + RCCL_STATIC_EXPOSE_CHECK(); + maxCount = ncclFuncMaxSendRecvCount(func, nRanks, count); + return ncclSuccess; +} \ No newline at end of file diff --git a/projects/rccl/tools/topo_expl/Makefile b/projects/rccl/tools/topo_expl/Makefile index 7327d4b2a6..d4527e2a9d 100644 --- a/projects/rccl/tools/topo_expl/Makefile +++ b/projects/rccl/tools/topo_expl/Makefile @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. HIP_PATH ?= $(wildcard /opt/rocm) ifeq (,$(HIP_PATH)) HIP_PATH = ../../.. @@ -6,10 +6,11 @@ endif HIPCC = $(HIP_PATH)/bin/hipcc EXE = topo_expl -CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL -lpthread +CXXFLAGS = -g -ffunction-sections -fdata-sections -Wl,--gc-sections -fgpu-rdc -Iinclude -Ihipify_rccl/include -Ihipify_rccl/device/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DENABLE_LL128 -DNVTX_NO_IMPL -DRCCL_EXPOSE_STATIC -lpthread files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \ - hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc + hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc \ + hipify_rccl/collectives.cc hipify_rccl/enqueue.cc ../../src/rccl_wrap.cc all: hipify $(EXE) @@ -18,13 +19,22 @@ $(EXE): $(files) hipify: rm -rf hipify_rccl - mkdir -p hipify_rccl + mkdir -p hipify_rccl/device/include hipify_rccl/include/network/unpack cp -a ../../src/include/ hipify_rccl/ cp -a ../../src/graph/ hipify_rccl/ - cp -ar ../../src/misc/archinfo.cc hipify_rccl/graph/ + cp -a ../../src/device/*.h hipify_rccl/device/include + cp -a ../../src/device/network/unpack/*.h hipify_rccl/include/network/unpack + cp -a ../../src/enqueue.cc hipify_rccl/ + cp -a ../../src/collectives.cc hipify_rccl/ + cp -a ../../src/misc/archinfo.cc hipify_rccl/graph/ hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h + hipify-perl -inplace -quiet-warnings hipify_rccl/device/include/*.h + sed -i "s/template/template/g" "hipify_rccl/device/include/common.h" + sed -i "s/\\(struct RunWorkBatch]*\\)>*/\\1, COLL_UNROLL>/" "hipify_rccl/device/include/common.h" hipify-perl -inplace -quiet-warnings hipify_rccl/graph/* + hipify-perl -inplace -quiet-warnings hipify_rccl/include/network/unpack/* + hipify-perl -inplace -quiet-warnings hipify_rccl/*.cc clean: rm -rf hipify_rccl - rm -f *.o $(EXE) + rm -f *.o $(EXE) \ No newline at end of file diff --git a/projects/rccl/tools/topo_expl/include/device_table.h b/projects/rccl/tools/topo_expl/include/device_table.h new file mode 100644 index 0000000000..7e96d7abf4 --- /dev/null +++ b/projects/rccl/tools/topo_expl/include/device_table.h @@ -0,0 +1,6 @@ +#ifndef DEVICE_TABLE_COMPATIBILITY +#define DEVICE_TABLE_COMPATIBILITY +__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept {} +__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept {} +__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_4(unsigned short funcIndex) noexcept {} +#endif diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp index d57388136c..88cb3a38de 100644 --- a/projects/rccl/tools/topo_expl/topo_expl.cpp +++ b/projects/rccl/tools/topo_expl/topo_expl.cpp @@ -47,6 +47,7 @@ THE SOFTWARE. #include "utils.h" #include "topo.h" #include "graph.h" +#include "rccl_common.h" NodeModel *node_model; extern ncclNet_t* ncclNet; @@ -255,7 +256,6 @@ int main(int argc,char* argv[]) assert(node_model!=0); initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i], nvlsGraph[i]); } - for (uint64_t len = 8; len <= 4294967296L; len *= 2) { struct ncclInfo info; float minTime = 3600000000.0; @@ -284,6 +284,35 @@ int main(int argc,char* argv[]) INFO(NCCL_TUNING, "%10ld %s %s time %f", len, ncclAlgoStr[algorithm], ncclProtoStr[protocol], minTime); } + // Arrays to store function types for ncclFuncAllReduce, ReduceScatter, and AllGather + std::vector ncclFuncTypes = { + ncclFuncAllReduce, + ncclFuncReduceScatter, + ncclFuncAllGather + }; + + std::cout << "Running fp32 production choices for algorithm/protocol/maxChannels" << std::endl; + // RCCL tuning results + printf("| %-15s | %-15s | %-15s | %-10s | %-10s | %-12s |\n", "Max Size(B)", "Count", "Collective", "Algorithm", "Protocol", "Max Channels"); + printf("|-----------------|-----------------|-----------------|------------|------------|--------------|\n"); + for(int i = 0; i < ncclFuncTypes.size(); ++i) { + for (uint64_t count = 8; count <= 1073741824L; count *= 2) { // Up to 1 gigabyte + int algo, proto, nChannels; + NCCLCHECK(rcclGetAlgoInfo(&comm[0], ncclFuncTypes[i], count, ncclFloat32 , 0, 0, 1, &algo, &proto, &nChannels)); + uint64_t maxCount; + NCCLCHECK(rcclFuncMaxSendRecvCount(ncclFuncTypes[i], comm[0].nRanks, count, maxCount)); + printf("| %-15ld | %-15ld | %-15s | %-10s | %-10s | %-12d |\n", + maxCount * sizeof(float), + count, + ncclFuncStr[ncclFuncTypes[i]], + ncclAlgoStr[algo], + ncclProtoStr[proto], + nChannels); + } + } + + + for (int i = 0; i < nranks; i++) { free(comm[i].connectSend); free(comm[i].connectRecv);