Support gfx950 in topo_expl and resolve dependency on FMT (#1829)

* Support gfx950 in topo_expl

* Fix dependencies and fetch fmt from sources

* Remove third_party folder in make clean

* Add empty target when fmt is found

* Add MI350 example

* Update README.md

---------

Co-authored-by: isaki001 <ioannissakiotis@gmail.com>
Этот коммит содержится в:
Mustafa Abduljabbar
2025-08-26 10:11:38 -04:00
коммит произвёл GitHub
родитель 5e7937effb
Коммит dfad51e3c9
5 изменённых файлов: 210 добавлений и 5 удалений
+34 -4
Просмотреть файл
@@ -5,6 +5,8 @@ HIP_PATH = ../../..
endif
HIPCC = $(HIP_PATH)/bin/hipcc
.DEFAULT_GOAL := all
EXE = topo_expl
CXXFLAGS = -g -ffunction-sections -fdata-sections -Wl,--gc-sections -fgpu-rdc -Iinclude -Ihipify_rccl/include -Ihipify_rccl/include/plugin -Ihipify_rccl/device/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DENABLE_LL128 -DNVTX_NO_IMPL -DRCCL_EXPOSE_STATIC -lpthread
@@ -12,7 +14,34 @@ files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/gra
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc \
hipify_rccl/collectives.cc hipify_rccl/register.cc hipify_rccl/enqueue.cc ../../src/rccl_wrap.cc
all: hipify $(EXE)
FMT_DIR := third_party/fmt
FMT_INCLUDE := $(FMT_DIR)/include
FMT_HEADER := $(FMT_INCLUDE)/fmt/format.h
FMT_GIT := https://github.com/fmtlib/fmt.git
# Probe for <fmt/format.h> using a portable pipe (no <<< here-string)
HAVE_FMT := $(shell echo '#include <fmt/format.h>' | \
$(HIPCC) -xc++ -std=c++17 -E - >/dev/null 2>&1 && echo yes || echo no)
ifeq ($(HAVE_FMT),no)
CXXFLAGS += -I$(FMT_INCLUDE)
NEED_FMT := 1
endif
ifeq ($(NEED_FMT),1)
$(FMT_HEADER):
rm -rf third_party/fmt
@echo ">>> fmt not found; cloning $(FMT_GIT) ..."
@mkdir -p $(dir $(FMT_DIR))
@git clone --depth=1 $(FMT_GIT) $(FMT_DIR)
else
$(FMT_HEADER):
endif
# ---------------------------------------------------------------------------
all: $(FMT_HEADER) hipify $(EXE)
$(EXE): $(files)
$(HIPCC) $(CXXFLAGS) $^ -o $@
@@ -30,13 +59,14 @@ hipify:
cp -a ../../src/misc/archinfo.cc hipify_rccl/graph/
hipify-perl -inplace -quiet-warnings hipify_rccl/include/*.h
hipify-perl -inplace -quiet-warnings hipify_rccl/include/plugin/*.h
hipify-perl -inplace -quiet-warnings hipify_rccl/include/latency_profiler/*.h
hipify-perl -inplace -quiet-warnings hipify_rccl/device/include/*.h
sed -i "s/template<typename T, typename RedOp>/template<typename T, typename RedOp, int COLL_UNROLL>/g" "hipify_rccl/device/include/common.h"
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "hipify_rccl/device/include/common.h"
sed -i "s/template<typename T, typename RedOp>/template<typename T, typename RedOp, int USE_ACC, int COLL_UNROLL>/g" "hipify_rccl/device/include/common.h"
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, USE_ACC, COLL_UNROLL>/" "hipify_rccl/device/include/common.h"
hipify-perl -inplace -quiet-warnings hipify_rccl/graph/*
hipify-perl -inplace -quiet-warnings hipify_rccl/include/network/unpack/*
hipify-perl -inplace -quiet-warnings hipify_rccl/*.cc
clean:
rm -rf hipify_rccl
rm -rf hipify_rccl third_party
rm -f *.o $(EXE)
+4 -1
Просмотреть файл
@@ -33,7 +33,7 @@ Run `./topo_expl` without arguments to see the list of available models. Each mo
## Example Usage: Print RCCL's algorithm/protocol selections
The tool is typically run with the `NCCL_DEBUG=INFO` environment variable, but for the convenience of just printing the algo/proto table, we use version `NCCL_DEBUG=version` in this example to avoid printing topo details.
The tool is typically run with the `NCCL_DEBUG=INFO` environment variable to show the topology information and print out the constructed rings/trees. However, for the convenience of just printing the algo/proto table, we use version `NCCL_DEBUG=version` in this example to avoid printing topo details.
```bash
# List available models
@@ -45,6 +45,9 @@ NCCL_DEBUG=version ./topo_expl -m 55
# Test a multi-node MI300 configuration with 8 nodes
NCCL_DEBUG=version ./topo_expl -m 55 -n 8
# Test a multi-node MI350 configuration with 2 nodes
NCCL_DEBUG=version ./topo_expl -m 59 -n 2
# Test MI250 configuration (model 42)
NCCL_DEBUG=version ./topo_expl -m 42
+3
Просмотреть файл
@@ -11,5 +11,8 @@ static struct rcclKernelItem rcclKernelTable[] = { };
template <int unroll>
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept { }
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_1(unsigned short funcIndex) noexcept { }
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_2(unsigned short funcIndex) noexcept { }
__forceinline__ __device__ void NCCL_CALL_FUNCTIONS_4(unsigned short funcIndex) noexcept { }
#endif
+167
Просмотреть файл
@@ -0,0 +1,167 @@
<system version="2">
<cpu host_hash="0x1b2f53a636e182bb" numaid="0" affinity="00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="191" modelid="2">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc03" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re0" dev="0" latency="0" speed="400000" port="1" guid="0x28303efeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:06:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:08:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="256" gcn="gfx950" arch="38911" rank="0" gdr="1">
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:11:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc03" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:13:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re1" dev="1" latency="0" speed="400000" port="1" guid="0x18483efeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:16:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:18:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="256" gcn="gfx950" arch="38911" rank="1" gdr="1">
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:61:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc02" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:63:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re2" dev="2" latency="0" speed="400000" port="1" guid="0x78333efeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:66:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:68:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="256" gcn="gfx950" arch="38911" rank="2" gdr="1">
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:71:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc02" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:73:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re3" dev="3" latency="0" speed="400000" port="1" guid="0x981e3efeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:76:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:78:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="256" gcn="gfx950" arch="38911" rank="3" gdr="1">
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
</cpu>
<cpu host_hash="0x1b2f53a636e182bb" numaid="1" affinity="ffffffff,ffffffff,ffffffff,00000000,00000000,00000000,ffffffff,ffffffff,ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="191" modelid="2">
<pci busid="0000:81:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc01" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:83:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re4" dev="4" latency="0" speed="400000" port="1" guid="0x80963ffeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:86:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:88:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="256" gcn="gfx950" arch="38911" rank="4" gdr="1">
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:91:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc01" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:93:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re5" dev="5" latency="0" speed="400000" port="1" guid="0x681e3efeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:96:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:98:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="256" gcn="gfx950" arch="38911" rank="5" gdr="1">
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc00" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re6" dev="6" latency="0" speed="400000" port="1" guid="0xd0373efeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:e6:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e8:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="256" gcn="gfx950" arch="38911" rank="6" gdr="1">
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:f9:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:f1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1d9b" subsystem_device="0xcc00" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:f3:00.0" class="0x020000" vendor="0x14e4" device="0x1760" subsystem_vendor="0x14e4" subsystem_device="0xf323" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="bnxt_re7" dev="7" latency="0" speed="400000" port="1" guid="0xe84a3efeffe604d6" maxconn="131073" gdr="1"/>
</nic>
</pci>
<pci busid="0000:f7:00.0" class="0x060400" vendor="0x1022" device="0x1500" subsystem_vendor="0x1022" subsystem_device="0x1500" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:f9:00.0" class="0x120000" vendor="0x1002" device="0x75a0" subsystem_vendor="0x1002" subsystem_device="0x75a0" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="256" gcn="gfx950" arch="38911" rank="7" gdr="1">
<xgmi target="0000:08:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:18:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:68:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:78:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:88:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:98:00.0" count="1" tclass="0x120000"/>
<xgmi target="0000:e8:00.0" count="1" tclass="0x120000"/>
</gpu>
</pci>
</pci>
</pci>
</cpu>
</system>
+2
Просмотреть файл
@@ -136,6 +136,8 @@ NodeModelDesc model_descs[] = {
{"topo_8p_942vm.xml", " 8gfx942 1H7XGMI 8NIC 2Intel B"},
{"topo_16p_gio-1s-1rp-cascade.xml", "16gfx942 2H7XGMI 1NIC 2AMD A"},
{"topo_16p_gio-3s-1rp-split-flat.xml", "16gfx942 2H7XGMI 1NIC 2AMD B"},
// GFX 950
{"topo_8p_950.xml", " 8gfx950 1H7XGMI 8NIC 2AMD A"},
};
NCCL_PARAM(MaxCTAs, "MAX_CTAS", MAXCHANNELS);