From 216c83c39dd30596bbcad413e469ed6e81bf3753 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Wed, 8 Mar 2023 14:08:07 -0800 Subject: [PATCH] Fix XGMI detection (#699) * Fix XGMI detection * Increase stack size * Temporarily disable signal hangler in CI [Process: 17281] Inside handler function signal: Segmentation fault (11) BFD: DWARF error: section .debug_info is larger than its filesize! (0x93ef57 vs 0x530ea0) BFD: DWARF error: section .debug_info is larger than its filesize! (0x93ef57 vs 0x530ea0) [ROCm/rccl commit: 22b81fbaaec48aa1a8b8eb82220315c96ef74f5a] --- projects/rccl/.jenkins/common.groovy | 2 +- projects/rccl/src/graph/search.cc | 2 +- projects/rccl/src/init.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/rccl/.jenkins/common.groovy b/projects/rccl/.jenkins/common.groovy index 827dde71ff..c24ec7151b 100644 --- a/projects/rccl/.jenkins/common.groovy +++ b/projects/rccl/.jenkins/common.groovy @@ -23,7 +23,7 @@ def runTestCommand (platform, project, gfilter) cd ${project.paths.project_build_prefix}/build/release/test ${sudo} ulimit -l unlimited ulimit -a - ${sudo} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes + ${sudo} RCCL_ENABLE_SIGNALHANDLER=0 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes """ platform.runCommand(this, command) diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index f680619ab5..5a431c17ed 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -1187,7 +1187,7 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in struct ncclTopoNode* remNode = link->remNode; if (remNode->gpu.dev == cudaDev2) { *isXGMI = (link->type == LINK_NVL); - return ncclSuccess; + if (*isXGMI) return ncclSuccess; } } } diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index c0b856e12f..28c11023e1 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1431,7 +1431,7 @@ fail: #ifdef USE_INDIRECT_FUNCTION_CALL NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1); -RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 256); +RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 512); #else NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);