Fix XGMI detection (#699)

* Fix XGMI detection

* Increase stack size

* Temporarily disable signal hangler in CI

[Process: 17281] Inside handler function signal: Segmentation fault (11)
BFD: DWARF error: section .debug_info is larger than its filesize! (0x93ef57 vs 0x530ea0)
BFD: DWARF error: section .debug_info is larger than its filesize! (0x93ef57 vs 0x530ea0)

[ROCm/rccl commit: 22b81fbaae]
This commit is contained in:
Wenkai Du
2023-03-08 14:08:07 -08:00
committed by GitHub
parent 62f5e6a82f
commit 216c83c39d
3 changed files with 3 additions and 3 deletions
+1 -1
View File
@@ -23,7 +23,7 @@ def runTestCommand (platform, project, gfilter)
cd ${project.paths.project_build_prefix}/build/release/test
${sudo} ulimit -l unlimited
ulimit -a
${sudo} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
${sudo} RCCL_ENABLE_SIGNALHANDLER=0 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
"""
platform.runCommand(this, command)
+1 -1
View File
@@ -1187,7 +1187,7 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in
struct ncclTopoNode* remNode = link->remNode;
if (remNode->gpu.dev == cudaDev2) {
*isXGMI = (link->type == LINK_NVL);
return ncclSuccess;
if (*isXGMI) return ncclSuccess;
}
}
}
+1 -1
View File
@@ -1431,7 +1431,7 @@ fail:
#ifdef USE_INDIRECT_FUNCTION_CALL
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 256);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 512);
#else
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);