Fix XGMI detection (#699)
* Fix XGMI detection
* Increase stack size
* Temporarily disable signal hangler in CI
[Process: 17281] Inside handler function signal: Segmentation fault (11)
BFD: DWARF error: section .debug_info is larger than its filesize! (0x93ef57 vs 0x530ea0)
BFD: DWARF error: section .debug_info is larger than its filesize! (0x93ef57 vs 0x530ea0)
[ROCm/rccl commit: 22b81fbaae]
This commit is contained in:
@@ -23,7 +23,7 @@ def runTestCommand (platform, project, gfilter)
|
||||
cd ${project.paths.project_build_prefix}/build/release/test
|
||||
${sudo} ulimit -l unlimited
|
||||
ulimit -a
|
||||
${sudo} RCCL_ENABLE_SIGNALHANDLER=1 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
|
||||
${sudo} RCCL_ENABLE_SIGNALHANDLER=0 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
|
||||
"""
|
||||
|
||||
platform.runCommand(this, command)
|
||||
|
||||
@@ -1187,7 +1187,7 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in
|
||||
struct ncclTopoNode* remNode = link->remNode;
|
||||
if (remNode->gpu.dev == cudaDev2) {
|
||||
*isXGMI = (link->type == LINK_NVL);
|
||||
return ncclSuccess;
|
||||
if (*isXGMI) return ncclSuccess;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1431,7 +1431,7 @@ fail:
|
||||
|
||||
#ifdef USE_INDIRECT_FUNCTION_CALL
|
||||
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1);
|
||||
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 256);
|
||||
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 512);
|
||||
#else
|
||||
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
|
||||
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);
|
||||
|
||||
Reference in New Issue
Block a user