Increase stack size for gfx906 (#1034)

Occationally "Memory access fault by GPU node-8 (Agent handle: 0x23a5640) on address 0x7f461ec00000. Reason: Page not present or supervisor privilege" can be seen from gfx906 CI

[ROCm/rccl commit: e5bf56c6d8]
Este commit está contenido en:
Wenkai Du
2024-01-07 20:25:02 -08:00
cometido por GitHub
padre 1b39fef32a
commit 626608c172
+11 -4
Ver fichero
@@ -1602,11 +1602,11 @@ fail:
#ifdef USE_INDIRECT_FUNCTION_CALL
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 512);
#else
NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0);
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);
#endif
RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0);
NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT);
// Match config max/minCTAs
NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
@@ -1688,7 +1688,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
int cudaDev = job->cudaDev;
int* parentRanks = NULL;
int cudaArch;
int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
int64_t stackSize;
hipDeviceProp_t devProp;
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
@@ -1701,7 +1701,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
#ifdef USE_INDIRECT_FUNCTION_CALL
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
if (stackSize > 0 && ncclParamSetStackSize() == 1 && strcmp(devProp.gcnArchName,"gfx940") != 0 && strcmp(devProp.gcnArchName, "gfx941") != 0 && strcmp(devProp.gcnArchName, "gfx942") != 0) {
if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94")) {
stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
if (stackSize == 0) {
if (IsArchMatch(devProp.gcnArchName,"gfx906"))
stackSize = 1024;
else
stackSize = 512;
}
INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes);
CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize));
}