From 626608c17238ddfac24002a2e15d90f5fa7fc0f6 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Sun, 7 Jan 2024 20:25:02 -0800 Subject: [PATCH] Increase stack size for gfx906 (#1034) Occationally "Memory access fault by GPU node-8 (Agent handle: 0x23a5640) on address 0x7f461ec00000. Reason: Page not present or supervisor privilege" can be seen from gfx906 CI [ROCm/rccl commit: e5bf56c6d85b58203ee728fbc3958f3394b22902] --- projects/rccl/src/init.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index cdf0389b2d..cddc7db838 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1602,11 +1602,11 @@ fail: #ifdef USE_INDIRECT_FUNCTION_CALL NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 1); -RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 512); #else NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); -RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0); #endif +RCCL_PARAM(StackSizeOverride, "STACK_SIZE_OVERRIDE", 0); + NCCL_PARAM(CGAClusterSize, "CGA_CLUSTER_SIZE", NCCL_CONFIG_UNDEF_INT); // Match config max/minCTAs NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT); @@ -1688,7 +1688,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { int cudaDev = job->cudaDev; int* parentRanks = NULL; int cudaArch; - int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes; + int64_t stackSize; hipDeviceProp_t devProp; CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); @@ -1701,7 +1701,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { // a CUDA memory reconfig on load (c.f. NVSHMEM issue) #ifdef USE_INDIRECT_FUNCTION_CALL CUDACHECK(hipGetDeviceProperties(&devProp, 0)); - if (stackSize > 0 && ncclParamSetStackSize() == 1 && strcmp(devProp.gcnArchName,"gfx940") != 0 && strcmp(devProp.gcnArchName, "gfx941") != 0 && strcmp(devProp.gcnArchName, "gfx942") != 0) { + if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx94")) { + stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes; + if (stackSize == 0) { + if (IsArchMatch(devProp.gcnArchName,"gfx906")) + stackSize = 1024; + else + stackSize = 512; + } INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes); CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize)); }