From 23870ceccd23898ca9aa26f1be22a967c6c0ff0c Mon Sep 17 00:00:00 2001 From: Dingming Wu Date: Mon, 10 Nov 2025 09:54:35 -0800 Subject: [PATCH] Fail the job if flag HIP_HOST_UNCACHED_MEMORY is not set on MI350x (#2023) * Fail the job if compiler flag HIP_HOST_UNCACHED_MEMORY is not turned on on mi350x Place the check after initTransportsRank as the GPU arch info in comm->topo->nodes info is populated after that. * Update src/init.cc to use ERROR instead of WARN Co-authored-by: Nilesh M Negi [ROCm/rccl commit: 05f914c9979df426a277822e067d4a20d13bd55d] --- projects/rccl/src/init.cc | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 4a511633ec..eb936ac9bd 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -155,6 +155,22 @@ ncclResult_t checkHsaEnvSetting() { } return ncclSuccess; } + +// Fail the job if build flag HIP_HOST_UNCACHED_MEMORY is not set on mi350x +ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) { + #if defined(HIP_HOST_UNCACHED_MEMORY) + return ncclSuccess; + #else + if( IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") ){ + ERROR("Build flag HIP_HOST_UNCACHED_MEMORY must be set to avoid memory corruption on mi350x"); + return ncclSystemError; + } + else { + return ncclSuccess; + } + #endif +} + static void initOnceFunc() { NCCLCHECKGOTO(checkHsaEnvSetting(), initResult, exit); initEnv(); @@ -2051,7 +2067,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { comm->cuCount = cuCount; NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail); - + + // Check if using host uncached mem correctly + NCCLCHECK(checkHostUncacheMemSetting(comm)); + // RCCL: determine and set unroll factor for comm NCCLCHECK(commSetUnrollFactor(comm));