diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 4a511633ec..eb936ac9bd 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -155,6 +155,22 @@ ncclResult_t checkHsaEnvSetting() { } return ncclSuccess; } + +// Fail the job if build flag HIP_HOST_UNCACHED_MEMORY is not set on mi350x +ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) { + #if defined(HIP_HOST_UNCACHED_MEMORY) + return ncclSuccess; + #else + if( IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") ){ + ERROR("Build flag HIP_HOST_UNCACHED_MEMORY must be set to avoid memory corruption on mi350x"); + return ncclSystemError; + } + else { + return ncclSuccess; + } + #endif +} + static void initOnceFunc() { NCCLCHECKGOTO(checkHsaEnvSetting(), initResult, exit); initEnv(); @@ -2051,7 +2067,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { comm->cuCount = cuCount; NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail); - + + // Check if using host uncached mem correctly + NCCLCHECK(checkHostUncacheMemSetting(comm)); + // RCCL: determine and set unroll factor for comm NCCLCHECK(commSetUnrollFactor(comm));