Fail the job if flag HIP_HOST_UNCACHED_MEMORY is not set on MI350x (#2023)

* Fail the job if compiler flag HIP_HOST_UNCACHED_MEMORY is not turned on on mi350x
Place the check after initTransportsRank as the GPU arch info in comm->topo->nodes info is populated after that.

* Update src/init.cc to use ERROR instead of WARN
Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>

[ROCm/rccl commit: 05f914c997]
Este commit está contenido en:
Dingming Wu
2025-11-10 09:54:35 -08:00
cometido por GitHub
padre c601f9b3f8
commit 23870ceccd
+20 -1
Ver fichero
@@ -155,6 +155,22 @@ ncclResult_t checkHsaEnvSetting() {
}
return ncclSuccess;
}
// Fail the job if build flag HIP_HOST_UNCACHED_MEMORY is not set on mi350x
ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) {
#if defined(HIP_HOST_UNCACHED_MEMORY)
return ncclSuccess;
#else
if( IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") ){
ERROR("Build flag HIP_HOST_UNCACHED_MEMORY must be set to avoid memory corruption on mi350x");
return ncclSystemError;
}
else {
return ncclSuccess;
}
#endif
}
static void initOnceFunc() {
NCCLCHECKGOTO(checkHsaEnvSetting(), initResult, exit);
initEnv();
@@ -2051,7 +2067,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
comm->cuCount = cuCount;
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
// Check if using host uncached mem correctly
NCCLCHECK(checkHostUncacheMemSetting(comm));
// RCCL: determine and set unroll factor for comm
NCCLCHECK(commSetUnrollFactor(comm));