Fail the job if flag HIP_HOST_UNCACHED_MEMORY is not set on MI350x (#2023)
* Fail the job if compiler flag HIP_HOST_UNCACHED_MEMORY is not turned on on mi350x
Place the check after initTransportsRank as the GPU arch info in comm->topo->nodes info is populated after that.
* Update src/init.cc to use ERROR instead of WARN
Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>
[ROCm/rccl commit: 05f914c997]
Este commit está contenido en:
@@ -155,6 +155,22 @@ ncclResult_t checkHsaEnvSetting() {
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Fail the job if build flag HIP_HOST_UNCACHED_MEMORY is not set on mi350x
|
||||
ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) {
|
||||
#if defined(HIP_HOST_UNCACHED_MEMORY)
|
||||
return ncclSuccess;
|
||||
#else
|
||||
if( IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") ){
|
||||
ERROR("Build flag HIP_HOST_UNCACHED_MEMORY must be set to avoid memory corruption on mi350x");
|
||||
return ncclSystemError;
|
||||
}
|
||||
else {
|
||||
return ncclSuccess;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void initOnceFunc() {
|
||||
NCCLCHECKGOTO(checkHsaEnvSetting(), initResult, exit);
|
||||
initEnv();
|
||||
@@ -2051,7 +2067,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
comm->cuCount = cuCount;
|
||||
|
||||
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
|
||||
|
||||
|
||||
// Check if using host uncached mem correctly
|
||||
NCCLCHECK(checkHostUncacheMemSetting(comm));
|
||||
|
||||
// RCCL: determine and set unroll factor for comm
|
||||
NCCLCHECK(commSetUnrollFactor(comm));
|
||||
|
||||
|
||||
Referencia en una nueva incidencia
Block a user