diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h index 9556994064..694fee1083 100644 --- a/projects/rccl/src/include/comm.h +++ b/projects/rccl/src/include/comm.h @@ -747,7 +747,7 @@ struct ncclComm { int unroll; // custom collective [RCCL] bool enableCustColl; - // gfx name from hipDeviceProp_t [RCCL] + // gfx name from hipDeviceProp_t [RCCL] , Memory resource owned by comm allocated in ncclCommInitRankFunc char* archName; // multiProcessorCount from hipDeviceProp_t [RCCL] int cuCount; diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index def51b40ea..130dd60af6 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -612,6 +612,7 @@ skip_profiling: free(comm->topParentRanks); free(comm->topParentLocalRanks); free(comm->gproxyConn); + free(comm->archName); NCCLCHECK(ncclRegCleanup(comm)); @@ -2149,12 +2150,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { CUDACHECKGOTO(hipGetDeviceProperties(&devProp, cudaDev), res, fail); cuCount = devProp.multiProcessorCount; - archName = (char*)malloc(strlen(devProp.gcnArchName) + 1); - if (archName == nullptr) { - WARN("Failed to allocate memory for architecture name"); + archName = strdup(devProp.gcnArchName); + if (!archName) { + res = ncclSystemError; + WARN("strdup failed for architecture name"); goto fail; } - strcpy(archName, devProp.gcnArchName); timers[TIMER_INIT_KERNELS] = clockNano(); NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));