Update MSCCL++ register/deregister (#1523)
* erase handle key from mscclpp communicator during deregistration
* remove check on buffer size being a multiple of 32 from registration/deregistration routines since these checks are applied during enqueue
* add check for greater than zero buffer size in mscclpp registration
[ROCm/rccl commit: 19105206f6]
Este commit está contenido en:
@@ -116,7 +116,7 @@ index 022d398..468fcf2 100644
|
||||
+ if (outIt != comm->channelOutInfos.end()) {
|
||||
+ comm->channelOutInfos.erase(outIt);
|
||||
+ }
|
||||
+
|
||||
+ comm->handleKeys.erase(handle);
|
||||
+ free(handle);
|
||||
+ }
|
||||
+ return ncclSuccess;
|
||||
|
||||
@@ -161,7 +161,7 @@ ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t siz
|
||||
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
|
||||
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
if (comm->mscclCompatible && size > 0 && (size & 31) == 0 && size <= comm->mscclpp_threshold){
|
||||
if (comm->mscclCompatible && size > 0){
|
||||
bool isManagedBuffer = false;
|
||||
CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(buff)));
|
||||
if(!isManagedBuffer){
|
||||
@@ -184,7 +184,7 @@ ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
|
||||
|
||||
#ifdef ENABLE_MSCCLPP
|
||||
const size_t size = mscclpp_BufferSize(comm->mscclpp_comm, handle);
|
||||
if (comm->mscclCompatible && size > 0 && (size & 31) == 0 && size <= comm->mscclpp_threshold) {
|
||||
if (comm->mscclCompatible && size > 0) {
|
||||
NCCLCHECK(mscclpp_ncclCommDeregister(comm->mscclpp_comm, handle));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
Referencia en una nueva incidencia
Block a user