Update MSCCL++ register/deregister (#1523)

* erase handle key from mscclpp communicator during deregistration

* remove check on buffer size being a multiple of 32 from registration/deregistration routines since these checks are applied during enqueue

* add check for greater than zero buffer size in mscclpp registration

[ROCm/rccl commit: 19105206f6]
This commit is contained in:
isaki001
2025-02-04 09:09:56 -06:00
committed by GitHub
parent e171f59719
commit d2b5ba80a7
2 changed files with 3 additions and 3 deletions
+1 -1
View File
@@ -116,7 +116,7 @@ index 022d398..468fcf2 100644
+ if (outIt != comm->channelOutInfos.end()) {
+ comm->channelOutInfos.erase(outIt);
+ }
+
+ comm->handleKeys.erase(handle);
+ free(handle);
+ }
+ return ncclSuccess;
+2 -2
View File
@@ -161,7 +161,7 @@ ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t siz
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
#ifdef ENABLE_MSCCLPP
if (comm->mscclCompatible && size > 0 && (size & 31) == 0 && size <= comm->mscclpp_threshold){
if (comm->mscclCompatible && size > 0){
bool isManagedBuffer = false;
CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(buff)));
if(!isManagedBuffer){
@@ -184,7 +184,7 @@ ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
#ifdef ENABLE_MSCCLPP
const size_t size = mscclpp_BufferSize(comm->mscclpp_comm, handle);
if (comm->mscclCompatible && size > 0 && (size & 31) == 0 && size <= comm->mscclpp_threshold) {
if (comm->mscclCompatible && size > 0) {
NCCLCHECK(mscclpp_ncclCommDeregister(comm->mscclpp_comm, handle));
return ncclSuccess;
}