Update MSCCL++ register/deregister (#1523)

* erase handle key from mscclpp communicator during deregistration

* remove check on buffer size being a multiple of 32 from registration/deregistration routines since these checks are applied during enqueue

* add check for greater than zero buffer size in mscclpp registration
Tento commit je obsažen v:
isaki001
2025-02-04 09:09:56 -06:00
odevzdal GitHub
rodič 5804603632
revize 19105206f6
2 změnil soubory, kde provedl 3 přidání a 3 odebrání
+1 -1
Zobrazit soubor
@@ -116,7 +116,7 @@ index 022d398..468fcf2 100644
+ if (outIt != comm->channelOutInfos.end()) {
+ comm->channelOutInfos.erase(outIt);
+ }
+
+ comm->handleKeys.erase(handle);
+ free(handle);
+ }
+ return ncclSuccess;
+2 -2
Zobrazit soubor
@@ -161,7 +161,7 @@ ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t siz
NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
#ifdef ENABLE_MSCCLPP
if (comm->mscclCompatible && size > 0 && (size & 31) == 0 && size <= comm->mscclpp_threshold){
if (comm->mscclCompatible && size > 0){
bool isManagedBuffer = false;
CUDACHECK(hipPointerGetAttribute(&isManagedBuffer, HIP_POINTER_ATTRIBUTE_IS_MANAGED, const_cast<void*>(buff)));
if(!isManagedBuffer){
@@ -184,7 +184,7 @@ ncclResult_t ncclCommDeregister_impl(const ncclComm_t comm, void* handle) {
#ifdef ENABLE_MSCCLPP
const size_t size = mscclpp_BufferSize(comm->mscclpp_comm, handle);
if (comm->mscclCompatible && size > 0 && (size & 31) == 0 && size <= comm->mscclpp_threshold) {
if (comm->mscclCompatible && size > 0) {
NCCLCHECK(mscclpp_ncclCommDeregister(comm->mscclpp_comm, handle));
return ncclSuccess;
}