Improve RDMA flushing by write dummy payload with RO=0 (#1570)

* Improve RDMA flushing by write dummy payload with RO=0

* Rename env var for disabling this change to RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING

[ROCm/rccl commit: 60c1264d27]
This commit is contained in:
Wenkai Du
2025-02-27 16:20:32 -08:00
committed by GitHub
parent a3f44b4a10
commit 3be905ca83
+40 -4
View File
@@ -947,6 +947,8 @@ static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32-
struct ncclIbGpuFlush {
struct ibv_mr* hostMr;
struct ibv_mr* gpuMr;
int* gpuFlushGpuMem;
struct ibv_sge sge;
struct ncclIbQp qp;
};
@@ -1324,6 +1326,7 @@ ib_send_ready:
}
NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
RCCL_PARAM(IbGdrFlushGpuMemNoRelaxedOrdering, "GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING", 1);
ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
@@ -1452,11 +1455,22 @@ ib_recv:
// Allocate Flush dummy buffer for GPU Direct RDMA
if (rComm->flushEnabled) {
if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) {
#if defined(HIP_UNCACHED_MEMORY)
NCCLCHECK(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocUncached));
#else
NCCLCHECK(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocFinegrained));
#endif
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.gpuMr, rCommDev->base.pd, rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
} else {
rCommDev->gpuFlush.gpuFlushGpuMem = nullptr;
rCommDev->gpuFlush.gpuMr = nullptr;
}
NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem;
rCommDev->gpuFlush.sge.length = 1;
rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey;
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp));
NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rCommDev->gpuFlush.qp));
struct ncclIbDevInfo devInfo;
devInfo.lid = ibDev->portAttr.lid;
devInfo.link_layer = ibDev->portAttr.link_layer;
@@ -2008,9 +2022,25 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
struct ibv_send_wr wr;
memset(&wr, 0, sizeof(wr));
wr.wr_id = req - comm->base.reqs;
wr.wr.rdma.remote_addr = (uint64_t)data[last];
wr.wr.rdma.rkey = mhandle->mrs[i]->rkey;
if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) {
wr.wr.rdma.remote_addr = (uint64_t)(comm->devs[i].gpuFlush.gpuFlushGpuMem);
wr.wr.rdma.rkey = comm->devs[i].gpuFlush.gpuMr->rkey;
wr.sg_list = &comm->devs[i].gpuFlush.sge;
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_WRITE;
wr.send_flags = 0;
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->devs[i].gpuFlush.qp.qp, &wr, &bad_wr));
}
memset(&wr, 0, sizeof(wr));
wr.wr_id = req - comm->base.reqs;
if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) {
wr.wr.rdma.remote_addr = (uint64_t)(comm->devs[i].gpuFlush.gpuFlushGpuMem);
wr.wr.rdma.rkey = comm->devs[i].gpuFlush.gpuMr->rkey;
} else {
wr.wr.rdma.remote_addr = (uint64_t)data[last];
wr.wr.rdma.rkey = mhandle->mrs[i]->rkey;
}
wr.sg_list = &comm->devs[i].gpuFlush.sge;
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_READ;
@@ -2148,6 +2178,12 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
for (int i = 0; i < comm->base.ndevs; i++) {
struct ncclIbRecvCommDev* commDev = comm->devs + i;
if (comm->flushEnabled) {
if (commDev->gpuFlush.gpuFlushGpuMem != nullptr) {
NCCLCHECK(ncclCudaFree(commDev->gpuFlush.gpuFlushGpuMem));
commDev->gpuFlush.gpuFlushGpuMem = nullptr;
if (commDev->gpuFlush.gpuMr != nullptr) NCCLCHECK(wrap_ibv_dereg_mr(commDev->gpuFlush.gpuMr));
commDev->gpuFlush.gpuMr = nullptr;
}
if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp));
if (commDev->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->gpuFlush.hostMr));
}