From 3be905ca832deb8e18ad930e5c1a20fae2400da0 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Thu, 27 Feb 2025 16:20:32 -0800 Subject: [PATCH] Improve RDMA flushing by write dummy payload with RO=0 (#1570) * Improve RDMA flushing by write dummy payload with RO=0 * Rename env var for disabling this change to RCCL_GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING [ROCm/rccl commit: 60c1264d27debb6e52286032e002e5e7d9624919] --- projects/rccl/src/transport/net_ib.cc | 44 ++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index 231ef40f5d..62f3066abb 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -947,6 +947,8 @@ static_assert((offsetof(struct ncclIbSendComm, wrs) % 32) == 0, "wrs must be 32- struct ncclIbGpuFlush { struct ibv_mr* hostMr; + struct ibv_mr* gpuMr; + int* gpuFlushGpuMem; struct ibv_sge sge; struct ncclIbQp qp; }; @@ -1324,6 +1326,7 @@ ib_send_ready: } NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); +RCCL_PARAM(IbGdrFlushGpuMemNoRelaxedOrdering, "GDR_FLUSH_GPU_MEM_NO_RELAXED_ORDERING", 1); ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) { struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; @@ -1452,11 +1455,22 @@ ib_recv: // Allocate Flush dummy buffer for GPU Direct RDMA if (rComm->flushEnabled) { + if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) { +#if defined(HIP_UNCACHED_MEMORY) + NCCLCHECK(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocUncached)); +#else + NCCLCHECK(ncclCudaCalloc(&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), nullptr, hipDeviceMallocFinegrained)); +#endif + NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.gpuMr, rCommDev->base.pd, rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + } else { + rCommDev->gpuFlush.gpuFlushGpuMem = nullptr; + rCommDev->gpuFlush.gpuMr = nullptr; + } NCCLCHECK(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE)); rCommDev->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlushHostMem; rCommDev->gpuFlush.sge.length = 1; rCommDev->gpuFlush.sge.lkey = rCommDev->gpuFlush.hostMr->lkey; - NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rCommDev->gpuFlush.qp)); + NCCLCHECK(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE, &rCommDev->gpuFlush.qp)); struct ncclIbDevInfo devInfo; devInfo.lid = ibDev->portAttr.lid; devInfo.link_layer = ibDev->portAttr.link_layer; @@ -2008,9 +2022,25 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = req - comm->base.reqs; - - wr.wr.rdma.remote_addr = (uint64_t)data[last]; - wr.wr.rdma.rkey = mhandle->mrs[i]->rkey; + if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) { + wr.wr.rdma.remote_addr = (uint64_t)(comm->devs[i].gpuFlush.gpuFlushGpuMem); + wr.wr.rdma.rkey = comm->devs[i].gpuFlush.gpuMr->rkey; + wr.sg_list = &comm->devs[i].gpuFlush.sge; + wr.num_sge = 1; + wr.opcode = IBV_WR_RDMA_WRITE; + wr.send_flags = 0; + struct ibv_send_wr* bad_wr; + NCCLCHECK(wrap_ibv_post_send(comm->devs[i].gpuFlush.qp.qp, &wr, &bad_wr)); + } + memset(&wr, 0, sizeof(wr)); + wr.wr_id = req - comm->base.reqs; + if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) { + wr.wr.rdma.remote_addr = (uint64_t)(comm->devs[i].gpuFlush.gpuFlushGpuMem); + wr.wr.rdma.rkey = comm->devs[i].gpuFlush.gpuMr->rkey; + } else { + wr.wr.rdma.remote_addr = (uint64_t)data[last]; + wr.wr.rdma.rkey = mhandle->mrs[i]->rkey; + } wr.sg_list = &comm->devs[i].gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; @@ -2148,6 +2178,12 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) { for (int i = 0; i < comm->base.ndevs; i++) { struct ncclIbRecvCommDev* commDev = comm->devs + i; if (comm->flushEnabled) { + if (commDev->gpuFlush.gpuFlushGpuMem != nullptr) { + NCCLCHECK(ncclCudaFree(commDev->gpuFlush.gpuFlushGpuMem)); + commDev->gpuFlush.gpuFlushGpuMem = nullptr; + if (commDev->gpuFlush.gpuMr != nullptr) NCCLCHECK(wrap_ibv_dereg_mr(commDev->gpuFlush.gpuMr)); + commDev->gpuFlush.gpuMr = nullptr; + } if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp)); if (commDev->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->gpuFlush.hostMr)); }