diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h index 677435071e..98f7a7a4ef 100644 --- a/projects/rccl/src/collectives/device/all_gather.h +++ b/projects/rccl/src/collectives/device/all_gather.h @@ -31,7 +31,7 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) { WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS); WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS); PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS); + PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->hdp_reg); typedef Primitives Prims; diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h index a323d4a70b..92b1aac182 100644 --- a/projects/rccl/src/collectives/device/all_reduce.h +++ b/projects/rccl/src/collectives/device/all_reduce.h @@ -31,7 +31,7 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) { WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS); WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS); PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS); + PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->hdp_reg); typedef Primitives Prims; diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h index 0baef3f476..00a04bcc77 100644 --- a/projects/rccl/src/collectives/device/broadcast.h +++ b/projects/rccl/src/collectives/device/broadcast.h @@ -30,7 +30,7 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) { WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS); WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS); + PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->hdp_reg); typedef Primitives Prims; diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index 1ef5ac802e..0ad5ddeef7 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -44,11 +44,14 @@ class PostFlag { const int shift; volatile int * const fifo; const int fifo_size; + uint32_t * hdp_reg; public: __device__ - PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { } + PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL) + : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { } + // remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag __device__ - void post(uint64_t val) { STORE(flag, (val - shift)); } + void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); } __device__ void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); }; }; diff --git a/projects/rccl/src/collectives/device/reduce.h b/projects/rccl/src/collectives/device/reduce.h index 91eb83f60d..ce4e49fb68 100644 --- a/projects/rccl/src/collectives/device/reduce.h +++ b/projects/rccl/src/collectives/device/reduce.h @@ -27,7 +27,7 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) { WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS); WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS); + PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->hdp_reg); typedef Primitives Prims; diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h index aad151211d..16dea5e1e7 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.h +++ b/projects/rccl/src/collectives/device/reduce_scatter.h @@ -28,7 +28,7 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) { WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS); WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS); PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS); + PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->hdp_reg); typedef Primitives Prims; diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h index 1e5950553b..b044560323 100644 --- a/projects/rccl/src/include/core.h +++ b/projects/rccl/src/include/core.h @@ -153,6 +153,11 @@ struct ncclRing { int* userRanks; int* devUserRanks; + // Next GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register + // allows software to explicitly initiate a flush read to HDP memory. See more + // descriptions in primitives.h. + uint32_t* hdp_reg; + // Operation list for aggregation struct ncclColl* collectives; struct ncclColl* devCollectives; diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cu index 065e2a7bc7..094511630e 100644 --- a/projects/rccl/src/transport/p2p.cu +++ b/projects/rccl/src/transport/p2p.cu @@ -118,11 +118,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin link_type_name[link_type], hops); link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev)); } - if (link_type != HSA_AMD_LINK_INFO_TYPE_XGMI) { - // disable PCIe P2P until HDP flush is implemented. - p2p = 0; - return ncclSuccess; - } int nvlinkp2p = 0; if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1) nvlinkp2p = CONNECT_NVLINK; @@ -487,10 +482,69 @@ end: } while (0) /* Send: Create and return connect structures for this peer to connect to me */ +static ncclResult_t getGpuHdpReg(int cudaDev, uint32_t** hdp) { + auto convert_bdf = [](const char *busId) { + char bdf[9]; + strncpy(bdf, busId, 4); + strncpy(bdf+4, busId+5, 2); + strncpy(bdf+6, busId+8, 2); + bdf[8] = '\0'; + uint16_t id = (uint16_t)strtol(bdf, NULL, 16); + return id; + }; + + union find_agent_args { + hsa_agent_t agent; + uint16_t id; + } args; + + const auto& find_agent = [](hsa_agent_t agent, void* arg) { + uint16_t id = ((union find_agent_args *)arg)->id; + hsa_device_type_t type; + hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, (void*)&type); + if(type == HSA_DEVICE_TYPE_GPU) { + uint16_t bdf_id = 1; + hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id); + if(bdf_id == id) { + ((union find_agent_args *)arg)->agent=agent; + return HSA_STATUS_INFO_BREAK; + } + } + return HSA_STATUS_SUCCESS; + }; + + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + *hdp = NULL; + CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); + args.id = convert_bdf(busId); + hsa_status_t err = hsa_iterate_agents(find_agent, (void*)&args); + if (err != HSA_STATUS_INFO_BREAK) { + WARN("failed to get locate HSA agent for GPU %d", cudaDev); + return ncclSystemError; + } + hsa_amd_hdp_flush_t hdpinfo; + err = hsa_agent_get_info(args.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo); + if ((err != HSA_STATUS_SUCCESS) && (err != HSA_STATUS_INFO_BREAK)) { + WARN("failed to get HSA_AMD_AGENT_INFO_HDP_FLUSH for GPU %d", cudaDev); + return ncclSystemError; + } + *hdp = hdpinfo.HDP_MEM_FLUSH_CNTL; + return ncclSuccess; +} + ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; struct p2pConnectInfo info; + uint32_t linktype, hops; + if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) { + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", ring->id, myInfo->rank, peerInfo->rank); + return ncclInternalError; + } + if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) { + NCCLCHECK(getGpuHdpReg(peerInfo->cudaDev, &ring->hdp_reg)); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", ring->id, myInfo->rank, peerInfo->rank, ring->hdp_reg); + } if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; info.directPtr = ring->devMemSend;