diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h
index 677435071e..98f7a7a4ef 100644
--- a/projects/rccl/src/collectives/device/all_gather.h
+++ b/projects/rccl/src/collectives/device/all_gather.h
@@ -31,7 +31,7 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
   WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
   PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->hdp_reg);
 
   typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
 
diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h
index a323d4a70b..92b1aac182 100644
--- a/projects/rccl/src/collectives/device/all_reduce.h
+++ b/projects/rccl/src/collectives/device/all_reduce.h
@@ -31,7 +31,7 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
   WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
   PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->hdp_reg);
 
   typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
 
diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h
index 0baef3f476..00a04bcc77 100644
--- a/projects/rccl/src/collectives/device/broadcast.h
+++ b/projects/rccl/src/collectives/device/broadcast.h
@@ -30,7 +30,7 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
   WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
   PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->hdp_reg);
 
   typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
 
diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h
index 1ef5ac802e..0ad5ddeef7 100644
--- a/projects/rccl/src/collectives/device/primitives.h
+++ b/projects/rccl/src/collectives/device/primitives.h
@@ -44,11 +44,14 @@ class PostFlag {
   const int shift;
   volatile int * const fifo;
   const int fifo_size;
+  uint32_t * hdp_reg;
  public:
   __device__
-  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
+  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
+    : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
+  // remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
   __device__
-  void post(uint64_t val) { STORE(flag, (val - shift)); }
+  void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
   __device__
   void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
 };
diff --git a/projects/rccl/src/collectives/device/reduce.h b/projects/rccl/src/collectives/device/reduce.h
index 91eb83f60d..ce4e49fb68 100644
--- a/projects/rccl/src/collectives/device/reduce.h
+++ b/projects/rccl/src/collectives/device/reduce.h
@@ -27,7 +27,7 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
   WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
   PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->hdp_reg);
 
   typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
 
diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h
index aad151211d..16dea5e1e7 100644
--- a/projects/rccl/src/collectives/device/reduce_scatter.h
+++ b/projects/rccl/src/collectives/device/reduce_scatter.h
@@ -28,7 +28,7 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
   WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
   WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
   PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->hdp_reg);
 
   typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
 
diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h
index 1e5950553b..b044560323 100644
--- a/projects/rccl/src/include/core.h
+++ b/projects/rccl/src/include/core.h
@@ -153,6 +153,11 @@ struct ncclRing {
       int* userRanks;
       int* devUserRanks;
 
+      // Next GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+      // allows software to explicitly initiate a flush read to HDP memory. See more
+      // descriptions in primitives.h.
+      uint32_t* hdp_reg;
+
       // Operation list for aggregation
       struct ncclColl* collectives;
       struct ncclColl* devCollectives;
diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cu
index 065e2a7bc7..094511630e 100644
--- a/projects/rccl/src/transport/p2p.cu
+++ b/projects/rccl/src/transport/p2p.cu
@@ -118,11 +118,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
       link_type_name[link_type], hops);
     link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
   }
-  if (link_type != HSA_AMD_LINK_INFO_TYPE_XGMI) {
-    // disable PCIe P2P until HDP flush is implemented.
-    p2p = 0;
-    return ncclSuccess;
-  }
   int nvlinkp2p = 0;
   if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
     nvlinkp2p = CONNECT_NVLINK;
@@ -487,10 +482,69 @@ end:
   } while (0)
 
 /* Send: Create and return connect structures for this peer to connect to me */
+static ncclResult_t getGpuHdpReg(int cudaDev, uint32_t** hdp) {
+  auto convert_bdf = [](const char *busId) {
+    char bdf[9];
+    strncpy(bdf, busId, 4);
+    strncpy(bdf+4, busId+5, 2);
+    strncpy(bdf+6, busId+8, 2);
+    bdf[8] = '\0';
+    uint16_t id = (uint16_t)strtol(bdf, NULL, 16);
+    return id;
+  };
+
+  union find_agent_args {
+    hsa_agent_t agent;
+    uint16_t id;
+  } args;
+
+  const auto& find_agent = [](hsa_agent_t agent, void* arg) {
+    uint16_t id = ((union find_agent_args *)arg)->id;
+    hsa_device_type_t type;
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, (void*)&type);
+    if(type == HSA_DEVICE_TYPE_GPU) {
+      uint16_t bdf_id = 1;
+      hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
+      if(bdf_id == id) {
+        ((union find_agent_args *)arg)->agent=agent;
+        return HSA_STATUS_INFO_BREAK;
+      }
+    }
+    return HSA_STATUS_SUCCESS;
+  };
+
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  *hdp = NULL;
+  CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  args.id = convert_bdf(busId);
+  hsa_status_t err = hsa_iterate_agents(find_agent, (void*)&args);
+  if (err != HSA_STATUS_INFO_BREAK) {
+    WARN("failed to get locate HSA agent for GPU %d", cudaDev);
+    return ncclSystemError;
+  }
+  hsa_amd_hdp_flush_t hdpinfo;
+  err = hsa_agent_get_info(args.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
+  if ((err != HSA_STATUS_SUCCESS) && (err != HSA_STATUS_INFO_BREAK)) {
+    WARN("failed to get HSA_AMD_AGENT_INFO_HDP_FLUSH for GPU %d", cudaDev);
+    return ncclSystemError;
+  }
+  *hdp = hdpinfo.HDP_MEM_FLUSH_CNTL;
+  return ncclSuccess;
+}
+
 ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
   struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
   struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
   struct p2pConnectInfo info;
+  uint32_t linktype, hops;
+  if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", ring->id, myInfo->rank, peerInfo->rank);
+    return ncclInternalError;
+  }
+  if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) {
+    NCCLCHECK(getGpuHdpReg(peerInfo->cudaDev, &ring->hdp_reg));
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", ring->id, myInfo->rank, peerInfo->rank, ring->hdp_reg);
+  }
   if (myInfo->pidHash == peerInfo->pidHash) {
     info.direct = 1;
     info.directPtr = ring->devMemSend;