Implement HDP flush when transfer data over PCIe P2P (#75)

* Implement HDP flush when transfer data over PCIe P2P * Add some descriptions to HDP flushing * Fix for review comments [ROCm/rccl commit: b7a6307371]
2019-06-03 16:29:55 -07:00
@@ -31,7 +31,7 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS, ring->hdp_reg);

  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;

@@ -31,7 +31,7 @@ __device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS, ring->hdp_reg);

  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;

@@ -30,7 +30,7 @@ __device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS, ring->hdp_reg);

  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;

@@ -44,11 +44,14 @@ class PostFlag {
  const int shift;
  volatile int * const fifo;
  const int fifo_size;
+  uint32_t * hdp_reg;
 public:
  __device__
-  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
+  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size, uint32_t* hdp_reg = NULL)
+    : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size), hdp_reg(hdp_reg) { }
+  // remote writes can be reordered if we don't do s_waitcnt 0 + store to HDP between the data and flag
  __device__
-  void post(uint64_t val) { STORE(flag, (val - shift)); }
+  void post(uint64_t val) { if (hdp_reg != NULL) STORE(hdp_reg, 0x1); STORE(flag, (val - shift)); }
  __device__
  void postSize(uint64_t step, int size) { if (fifo != NULL) STORE(fifo + step%fifo_size, size); };
 };
@@ -27,7 +27,7 @@ __device__ void ncclReduceKernel(struct CollectiveArgs* args) {
  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS, ring->hdp_reg);

  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;

@@ -28,7 +28,7 @@ __device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS, ring->hdp_reg);

  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;

@@ -153,6 +153,11 @@ struct ncclRing {
      int* userRanks;
      int* devUserRanks;

+      // Next GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+      // allows software to explicitly initiate a flush read to HDP memory. See more
+      // descriptions in primitives.h.
+      uint32_t* hdp_reg;
+
      // Operation list for aggregation
      struct ncclColl* collectives;
      struct ncclColl* devCollectives;
@@ -118,11 +118,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
      link_type_name[link_type], hops);
    link_status_print_once_mask |= (1 << (myInfo->cudaDev*8 + peerInfo->cudaDev));
  }
-  if (link_type != HSA_AMD_LINK_INFO_TYPE_XGMI) {
-    // disable PCIe P2P until HDP flush is implemented.
-    p2p = 0;
-    return ncclSuccess;
-  }
  int nvlinkp2p = 0;
  if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI && hops == 1)
    nvlinkp2p = CONNECT_NVLINK;
@@ -487,10 +482,69 @@ end:
  } while (0)

 /* Send: Create and return connect structures for this peer to connect to me */
+static ncclResult_t getGpuHdpReg(int cudaDev, uint32_t** hdp) {
+  auto convert_bdf = [](const char *busId) {
+    char bdf[9];
+    strncpy(bdf, busId, 4);
+    strncpy(bdf+4, busId+5, 2);
+    strncpy(bdf+6, busId+8, 2);
+    bdf[8] = '\0';
+    uint16_t id = (uint16_t)strtol(bdf, NULL, 16);
+    return id;
+  };
+
+  union find_agent_args {
+    hsa_agent_t agent;
+    uint16_t id;
+  } args;
+
+  const auto& find_agent = [](hsa_agent_t agent, void* arg) {
+    uint16_t id = ((union find_agent_args *)arg)->id;
+    hsa_device_type_t type;
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, (void*)&type);
+    if(type == HSA_DEVICE_TYPE_GPU) {
+      uint16_t bdf_id = 1;
+      hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_BDFID, &bdf_id);
+      if(bdf_id == id) {
+        ((union find_agent_args *)arg)->agent=agent;
+        return HSA_STATUS_INFO_BREAK;
+      }
+    }
+    return HSA_STATUS_SUCCESS;
+  };
+
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  *hdp = NULL;
+  CUDACHECK(hipDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  args.id = convert_bdf(busId);
+  hsa_status_t err = hsa_iterate_agents(find_agent, (void*)&args);
+  if (err != HSA_STATUS_INFO_BREAK) {
+    WARN("failed to get locate HSA agent for GPU %d", cudaDev);
+    return ncclSystemError;
+  }
+  hsa_amd_hdp_flush_t hdpinfo;
+  err = hsa_agent_get_info(args.agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_HDP_FLUSH, &hdpinfo);
+  if ((err != HSA_STATUS_SUCCESS) && (err != HSA_STATUS_INFO_BREAK)) {
+    WARN("failed to get HSA_AMD_AGENT_INFO_HDP_FLUSH for GPU %d", cudaDev);
+    return ncclSystemError;
+  }
+  *hdp = hdpinfo.HDP_MEM_FLUSH_CNTL;
+  return ncclSuccess;
+}
+
 ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
  struct p2pConnectInfo info;
+  uint32_t linktype, hops;
+  if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &linktype, &hops) != hipSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d failed to get link type and hop count", ring->id, myInfo->rank, peerInfo->rank);
+    return ncclInternalError;
+  }
+  if (linktype != HSA_AMD_LINK_INFO_TYPE_XGMI) {
+    NCCLCHECK(getGpuHdpReg(peerInfo->cudaDev, &ring->hdp_reg));
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d HDP %p", ring->id, myInfo->rank, peerInfo->rank, ring->hdp_reg);
+  }
  if (myInfo->pidHash == peerInfo->pidHash) {
    info.direct = 1;
    info.directPtr = ring->devMemSend;