Rework barrier and event code (#761)

* Rework barrier and event code * Switch to inline asm
2023-05-31 13:36:51 -07:00
@@ -29,6 +29,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
  ncclCommPushCudaFree(comm, channel->devRingUserRanks);

  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
+  CUDACHECK(hipEventRecord(comm->deviceStream.scratchEvent, comm->deviceStream.cudaStream));
+  CUDACHECK(hipStreamWaitEvent(comm->deviceStream.cudaStream, comm->deviceStream.scratchEvent, 0));

  for (int r=0; r < nPeers; ++r) {
    for (int b=0; b < NCCL_MAX_CONNS; b++) {
@@ -16,12 +16,11 @@
 #define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000

 #define barrier_by_group() do { \
-  const int w = threadIdx.x/WARP_SIZE; \
-  const int wid = threadIdx.x%WARP_SIZE; \
-  __threadfence(); \
  if (nthreads == NCCL_MAX_NTHREADS) { \
-    __syncthreads(); \
+    __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier\ns_waitcnt lgkmcnt(0)"); \
  } else { \
+    const int w = threadIdx.x/WARP_SIZE; \
+    const int wid = threadIdx.x%WARP_SIZE; \
    if (wid == 0) { \
      barrier_next[w] += nthreads/WARP_SIZE; \
      atomicAdd((unsigned long long *)barriers, 1); \
@@ -237,8 +237,6 @@ ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStro
      }
    }
  #endif
-  CUDACHECK(cudaEventRecord(ss->scratchEvent, ss->cudaStream));
-  CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->scratchEvent, 0));
  return ncclSuccess;
 }