Rework barrier and event code (#761)
* Rework barrier and event code * Switch to inline asm
Этот коммит содержится в:
@@ -29,6 +29,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
|
||||
ncclCommPushCudaFree(comm, channel->devRingUserRanks);
|
||||
|
||||
NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->deviceStream));
|
||||
CUDACHECK(hipEventRecord(comm->deviceStream.scratchEvent, comm->deviceStream.cudaStream));
|
||||
CUDACHECK(hipStreamWaitEvent(comm->deviceStream.cudaStream, comm->deviceStream.scratchEvent, 0));
|
||||
|
||||
for (int r=0; r < nPeers; ++r) {
|
||||
for (int b=0; b < NCCL_MAX_CONNS; b++) {
|
||||
|
||||
@@ -16,12 +16,11 @@
|
||||
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
|
||||
|
||||
#define barrier_by_group() do { \
|
||||
const int w = threadIdx.x/WARP_SIZE; \
|
||||
const int wid = threadIdx.x%WARP_SIZE; \
|
||||
__threadfence(); \
|
||||
if (nthreads == NCCL_MAX_NTHREADS) { \
|
||||
__syncthreads(); \
|
||||
__asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier\ns_waitcnt lgkmcnt(0)"); \
|
||||
} else { \
|
||||
const int w = threadIdx.x/WARP_SIZE; \
|
||||
const int wid = threadIdx.x%WARP_SIZE; \
|
||||
if (wid == 0) { \
|
||||
barrier_next[w] += nthreads/WARP_SIZE; \
|
||||
atomicAdd((unsigned long long *)barriers, 1); \
|
||||
|
||||
@@ -237,8 +237,6 @@ ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStro
|
||||
}
|
||||
}
|
||||
#endif
|
||||
CUDACHECK(cudaEventRecord(ss->scratchEvent, ss->cudaStream));
|
||||
CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->scratchEvent, 0));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user