Use different atomics to check flags in kernel (#568)

Αυτή η υποβολή περιλαμβάνεται σε:
Wenkai Du
2022-06-23 09:16:41 -07:00
υποβλήθηκε από GitHub
γονέας 06f05300fe
υποβολή c3bb9e70d0
2 αρχεία άλλαξαν με 4 προσθήκες και 2 διαγραφές
@@ -99,9 +99,11 @@ private:
if (sendConnHeadPtr) {
int spins = 0;
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
sendConnHeadCache = LOAD(sendConnHeadPtr);
__builtin_amdgcn_s_sleep(8);
sendConnHeadCache = atomicAdd_system((unsigned long long *)sendConnHeadPtr, 0);
if (checkAbort(spins, 1)) break;
}
__asm__ __volatile__("s_wakeup");
if (sendConnFifoPtr) {
int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
STORE(sendConnFifoPtr+sendConnHead%NCCL_STEPS, size);
@@ -107,7 +107,7 @@ private:
int spins = 0;
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
__builtin_amdgcn_s_sleep(8);
connStepCache = LOAD(connStepPtr);
connStepCache = atomicAdd_system((unsigned long long *)connStepPtr, 0);
if (checkAbort(spins)) break;
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem->comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));