diff --git a/src/init.cc b/src/init.cc index e1403f883d..1ca100bec2 100644 --- a/src/init.cc +++ b/src/init.cc @@ -189,6 +189,7 @@ void *ncclCommThreadMain(void *arg) { memset(head, 0, sizeof(int)*MAXCHANNELS); vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3; do { + int numActiveChans = MAXCHANNELS; for (int channel = 0; channel < MAXCHANNELS; channel++) { int tail = comm->collTraceTail[channel].tail%COLLTRACE_NUM_ITEMS; int count; @@ -196,8 +197,8 @@ void *ncclCommThreadMain(void *arg) { count = tail - head[channel]; else count = COLLTRACE_NUM_ITEMS + head[channel] - tail; - if (!count) { - usleep(1000); //sleep 1ms + if (count == 0) { + numActiveChans--; continue; } for (int i = 0; i < count; i++) { @@ -258,7 +259,10 @@ void *ncclCommThreadMain(void *arg) { head[channel] %= COLLTRACE_NUM_ITEMS; } } - } while(!comm->collTraceExit); + if (comm->collTraceExit && numActiveChans == 0) + break; + usleep(1000); //sleep 1ms + } while(true); pthread_exit(NULL); } #endif