colltrace: fix dropped trace messages (#1059)
* colltrace: fix dropped trace messages * Remove extra space
Этот коммит содержится в:
+7
-3
@@ -189,6 +189,7 @@ void *ncclCommThreadMain(void *arg) {
|
||||
memset(head, 0, sizeof(int)*MAXCHANNELS);
|
||||
vega_gpu_rtc_freq = GetDeviceWallClockRateInKhz(comm->cudaDev) * 1.0E3;
|
||||
do {
|
||||
int numActiveChans = MAXCHANNELS;
|
||||
for (int channel = 0; channel < MAXCHANNELS; channel++) {
|
||||
int tail = comm->collTraceTail[channel].tail%COLLTRACE_NUM_ITEMS;
|
||||
int count;
|
||||
@@ -196,8 +197,8 @@ void *ncclCommThreadMain(void *arg) {
|
||||
count = tail - head[channel];
|
||||
else
|
||||
count = COLLTRACE_NUM_ITEMS + head[channel] - tail;
|
||||
if (!count) {
|
||||
usleep(1000); //sleep 1ms
|
||||
if (count == 0) {
|
||||
numActiveChans--;
|
||||
continue;
|
||||
}
|
||||
for (int i = 0; i < count; i++) {
|
||||
@@ -258,7 +259,10 @@ void *ncclCommThreadMain(void *arg) {
|
||||
head[channel] %= COLLTRACE_NUM_ITEMS;
|
||||
}
|
||||
}
|
||||
} while(!comm->collTraceExit);
|
||||
if (comm->collTraceExit && numActiveChans == 0)
|
||||
break;
|
||||
usleep(1000); //sleep 1ms
|
||||
} while(true);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
Ссылка в новой задаче
Block a user