Move collective trace to HBM and fix log issue (#1542)
[ROCm/rccl commit: f5b15f27a9]
Этот коммит содержится в:
@@ -254,7 +254,7 @@ void *ncclCommThreadMain(void *arg) {
|
||||
} else {
|
||||
if (type & ncclCollTraceP2pElemType)
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]);
|
||||
else if (type & ncclCollTraceCollElemType)
|
||||
else
|
||||
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount);
|
||||
offset = strlen(line);
|
||||
if (type == ncclCollTraceCollElemType) {
|
||||
@@ -405,8 +405,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
else
|
||||
ncclCommThreadMain((void *)comm);
|
||||
}
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->collTrace));
|
||||
NCCLCHECK(ncclCudaHostFree((void *)comm->collTraceTail));
|
||||
NCCLCHECK(ncclCudaFree((void *)comm->collTrace));
|
||||
NCCLCHECK(ncclCudaFree((void *)comm->collTraceTail));
|
||||
#endif
|
||||
|
||||
free(comm->peerInfo);
|
||||
@@ -591,8 +591,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
|
||||
comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
NCCLCHECK(ncclCudaHostCalloc(&comm->collTraceTail, MAXCHANNELS));
|
||||
NCCLCHECK(ncclCudaHostCalloc(&comm->collTrace, COLLTRACE_NUM_ITEMS*MAXCHANNELS));
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->collTraceTail, MAXCHANNELS));
|
||||
NCCLCHECK(ncclCudaCalloc(&comm->collTrace, COLLTRACE_NUM_ITEMS*MAXCHANNELS));
|
||||
comm->collTraceExit = 0;
|
||||
comm->collTraceEnabled = false; // we can enable colltrace without starting a thread
|
||||
if ((ncclDebugLevel >= NCCL_LOG_INFO) && rcclParamKernelCollTraceEnable()) {
|
||||
|
||||
Ссылка в новой задаче
Block a user