Move collective trace to HBM and fix log issue (#1542)

[ROCm/rccl commit: f5b15f27a9]
Этот коммит содержится в:
Wenkai Du
2025-02-11 11:40:14 -08:00
коммит произвёл GitHub
родитель 689725fb9e
Коммит 7fdbcdfdec
+5 -5
Просмотреть файл
@@ -254,7 +254,7 @@ void *ncclCommThreadMain(void *arg) {
} else {
if (type & ncclCollTraceP2pElemType)
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]);
else if (type & ncclCollTraceCollElemType)
else
sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount);
offset = strlen(line);
if (type == ncclCollTraceCollElemType) {
@@ -405,8 +405,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
else
ncclCommThreadMain((void *)comm);
}
NCCLCHECK(ncclCudaHostFree((void *)comm->collTrace));
NCCLCHECK(ncclCudaHostFree((void *)comm->collTraceTail));
NCCLCHECK(ncclCudaFree((void *)comm->collTrace));
NCCLCHECK(ncclCudaFree((void *)comm->collTraceTail));
#endif
free(comm->peerInfo);
@@ -591,8 +591,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
#ifdef ENABLE_COLLTRACE
NCCLCHECK(ncclCudaHostCalloc(&comm->collTraceTail, MAXCHANNELS));
NCCLCHECK(ncclCudaHostCalloc(&comm->collTrace, COLLTRACE_NUM_ITEMS*MAXCHANNELS));
NCCLCHECK(ncclCudaCalloc(&comm->collTraceTail, MAXCHANNELS));
NCCLCHECK(ncclCudaCalloc(&comm->collTrace, COLLTRACE_NUM_ITEMS*MAXCHANNELS));
comm->collTraceExit = 0;
comm->collTraceEnabled = false; // we can enable colltrace without starting a thread
if ((ncclDebugLevel >= NCCL_LOG_INFO) && rcclParamKernelCollTraceEnable()) {