From 7fdbcdfdeca633c75795f0d0fe02010bb894032d Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 11 Feb 2025 11:40:14 -0800 Subject: [PATCH] Move collective trace to HBM and fix log issue (#1542) [ROCm/rccl commit: f5b15f27a94a8c41cf138f3febea27e54e6fd016] --- projects/rccl/src/init.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index b0b78c35d1..e7de0f877b 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -254,7 +254,7 @@ void *ncclCommThreadMain(void *arg) { } else { if (type & ncclCollTraceP2pElemType) sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06x-%06x", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->p2pOpCount[0], td->p2pOpCount[1]); - else if (type & ncclCollTraceCollElemType) + else sprintf(line, "## [%012.6f] [%02d:%02d:%02d] %06lx", (double)(td->timeStamp)/vega_gpu_rtc_freq, comm->rank, td->bid, td->channelId, td->opCount); offset = strlen(line); if (type == ncclCollTraceCollElemType) { @@ -405,8 +405,8 @@ static ncclResult_t commFree(ncclComm_t comm) { else ncclCommThreadMain((void *)comm); } - NCCLCHECK(ncclCudaHostFree((void *)comm->collTrace)); - NCCLCHECK(ncclCudaHostFree((void *)comm->collTraceTail)); + NCCLCHECK(ncclCudaFree((void *)comm->collTrace)); + NCCLCHECK(ncclCudaFree((void *)comm->collTraceTail)); #endif free(comm->peerInfo); @@ -591,8 +591,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false; #ifdef ENABLE_COLLTRACE - NCCLCHECK(ncclCudaHostCalloc(&comm->collTraceTail, MAXCHANNELS)); - NCCLCHECK(ncclCudaHostCalloc(&comm->collTrace, COLLTRACE_NUM_ITEMS*MAXCHANNELS)); + NCCLCHECK(ncclCudaCalloc(&comm->collTraceTail, MAXCHANNELS)); + NCCLCHECK(ncclCudaCalloc(&comm->collTrace, COLLTRACE_NUM_ITEMS*MAXCHANNELS)); comm->collTraceExit = 0; comm->collTraceEnabled = false; // we can enable colltrace without starting a thread if ((ncclDebugLevel >= NCCL_LOG_INFO) && rcclParamKernelCollTraceEnable()) {