diff --git a/projects/rccl/LICENSE.txt b/projects/rccl/LICENSE.txt
index 2a454ff322..1eddd43e2a 100644
--- a/projects/rccl/LICENSE.txt
+++ b/projects/rccl/LICENSE.txt
@@ -1,5 +1,5 @@
 
- Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
@@ -30,3 +30,11 @@
  The U.S. Department of Energy funded the development of this software
  under subcontract 7078610 with Lawrence Berkeley National Laboratory.
 
+
+This code also includes files from the NVIDIA Tools Extension SDK project.
+
+See:
+
+   https://github.com/NVIDIA/NVTX
+
+for more information and license details.
diff --git a/projects/rccl/README.md b/projects/rccl/README.md
index 2e8dcaee3f..433387fe85 100644
--- a/projects/rccl/README.md
+++ b/projects/rccl/README.md
@@ -86,6 +86,6 @@ Please refer to the [Library documentation](http://rccl.readthedocs.io/) for cur
 
 ## Copyright
 
-All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 
-All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
\ No newline at end of file
+All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk
index f2539c595b..f64e8ad49a 100644
--- a/projects/rccl/makefiles/version.mk
+++ b/projects/rccl/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 8
-NCCL_PATCH   := 3
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc
index 9bf8d6840d..b4db1368fd 100644
--- a/projects/rccl/src/bootstrap.cc
+++ b/projects/rccl/src/bootstrap.cc
@@ -78,7 +78,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
   int recvSize;
   NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
   if (recvSize > size) {
-    WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
+    WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
     return ncclInternalError;
   }
   NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
@@ -274,7 +274,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) {
 void* ncclRemoteMemAllocationService(void* args) {
   struct remAllocState* state = (struct remAllocState *) args;
   if (hipSetDevice(state->cudaDev) != hipSuccess) {
-    WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
+    WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
   }
 
   // Prepare poll descriptor
@@ -523,7 +523,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
 ncclResult_t bootstrapClose(void* commState) {
   struct extState* state = (struct extState*)commState;
   if (state->unexpectedConnections != NULL) {
-    WARN("Unexpected connections are not empty.\n");
+    WARN("Unexpected connections are not empty");
     return ncclInternalError;
   }
   close(state->extListenFd);
diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc
index f5e769a5a2..43848017f6 100644
--- a/projects/rccl/src/enqueue.cc
+++ b/projects/rccl/src/enqueue.cc
@@ -376,7 +376,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
       info->nstepsPerLoop = 1;
       info->nchunksPerLoop = info->comm->nRanks; break;
     default:
-      WARN("Unknown pattern %d\n", info->pattern);
+      WARN("Unknown pattern %d", info->pattern);
       return ncclInternalError;
   }
   return ncclSuccess;
diff --git a/projects/rccl/src/graph/connect.cc b/projects/rccl/src/graph/connect.cc
index be422bd32d..737f3218e8 100644
--- a/projects/rccl/src/graph/connect.cc
+++ b/projects/rccl/src/graph/connect.cc
@@ -117,7 +117,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
   int x = 0;
   while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
   if (x == NCCL_MAX_TREE_ARITY) {
-    WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
+    WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
     return ncclInternalError;
   }
   tree->down[x] = indexes[d];
@@ -215,7 +215,7 @@ int ncclMinNchannels() {
   if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
   if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
   if (minNchannels > MAXCHANNELS) {
-    WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
+    WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
     minNchannels = MAXCHANNELS;
   }
   if (minNchannels < 0) minNchannels = 0;
@@ -227,7 +227,7 @@ int ncclMaxNchannels() {
   if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
   if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
   if (maxNchannels < 1) {
-    WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
+    WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
     maxNchannels = 1;
   }
   return maxNchannels;
diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc
index c9924229be..13a6434dd4 100644
--- a/projects/rccl/src/graph/paths.cc
+++ b/projects/rccl/src/graph/paths.cc
@@ -26,7 +26,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
       return ncclSuccess;
     }
   }
-  WARN("Could not find node of type %d id %lx\n", t, id);
+  WARN("Could not find node of type %d id %lx", t, id);
   return ncclInternalError;
 }
 
@@ -282,6 +282,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
     if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
     else p2pLevel = PATH_SYS;
   }
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+    p2pLevel = PATH_PXB;
+  }
 
 compare:
   // Compute the PCI distance and compare with the p2pLevel.
diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc
index 1d732a92ff..097dcaaeb4 100644
--- a/projects/rccl/src/graph/search.cc
+++ b/projects/rccl/src/graph/search.cc
@@ -58,7 +58,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
       return ncclSuccess;
     }
   }
-  WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
+  WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
   return ncclInternalError;
 }
 
@@ -191,7 +191,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
       return ncclSuccess;
     }
   }
-  WARN("Could not find gpu rank %d\n", rank);
+  WARN("Could not find gpu rank %d", rank);
   return ncclInternalError;
 }
 
@@ -202,7 +202,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
       return ncclSuccess;
     }
   }
-  WARN("Could not find net id %lx\n", id);
+  WARN("Could not find net id %lx", id);
   return ncclInternalError;
 }
 
@@ -1253,7 +1253,7 @@ done:
   }
 
   if (graph->nChannels == 0 && graph->collNet == 0) {
-    WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
+    WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
     for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
     graph->inter[0] = graph->inter[1] = 0;
     graph->speedIntra = graph->speedInter = 0.1;
diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc
index b50b89c04b..61a38c1c84 100644
--- a/projects/rccl/src/graph/topo.cc
+++ b/projects/rccl/src/graph/topo.cc
@@ -82,6 +82,9 @@ static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* wi
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
     *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
   }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+    *width = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
+  }
   return ncclSuccess;
 }
 
@@ -104,7 +107,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*
 
 ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
   if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
-    WARN("Error : tried to create too many nodes of type %d\n", type);
+    WARN("Error : tried to create too many nodes of type %d", type);
     return ncclInternalError;
   }
   struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
@@ -379,7 +382,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
 }
 
 struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
-struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
+struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { "  Shanghai  ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
 
 ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
   int numaId;
@@ -402,6 +405,11 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
       cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+    } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
+      int familyId, modelId;
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
     }
     if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
       int familyId, modelId;
@@ -486,7 +494,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
     NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
     NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
     if (gpu == NULL) {
-      WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
+      WARN("Add NVLink error : could not find GPU %lx", pBusId);
       return ncclInternalError;
     }
     int count;
diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h
index 9379844bf2..379b5a3351 100644
--- a/projects/rccl/src/graph/topo.h
+++ b/projects/rccl/src/graph/topo.h
@@ -20,6 +20,8 @@
 #define PCI_WIDTH 12.0           // PCI Gen3 x16
 #define QPI_WIDTH 6.0
 #define SKL_QPI_WIDTH 9.0
+#define ZPI_WIDTH 6.0
+#define YONGFENG_ZPI_WIDTH 9.0
 #define P9_WIDTH 32.0
 #define ARM_WIDTH 6.0
 #define NET_WIDTH 12.0           // 100Gbit
diff --git a/projects/rccl/src/graph/xml.cc b/projects/rccl/src/graph/xml.cc
index a4da61d60e..1e61332fda 100644
--- a/projects/rccl/src/graph/xml.cc
+++ b/projects/rccl/src/graph/xml.cc
@@ -71,7 +71,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
     if (c == '=') {
       ptr[o] = '\0';
       if (value == NULL) {
-        WARN("XML Parse : Unexpected value with name %s\n", ptr);
+        WARN("XML Parse : Unexpected value with name %s", ptr);
         return ncclInternalError;
       }
       return xmlGetValue(file, value, last);
@@ -137,7 +137,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
     // Re-read the name, we got '/' in the first call
     NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
     if (c != '>') {
-      WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
+      WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
       return ncclInternalError;
     }
     return ncclSuccess;
@@ -150,7 +150,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
   while (c == ' ') {
     NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
     if (a == MAX_ATTR_COUNT) {
-      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
+      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
       // Actually we need to still consume the extra attributes so we have an extra one.
     } else a++;
   }
@@ -178,7 +178,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
   if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
   while (1) {
     if (xml->maxIndex == MAX_NODES) {
-      WARN("Error : XML parser is limited to 1024 nodes\n");
+      WARN("Error : XML parser is limited to 1024 nodes");
       return ncclInternalError;
     }
     struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
@@ -373,7 +373,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
   char strValue[MAX_STR_LEN];
   NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
   if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
-  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
+  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
   return ncclSuccess;
 }
 
@@ -661,7 +661,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
     int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
 
     if (maxNvLinks > 0 && nvmlDev == NULL) {
-      WARN("No NVML device handle. Skipping nvlink detection.\n");
+      WARN("No NVML device handle. Skipping nvlink detection.");
       maxNvLinks = 0;
     }
 
diff --git a/projects/rccl/src/group.cc b/projects/rccl/src/group.cc
index df9cd85ddd..3ab95c0533 100644
--- a/projects/rccl/src/group.cc
+++ b/projects/rccl/src/group.cc
@@ -202,7 +202,7 @@ ncclResult_t ncclGroupEnd() {
     if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
       int err = pthread_join(ncclGroupThreads[i], NULL);
       if (err != 0) {
-        WARN("Error waiting for pthread_join : %s\n", strerror(errno));
+        WARN("Error waiting for pthread_join : %s", strerror(errno));
         return ncclSystemError;
       }
       NCCLCHECKGOTO(args->ret, ret, end);
@@ -234,7 +234,7 @@ ncclResult_t ncclGroupEnd() {
         // schedule delta 0, +1, -1, +2, -2, ...
         // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
         for (int d=0; d<=nRanks/4; d++) {
-          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
+          int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
           int index = 0;
           int delta = deltas[index];
 sched_delta:
@@ -259,6 +259,10 @@ sched_delta:
               ssize_t sendbytes = totSendBytes-sendOffset;
               if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
               if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
+              // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
+              // (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
+              if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
+              if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
               if (sendbytes >= 0 || recvbytes >= 0) {
                 NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
                       recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h
index d88458c24a..e7a152cc97 100644
--- a/projects/rccl/src/include/debug.h
+++ b/projects/rccl/src/include/debug.h
@@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
 
-void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
 
 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
diff --git a/projects/rccl/src/include/graph.h b/projects/rccl/src/include/graph.h
index 6ef9f4c196..ac7fcbf992 100644
--- a/projects/rccl/src/include/graph.h
+++ b/projects/rccl/src/include/graph.h
@@ -40,10 +40,12 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
 #define NCCL_TOPO_CPU_ARCH_ARM 3
 #define NCCL_TOPO_CPU_VENDOR_INTEL 1
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
+#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
 #define NCCL_TOPO_CPU_TYPE_BDW 1
 #define NCCL_TOPO_CPU_TYPE_SKL 2
 #define NCCL_TOPO_CPU_TYPE_ZEN 3
 #define NCCL_TOPO_CPU_TYPE_ROME 4
+#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 
diff --git a/projects/rccl/src/include/shm.h b/projects/rccl/src/include/shm.h
index 85acdfe3dc..6a3e556c1e 100644
--- a/projects/rccl/src/include/shm.h
+++ b/projects/rccl/src/include/shm.h
@@ -46,7 +46,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
   *shmPtr = ptr;
   return ncclSuccess;
 sysError:
-  WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
+  WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
 hipError:
   if (fd != -1) close(fd);
   if (create) shm_unlink(shmname);
diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h
index e386598cdf..aaf137713e 100644
--- a/projects/rccl/src/include/socket.h
+++ b/projects/rccl/src/include/socket.h
@@ -372,7 +372,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
   /* IPv4/IPv6 support */
   int family = remoteAddr->sa.sa_family;
   if (family != AF_INET && family != AF_INET6) {
-    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
+    WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6);
     return ncclInternalError;
   }
   int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
index b534264dee..76237e7b22 100644
--- a/projects/rccl/src/init.cc
+++ b/projects/rccl/src/init.cc
@@ -286,7 +286,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
   #define VEGA_GPU_RTC_FREQUENCY 2.5E7
   if (comm->rank == 0) {
     INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", "  wait", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
-    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
+    INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
   }
   INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
     comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
@@ -385,7 +385,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   comm->nRanks = comm->hostDevComm.nRanks = ndev;
   hipGetDevice(&comm->cudaDev);
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
-  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
+  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
 
   comm->doneEvent = doneEvent;
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
@@ -766,7 +766,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   for (int i = 0; i < nranks; i++) {
     memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
     if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
-      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
       return ncclInvalidUsage;
     }
   }
@@ -1163,7 +1163,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
 
   return ncclSuccess;
 cleanup:
@@ -1234,6 +1234,9 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
 
 static ncclResult_t commDestroy(ncclComm_t comm) {
   int savedDevice;
+#ifdef ENABLE_TRACE
+  int rank = comm->rank;
+#endif
   CUDACHECK(hipGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
 
@@ -1250,7 +1253,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
   if (savedDevice != commDevice)
     CUDACHECK(hipSetDevice(savedDevice));
 
-  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
+  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
 
   return ncclSuccess;
 }
@@ -1261,7 +1264,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
-  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
 
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
diff --git a/projects/rccl/src/misc/utils.cc b/projects/rccl/src/misc/utils.cc
index a02b558b49..29b618196c 100644
--- a/projects/rccl/src/misc/utils.cc
+++ b/projects/rccl/src/misc/utils.cc
@@ -69,10 +69,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
 }
 
 uint64_t getHash(const char* string, int n) {
-  // Based on DJB2, result = result * 33 + char
+  // Based on DJB2a, result = result * 33 ^ char
   uint64_t result = 5381;
   for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) + string[c];
+    result = ((result << 5) + result) ^ string[c];
   }
   return result;
 }
diff --git a/projects/rccl/src/proxy.cc b/projects/rccl/src/proxy.cc
index 2ba7aabfb4..29cf1fda39 100644
--- a/projects/rccl/src/proxy.cc
+++ b/projects/rccl/src/proxy.cc
@@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
   struct ncclProxyArgs* op = state->ops;
   while (op) {
     if (op->idle & OP_SEEN) {
-      WARN("Active list loop at element %ld\n", OP_INDEX(op));
+      WARN("Active list loop at element %ld", OP_INDEX(op));
     }
     op->idle |= OP_SEEN;
     printf("[%ld]", OP_INDEX(op));
@@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
   struct ncclProxyArgs* free = state->pool;
   while (free) {
     if (free->idle & OP_SEEN) {
-      WARN("Free list loop at element %ld\n", OP_INDEX(free));
+      WARN("Free list loop at element %ld", OP_INDEX(free));
     }
     free->idle |= OP_SEEN;
     free = free->next;
@@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
   while (p) {
     for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
       if ((p->elems[e].idle & OP_SEEN) == 0) {
-        WARN("Element %d of pool %d has been lost\n", e, i);
+        WARN("Element %d of pool %d has been lost", e, i);
         struct ncclProxyArgs* free = state->pool;
         printf("Free list ");
         while (free) {
@@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
   struct ncclPeer* peerComm = args->channel->peers+peer;
   struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
   if (connector->transportComm == NULL) {
-    WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank,
+    WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
         type == proxyRecv ? "recv" : "send", peer, args->channel->id);
     return ncclInternalError;
   }
@@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ
   while (nslots*state->slotSize < size) nslots *= 2;
   int s = (ptr-buff)/state->slotSize;
   if (s < 0 || s+nslots > state->nslots) {
-    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
+    WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots);
     return ncclInternalError;
   }
   for (int i=0; i<nslots; i++) used[s+i] = 0;
diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc
index 9f9a2ec666..5544be8889 100644
--- a/projects/rccl/src/transport/coll_net.cc
+++ b/projects/rccl/src/transport/coll_net.cc
@@ -295,7 +295,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
           int count = size/ncclTypeSize(args->dtype);
           NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
           if (args->requests[buffSlot] != NULL) {
-            TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
+            TRACE(NCCL_NET, "sendProxy [%lu/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
             STORE(sizesFifo+buffSlot, -1);
             // Make sure size is reset to zero before we update the head.
             __sync_synchronize();
@@ -312,7 +312,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
       int buffSlot = args->done%NCCL_STEPS;
       NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
       if (done) {
-        TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
+        TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
         STORE(&reqFifo[buffSlot].size, size);
         // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
         // (reordered store after store is possible on POWER, though not on x86)
@@ -357,7 +357,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
       char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
       int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
       STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize);
-      TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
+      TRACE(NCCL_NET, "recvProxy [%lu/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
       args->posted += args->sliceSteps;
       args->idle = 0;
       return ncclSuccess;
@@ -365,7 +365,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
     if (args->posted > args->received) {
       int buffSlot = args->received%NCCL_STEPS;
       if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete
-        TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
+        TRACE(NCCL_NET, "recvProxy [%lu/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
         if (args->protocol == NCCL_PROTO_LL) { // ll
           // re-attach flag
           uint32_t flag = NCCL_LL_FLAG(args->received + 1);
diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc
index be658ec4d6..4fe4423d43 100644
--- a/projects/rccl/src/transport/net.cc
+++ b/projects/rccl/src/transport/net.cc
@@ -362,7 +362,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
             args->channel->sizes += LOAD(sizesFifo+buffSlot);
             args->channel->send_byte += LOAD(sizesFifo+buffSlot);
 #endif
-            TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
+            TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
             STORE(sizesFifo+buffSlot, -1);
             // Make sure size is reset to zero before we update the head.
             __sync_synchronize();
@@ -379,7 +379,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
       int buffSlot = args->done%NCCL_STEPS;
       NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
       if (done) {
-        TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]);
+        TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", args->done, buffSlot, args->requests[buffSlot]);
 #ifdef ENABLE_PROFILING
         if (args->protocol == NCCL_PROTO_SIMPLE) {
           args->channel->active_req --;
@@ -446,7 +446,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
       }
       NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
       if (args->requests[buffSlot] != NULL) {
-        TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
+        TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
 #ifdef ENABLE_PROFILING
         if (args->protocol == NCCL_PROTO_SIMPLE) {
           if (args->channel->active_req == 0) {
diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc
index 50ffd567cb..228a3fd4f8 100644
--- a/projects/rccl/src/transport/net_ib.cc
+++ b/projects/rccl/src/transport/net_ib.cc
@@ -85,7 +85,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
   snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
   char* p = realpath(devicePath, NULL);
   if (p == NULL) {
-    WARN("Could not find real path of %s", *devicePath);
+    WARN("Could not find real path of %s (%s)", devName, devicePath);
   } else {
     // Merge multi-port NICs into the same PCI device
     p[strlen(p)-1] = '0';
diff --git a/projects/rccl/src/transport/net_socket.cc b/projects/rccl/src/transport/net_socket.cc
index bbb3c8a58b..13e155c0fb 100644
--- a/projects/rccl/src/transport/net_socket.cc
+++ b/projects/rccl/src/transport/net_socket.cc
@@ -238,7 +238,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
     if (fd == -1) {
       // Could not find device vendor. This is handled silently so
       // we don't want to print an INFO error.
-      TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
+      TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno));
       goto end;
     }
     char vendor[7];