diff --git a/projects/rccl/ext-net/dummy/Makefile b/projects/rccl/ext-net/dummy/Makefile
new file mode 100644
index 0000000000..d1eb4c5a62
--- /dev/null
+++ b/projects/rccl/ext-net/dummy/Makefile
@@ -0,0 +1,17 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+NCCL_HOME:=../../build/
+CUDA_HOME:=/usr/local/cuda
+INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include
+PLUGIN_SO:=libnccl-net.so
+
+default: $(PLUGIN_SO)
+
+$(PLUGIN_SO): plugin.c
+	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+clean:
+	rm -f $(PLUGIN_SO)
diff --git a/projects/rccl/ext-net/dummy/plugin.c b/projects/rccl/ext-net/dummy/plugin.c
new file mode 100644
index 0000000000..b11cf76a61
--- /dev/null
+++ b/projects/rccl/ext-net/dummy/plugin.c
@@ -0,0 +1,44 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include <nccl.h>
+#include <nccl_net.h>
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+__hidden ncclResult_t pluginInit() { return ncclSuccess; }
+__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
+__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
+__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
+__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int type, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, int type, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size) { return ncclInternalError; }
+__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
+
+ncclNet_t NCCL_PLUGIN_SYMBOL = {
+  "Dummy",
+  pluginInit,
+  pluginDevices,
+  pluginPciPath,
+  pluginPtrSupport,
+  pluginListen,
+  pluginConnect,
+  pluginAccept,
+  pluginIsend,
+  pluginIrecv,
+  pluginFlush,
+  pluginTest,
+  pluginCloseSend,
+  pluginCloseRecv,
+  pluginCloseListen
+};
diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile
index 5d304441aa..481000ad16 100644
--- a/projects/rccl/src/Makefile
+++ b/projects/rccl/src/Makefile
@@ -8,7 +8,7 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk
 
 ##### src files
-INCEXPORTS  := nccl.h
+INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
 		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
 		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
@@ -80,6 +80,11 @@ $(INCDIR)/%.h : %.h
 	mkdir -p $(INCDIR)
 	cp -f $< $@
 
+$(INCDIR)/nccl_%.h : include/nccl_%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)
+	cp -f $< $@
+
 $(OBJDIR)/%.o : %.cu
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
diff --git a/projects/rccl/src/bootstrap.cu b/projects/rccl/src/bootstrap.cu
index 8593726b67..17aa1f45ef 100644
--- a/projects/rccl/src/bootstrap.cu
+++ b/projects/rccl/src/bootstrap.cu
@@ -217,7 +217,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
   int rank = state->rank;
   int nranks = state->nranks;
 
-  TRACE(INIT, "rank %d nranks %d size %d", rank, nranks, size);
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size);
 
   /* Simple ring based AllGather
    * At each step i receive data from (rank-i-1) from left
@@ -233,7 +233,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
     NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
   }
 
-  TRACE(INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
+  TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
   return ncclSuccess;
 }
 
diff --git a/projects/rccl/src/collectives/all_gather.cu b/projects/rccl/src/collectives/all_gather.cu
index e19feff06b..8dec28e63b 100644
--- a/projects/rccl/src/collectives/all_gather.cu
+++ b/projects/rccl/src/collectives/all_gather.cu
@@ -12,7 +12,7 @@
 ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
       CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
diff --git a/projects/rccl/src/collectives/all_reduce.cu b/projects/rccl/src/collectives/all_reduce.cu
index 77ae4c8483..cc14083ab7 100644
--- a/projects/rccl/src/collectives/all_reduce.cu
+++ b/projects/rccl/src/collectives/all_reduce.cu
@@ -12,7 +12,7 @@
 ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
       CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
diff --git a/projects/rccl/src/collectives/broadcast.cu b/projects/rccl/src/collectives/broadcast.cu
index 0e5ec7b1cf..91ce905440 100644
--- a/projects/rccl/src/collectives/broadcast.cu
+++ b/projects/rccl/src/collectives/broadcast.cu
@@ -12,7 +12,7 @@
 ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
       CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
diff --git a/projects/rccl/src/collectives/reduce.cu b/projects/rccl/src/collectives/reduce.cu
index 76d4a194f7..d8fde80baa 100644
--- a/projects/rccl/src/collectives/reduce.cu
+++ b/projects/rccl/src/collectives/reduce.cu
@@ -12,7 +12,7 @@
 ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
       CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
diff --git a/projects/rccl/src/collectives/reduce_scatter.cu b/projects/rccl/src/collectives/reduce_scatter.cu
index af9d78b289..1447d4a91b 100644
--- a/projects/rccl/src/collectives/reduce_scatter.cu
+++ b/projects/rccl/src/collectives/reduce_scatter.cu
@@ -12,7 +12,7 @@
 ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
   size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
   if (comm->nRanks == 1) {
     if (sendbuff != recvbuff)
       CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h
index 2dd63d60e3..eff2172968 100644
--- a/projects/rccl/src/include/core.h
+++ b/projects/rccl/src/include/core.h
@@ -271,7 +271,7 @@ struct ncclComm {
   while (ret == -1) { \
     SYSCHECKVAL(call, name, ret); \
     if (ret == -1) { \
-      INFO(ALL,"Got %s, retrying", strerror(errno));   \
+      INFO(NCCL_ALL,"Got %s, retrying", strerror(errno));   \
     }\
   } \
 } while (0);
@@ -313,7 +313,7 @@ struct ncclComm {
   ncclResult_t res = call; \
   if (res != ncclSuccess) { \
     /* Print the back trace*/ \
-    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     return res; \
   } \
 } while (0);
@@ -322,7 +322,7 @@ struct ncclComm {
   res = call; \
   if (res != ncclSuccess) { \
     /* Print the back trace*/ \
-    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     goto label; \
   } \
 } while (0);
diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h
index 83ae98b978..55dee1838c 100644
--- a/projects/rccl/src/include/debug.h
+++ b/projects/rccl/src/include/debug.h
@@ -16,65 +16,24 @@
 #include <limits.h>
 #include <string.h>
 #include "nccl.h"
+#include "nccl_net.h"
+
 #define gettid() (pid_t) syscall(SYS_gettid)
 
-typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel;
-typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
-extern DebugLevel ncclDebugLevel;
+extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen);
 
-#define WARN(...) do {                                           \
-  if (ncclDebugLevel >= WARN) {                                  \
-    char hostname[1024];                                         \
-    getHostName(hostname, 1024);                                 \
-    int cudaDev;                                                 \
-    cudaGetDevice(&cudaDev);                                     \
-    pthread_mutex_lock(&ncclDebugOutputLock);                    \
-    fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
-    fprintf(ncclDebugFile,__VA_ARGS__);                          \
-    fprintf(ncclDebugFile,"\n");                                 \
-    fflush(ncclDebugFile);                                       \
-    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
-    if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
-  }                                                              \
-} while(0)
+extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
 
-#define INFO(FLAGS, ...) do {                                    \
-  if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) {     \
-    char hostname[1024];                                         \
-    getHostName(hostname, 1024);                                 \
-    int cudaDev;                                                 \
-    cudaGetDevice(&cudaDev);                                     \
-    pthread_mutex_lock(&ncclDebugOutputLock);                    \
-    fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
-    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
-    fflush(ncclDebugFile);                                       \
-    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
-  }                                                              \
-} while(0)
+#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 
 #ifdef ENABLE_TRACE
-#define TRACE(FLAGS, ...) do {                                   \
-  if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) {    \
-    char hostname[1024];                                         \
-    getHostName(hostname, 1024);                                 \
-    int cudaDev;                                                 \
-    cudaGetDevice(&cudaDev);                                     \
-    pthread_mutex_lock(&ncclDebugOutputLock);                    \
-    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
-    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
-    fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
-    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
-    fflush(ncclDebugFile);                                       \
-    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
-  }                                                              \
-} while(0)
-
+#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 extern std::chrono::high_resolution_clock::time_point ncclEpoch;
-
 #else
 #define TRACE(...)
 #endif
@@ -84,17 +43,17 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
 static inline void initDebug() {
   const char* nccl_debug = getenv("NCCL_DEBUG");
   if (nccl_debug == NULL) {
-    ncclDebugLevel = NONE;
+    ncclDebugLevel = NCCL_LOG_NONE;
   } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = VERSION;
+    ncclDebugLevel = NCCL_LOG_VERSION;
   } else if (strcasecmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = WARN;
+    ncclDebugLevel = NCCL_LOG_WARN;
   } else if (strcasecmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = INFO;
+    ncclDebugLevel = NCCL_LOG_INFO;
   } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = ABORT;
+    ncclDebugLevel = NCCL_LOG_ABORT;
   } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
-    ncclDebugLevel = TRACE;
+    ncclDebugLevel = NCCL_LOG_TRACE;
   }
 
   /* Parse the NCCL_DEBUG_SUBSYS env var
@@ -109,17 +68,17 @@ static inline void initDebug() {
       uint64_t mask = 0;
       if (subsys[0] == '^') { invert = 1; subsys++; }
       if (strcasecmp(subsys, "INIT") == 0) {
-        mask = INIT;
+        mask = NCCL_INIT;
       } else if (strcasecmp(subsys, "COLL") == 0) {
-        mask = COLL;
+        mask = NCCL_COLL;
       } else if (strcasecmp(subsys, "P2P") == 0) {
-        mask = P2P;
+        mask = NCCL_P2P;
       } else if (strcasecmp(subsys, "SHM") == 0) {
-        mask = SHM;
+        mask = NCCL_SHM;
       } else if (strcasecmp(subsys, "NET") == 0) {
-        mask = NET;
+        mask = NCCL_NET;
       } else if (strcasecmp(subsys, "ALL") == 0) {
-        mask = ALL;
+        mask = NCCL_ALL;
       }
       if (mask) {
         if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
@@ -133,7 +92,7 @@ static inline void initDebug() {
    * NCCL_DEBUG level is > VERSION
    */
   const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
-  if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) {
+  if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
     int c = 0;
     char debug_fn[PATH_MAX+1] = "";
     char *dfn = debug_fn;
@@ -164,7 +123,7 @@ static inline void initDebug() {
     if (debug_fn[0] != '\0') {
       FILE *file = fopen(debug_fn, "w");
       if (file != NULL) {
-        INFO(ALL,"DEBUG file is '%s'", debug_fn);
+        INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
         ncclDebugFile = file;
       }
     }
diff --git a/projects/rccl/src/include/nccl_net.h b/projects/rccl/src/include/nccl_net.h
index 58c1335d94..7dbbc37893 100644
--- a/projects/rccl/src/include/nccl_net.h
+++ b/projects/rccl/src/include/nccl_net.h
@@ -9,25 +9,25 @@
 
 #include "nccl.h"
 
-#define NCCL_NET_MAJOR 1
-#define NCCL_NET_MINOR 0
-
 #define NCCL_NET_HANDLE_MAXSIZE 64
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
 
-#define NCCL_MAX_SCORE 0x7
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
-  // Return the number of network devices along with their scores relative to the
-  // current CUDA device. The per device score should be a value from 1-7 with a
-  // higher score representing a better choice for performance.
-  // This call should allocate the 'scores' array using malloc(3), and it
-  // will then be freed automatically by NCCL.
-  ncclResult_t (*devices)(int* ndev, int** scores);
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
   // Return whether this device supports host pointers and/or CUDA pointers
   // as data from the current GPU. Supported types should be composed with
   // NCCL_PTR_HOST and NCCL_PTR_CUDA.
@@ -53,12 +53,10 @@ typedef struct {
   ncclResult_t (*closeSend)(void* sendComm);
   ncclResult_t (*closeRecv)(void* recvComm);
   ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_t;
+} ncclNet_v1_t;
 
-extern
-#ifdef __cplusplus
-"C"
-#endif
-ncclNet_t* ncclNet;
+typedef ncclNet_v1_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
 
 #endif // end include guard
diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h
index d841419a55..ebc967782c 100644
--- a/projects/rccl/src/include/net.h
+++ b/projects/rccl/src/include/net.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_net.h"
 
+extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
 /* Socket Interface Selection type */
@@ -19,7 +20,8 @@ typedef enum { findSubnetIf   = -1,
 
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
-static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; }
+static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
 static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
@@ -32,7 +34,6 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
 
-extern bool ncclIbSupport();
 extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
diff --git a/projects/rccl/src/include/param.h b/projects/rccl/src/include/param.h
index 44c1b9a55f..dd5f697e34 100644
--- a/projects/rccl/src/include/param.h
+++ b/projects/rccl/src/include/param.h
@@ -67,10 +67,10 @@ int64_t ncclParam##name() { \
       errno = 0; \
       int64_t v = strtoll(str, NULL, 0); \
       if (errno) { \
-        INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
+        INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
       } else { \
         value = v; \
-        INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
+        INFO(NCCL_ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
       } \
     } \
   } \
diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h
index 533cacc9d5..9d2b2c8fc4 100644
--- a/projects/rccl/src/include/socket.h
+++ b/projects/rccl/src/include/socket.h
@@ -76,7 +76,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
     if (family != AF_INET && family != AF_INET6)
       continue;
 
-    TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+    TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
 
     /* Allow the caller to force the socket family type */
     if (sock_family != -1 && family != sock_family)
@@ -106,7 +106,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
       memcpy(addrs+found, interface->ifa_addr, salen);
-      INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+      INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
       found++;
     }
   }
@@ -183,7 +183,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
     // Store the interface name
     strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
 
-    INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
     found++;
     if (found == maxIfs) break;
   }
@@ -333,7 +333,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
 
 #ifdef ENABLE_TRACE
   char line[1024];
-  TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
+  TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif
 
   /* Put the socket in listen mode */
@@ -363,7 +363,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
 
 #ifdef ENABLE_TRACE
   char line[1024];
-  TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
+  TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
 #endif
 
   SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
@@ -381,7 +381,7 @@ static ncclResult_t socketReceive(int fd, void* ptr, int size) {
       return ncclSystemError;
     }
     if (recvsize == -1) {
-      INFO(NET,"Recv : got retcode %d, retrying", errno);
+      INFO(NCCL_NET,"Recv : got retcode %d, retrying", errno);
       continue;
     }
     data += recvsize;
@@ -397,7 +397,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) {
     int sendsize;
     SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
     if (sendsize == -1) {
-      INFO(NET,"Send : got retcode %d, retrying", errno);
+      INFO(NCCL_NET,"Send : got retcode %d, retrying", errno);
       continue;
     }
     data += sendsize;
diff --git a/projects/rccl/src/include/topo.h b/projects/rccl/src/include/topo.h
index 24d7e9d28e..971ae68c2f 100644
--- a/projects/rccl/src/include/topo.h
+++ b/projects/rccl/src/include/topo.h
@@ -8,53 +8,26 @@
 #define NCCL_TOPO_H_
 
 #include "nccl.h"
+#include <limits.h>
+#include <stdlib.h>
 #include <ctype.h>
 
-#define MAXPATHSIZE 1024
-
 static ncclResult_t getCudaPath(int cudaDev, char** path) {
   char busId[16];
   CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
   for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/device";
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
   memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
-  char* cudaRpath = realpath(busPath, NULL);
-  char pathname[MAXPATHSIZE];
-  strncpy(pathname, cudaRpath, MAXPATHSIZE);
-  strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
-  strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
-  free(cudaRpath);
-  *path = realpath(pathname, NULL);
+  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, sizeof("0000:00:00.0")-1);
+  *path = realpath(busPath, NULL);
   if (*path == NULL) {
-    WARN("Could not find real path of %s", pathname);
+    WARN("Could not find real path of %s", busPath);
     return ncclSystemError;
   }
   return ncclSuccess;
 }
 
-static ncclResult_t getMlxPath(char* ibName, char** path) {
-  char devicepath[MAXPATHSIZE];
-  snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", devicepath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t getSockPath(char* ifName, char** path) {
-  char devicepath[MAXPATHSIZE];
-  snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    INFO(NET|INIT, "Could not find real path of %s", devicepath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-enum ncclIbPathDist {
+enum ncclPathDist {
   PATH_PIX = 0,
   PATH_PXB = 1,
   PATH_PHB = 2,
@@ -74,7 +47,7 @@ static int pciDistance(char* path1, char* path2) {
       if (same == 1) score++;
     }
   }
-  if (score == 3) return PATH_SOC;
+  if (score <= 3) return PATH_SOC;
   if (score == 4) return PATH_PHB;
   if (score == depth-1) return PATH_PIX;
   return PATH_PXB;
diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cu
index ed33d1aa43..c9700a8b4f 100644
--- a/projects/rccl/src/init.cu
+++ b/projects/rccl/src/init.cu
@@ -28,9 +28,13 @@
 #include <string.h>
 #include <errno.h>
 #include <assert.h>
+#include <dlfcn.h>
 
-DebugLevel ncclDebugLevel;
-uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+int ncclDebugLevel;
+uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
 pthread_mutex_t ncclDebugOutputLock;
 FILE *ncclDebugFile = stdout;
 
@@ -48,7 +52,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 
-extern "C" __attribute__ ((visibility("default")))
 ncclNet_t* ncclNet = NULL;
 
 // We define this as weak to let tests redefine their own
@@ -69,13 +72,53 @@ int ncclCudaFullCompCap() {
   return ccMajor*10+ccMinor;
 }
 
-void initNet() {
-  if (ncclNet != NULL) {
-    INFO(INIT,"Using external Network %s", ncclNetName());
-  } else {
-    ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket;
-    INFO(INIT,"Using internal Network %s", ncclNetName());
+ncclResult_t initNet(ncclNet_t* net) {
+  int ndev;
+  NCCLCHECK(net->init(ncclDebugLog));
+  NCCLCHECK(net->devices(&ndev));
+  if (ndev <= 0) {
+    INFO(NCCL_INIT, "Net/%s: call to devices() returned 0 devices.", net->name);
+    return ncclSystemError;
   }
+  return ncclSuccess;
+}
+
+ncclResult_t initNetPlugin(ncclNet_t** net) {
+  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
+  if (netPluginLib == NULL) {
+    INFO(NCCL_INIT, "Unable to load libnccl-net.so : %s", dlerror());
+    return ncclSuccess;
+  }
+  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
+  if (extNet == NULL) {
+    INFO(NCCL_INIT, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
+    goto cleanup;
+  }
+  if (initNet(extNet) == ncclSuccess) {
+    *net = extNet;
+    return ncclSuccess;
+  }
+cleanup:
+  if (netPluginLib != NULL) dlclose(netPluginLib);
+  return ncclSuccess;
+}
+
+ncclResult_t initNet() {
+  // Always initialize sockets as we use it for bootstrap
+  NCCLCHECK(initNet(&ncclNetSocket));
+
+  NCCLCHECK(initNetPlugin(&ncclNet));
+  if (ncclNet != NULL) {
+    INFO(NCCL_INIT, "Using external Network %s", ncclNetName());
+    return ncclSuccess;
+  }
+  if (initNet(&ncclNetIb) == ncclSuccess) {
+    ncclNet = &ncclNetIb;
+  } else {
+    ncclNet = &ncclNetSocket;
+  }
+  INFO(NCCL_INIT,"Using internal Network %s", ncclNetName());
+  return ncclSuccess;
 }
 
 NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
@@ -171,7 +214,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   struct ncclComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
 
-  INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
   comm->rank = rank;
   comm->nRanks = ndev;
   cudaGetDevice(&comm->cudaDev);
@@ -204,16 +247,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
 }
 
 // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
-#define STR2(v) #v
-#define STR(v) STR2(v)
 #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
 static void showVersion() {
   static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= VERSION) {
+  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
     printf("%s\n", VERSION_STRING);
     fflush(stdout);
     if (ncclDebugFile != stdout)
-      INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
     shown = 1;
   }
 }
@@ -294,12 +335,12 @@ void dumpMatrix(int* connectMatrix, int nranks) {
   line[STRLENGTH] = '\0';
   memset(line, ' ', STRLENGTH);
   for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
-  INFO(INIT,"%s", line);
+  INFO(NCCL_INIT,"%s", line);
   for (int i=0; i<nranks; i++) {
     memset(line, ' ', STRLENGTH);
     sprintf(line, "%3d ", i);
     for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
-    INFO(INIT,"%s", line);
+    INFO(NCCL_INIT,"%s", line);
   }
 }
 
@@ -308,12 +349,12 @@ void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
   line[STRLENGTH] = '\0';
   memset(line, ' ', STRLENGTH);
   for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
-  INFO(INIT,"%s", line);
+  INFO(NCCL_INIT,"%s", line);
   for (int i=0; i<nranks; i++) {
     memset(line, ' ', STRLENGTH);
     sprintf(line, "%3d ", i);
     for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
-    INFO(INIT,"%s", line);
+    INFO(NCCL_INIT,"%s", line);
   }
 }
 
@@ -325,7 +366,7 @@ void dumpLine(int* values, int nranks, const char* prefix) {
   memset(line, ' ', STRLENGTH);
   strncpy(line, prefix, PREFIXLEN);
   for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
-  INFO(INIT,"%s", line);
+  INFO(NCCL_INIT,"%s", line);
 }
 
 static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
@@ -477,7 +518,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
   for (int i=0; i<nranks; i++)
     comm->nThreads = std::max(allData[i], comm->nThreads);
-  if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads);
+  if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
 
   // Determine the minimum CUDA Compute capability of all GPUs
   int myCompCap = ncclCudaCompCap();
@@ -486,7 +527,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
   for (int i=0; i<nranks; i++)
     minCompCap = std::min(allData[i], minCompCap);
-  if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap);
+  if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
 
   // Find min nrings across ranks
   allData[rank] = nrings;
@@ -547,7 +588,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
       multiNode = 1;
     }
   }
-  TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
       rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
   if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
     WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
@@ -596,7 +637,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
 
-  INFO(INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
 
   return ncclSuccess;
 cleanup:
@@ -615,7 +656,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   NCCLCHECK(ncclInit());
   if (myrank == 0) showVersion();
 
-  INFO(INIT,"rank %d nranks %d", myrank, nranks);
+  INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
 
   // Make sure the CUDA runtime is initialized.
   CUDACHECK(cudaFree(NULL));
@@ -679,8 +720,8 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   free(prev);
   free(next);
 
-  INFO(INIT,"Using %d threads", nthreads);
-  INFO(INIT,"Min Comp Cap %d", minCompCap);
+  INFO(NCCL_INIT,"Using %d threads", nthreads);
+  INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
 
   int* rings;
   NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
@@ -733,7 +774,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   NCCLCHECK(wrapNvmlInit());
   showVersion();
 
-  INFO(INIT,"nranks %d", ndev);
+  INFO(NCCL_INIT,"nranks %d", ndev);
 
   NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
   if (ndev < 1) {
@@ -793,7 +834,7 @@ cleanup:
 
 final:
   if(wrapNvmlShutdown() != ncclSuccess)
-    INFO(INIT,"NCCL did not shutdown nvml properly");
+    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
   cudaSetDevice(savedDevice);
   sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   return res;
diff --git a/projects/rccl/src/misc/enqueue.cu b/projects/rccl/src/misc/enqueue.cu
index dc5d9cc4aa..6c6e2cee8e 100644
--- a/projects/rccl/src/misc/enqueue.cu
+++ b/projects/rccl/src/misc/enqueue.cu
@@ -175,7 +175,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
   // We can't print the CG mode before the first barrier happened.
   if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
     *comm->intraCGMode ^= 0x10;
-    INFO(INIT,"Launch mode %s%s%s",
+    INFO(NCCL_INIT,"Launch mode %s%s%s",
         comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
         *comm->intraCGMode ? "/CGMD" : "",
         (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
diff --git a/projects/rccl/src/misc/group.cu b/projects/rccl/src/misc/group.cu
index c7b31cfeb9..1716a75643 100644
--- a/projects/rccl/src/misc/group.cu
+++ b/projects/rccl/src/misc/group.cu
@@ -58,7 +58,7 @@ ncclResult_t ncclSetDevice(int cudaDev) {
 
 #define CHECK(a) do { \
   if ((args->ret = (a)) != ncclSuccess) { \
-    INFO(INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
     return args; \
   } \
 } while(0)
diff --git a/projects/rccl/src/misc/nvmlwrap.cu b/projects/rccl/src/misc/nvmlwrap.cu
index 52d4d41473..d9407f4686 100644
--- a/projects/rccl/src/misc/nvmlwrap.cu
+++ b/projects/rccl/src/misc/nvmlwrap.cu
@@ -61,7 +61,7 @@ ncclResult_t wrapNvmlSymbols(void) {
     cast = (void**)&funcptr;                             \
     tmp = dlsym(handle, symbol);                         \
     if (tmp == NULL) {                                   \
-      INFO(INIT,"dlsym failed on %s, ignoring", symbol); \
+      INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
     }                                                    \
     *cast = tmp;                                         \
   } while (0)
@@ -208,7 +208,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
   }
   nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
   if (ret != NVML_SUCCESS) {
-    INFO(INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+    INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
         nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
@@ -223,7 +223,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
   nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
   if (ret != NVML_SUCCESS) {
     if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
+      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
           nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
@@ -239,7 +239,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
   nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
   if (ret != NVML_SUCCESS) {
     if (ret != NVML_ERROR_NOT_SUPPORTED)
-      INFO(INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
+      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
           nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
diff --git a/projects/rccl/src/misc/rings.cu b/projects/rccl/src/misc/rings.cu
index 9ecda49cd9..a5d4616019 100644
--- a/projects/rccl/src/misc/rings.cu
+++ b/projects/rccl/src/misc/rings.cu
@@ -5,9 +5,10 @@
  ************************************************************************/
 
 #include "core.h"
-#include "net.h"
 #include "param.h"
 
+#define NCCL_MAX_SCORE 7
+
 /* Parse user defined rings. Format is like :
  * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
  * Rings with a non-matching number of ranks are ignored so we can provide
@@ -188,11 +189,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
   if (str && strlen(str)>0) {
     int ret = parseRings(str, nrings, nranks, prev, next);
     if (ret == ncclSuccess && *nrings > 0) {
-      if (rank == 0) INFO(INIT,"%d ring(s) set by environment", *nrings);
+      if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
       NCCLCHECK(getEnvThreads(nthreads));
       return ncclSuccess;
     }
-    if (rank == 0) INFO(INIT,"No valid ring found in environment, ignoring");
+    if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
     *nrings = 0;
   }
 
@@ -333,13 +334,13 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     minNrings = MAXRINGS;
   }
   if (maxNrings > 0 && maxNrings <= *nrings) {
-    if (rank == 0) INFO(INIT,"Limiting to %d rings per user request.", maxNrings);
+    if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
     *nrings = maxNrings;
   } else {
     int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
     if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
     if (minNrings > 0 && minNrings > *nrings) {
-      if (rank == 0 && minNrings > defaultMinNrings) INFO(INIT,"Duplicating rings to %d per user request.", minNrings);
+      if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
       for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
         for (int i=0; i<nranks; i++) {
           prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
diff --git a/projects/rccl/src/misc/utils.cu b/projects/rccl/src/misc/utils.cu
index d517267f96..d8e3aec5f5 100644
--- a/projects/rccl/src/misc/utils.cu
+++ b/projects/rccl/src/misc/utils.cu
@@ -6,8 +6,10 @@
 
 #include "utils.h"
 #include "debug.h"
+#include "nccl_net.h"
 #include <unistd.h>
 #include <string.h>
+#include <stdarg.h>
 
 ncclResult_t getHostName(char* hostname, int maxlen) {
   if (gethostname(hostname, maxlen) != 0) {
@@ -20,6 +22,53 @@ ncclResult_t getHostName(char* hostname, int maxlen) {
   return ncclSuccess;
 }
 
+/* Common logging function used by the INFO, WARN and TRACE macros
+ * Also exported to the dynamically loadable Net transport modules so
+ * they can share the debugging mechanisms and output files
+ */
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
+  if (ncclDebugLevel <= NCCL_LOG_NONE) return;
+
+  char hostname[1024];
+  getHostName(hostname, 1024);
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+
+  char buffer[1024];
+  size_t len = 0;
+  pthread_mutex_lock(&ncclDebugOutputLock);
+  if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
+    len = snprintf(buffer, sizeof(buffer),
+                   "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
+  else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
+    len = snprintf(buffer, sizeof(buffer),
+                   "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
+#ifdef ENABLE_TRACE
+  else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
+    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
+    len = snprintf(buffer, sizeof(buffer),
+                   "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
+  }
+#endif
+  if (len) {
+    va_list vargs;
+    va_start(vargs, fmt);
+    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    va_end(vargs);
+    fprintf(ncclDebugFile,"%s\n", buffer);
+    fflush(ncclDebugFile);
+  }
+  pthread_mutex_unlock(&ncclDebugOutputLock);
+
+  // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
+  if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
+    fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
+            hostname, getpid(), gettid(), cudaDev, filefunc, line);
+    abort();
+  }
+}
+
 uint64_t getHash(const char* string) {
   // Based on DJB2, result = result * 33 + char
   uint64_t result = 5381;
@@ -51,7 +100,7 @@ uint64_t getHostHash(void) {
   offset += len;
   // Trailing '\0'
   uname[offset]='\0';
-  TRACE(INIT,"unique hostname '%s'", uname);
+  TRACE(NCCL_INIT,"unique hostname '%s'", uname);
 
   return getHash(uname);
 }
@@ -71,7 +120,7 @@ uint64_t getPidHash(void) {
   if (len < 0) len = 0;
 
   pname[plen+len]='\0';
-  TRACE(INIT,"unique PID '%s'", pname);
+  TRACE(NCCL_INIT,"unique PID '%s'", pname);
 
   return getHash(pname);
 }
diff --git a/projects/rccl/src/ring.cu b/projects/rccl/src/ring.cu
index e9e9573703..fede79387f 100644
--- a/projects/rccl/src/ring.cu
+++ b/projects/rccl/src/ring.cu
@@ -26,7 +26,7 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) {
   NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
   ring->devMemRecv = recvMem;
 
-  TRACE(INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
+  TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
 
   // Pre-configure send/recv pointers. Those are the default, they may change later.
   ring->recv.conn.buff = recvMem->buff;
diff --git a/projects/rccl/src/transport.cu b/projects/rccl/src/transport.cu
index f5f9d75e29..7c13d5c351 100644
--- a/projects/rccl/src/transport.cu
+++ b/projects/rccl/src/transport.cu
@@ -113,8 +113,8 @@ ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRoun
 
   int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
   int nsteps = nstepsPerRound * nrounds * substeps;
-  TRACE(NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
-  TRACE(NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
+  TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
+  TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
   for (int r=0; r<nrings; r++) {
     struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
     struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
@@ -159,7 +159,7 @@ ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclCo
   struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
   threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
   if (proxyfunc) {
-    TRACE(NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
+    TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
     struct transportProxyInfo* info;
     NCCLCHECK(ncclCalloc(&info, 1));
     connector->proxyInfo = info;
diff --git a/projects/rccl/src/transport/net.cu b/projects/rccl/src/transport/net.cu
index ed62a66c91..8a7e3b8757 100644
--- a/projects/rccl/src/transport/net.cu
+++ b/projects/rccl/src/transport/net.cu
@@ -19,11 +19,21 @@
 #define NET_BITS_PER_IF 3
 #define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
 static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
+static ncclTvalue_t getTvalue(short* distances, int ndev) {
+  ncclTvalue_t tvalue = 0;
+  for (int d=0; d<ndev; d++) {
+    int score = 1 + PATH_SOC - distances[d];
+    // Keep 3 bits of score info per dev
+    tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
+  }
+  return tvalue;
+}
 
 struct netInfo {
   int rank;
   int ndev;
-  short scores[NET_MAX_IFS];
+  ncclTvalue_t tValue;
+  short distances[NET_MAX_IFS];
 };
 
 struct netConnectInfo {
@@ -38,7 +48,7 @@ struct netSendResources {
   struct ncclRecvMem* devHostRecvMem;
   struct ncclSendMem* hostDevMem;
   int netDev;
-  bool cudaSupport;
+  int useGdr;
   struct ncclRecvMem* devNetMem;
   uint64_t llStep;
   uint64_t llLastCleaning;
@@ -53,7 +63,7 @@ struct netRecvResources {
   struct ncclRecvMem* devHostRecvMem;
   struct ncclRecvMem* hostDevMem;
   int netDev;
-  bool cudaSupport;
+  int useGdr;
   uint64_t llStep;
   uint64_t llLastCleaning;
 };
@@ -64,26 +74,37 @@ ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
   struct netInfo* info = (struct netInfo*)opaqueInfo;
   static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
   info->rank = rank;
-  int *scores;
-  NCCLCHECK(ncclNetDevices(&info->ndev, &scores));
+  NCCLCHECK(ncclNetDevices(&info->ndev));
   if (info->ndev == 0) {
     WARN("Error : Network returned 0 device");
     return ncclSystemError;
   }
   if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
-  for (int d=0; d<info->ndev; d++) info->scores[d] = scores[d];
-  free(scores);
+
+  // Find distance with current GPU
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  char* cudaPath;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+
+  char line[1024];
+  sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
+  for (int d=0; d<info->ndev; d++) {
+    char* nicPath;
+    ncclResult_t err = ncclNetPciPath(d, &nicPath);
+    info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
+    sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
+    if (err == ncclSuccess) free(nicPath);
+  }
+  INFO(NCCL_INIT|NCCL_NET, "%s", line);
+  free(cudaPath);
   return ncclSuccess;
 }
 
 /* Determine if we can communicate with the peer */
 ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
-  ret[0] = 0;
   struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  for (int d=0; d<myInfo->ndev; d++) {
-    // Keep 3 bits of score info per dev
-    ret[0] |= ((myInfo->scores[d] & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
-  }
+  ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
   return ncclSuccess;
 }
 
@@ -175,13 +196,13 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
   return ncclSuccess;
 }
 
-int getDev(int ringId, int nDev, short* scores) {
-  int maxScore = 0;
-  for (int d=0; d<nDev; d++) if (scores[d] > maxScore) maxScore = scores[d];
+int getDev(int ringId, int nDev, short* distances) {
+  int minDistance = PATH_SOC;
+  for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
   int skip = ringId+1;
   while (skip) {
     for (int d=0; d<nDev; d++) {
-      if (scores[d] == maxScore) {
+      if (distances[d] == minDistance) {
         skip--;
         if (skip == 0) return d;
       }
@@ -191,6 +212,40 @@ int getDev(int ringId, int nDev, short* scores) {
 }
 
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
+
+static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
+  *useGdr = 0;
+
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+
+  if (read) { // For reads (sends) only enable under certain conditions
+    int gdrReadParam = ncclParamNetGdrRead();
+    if (gdrReadParam == 0) return ncclSuccess;
+    else if (gdrReadParam < 0) { // default : enable only on DGX2
+      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+      int nvlinks = getNumNvlinks(busId);
+      if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
+    }
+  }
+
+  // Check if we are close enough that it makes sense to enable GDR
+  int netGdrLevel = ncclParamNetGdrLevel();
+  if (distance >= netGdrLevel) {
+    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
+    return ncclSuccess;
+  }
+
+  // Finally, check if the NIC supports it
+  int flags;
+  NCCLCHECK(ncclNetPtrSupport(dev, &flags));
+  if (flags & NCCL_PTR_CUDA == 0) return ncclSuccess;
+  *useGdr = 1;
+  INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
+  return ncclSuccess;
+}
 
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
@@ -200,34 +255,11 @@ ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
   ring->send.transportResources = resources;
 
   struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
-  resources->cudaSupport = false;
-
-  // Get user's GDR READ setting
-  int gdrReadParam = ncclParamNetGdrRead();
-
-  // Determine whether the GPU has NVLink
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
-  int nvlinks = getNumNvlinks(busId);
-
-  // Enable GDR read when:
-  // 1) user sets it, or
-  // 2) we are on a NVSwitch platform (i.e. no P2P traffic over PCI-E switch) AND the GPU is Volta
-  bool enableGdrRead = (gdrReadParam > 0) || (nvlinks >= CONNECT_NVSWITCH && ncclCudaCompCap() > 6 && gdrReadParam != 0);
-  if (enableGdrRead) {
-    int flags;
-    NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
-    if (flags & NCCL_PTR_CUDA)
-      resources->cudaSupport = true;
-  }
-  if (resources->cudaSupport)
-    INFO(INIT|NET, "Net: enabling net device %d to read from rank %d", resources->netDev, myInfo->rank);
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
 
   int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  if (resources->cudaSupport) {
+  if (resources->useGdr) {
     NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
   }
 
@@ -243,10 +275,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
   ring->recv.transportResources = resources;
 
   struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
-  int flags;
-  NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
-  resources->cudaSupport = (flags & NCCL_PTR_CUDA) ? true : false;
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
 
   int sendSize = sizeof(struct ncclSendMem);
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -255,8 +285,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
 
   struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
-  INFO(INIT|NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
-      resources->cudaSupport ? "/GDRDMA" : "",
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "",
       (resources->hostDevMem != NULL) ? "/GDCopy" : "");
   struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
   NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
@@ -267,7 +297,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   // Setup device pointers
   struct netSendResources* resources = (struct netSendResources*)send->transportResources;
 
-  if (resources->cudaSupport) {
+  if (resources->useGdr) {
     send->conn.buff = resources->devNetMem->buff;
     // We don't use devMem for llMode because the CPU has to read the data
     send->conn.llBuff = resources->devHostRecvMem->llBuff;
@@ -299,7 +329,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   recv->conn.head = &resources->devHostSendMem->head;
   recv->conn.llHead = &resources->devHostSendMem->llHead;
 
-  if (resources->cudaSupport == false) {
+  if (resources->useGdr == 0) {
     recv->conn.buff = resources->devHostRecvMem->buff;
     recv->conn.llBuff = resources->devHostRecvMem->llBuff;
   }
@@ -320,7 +350,7 @@ ncclResult_t netSendFree(void* transportResources) {
   struct netSendResources* resources = (struct netSendResources*)transportResources;
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
-  if (resources->cudaSupport)
+  if (resources->useGdr)
     CUDACHECK(cudaFree(resources->devNetMem));
   NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   free(resources);
@@ -344,9 +374,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
   volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
   struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
   uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
-  struct ncclRecvMem* localMem = resources->cudaSupport ? resources->devNetMem : resources->hostRecvMem;
+  struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
   char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
-  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
   volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
   int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
   int sliceSize = buffSize / args->substeps;
@@ -362,8 +392,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
 
   if (!args->needProxy) goto nextColl;
 
-  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
 
   // Update in case we skipped some collectives
   if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
@@ -440,10 +470,10 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
   int llMode = args->llMode;
 
   volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
-  struct ncclRecvMem* localMem = resources->cudaSupport ? ring->devMemRecv : resources->hostRecvMem;
+  struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
   char* localBuff = llMode ? localMem->llBuff : localMem->buff;
-  char* nextBuff = (resources->cudaSupport == false && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
-  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
+  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
   uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
 
   int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
@@ -458,8 +488,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
 
   if (!args->needProxy) goto nextColl;
 
-  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
 
   if (llMode == 0) {
     // Waiting for next opCount is only needed before writing nextTail.
diff --git a/projects/rccl/src/transport/net_ib.cu b/projects/rccl/src/transport/net_ib.cu
index 7b1fc99f46..fb8bd7b7da 100644
--- a/projects/rccl/src/transport/net_ib.cu
+++ b/projects/rccl/src/transport/net_ib.cu
@@ -82,8 +82,12 @@ static void* ncclIbAsyncThreadMain(void* args) {
   return NULL;
 }
 
-static void initDevices() {
-  if(wrap_ibv_symbols() != ncclSuccess) { return; }
+NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
+
+ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+  if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
+  if (ncclParamIbDisable()) return ncclInternalError;
+
   if (ncclNIbDevs == -1) {
     pthread_mutex_lock(&ncclIbLock);
     wrap_ibv_fork_init();
@@ -91,9 +95,9 @@ static void initDevices() {
       ncclNIbDevs = 0;
       if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
         WARN("NET/IB : No IP interface found.");
-        return;
+        return ncclInternalError;
       }
-      INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
+      INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
 
       // Detect IB cards
       int nIbDevs;
@@ -105,7 +109,7 @@ static void initDevices() {
       bool searchNot = userIbEnv && userIbEnv[0] == '^';
       int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
 
-      if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return;
+      if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
 
       for (int d=0; d<nIbDevs; d++) {
         struct ibv_context * context;
@@ -134,7 +138,7 @@ static void initDevices() {
             if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
               continue;
             }
-            INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+            INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
                 portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
             ncclIbDevs[ncclNIbDevs].device = d;
             ncclIbDevs[ncclNIbDevs].port = port;
@@ -145,38 +149,29 @@ static void initDevices() {
             pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
           }
 
-          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } }
+          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
         }
       }
-      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; };
+      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
     }
-
     pthread_mutex_unlock(&ncclIbLock);
   }
+  return ncclSuccess;
 }
 
-ncclResult_t ncclIbDevices(int* ndev, int** scores) {
-  initDevices();
+ncclResult_t ncclIbDevices(int* ndev) {
   *ndev = ncclNIbDevs;
-  int cudaDev;
-  cudaGetDevice(&cudaDev);
-  char* cudaPath;
-  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
-  int* sc;
-  NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs));
-  char line[1024];
-  sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev);
-  for (int d=0; d<ncclNIbDevs; d++) {
-    char* mlxPath;
-    ncclResult_t err2 = getMlxPath(ncclIbDevs[d].devName, &mlxPath);
-    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
-    sprintf(line+strlen(line), "%s/%d(%s) ", ncclIbDevs[d].devName, ncclIbDevs[d].port, pathDists[distance]);
-    sc[d] = 1+PATH_SOC-distance;
-    if (err2 == ncclSuccess) free(mlxPath);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbPciPath(int dev, char** path) {
+  char devicepath[PATH_MAX];
+  snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", devicepath);
+    return ncclSystemError;
   }
-  INFO(INIT|NET,"%s", line);
-  if (err1 == ncclSuccess) free(cudaPath);
-  *scores = sc;
   return ncclSuccess;
 }
 
@@ -207,45 +202,21 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
   return ret;
 }
 
-NCCL_PARAM(IbGdrLevel, "IB_GDR_LEVEL", -2);
-NCCL_PARAM(IbCudaSupport, "IB_CUDA_SUPPORT", -2);
-
 ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
-  initDevices();
   *supportedTypes = NCCL_PTR_HOST;
 
   int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return ncclSuccess;
+  CUDACHECK(cudaGetDevice(&cudaDev));
 
-  int ibGdrLevel = PATH_PHB;
-  if (ncclParamIbCudaSupport() != -2) ibGdrLevel = ncclParamIbCudaSupport() ? PATH_SOC + 1 : 0;
-  if (ncclParamIbGdrLevel() != -2) ibGdrLevel = ncclParamIbGdrLevel();
-  if (ibGdrLevel > 0) {
-    int gdrSupport = ncclIbGdrSupport(dev);
-    if (gdrSupport > 0) {
-      INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU");
-      ibGdrLevel = 0;
-    }
-  }
-
-  if (ibGdrLevel <= 0) return ncclSuccess;
-
-  char* cudaPath;
-  if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess;
-  char* mlxPath;
-  if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; }
-  int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
-  free(mlxPath); free(cudaPath);
-  if (distance < ibGdrLevel) {
-    *supportedTypes |= NCCL_PTR_CUDA;
-  } else {
-    INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel);
+  if (ncclIbGdrSupport(dev) != ncclSuccess) {
+    INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
+    return ncclSuccess;
   }
+  *supportedTypes |= NCCL_PTR_CUDA;
   return ncclSuccess;
 }
 
 static ncclResult_t GetSocketAddr(union socketAddress* addr) {
-  if (ncclNIbDevs == -1) initDevices();
   memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
   return ncclSuccess;
 }
@@ -442,7 +413,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   *sendComm = comm;
 
   // IB Setup
-  initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/
   ibv_context* ctx = ncclIbDevs[dev].context;
   NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
   uint8_t ib_port = ncclIbDevs[dev].port;
@@ -464,13 +434,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   // RoCE support
   qpInfo.lid = portAttr.lid;
   if (qpInfo.lid) { // IB
-    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
   } else { // RoCE
     union ibv_gid gid;
     NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
     qpInfo.spn = gid.global.subnet_prefix;
     qpInfo.iid = gid.global.interface_id;
-    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
   }
 
   NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
@@ -649,7 +619,7 @@ ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct
   NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
   *mrRet = verbs->mrPool+elem;
   verbs->mrPool[elem].refcnt++;
-  TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+  TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
   return ncclSuccess;
 }
 
@@ -903,7 +873,9 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
 
 ncclNet_t ncclNetIb = {
   "IB",
+  ncclIbInit,
   ncclIbDevices,
+  ncclIbPciPath,
   ncclIbPtrSupport,
   ncclIbListen,
   ncclIbConnect,
@@ -917,10 +889,3 @@ ncclNet_t ncclNetIb = {
   ncclIbCloseListen
 };
 
-NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
-
-bool ncclIbSupport() {
-  if (ncclParamIbDisable()) return 0;
-  initDevices();
-  return ncclNIbDevs > 0;
-}
diff --git a/projects/rccl/src/transport/net_socket.cu b/projects/rccl/src/transport/net_socket.cu
index cff1973a4d..a8ae866e5a 100644
--- a/projects/rccl/src/transport/net_socket.cu
+++ b/projects/rccl/src/transport/net_socket.cu
@@ -8,67 +8,58 @@
 #include "core.h"
 #include "socket.h"
 #include "net.h"
-#include "topo.h"
 
 #include <assert.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <poll.h>
+#include <limits.h>
 
 /* Init functions */
+static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress ncclNetIfAddrs[MAX_IFS];
+static int ncclNetIfs = -1;
+pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
+  if (ncclNetIfs == -1) {
+    pthread_mutex_lock(&ncclSocketLock);
+    if (ncclNetIfs == -1) {
+      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
+      if (ncclNetIfs <= 0) {
+        WARN("NET/Socket : no interface found");
+        return ncclInternalError;
+      }
+    }
+    pthread_mutex_unlock(&ncclSocketLock);
+  }
+  return ncclSuccess;
+}
 
 ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
   *supportedTypes = NCCL_PTR_HOST;
   return ncclSuccess;
 }
 
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
-static int ncclNetIfs = -1;
-pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
-
-static void initDevices() {
-  if (ncclNetIfs == -1) {
-    pthread_mutex_lock(&ncclSocketLock);
-    if (ncclNetIfs == -1) {
-      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
-      INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs);
-      if (ncclNetIfs <= 0) {
-        WARN("NET/Socket : no interface found");
-      }
-    }
-    pthread_mutex_unlock(&ncclSocketLock);
-  }
+ncclResult_t ncclSocketDevices(int* ndev) {
+  *ndev = ncclNetIfs;
+  return ncclSuccess;
 }
 
-ncclResult_t ncclSocketDevices(int* ndev, int** scores) {
-  initDevices();
-  *ndev = ncclNetIfs;
-  int cudaDev;
-  cudaGetDevice(&cudaDev);
-  char* cudaPath;
-  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
-  int* sc;
-  NCCLCHECK(ncclCalloc(&sc, ncclNetIfs));
-  char line[1024];
-  sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev);
-  for (int i=0; i<ncclNetIfs; i++) {
-    char* sockPath;
-    ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath);
-    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath);
-    sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]);
-    sc[i] = 1+PATH_SOC-distance;
-    if (err2 == ncclSuccess) free(sockPath);
+ncclResult_t ncclSocketPciPath(int dev, char** path) {
+  char devicepath[PATH_MAX];
+  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
+    return ncclSystemError;
   }
-  INFO(INIT|NET,"%s", line);
-  if (err1 == ncclSuccess) free(cudaPath);
-  *scores = sc;
   return ncclSuccess;
 }
 
 static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
-  if (ncclNetIfs == -1) initDevices();
   if (dev >= ncclNetIfs) return ncclInternalError;
   memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
   return ncclSuccess;
@@ -223,7 +214,9 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
 
 ncclNet_t ncclNetSocket = {
   "Socket",
+  ncclSocketInit,
   ncclSocketDevices,
+  ncclSocketPciPath,
   ncclSocketPtrSupport,
   ncclSocketListen,
   ncclSocketConnect,
diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cu
index 35aebb4dcd..6c4626a77c 100644
--- a/projects/rccl/src/transport/p2p.cu
+++ b/projects/rccl/src/transport/p2p.cu
@@ -85,7 +85,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
   // See if CUDA can do P2P
   int p2p;
   if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
-    INFO(INIT|P2P,"peer query failed between dev %d and dev %d",
+    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
         myInfo->cudaDev, peerInfo->cudaDev);
     return ncclSuccess;
   }
@@ -454,7 +454,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
     info.direct = 1;
     info.directPtr = ring->devMemSend;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(INIT|P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
     } else {
       // Enable P2P access
       cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
@@ -465,7 +465,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
             peerInfo->cudaDev, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
-      INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
           ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
     }
   } else {
@@ -477,7 +477,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
           myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
       return ncclInternalError;
     }
-    INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
         ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
@@ -495,7 +495,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
     info.direct = 1;
     info.directPtr = ring->devMemRecv;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
-      TRACE(INIT|P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+      TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
     } else {
       // Enable P2P access
       cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
@@ -506,7 +506,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
             peerInfo->cudaDev, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
-      TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
     }
   } else {
     info.direct = 0;
@@ -517,7 +517,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
           myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
       return ncclInternalError;
     }
-    TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cu
index 38b6c6a0c5..317f652dac 100644
--- a/projects/rccl/src/transport/shm.cu
+++ b/projects/rccl/src/transport/shm.cu
@@ -168,10 +168,10 @@ ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
   char shmName[MAX_SHM_NAME_LEN];
   sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
   info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
-  INFO(INIT|SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
   info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
   static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
   memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
@@ -189,7 +189,7 @@ ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
   char shmName[MAX_SHM_NAME_LEN];
   sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
   info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
   info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
@@ -207,7 +207,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   char shmName[MAX_SHM_NAME_LEN];
   sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
   resources->remShmSize = info->shmSize;
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
   NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   // Remove the file to ensure proper clean-up
   NCCLCHECK(shmUnlink(shmName));
@@ -231,7 +231,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   char shmName[MAX_SHM_NAME_LEN];
   sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
   resources->remShmSize = info->shmSize;
-  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
   NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   NCCLCHECK(shmUnlink(shmName));
   recv->conn.head = &resources->devRemHostMem->head;