From e2f73fea64bc37041eb4c7c8949dcbc3d9e33a73 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 4 Dec 2018 14:47:41 -0800 Subject: [PATCH] Remove error logging from a normal path When initNet fails, we should not print the backtrace as it is supposed to be normal operation (falling back to sockets) [ROCm/rccl commit: 57368189e11e8a1e774a01ca6ecee4b726a5bc43] --- projects/rccl/src/init.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cu index 56633236f6..db522b8d30 100644 --- a/projects/rccl/src/init.cu +++ b/projects/rccl/src/init.cu @@ -72,10 +72,11 @@ int ncclCudaFullCompCap() { return ccMajor*10+ccMinor; } +// Returns ncclInternalError if anything fails, causing that network to be ignored. ncclResult_t initNet(ncclNet_t* net) { int ndev; - NCCLCHECK(net->init(ncclDebugLog)); - NCCLCHECK(net->devices(&ndev)); + if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; + if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; if (ndev <= 0) { INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name); return ncclSystemError;