From 87f12cbb86214be96fe1cb8a12a92581af6e38f2 Mon Sep 17 00:00:00 2001 From: TomSang Date: Sun, 11 Apr 2021 16:29:24 -0400 Subject: [PATCH] Add detection of cooperative multi device launch attribute (#345) --- src/init.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/init.cc b/src/init.cc index 69008fc52a..d45cd7f5e3 100644 --- a/src/init.cc +++ b/src/init.cc @@ -573,12 +573,18 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct } if (comm->launchMode == ncclComm::GROUP) { CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking)); -#if CUDART_VERSION >= 9000 if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) { // Check whether the GPU supports Cooperative Group Multi Device Launch - (void) hipDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev); + hipError_t ret = hipDeviceGetAttribute(&cgMdLaunch, hipDeviceAttributeCooperativeMultiDeviceLaunch, comm->cudaDev); + if (ret != hipSuccess) { + INFO(NCCL_INIT, "hipDeviceGetAttribute(hipDeviceAttributeCooperativeMultiDeviceLaunch, %d) failed with %s", + comm->cudaDev, hipGetErrorString(ret)); + return ncclInternalError; + } + if (!cgMdLaunch) { + INFO(NCCL_INIT, "Multi-GPU cooperative launch support not available for device %d", comm->cudaDev); + } } -#endif } // Disable cgMdLaunch if any rank does not support it