d7293281f3
[ROCm/rccl commit: 858b4e76eb]
94 wiersze
3.9 KiB
C++
94 wiersze
3.9 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#include "mnnvl.h"
|
|
#include "transport.h"
|
|
#include <cuda.h>
|
|
#include "rocmwrap.h"
|
|
|
|
// Determine if MNNVL support is available
|
|
ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
|
|
#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
|
|
// MNNVL requires cuMem to be enabled
|
|
if (!ncclCuMemEnable()) return ncclSuccess;
|
|
|
|
// MNNVL also requires FABRIC handle support
|
|
int cudaDev;
|
|
int flag = 0;
|
|
CUdevice currentDev;
|
|
CUDACHECK(cudaGetDevice(&cudaDev));
|
|
CUDACHECK(cuDeviceGet(¤tDev, cudaDev));
|
|
// Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
|
|
(void) cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev);
|
|
if (!flag) return ncclSuccess;
|
|
// Check that all ranks have initialized the fabric fully
|
|
for (int i = 0; i < comm->nRanks; i++) {
|
|
if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return ncclSuccess;
|
|
}
|
|
|
|
// Determine our MNNVL domain/clique
|
|
NCCLCHECK(ncclCalloc(&comm->clique.ranks, comm->nRanks));
|
|
comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
|
|
for (int i = 0; i < comm->nRanks; i++) {
|
|
nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
|
|
nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
|
|
// Check if the cluster UUID and cliqueId match
|
|
// A zero UUID means we don't have MNNVL fabric info - disable MNNVL
|
|
unsigned long uuid0 = 0;
|
|
unsigned long uuid1 = 0;
|
|
memcpy(&uuid0, fabricInfo2->clusterUuid, sizeof(uuid0));
|
|
memcpy(&uuid1, fabricInfo2->clusterUuid + sizeof(uuid0), sizeof(uuid1));
|
|
if ((uuid0 | uuid1) == 0) return ncclSuccess;
|
|
if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
|
|
(fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
|
|
if (i == comm->rank) {
|
|
comm->cliqueRank = comm->clique.size;
|
|
}
|
|
comm->clique.ranks[comm->clique.size++] = i;
|
|
}
|
|
}
|
|
|
|
// No MNNVL clique found
|
|
if (comm->clique.size <= 1) return ncclSuccess;
|
|
|
|
// Check that FABRIC handles can be exported & imported by IMEX
|
|
{
|
|
void *ptr = NULL;
|
|
CUmemGenericAllocationHandle handle;
|
|
ncclCuDesc cuDesc;
|
|
CUresult err;
|
|
|
|
// Allocate FABRIC handle compatible memory
|
|
ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN);
|
|
if (ret != ncclSuccess) {
|
|
// Return an error if this is a MNNVL capable system but FABRIC handles are not supported
|
|
WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX channel configuration (/dev/nvidia-caps-imex-channels). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.",
|
|
comm->clique.size);
|
|
return ncclSystemError;
|
|
}
|
|
err = cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0);
|
|
if (err != CUDA_SUCCESS ||
|
|
(err = cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC)) != CUDA_SUCCESS) {
|
|
const char *errStr;
|
|
(void) cuGetErrorString(err, &errStr);
|
|
NCCLCHECK(ncclCuMemFree(ptr));
|
|
// Return an error if this is a MNNVL capable system but it's not working
|
|
WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX configuration (nvidia-imex-ctl -N). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.",
|
|
comm->clique.size);
|
|
return ncclSystemError;
|
|
}
|
|
NCCLCHECK(ncclCuMemFree(ptr));
|
|
|
|
// Force the CUMEM handle type to be FABRIC for MNNVL
|
|
ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
|
|
comm->MNNVL = 1;
|
|
INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d",
|
|
comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
|
|
}
|
|
#endif
|
|
return ncclSuccess;
|
|
}
|