RCCL 2.4 update
This commit is contained in:
+9
-6
@@ -12,14 +12,14 @@
|
||||
#include <sys/mman.h>
|
||||
|
||||
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
|
||||
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
|
||||
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
|
||||
memset(*ptr, 0, size);
|
||||
*devPtr = *ptr;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
|
||||
CUDACHECK(cudaFreeHost(ptr));
|
||||
CUDACHECK(hipHostFree(ptr));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -36,15 +36,18 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
|
||||
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
|
||||
if (isFineGrain)
|
||||
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
|
||||
else
|
||||
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
|
||||
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
|
||||
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,17 +11,17 @@
|
||||
|
||||
// Check CUDA calls
|
||||
#define CUDACHECK(cmd) do { \
|
||||
cudaError_t e = cmd; \
|
||||
if( e != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
return ncclUnhandledCudaError; \
|
||||
} \
|
||||
} while(false)
|
||||
|
||||
#define CUDACHECKGOTO(cmd, res, label) do { \
|
||||
cudaError_t e = cmd; \
|
||||
if( e != cudaSuccess ) { \
|
||||
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
|
||||
res = ncclUnhandledCudaError; \
|
||||
goto label; \
|
||||
} \
|
||||
|
||||
+8
-18
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -7,21 +8,10 @@
|
||||
#ifndef NCCL_COMM_H_
|
||||
#define NCCL_COMM_H_
|
||||
|
||||
#if CUDART_VERSION < 9000
|
||||
struct cudaLaunchParams {
|
||||
void *func;
|
||||
dim3 gridDim;
|
||||
dim3 blockDim;
|
||||
void **args;
|
||||
size_t sharedMem;
|
||||
cudaStream_t stream;
|
||||
};
|
||||
#endif
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
|
||||
|
||||
#define CACHE_LINE_SIZE 128
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#define MEM_ALIGN 4096
|
||||
#define CUDA_IPC_MIN 2097152UL
|
||||
|
||||
@@ -66,9 +56,9 @@ struct ncclComm {
|
||||
int nvmlDev; // my NVML device number
|
||||
|
||||
enum { GROUP, PARALLEL } launchMode;
|
||||
cudaStream_t userStream;
|
||||
hipStream_t userStream;
|
||||
bool userStreamSet;
|
||||
cudaEvent_t doneEvent;
|
||||
hipEvent_t doneEvent;
|
||||
bool checkPointers;
|
||||
|
||||
// Counter to make sure collectives match (needed for bcast/reduce
|
||||
@@ -88,7 +78,7 @@ struct ncclComm {
|
||||
|
||||
// An internal CUDA stream for NCCL kernel CGMD launches
|
||||
int groupCudaStream;
|
||||
cudaStream_t groupStream;
|
||||
hipStream_t groupStream;
|
||||
|
||||
// Whether there has been a fatal error in this communicator.
|
||||
ncclResult_t fatalError;
|
||||
@@ -111,13 +101,13 @@ struct ncclComm {
|
||||
int intraPhase;
|
||||
|
||||
// Storage for deferred intra-process launch
|
||||
struct cudaLaunchParams * intraParams;
|
||||
struct cudaLaunchParams *myParams;
|
||||
hipLaunchParams * intraParams;
|
||||
hipLaunchParams *myParams;
|
||||
int* intraCudaDevs;
|
||||
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
|
||||
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
|
||||
struct ncclColl args;
|
||||
void* argsptr;
|
||||
struct ncclColl* argsptr;
|
||||
|
||||
// Global proxy thread
|
||||
pthread_t proxyThread;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -10,6 +11,15 @@
|
||||
#include "nccl.h"
|
||||
#include <stdint.h>
|
||||
|
||||
// Convert volatile access to atomic
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
||||
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
||||
#else
|
||||
#define LOAD(VAR) *(VAR)
|
||||
#define STORE(DST, SRC) *(DST) = (SRC)
|
||||
#endif
|
||||
|
||||
#define NCCL_MAX_OPS 2048
|
||||
#define NCCL_STEPS 8
|
||||
|
||||
@@ -73,6 +83,12 @@ struct ncclConnInfo {
|
||||
// Low latency mechanism
|
||||
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
|
||||
uint64_t llLastCleaning;
|
||||
|
||||
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
|
||||
// allows software to explicitly initiate a flush read to HDP memory. See more
|
||||
// descriptions in primitives.h.
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
|
||||
};
|
||||
|
||||
struct ncclConnector {
|
||||
@@ -111,6 +127,8 @@ struct ncclPeer {
|
||||
|
||||
struct ncclDevComm;
|
||||
|
||||
#pragma pack(push) /* push current alignment to stack */
|
||||
#pragma pack(4) /* set alignment to 4 bytes boundary */
|
||||
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
|
||||
/* to make sure reads to host from the CUDA kernel are aligned. */
|
||||
/* Make sure to adjust padding at the end of ncclColl. */
|
||||
@@ -165,14 +183,56 @@ struct ncclChannel {
|
||||
int collCount;
|
||||
int collFifoHead; // Only used by GPU
|
||||
int collFifoTail; // Only used by CPU
|
||||
|
||||
uint32_t* abortCount;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
};
|
||||
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
|
||||
#pragma pack(pop) /* restore original alignment from stack */
|
||||
|
||||
#define MAXCHANNELS 16
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf {
|
||||
union {
|
||||
struct {
|
||||
uint64_t total_cycle;
|
||||
uint64_t wait_send_cycle[MAXCHANNELS];
|
||||
uint64_t wait_recv_cycle[MAXCHANNELS];
|
||||
// primtive cycles
|
||||
uint64_t send_cycle;
|
||||
uint64_t directSend_cycle;
|
||||
uint64_t recv_cycle;
|
||||
uint64_t directRecv_cycle;
|
||||
uint64_t copySend_cycle;
|
||||
uint64_t directCopySend_cycle;
|
||||
uint64_t recvCopySend_cycle;
|
||||
uint64_t directRecvCopySend_cycle;
|
||||
uint64_t recvReduceCopy_cycle;
|
||||
uint64_t recvReduceSend_cycle;
|
||||
uint64_t recvReduceCopySend_cycle;
|
||||
uint64_t directRecvReduceCopySend_cycle;
|
||||
// primitive bytes
|
||||
uint64_t send_byte;
|
||||
uint64_t directSend_byte;
|
||||
uint64_t recv_byte;
|
||||
uint64_t directRecv_byte;
|
||||
uint64_t copySend_byte;
|
||||
uint64_t directCopySend_byte;
|
||||
uint64_t recvCopySend_byte;
|
||||
uint64_t directRecvCopySend_byte;
|
||||
uint64_t recvReduceCopy_byte;
|
||||
uint64_t recvReduceSend_byte;
|
||||
uint64_t recvReduceCopySend_byte;
|
||||
uint64_t directRecvReduceCopySend_byte;
|
||||
};
|
||||
int data[0x80];
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
typedef enum {
|
||||
ncclDevSuccess,
|
||||
ncclDevAssertedMismatch,
|
||||
@@ -189,6 +249,11 @@ struct ncclDevComm {
|
||||
|
||||
// Channels, device side
|
||||
struct ncclChannel* channels;
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
// Profiling counters
|
||||
struct ncclProf* devProf;
|
||||
#endif
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -12,9 +13,9 @@
|
||||
|
||||
// Channels / LL tuning
|
||||
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
|
||||
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
|
||||
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL
|
||||
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
|
||||
#define NCCL_LL_MIN_NTHREADS 64
|
||||
#define NCCL_LL_MIN_NTHREADS 256
|
||||
|
||||
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
|
||||
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
|
||||
|
||||
+2
-1
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -18,7 +19,7 @@ typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueI
|
||||
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
|
||||
|
||||
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
|
||||
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
|
||||
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
|
||||
|
||||
ncclResult_t ncclAsyncColl(ncclComm_t comm);
|
||||
#endif
|
||||
|
||||
+1
-1
@@ -31,7 +31,7 @@ struct ncclInfo {
|
||||
ncclRedOp_t op;
|
||||
int root;
|
||||
ncclComm_t comm;
|
||||
cudaStream_t stream;
|
||||
hipStream_t stream;
|
||||
// Algorithm details
|
||||
int chunkSteps;
|
||||
int sliceSteps;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -58,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
|
||||
/* Get the maximum number of NVLinks based on the GPU generation */
|
||||
static ncclResult_t getMaxNvlinks(int* maxLinks) {
|
||||
int cudaDev;
|
||||
CUDACHECK(cudaGetDevice(&cudaDev));
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
int ccMajor;
|
||||
CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
|
||||
CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
|
||||
// 6 for Volta, 4 for Pascal
|
||||
*maxLinks = (ccMajor > 6) ? 6 : 4;
|
||||
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef NCCL_NVLINK_H_
|
||||
#define NCCL_NVLINK_H_
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include "nvmlwrap.h"
|
||||
#include "topo.h"
|
||||
|
||||
#define CONNECT_NVLINK 0x10
|
||||
#define CONNECT_NVSWITCH 0x100
|
||||
|
||||
enum ncclNvLinkDeviceType {
|
||||
ncclNvLinkDeviceGpu,
|
||||
ncclNvLinkDeviceSwitch,
|
||||
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
|
||||
};
|
||||
|
||||
static int getNvlinkGpu(const char* busId1, const char* busId2) {
|
||||
int links = 0;
|
||||
return CONNECT_NVLINK*links;
|
||||
}
|
||||
|
||||
#endif
|
||||
+5
-1
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,8 +9,11 @@
|
||||
#define NCCL_RINGS_H_
|
||||
|
||||
static int getDefaultThreads() {
|
||||
// On Kepler, rings are doubled later.
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
return 256;
|
||||
#else // On Kepler, rings are doubled later.
|
||||
return ncclCudaCompCap() == 3 ? 128 : 256;
|
||||
#endif
|
||||
}
|
||||
|
||||
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
|
||||
|
||||
+5
-4
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -39,14 +40,14 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
|
||||
ncclResult_t res = ncclSuccess;
|
||||
|
||||
NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
|
||||
CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
|
||||
CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
|
||||
CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError_t);
|
||||
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
|
||||
|
||||
*shmPtr = ptr;
|
||||
return ncclSuccess;
|
||||
sysError:
|
||||
WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
|
||||
cudaError:
|
||||
hipError_t:
|
||||
if (fd != -1) close(fd);
|
||||
if (create) shm_unlink(shmname);
|
||||
if (ptr != MAP_FAILED) munmap(ptr, shmsize);
|
||||
@@ -60,7 +61,7 @@ static ncclResult_t shmUnlink(const char* shmname) {
|
||||
}
|
||||
|
||||
static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
|
||||
CUDACHECK(cudaHostUnregister(shmPtr));
|
||||
CUDACHECK(hipHostUnregister(shmPtr));
|
||||
if (munmap(shmPtr, shmsize) != 0) {
|
||||
WARN("munmap of shared memory failed");
|
||||
return ncclSystemError;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,6 +12,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
uint64_t getnHash(const char* string, int n);
|
||||
uint64_t getHostHash();
|
||||
uint64_t getPidHash();
|
||||
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user