This commit is contained in:
Wenkai Du
2019-07-05 15:43:00 -07:00
parent 4d579e51cc
commit f11c8f60cd
95 ha cambiato i file con 7829 aggiunte e 614 eliminazioni
+9 -6
Vedi File
@@ -12,14 +12,14 @@
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
memset(*ptr, 0, size);
*devPtr = *ptr;
return ncclSuccess;
}
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
CUDACHECK(cudaFreeHost(ptr));
CUDACHECK(hipHostFree(ptr));
return ncclSuccess;
}
@@ -36,15 +36,18 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
}
template <typename T>
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
if (isFineGrain)
CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
else
CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
return ncclSuccess;
}
+6 -6
Vedi File
@@ -11,17 +11,17 @@
// Check CUDA calls
#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
WARN("Cuda failure '%s'", hipGetErrorString(e)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
+8 -18
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,21 +8,10 @@
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#define MAXCHANNELS 16
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
#define CACHE_LINE_SIZE 128
#define CACHE_LINE_SIZE 64
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
@@ -66,9 +56,9 @@ struct ncclComm {
int nvmlDev; // my NVML device number
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
hipStream_t userStream;
bool userStreamSet;
cudaEvent_t doneEvent;
hipEvent_t doneEvent;
bool checkPointers;
// Counter to make sure collectives match (needed for bcast/reduce
@@ -88,7 +78,7 @@ struct ncclComm {
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
cudaStream_t groupStream;
hipStream_t groupStream;
// Whether there has been a fatal error in this communicator.
ncclResult_t fatalError;
@@ -111,13 +101,13 @@ struct ncclComm {
int intraPhase;
// Storage for deferred intra-process launch
struct cudaLaunchParams * intraParams;
struct cudaLaunchParams *myParams;
hipLaunchParams * intraParams;
hipLaunchParams *myParams;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
void* argsptr;
struct ncclColl* argsptr;
// Global proxy thread
pthread_t proxyThread;
+65
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,6 +11,15 @@
#include "nccl.h"
#include <stdint.h>
// Convert volatile access to atomic
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
#else
#define LOAD(VAR) *(VAR)
#define STORE(DST, SRC) *(DST) = (SRC)
#endif
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
@@ -73,6 +83,12 @@ struct ncclConnInfo {
// Low latency mechanism
union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only)
};
struct ncclConnector {
@@ -111,6 +127,8 @@ struct ncclPeer {
struct ncclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(4) /* set alignment to 4 bytes boundary */
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclColl. */
@@ -165,14 +183,56 @@ struct ncclChannel {
int collCount;
int collFifoHead; // Only used by GPU
int collFifoTail; // Only used by CPU
uint32_t* abortCount;
};
int data[0x80];
};
};
static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
#pragma pack(pop) /* restore original alignment from stack */
#define MAXCHANNELS 16
#ifdef ENABLE_PROFILING
struct ncclProf {
union {
struct {
uint64_t total_cycle;
uint64_t wait_send_cycle[MAXCHANNELS];
uint64_t wait_recv_cycle[MAXCHANNELS];
// primtive cycles
uint64_t send_cycle;
uint64_t directSend_cycle;
uint64_t recv_cycle;
uint64_t directRecv_cycle;
uint64_t copySend_cycle;
uint64_t directCopySend_cycle;
uint64_t recvCopySend_cycle;
uint64_t directRecvCopySend_cycle;
uint64_t recvReduceCopy_cycle;
uint64_t recvReduceSend_cycle;
uint64_t recvReduceCopySend_cycle;
uint64_t directRecvReduceCopySend_cycle;
// primitive bytes
uint64_t send_byte;
uint64_t directSend_byte;
uint64_t recv_byte;
uint64_t directRecv_byte;
uint64_t copySend_byte;
uint64_t directCopySend_byte;
uint64_t recvCopySend_byte;
uint64_t directRecvCopySend_byte;
uint64_t recvReduceCopy_byte;
uint64_t recvReduceSend_byte;
uint64_t recvReduceCopySend_byte;
uint64_t directRecvReduceCopySend_byte;
};
int data[0x80];
};
};
#endif
typedef enum {
ncclDevSuccess,
ncclDevAssertedMismatch,
@@ -189,6 +249,11 @@ struct ncclDevComm {
// Channels, device side
struct ncclChannel* channels;
#ifdef ENABLE_PROFILING
// Profiling counters
struct ncclProf* devProf;
#endif
};
#endif
+3 -2
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,9 +13,9 @@
// Channels / LL tuning
#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD 256 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
#define NCCL_LL_MIN_NTHREADS 64
#define NCCL_LL_MIN_NTHREADS 256
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
+2 -1
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,7 +19,7 @@ typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueI
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAsyncColl(ncclComm_t comm);
#endif
+1 -1
Vedi File
@@ -31,7 +31,7 @@ struct ncclInfo {
ncclRedOp_t op;
int root;
ncclComm_t comm;
cudaStream_t stream;
hipStream_t stream;
// Algorithm details
int chunkSteps;
int sliceSteps;
+3 -2
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -58,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
/* Get the maximum number of NVLinks based on the GPU generation */
static ncclResult_t getMaxNvlinks(int* maxLinks) {
int cudaDev;
CUDACHECK(cudaGetDevice(&cudaDev));
CUDACHECK(hipGetDevice(&cudaDev));
int ccMajor;
CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
// 6 for Volta, 4 for Pascal
*maxLinks = (ccMajor > 6) ? 6 : 4;
// INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
+30
Vedi File
@@ -0,0 +1,30 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVLINK_H_
#define NCCL_NVLINK_H_
#include <sys/stat.h>
#include <fcntl.h>
#include "nvmlwrap.h"
#include "topo.h"
#define CONNECT_NVLINK 0x10
#define CONNECT_NVSWITCH 0x100
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static int getNvlinkGpu(const char* busId1, const char* busId2) {
int links = 0;
return CONNECT_NVLINK*links;
}
#endif
+5 -1
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,8 +9,11 @@
#define NCCL_RINGS_H_
static int getDefaultThreads() {
// On Kepler, rings are doubled later.
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
return 256;
#else // On Kepler, rings are doubled later.
return ncclCudaCompCap() == 3 ? 128 : 256;
#endif
}
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
+5 -4
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -39,14 +40,14 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
ncclResult_t res = ncclSuccess;
NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError_t);
CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);
*shmPtr = ptr;
return ncclSuccess;
sysError:
WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
cudaError:
hipError_t:
if (fd != -1) close(fd);
if (create) shm_unlink(shmname);
if (ptr != MAP_FAILED) munmap(ptr, shmsize);
@@ -60,7 +61,7 @@ static ncclResult_t shmUnlink(const char* shmname) {
}
static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
CUDACHECK(cudaHostUnregister(shmPtr));
CUDACHECK(hipHostUnregister(shmPtr));
if (munmap(shmPtr, shmsize) != 0) {
WARN("munmap of shared memory failed");
return ncclSystemError;
+2
Vedi File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,6 +12,7 @@
#include <stdint.h>
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getnHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();