RCCL 2.4 update

2019-07-05 15:43:00 -07:00
parent 4d579e51cc
commit f11c8f60cd
@@ -12,14 +12,14 @@
 #include <sys/mman.h>

 static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  CUDACHECK(hipHostMalloc(ptr, size, hipHostMallocMapped));
  memset(*ptr, 0, size);
  *devPtr = *ptr;
  return ncclSuccess;
 }

 static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
+  CUDACHECK(hipHostFree(ptr));
  return ncclSuccess;
 }

@@ -36,15 +36,18 @@ static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
 }

 template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) {
+  if (isFineGrain)
+    CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained));
+  else
+    CUDACHECK(hipMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T)));
  return ncclSuccess;
 }

 template <typename T>
 static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  CUDACHECK(hipMemcpy(dst, src, nelem*sizeof(T), hipMemcpyDefault));
  return ncclSuccess;
 }

@@ -11,17 +11,17 @@

 // Check CUDA calls
 #define CUDACHECK(cmd) do {                                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
        return ncclUnhandledCudaError;                      \
    }                                                       \
 } while(false)

 #define CUDACHECKGOTO(cmd, res, label) do {                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+    hipError_t e = cmd;                                    \
+    if( e != hipSuccess ) {                                \
+        WARN("Cuda failure '%s'", hipGetErrorString(e));   \
        res = ncclUnhandledCudaError;                       \
        goto label;                                         \
    }                                                       \
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -7,21 +8,10 @@
 #ifndef NCCL_COMM_H_
 #define NCCL_COMM_H_

-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
-  void *func;
-  dim3 gridDim;
-  dim3 blockDim;
-  void **args;
-  size_t sharedMem;
-  cudaStream_t stream;
-};
-#endif
-
 #define MAXCHANNELS 16
 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */

-#define CACHE_LINE_SIZE 128
+#define CACHE_LINE_SIZE 64
 #define MEM_ALIGN 4096
 #define CUDA_IPC_MIN 2097152UL

@@ -66,9 +56,9 @@ struct ncclComm {
  int nvmlDev; // my NVML device number

  enum { GROUP, PARALLEL } launchMode;
-  cudaStream_t userStream;
+  hipStream_t userStream;
  bool userStreamSet;
-  cudaEvent_t doneEvent;
+  hipEvent_t doneEvent;
  bool checkPointers;

  // Counter to make sure collectives match (needed for bcast/reduce
@@ -88,7 +78,7 @@ struct ncclComm {

  // An internal CUDA stream for NCCL kernel CGMD launches
  int groupCudaStream;
-  cudaStream_t groupStream;
+  hipStream_t groupStream;

  // Whether there has been a fatal error in this communicator.
  ncclResult_t fatalError;
@@ -111,13 +101,13 @@ struct ncclComm {
  int intraPhase;

  // Storage for deferred intra-process launch
-  struct cudaLaunchParams * intraParams;
-  struct cudaLaunchParams *myParams;
+  hipLaunchParams * intraParams;
+  hipLaunchParams *myParams;
  int* intraCudaDevs;
  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
  struct ncclColl args;
-  void* argsptr;
+  struct ncclColl* argsptr;

  // Global proxy thread
  pthread_t proxyThread;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -10,6 +11,15 @@
 #include "nccl.h"
 #include <stdint.h>

+// Convert volatile access to atomic
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
+#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
+#else
+#define LOAD(VAR) *(VAR)
+#define STORE(DST, SRC) *(DST) = (SRC)
+#endif
+
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8

@@ -73,6 +83,12 @@ struct ncclConnInfo {
  // Low latency mechanism
  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
  uint64_t llLastCleaning;
+
+  // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+  // allows software to explicitly initiate a flush read to HDP memory. See more
+  // descriptions in primitives.h.
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+  uint32_t* curr_hdp_reg;  // Curr GPU in ring (for rdma transport use only)
 };

 struct ncclConnector {
@@ -111,6 +127,8 @@ struct ncclPeer {

 struct ncclDevComm;

+#pragma pack(push)  /* push current alignment to stack */
+#pragma pack(4)     /* set alignment to 4 bytes boundary */
 /* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
 /* to make sure reads to host from the CUDA kernel are aligned. */
 /* Make sure to adjust padding at the end of ncclColl. */
@@ -165,14 +183,56 @@ struct ncclChannel {
      int collCount;
      int collFifoHead; // Only used by GPU
      int collFifoTail; // Only used by CPU
+
+      uint32_t* abortCount;
    };
    int data[0x80];
  };
 };
 static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+#pragma pack(pop)   /* restore original alignment from stack */

 #define MAXCHANNELS 16

+#ifdef ENABLE_PROFILING
+struct ncclProf {
+  union {
+    struct {
+      uint64_t total_cycle;
+      uint64_t wait_send_cycle[MAXCHANNELS];
+      uint64_t wait_recv_cycle[MAXCHANNELS];
+      // primtive cycles
+      uint64_t send_cycle;
+      uint64_t directSend_cycle;
+      uint64_t recv_cycle;
+      uint64_t directRecv_cycle;
+      uint64_t copySend_cycle;
+      uint64_t directCopySend_cycle;
+      uint64_t recvCopySend_cycle;
+      uint64_t directRecvCopySend_cycle;
+      uint64_t recvReduceCopy_cycle;
+      uint64_t recvReduceSend_cycle;
+      uint64_t recvReduceCopySend_cycle;
+      uint64_t directRecvReduceCopySend_cycle;
+      // primitive bytes
+      uint64_t send_byte;
+      uint64_t directSend_byte;
+      uint64_t recv_byte;
+      uint64_t directRecv_byte;
+      uint64_t copySend_byte;
+      uint64_t directCopySend_byte;
+      uint64_t recvCopySend_byte;
+      uint64_t directRecvCopySend_byte;
+      uint64_t recvReduceCopy_byte;
+      uint64_t recvReduceSend_byte;
+      uint64_t recvReduceCopySend_byte;
+      uint64_t directRecvReduceCopySend_byte;
+    };
+    int data[0x80];
+  };
+};
+#endif
+
 typedef enum {
  ncclDevSuccess,
  ncclDevAssertedMismatch,
@@ -189,6 +249,11 @@ struct ncclDevComm {

  // Channels, device side
  struct ncclChannel* channels;
+
+#ifdef ENABLE_PROFILING
+  // Profiling counters
+  struct ncclProf* devProf;
+#endif
 };

 #endif
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -12,9 +13,9 @@

 // Channels / LL tuning
 #define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
+#define NCCL_THREAD_THRESHOLD 256  // Per thread size before we switch to non-LL
 #define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MIN_NTHREADS 64
+#define NCCL_LL_MIN_NTHREADS 256

 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -18,7 +19,7 @@ typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueI
 ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);

 typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);

 ncclResult_t ncclAsyncColl(ncclComm_t comm);
 #endif
@@ -31,7 +31,7 @@ struct ncclInfo {
  ncclRedOp_t op;
  int root;
  ncclComm_t comm;
-  cudaStream_t stream;
+  hipStream_t stream;
  // Algorithm details
  int chunkSteps;
  int sliceSteps;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -58,9 +59,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
 /* Get the maximum number of NVLinks based on the GPU generation */
 static ncclResult_t getMaxNvlinks(int* maxLinks) {
  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUDACHECK(hipGetDevice(&cudaDev));
  int ccMajor;
-  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
+  CUDACHECK(hipDeviceGetAttribute(&ccMajor, hipDeviceAttributeComputeCapabilityMajor, cudaDev));
  // 6 for Volta, 4 for Pascal
  *maxLinks = (ccMajor > 6) ? 6 : 4;
  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
@@ -0,0 +1,30 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVLINK_H_
+#define NCCL_NVLINK_H_
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "nvmlwrap.h"
+#include "topo.h"
+
+#define CONNECT_NVLINK 0x10
+#define CONNECT_NVSWITCH 0x100
+
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static int getNvlinkGpu(const char* busId1, const char* busId2) {
+  int links = 0;
+  return CONNECT_NVLINK*links;
+}
+
+#endif
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -8,8 +9,11 @@
 #define NCCL_RINGS_H_

 static int getDefaultThreads() {
-  // On Kepler, rings are doubled later.
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+  return 256;
+#else  // On Kepler, rings are doubled later.
  return ncclCudaCompCap() == 3 ? 128 : 256;
+#endif
 }

 ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -39,14 +40,14 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
  ncclResult_t res = ncclSuccess;

  NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
-  CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
-  CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+  CUDACHECKGOTO(hipHostRegister(ptr, shmsize, hipHostRegisterMapped), res, hipError_t);
+  CUDACHECKGOTO(hipHostGetDevicePointer(devShmPtr, ptr, 0), res, hipError_t);

  *shmPtr = ptr;
  return ncclSuccess;
 sysError:
  WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
-cudaError:
+hipError_t:
  if (fd != -1) close(fd);
  if (create) shm_unlink(shmname);
  if (ptr != MAP_FAILED) munmap(ptr, shmsize);
@@ -60,7 +61,7 @@ static ncclResult_t shmUnlink(const char* shmname) {
 }

 static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
-  CUDACHECK(cudaHostUnregister(shmPtr));
+  CUDACHECK(hipHostUnregister(shmPtr));
  if (munmap(shmPtr, shmsize) != 0) {
    WARN("munmap of shared memory failed");
    return ncclSystemError;
@@ -1,5 +1,6 @@
 /*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -11,6 +12,7 @@
 #include <stdint.h>

 ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+uint64_t getnHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();