d97a32fac8
Add support for IB SHARP to NVLS (NVLink SHARP algorithm). Add NVLS+Tree algorithm. Add support for memory management using cuMem* functions. Use all NICs for Send/Receive operations on systems with more than one NIC per GPU (#804). Add ncclCommSplit primitive, with resource sharing option in config. Fix alltoallv hang (#788) Increase number of channels on H100 when we're not limited by NVLink. Improve error reporting in case of IB failure, printing local and remote ID (#779). Add build option to allow compilation against RDMA includes instead of dynamically loading IB verbs symbols (#802). Fix context creation for progress thread (#803). NET/IB: add option to use multiple QPs in round-robin mode. Fix tree performance issue when NVB is disabled on HCM topologies.
48 γραμμές
1.2 KiB
C++
48 γραμμές
1.2 KiB
C++
/*************************************************************************
|
|
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* See LICENSE.txt for license information
|
|
************************************************************************/
|
|
|
|
#ifndef NCCL_ALIGN_H_
|
|
#define NCCL_ALIGN_H_
|
|
|
|
#define DIVUP(x, y) \
|
|
(((x)+(y)-1)/(y))
|
|
|
|
#define ROUNDUP(x, y) \
|
|
(DIVUP((x), (y))*(y))
|
|
|
|
#define ALIGN_POWER(x, y) \
|
|
((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
|
|
|
|
#define ALIGN_SIZE(size, align) \
|
|
size = ((size + (align) - 1) / (align)) * (align);
|
|
|
|
#if !__CUDA_ARCH__
|
|
#ifndef __host__
|
|
#define __host__
|
|
#endif
|
|
#ifndef __device__
|
|
#define __device__
|
|
#endif
|
|
#endif
|
|
|
|
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
|
__host__ __device__ constexpr Z divUp(X x, Y y) {
|
|
return (x+y-1)/y;
|
|
}
|
|
|
|
template<typename X, typename Y, typename Z = decltype(X()+Y())>
|
|
__host__ __device__ constexpr Z roundUp(X x, Y y) {
|
|
return (x+y-1) - (x+y-1)%y;
|
|
}
|
|
|
|
// assumes second argument is a power of 2
|
|
template<typename X, typename Z = decltype(X()+int())>
|
|
__host__ __device__ constexpr Z alignUp(X x, int a) {
|
|
return (x+a-1) & Z(-a);
|
|
}
|
|
|
|
#endif
|