Fix build with hip-clang.
- Add necessary function attribute for HIP programming model.
- Explicitly include hsa headers.
[ROCm/rccl commit: 9369f8d75d]
此提交包含在:
@@ -18,7 +18,7 @@
|
||||
// Each thread sets a predicate to true if abort == 1
|
||||
// all CTA's threads enter the barrier and do a popc on their predicates being True
|
||||
// If any of the thread's predicate was True, all the threads call exit()
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define exitIfAbortBarrier(abort, abortCount) \
|
||||
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
|
||||
__syncthreads(); \
|
||||
@@ -102,7 +102,7 @@ static const __device__ constexpr ncclFunc_t ncclFuncs[]{
|
||||
|
||||
template<unsigned short f, unsigned short l>
|
||||
struct Caller {
|
||||
static
|
||||
static __device__ __host__
|
||||
void call(ncclColl* const c) noexcept
|
||||
{
|
||||
constexpr unsigned short m = f + (l - f) / 2;
|
||||
@@ -113,7 +113,7 @@ struct Caller {
|
||||
|
||||
template<unsigned short f>
|
||||
struct Caller<f, f + 1>{
|
||||
static
|
||||
static __device__ __host__
|
||||
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
|
||||
};
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
|
||||
|
||||
typedef uint64_t PackType;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
template<class FUNC, typename T>
|
||||
struct MULTI {
|
||||
@@ -205,7 +205,7 @@ struct MULTI<FUNC, int64_t> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
template<typename T> inline __device__
|
||||
T vFetch(const volatile T* ptr) {
|
||||
@@ -217,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
|
||||
*ptr = val;
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
|
||||
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
|
||||
template<> inline __device__
|
||||
half vFetch<half>(const volatile half* ptr) {
|
||||
half r;
|
||||
@@ -254,7 +254,7 @@ struct MULTI128 {
|
||||
};
|
||||
|
||||
inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
v.x = p->x;
|
||||
v.y = p->y;
|
||||
#else
|
||||
@@ -262,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
|
||||
#endif
|
||||
}
|
||||
inline __device__ void Store128(Pack128* p, Pack128& v) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
p->x = v.x;
|
||||
p->y = v.y;
|
||||
#else
|
||||
|
||||
@@ -60,7 +60,7 @@ class ncclPrimitives {
|
||||
__device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
|
||||
|
||||
__device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
@@ -358,7 +358,7 @@ class ncclLLPrimitives {
|
||||
__device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
|
||||
__device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
// Exit If Abort Barrier : make sure all threads exit consistently
|
||||
// Each thread sets a predicate to true if val == 1
|
||||
@@ -379,7 +379,7 @@ class ncclLLPrimitives {
|
||||
#endif
|
||||
|
||||
__device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
__syncthreads();
|
||||
#else
|
||||
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
|
||||
@@ -447,7 +447,7 @@ class ncclLLPrimitives {
|
||||
uint32_t data1, flag1, data2, flag2;
|
||||
spins = 0;
|
||||
mismatch = 0;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
do {
|
||||
@@ -468,7 +468,7 @@ class ncclLLPrimitives {
|
||||
}
|
||||
|
||||
__device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = val & 0xffffffff;
|
||||
@@ -529,7 +529,7 @@ class ncclLLPrimitives {
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
exitIfAbortBarrier(abort, abortCount);
|
||||
#else
|
||||
exitIfAbortLocalBarrier();
|
||||
|
||||
@@ -19,7 +19,7 @@ struct FuncNull {
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
//we really don't need any specializations and we don't need
|
||||
//to break things into uint32_t
|
||||
@@ -180,7 +180,7 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
|
||||
template<>
|
||||
struct FuncSum<int8_t> {
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
@@ -198,7 +198,7 @@ struct FuncSum<int8_t> {
|
||||
template<>
|
||||
struct FuncSum<uint8_t> {
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
@@ -250,7 +250,7 @@ template<>
|
||||
struct FuncMax<int8_t> {
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
@@ -276,7 +276,7 @@ template<>
|
||||
struct FuncMax<uint8_t> {
|
||||
union converter { uint32_t storage; uchar4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
@@ -303,7 +303,7 @@ template<>
|
||||
struct FuncMin<int8_t> {
|
||||
union converter { uint32_t storage; char4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
@@ -329,7 +329,7 @@ template<>
|
||||
struct FuncMin<uint8_t> {
|
||||
union converter { uint32_t storage; uchar4 a; };
|
||||
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
|
||||
int32_t rv, z=0;
|
||||
@@ -436,6 +436,6 @@ struct FuncMin<half> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
|
||||
#endif // REDUCE_KERNEL_H_
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
// Convert volatile access to atomic
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
|
||||
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
|
||||
#else
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#define NCCL_RINGS_H_
|
||||
|
||||
static int getDefaultThreads() {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
return 256;
|
||||
#else // On Kepler, rings are doubled later.
|
||||
return ncclCudaCompCap() == 3 ? 128 : 256;
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
#include "checks.h"
|
||||
#include "enqueue.h"
|
||||
#include "topo.h"
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#include "nvlink_stub.h"
|
||||
#else
|
||||
#include "nvlink.h"
|
||||
@@ -52,7 +52,7 @@ FILE *ncclDebugFile = stdout;
|
||||
std::chrono::high_resolution_clock::time_point ncclEpoch;
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
|
||||
#else
|
||||
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
|
||||
@@ -150,7 +150,7 @@ NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0);
|
||||
int ncclThreadThreshold(int minCompCap, int multiNode) {
|
||||
int threshold = ncclParamThreadThreshold();
|
||||
if (threshold == -2) { // user has not set this env variable
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
threshold = NCCL_THREAD_THRESHOLD_VEGA;
|
||||
#else
|
||||
threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
|
||||
@@ -314,7 +314,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->llThreshold = ncclParamLlThreshold();
|
||||
comm->treeThreshold = ncclParamTreeThreshold();
|
||||
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
||||
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
comm->groupCudaStream = ncclParamGroupCudaStream();
|
||||
#else
|
||||
// Don't allow the user to overload the default setting in older CUDA builds
|
||||
@@ -355,7 +355,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
|
||||
}
|
||||
|
||||
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
|
||||
#else
|
||||
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
|
||||
@@ -383,7 +383,7 @@ static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commH
|
||||
// NVML device number. Then we get the busID from NVML to be sure it is
|
||||
// consistent with NVML remote PCI bus Ids.
|
||||
CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#else
|
||||
nvmlDevice_t nvmlDevice;
|
||||
NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
|
||||
|
||||
@@ -171,7 +171,7 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#define DEFAULT_MIN_NRINGS 2
|
||||
#elif defined(__PPC__)
|
||||
// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
|
||||
@@ -381,7 +381,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
|
||||
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
|
||||
*nrings = maxNrings;
|
||||
} else {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
int defaultMinNrings = 1;
|
||||
#else
|
||||
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
|
||||
|
||||
@@ -197,7 +197,7 @@ ncclResult_t ncclIbPciPath(int dev, char** path) {
|
||||
ncclResult_t ncclIbGdrSupport(int ibDev) {
|
||||
static int moduleLoaded = -1;
|
||||
if (moduleLoaded == -1) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
|
||||
#else
|
||||
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
|
||||
|
||||
@@ -13,8 +13,10 @@
|
||||
#include <unistd.h>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <ctype.h>
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
#include "nvlink_stub.h"
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
#else
|
||||
#include "nvlink.h"
|
||||
#endif
|
||||
@@ -110,7 +112,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
|
||||
}
|
||||
if (p2p == 0) return ncclSuccess;
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
uint32_t link_type, hops;
|
||||
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
|
||||
p2p = 0;
|
||||
@@ -297,7 +299,7 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
|
||||
}
|
||||
|
||||
// Duplicate the rings for direct NVLink
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
|
||||
#else
|
||||
compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
|
||||
|
||||
新增問題並參考
封鎖使用者