Fix build with hip-clang.

- Add necessary function attribute for HIP programming model.
- Explicitly include hsa headers.


[ROCm/rccl commit: 9369f8d75d]
此提交包含在:
Michael LIAO
2019-08-15 12:15:30 -04:00
父節點 c920272a9e
當前提交 f4a240065f
共有 10 個檔案被更改,包括 38 行新增36 行删除
+3 -3
查看文件
@@ -18,7 +18,7 @@
// Each thread sets a predicate to true if abort == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
// If any of the thread's predicate was True, all the threads call exit()
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define exitIfAbortBarrier(abort, abortCount) \
if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
__syncthreads(); \
@@ -102,7 +102,7 @@ static const __device__ constexpr ncclFunc_t ncclFuncs[]{
template<unsigned short f, unsigned short l>
struct Caller {
static
static __device__ __host__
void call(ncclColl* const c) noexcept
{
constexpr unsigned short m = f + (l - f) / 2;
@@ -113,7 +113,7 @@ struct Caller {
template<unsigned short f>
struct Caller<f, f + 1>{
static
static __device__ __host__
void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
};
+5 -5
查看文件
@@ -19,7 +19,7 @@ static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
typedef uint64_t PackType;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
template<class FUNC, typename T>
struct MULTI {
@@ -205,7 +205,7 @@ struct MULTI<FUNC, int64_t> {
}
};
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
@@ -217,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
*ptr = val;
}
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
template<> inline __device__
half vFetch<half>(const volatile half* ptr) {
half r;
@@ -254,7 +254,7 @@ struct MULTI128 {
};
inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
v.x = p->x;
v.y = p->y;
#else
@@ -262,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
#endif
}
inline __device__ void Store128(Pack128* p, Pack128& v) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
p->x = v.x;
p->y = v.y;
#else
+6 -6
查看文件
@@ -60,7 +60,7 @@ class ncclPrimitives {
__device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
__device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
__syncthreads();
#else
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
@@ -358,7 +358,7 @@ class ncclLLPrimitives {
__device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
__device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
// Exit If Abort Barrier : make sure all threads exit consistently
// Each thread sets a predicate to true if val == 1
@@ -379,7 +379,7 @@ class ncclLLPrimitives {
#endif
__device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
__syncthreads();
#else
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
@@ -447,7 +447,7 @@ class ncclLLPrimitives {
uint32_t data1, flag1, data2, flag2;
spins = 0;
mismatch = 0;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
do {
@@ -468,7 +468,7 @@ class ncclLLPrimitives {
}
__device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = val & 0xffffffff;
@@ -529,7 +529,7 @@ class ncclLLPrimitives {
}
}
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
exitIfAbortBarrier(abort, abortCount);
#else
exitIfAbortLocalBarrier();
+8 -8
查看文件
@@ -19,7 +19,7 @@ struct FuncNull {
}
};
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
//we really don't need any specializations and we don't need
//to break things into uint32_t
@@ -180,7 +180,7 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
template<>
struct FuncSum<int8_t> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
@@ -198,7 +198,7 @@ struct FuncSum<int8_t> {
template<>
struct FuncSum<uint8_t> {
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
@@ -250,7 +250,7 @@ template<>
struct FuncMax<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
@@ -276,7 +276,7 @@ template<>
struct FuncMax<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
@@ -303,7 +303,7 @@ template<>
struct FuncMin<int8_t> {
union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
@@ -329,7 +329,7 @@ template<>
struct FuncMin<uint8_t> {
union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
@@ -436,6 +436,6 @@ struct FuncMin<half> {
}
};
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#endif // REDUCE_KERNEL_H_
+1 -1
查看文件
@@ -12,7 +12,7 @@
#include <stdint.h>
// Convert volatile access to atomic
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
#define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
#else
+1 -1
查看文件
@@ -9,7 +9,7 @@
#define NCCL_RINGS_H_
static int getDefaultThreads() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
return 256;
#else // On Kepler, rings are doubled later.
return ncclCudaCompCap() == 3 ? 128 : 256;
+6 -6
查看文件
@@ -20,7 +20,7 @@
#include "checks.h"
#include "enqueue.h"
#include "topo.h"
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#include "nvlink_stub.h"
#else
#include "nvlink.h"
@@ -52,7 +52,7 @@ FILE *ncclDebugFile = stdout;
std::chrono::high_resolution_clock::time_point ncclEpoch;
#endif
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
#else
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@@ -150,7 +150,7 @@ NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0);
int ncclThreadThreshold(int minCompCap, int multiNode) {
int threshold = ncclParamThreadThreshold();
if (threshold == -2) { // user has not set this env variable
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
threshold = NCCL_THREAD_THRESHOLD_VEGA;
#else
threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
@@ -314,7 +314,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->llThreshold = ncclParamLlThreshold();
comm->treeThreshold = ncclParamTreeThreshold();
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
comm->groupCudaStream = ncclParamGroupCudaStream();
#else
// Don't allow the user to overload the default setting in older CUDA builds
@@ -355,7 +355,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
}
// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
#else
#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
@@ -383,7 +383,7 @@ static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commH
// NVML device number. Then we get the busID from NVML to be sure it is
// consistent with NVML remote PCI bus Ids.
CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
nvmlDevice_t nvmlDevice;
NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
+2 -2
查看文件
@@ -171,7 +171,7 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo
}
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define DEFAULT_MIN_NRINGS 2
#elif defined(__PPC__)
// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
@@ -381,7 +381,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
*nrings = maxNrings;
} else {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
int defaultMinNrings = 1;
#else
int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
+1 -1
查看文件
@@ -197,7 +197,7 @@ ncclResult_t ncclIbPciPath(int dev, char** path) {
ncclResult_t ncclIbGdrSupport(int ibDev) {
static int moduleLoaded = -1;
if (moduleLoaded == -1) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
#else
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
+5 -3
查看文件
@@ -13,8 +13,10 @@
#include <unistd.h>
#include <hip/hip_runtime.h>
#include <ctype.h>
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#include "nvlink_stub.h"
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#else
#include "nvlink.h"
#endif
@@ -110,7 +112,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
}
if (p2p == 0) return ncclSuccess;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
uint32_t link_type, hops;
if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
p2p = 0;
@@ -297,7 +299,7 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
}
// Duplicate the rings for direct NVLink
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
#else
compNrings = copyRings(nranks, rings, compNrings, compNrings*2);