From f4a240065fbdb67cf45f3cf91a395b023f200acd Mon Sep 17 00:00:00 2001 From: Michael LIAO Date: Thu, 15 Aug 2019 12:15:30 -0400 Subject: [PATCH] Fix build with hip-clang. - Add necessary function attribute for HIP programming model. - Explicitly include hsa headers. [ROCm/rccl commit: 9369f8d75dc5cdc816e9576c01b224d8126edd39] --- projects/rccl/src/collectives/device/common.h | 6 +++--- .../rccl/src/collectives/device/common_kernel.h | 10 +++++----- .../rccl/src/collectives/device/primitives.h | 12 ++++++------ .../rccl/src/collectives/device/reduce_kernel.h | 16 ++++++++-------- projects/rccl/src/include/devcomm.h | 2 +- projects/rccl/src/include/rings.h | 2 +- projects/rccl/src/init.cc | 12 ++++++------ projects/rccl/src/misc/rings.cc | 4 ++-- projects/rccl/src/transport/net_ib.cc | 2 +- projects/rccl/src/transport/p2p.cc | 8 +++++--- 10 files changed, 38 insertions(+), 36 deletions(-) diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index 5fc36ed52a..fd26814b0f 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -18,7 +18,7 @@ // Each thread sets a predicate to true if abort == 1 // all CTA's threads enter the barrier and do a popc on their predicates being True // If any of the thread's predicate was True, all the threads call exit() -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #define exitIfAbortBarrier(abort, abortCount) \ if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \ __syncthreads(); \ @@ -102,7 +102,7 @@ static const __device__ constexpr ncclFunc_t ncclFuncs[]{ template struct Caller { - static + static __device__ __host__ void call(ncclColl* const c) noexcept { constexpr unsigned short m = f + (l - f) / 2; @@ -113,7 +113,7 @@ struct Caller { template struct Caller{ - static + static __device__ __host__ void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); } }; diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h index fbb8df4e95..7cf85671a3 100644 --- a/projects/rccl/src/collectives/device/common_kernel.h +++ b/projects/rccl/src/collectives/device/common_kernel.h @@ -19,7 +19,7 @@ static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } typedef uint64_t PackType; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) template struct MULTI { @@ -205,7 +205,7 @@ struct MULTI { } }; -#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) template inline __device__ T vFetch(const volatile T* ptr) { @@ -217,7 +217,7 @@ void vStore(volatile T* ptr, const T val) { *ptr = val; } -#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)) +#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)) template<> inline __device__ half vFetch(const volatile half* ptr) { half r; @@ -254,7 +254,7 @@ struct MULTI128 { }; inline __device__ void Fetch128(Pack128& v, const Pack128* p) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) v.x = p->x; v.y = p->y; #else @@ -262,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, const Pack128* p) { #endif } inline __device__ void Store128(Pack128* p, Pack128& v) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) p->x = v.x; p->y = v.y; #else diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index 13429f2850..81a4d4cb7f 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -60,7 +60,7 @@ class ncclPrimitives { __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); } __device__ void barrier() { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) __syncthreads(); #else asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); @@ -358,7 +358,7 @@ class ncclLLPrimitives { __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else // Exit If Abort Barrier : make sure all threads exit consistently // Each thread sets a predicate to true if val == 1 @@ -379,7 +379,7 @@ class ncclLLPrimitives { #endif __device__ void barrier() { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) __syncthreads(); #else asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); @@ -447,7 +447,7 @@ class ncclLLPrimitives { uint32_t data1, flag1, data2, flag2; spins = 0; mismatch = 0; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) using Vec = uint32_t __attribute__((ext_vector_type(4))); Vec i4; do { @@ -468,7 +468,7 @@ class ncclLLPrimitives { } __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) using Vec = uint32_t __attribute__((ext_vector_type(4))); Vec i4; i4[0] = val & 0xffffffff; @@ -529,7 +529,7 @@ class ncclLLPrimitives { } } } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) exitIfAbortBarrier(abort, abortCount); #else exitIfAbortLocalBarrier(); diff --git a/projects/rccl/src/collectives/device/reduce_kernel.h b/projects/rccl/src/collectives/device/reduce_kernel.h index 8895c65d2f..4c5caa9f28 100644 --- a/projects/rccl/src/collectives/device/reduce_kernel.h +++ b/projects/rccl/src/collectives/device/reduce_kernel.h @@ -19,7 +19,7 @@ struct FuncNull { } }; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) //we really don't need any specializations and we don't need //to break things into uint32_t @@ -180,7 +180,7 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) { template<> struct FuncSum { __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; @@ -198,7 +198,7 @@ struct FuncSum { template<> struct FuncSum { __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; @@ -250,7 +250,7 @@ template<> struct FuncMax { union converter { uint32_t storage; char4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; @@ -276,7 +276,7 @@ template<> struct FuncMax { union converter { uint32_t storage; uchar4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; @@ -303,7 +303,7 @@ template<> struct FuncMin { union converter { uint32_t storage; char4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; @@ -329,7 +329,7 @@ template<> struct FuncMin { union converter { uint32_t storage; uchar4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; @@ -436,6 +436,6 @@ struct FuncMin { } }; -#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #endif // REDUCE_KERNEL_H_ diff --git a/projects/rccl/src/include/devcomm.h b/projects/rccl/src/include/devcomm.h index bd6559d09c..30eccab7b8 100644 --- a/projects/rccl/src/include/devcomm.h +++ b/projects/rccl/src/include/devcomm.h @@ -12,7 +12,7 @@ #include // Convert volatile access to atomic -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) #else diff --git a/projects/rccl/src/include/rings.h b/projects/rccl/src/include/rings.h index 211e24311f..f634cbe071 100644 --- a/projects/rccl/src/include/rings.h +++ b/projects/rccl/src/include/rings.h @@ -9,7 +9,7 @@ #define NCCL_RINGS_H_ static int getDefaultThreads() { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) return 256; #else // On Kepler, rings are doubled later. return ncclCudaCompCap() == 3 ? 128 : 256; diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 23c1f53c2e..320b5d4f35 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -20,7 +20,7 @@ #include "checks.h" #include "enqueue.h" #include "topo.h" -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #include "nvlink_stub.h" #else #include "nvlink.h" @@ -52,7 +52,7 @@ FILE *ncclDebugFile = stdout; std::chrono::high_resolution_clock::time_point ncclEpoch; #endif -#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream #else #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream @@ -150,7 +150,7 @@ NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0); int ncclThreadThreshold(int minCompCap, int multiNode) { int threshold = ncclParamThreadThreshold(); if (threshold == -2) { // user has not set this env variable -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) threshold = NCCL_THREAD_THRESHOLD_VEGA; #else threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD; @@ -314,7 +314,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->llThreshold = ncclParamLlThreshold(); comm->treeThreshold = ncclParamTreeThreshold(); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; -#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) comm->groupCudaStream = ncclParamGroupCudaStream(); #else // Don't allow the user to overload the default setting in older CUDA builds @@ -355,7 +355,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { } // Pre-process the string so that running "strings" on the lib can quickly reveal the version. -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip" #else #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) @@ -383,7 +383,7 @@ static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commH // NVML device number. Then we get the busID from NVML to be sure it is // consistent with NVML remote PCI bus Ids. CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else nvmlDevice_t nvmlDevice; NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); diff --git a/projects/rccl/src/misc/rings.cc b/projects/rccl/src/misc/rings.cc index ea01e44a18..1fc58f08d0 100644 --- a/projects/rccl/src/misc/rings.cc +++ b/projects/rccl/src/misc/rings.cc @@ -171,7 +171,7 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo } } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #define DEFAULT_MIN_NRINGS 2 #elif defined(__PPC__) // Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes @@ -381,7 +381,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); *nrings = maxNrings; } else { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) int defaultMinNrings = 1; #else int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1; diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index a962248fba..bfb2d8d437 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -197,7 +197,7 @@ ncclResult_t ncclIbPciPath(int dev, char** path) { ncclResult_t ncclIbGdrSupport(int ibDev) { static int moduleLoaded = -1; if (moduleLoaded == -1) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1; #else moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1; diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index 0bc5c5d970..61874c9d42 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -13,8 +13,10 @@ #include #include #include -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #include "nvlink_stub.h" +#include +#include #else #include "nvlink.h" #endif @@ -110,7 +112,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc } if (p2p == 0) return ncclSuccess; -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) uint32_t link_type, hops; if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) { p2p = 0; @@ -297,7 +299,7 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin } // Duplicate the rings for direct NVLink -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) compNrings = copyRings(nranks, rings, compNrings, compNrings*3); #else compNrings = copyRings(nranks, rings, compNrings, compNrings*2);