Fix build with hip-clang.

- Add necessary function attribute for HIP programming model. - Explicitly include hsa headers. [ROCm/rccl commit: 9369f8d75d]
2019-08-15 12:15:30 -04:00
@@ -18,7 +18,7 @@
 // Each thread sets a predicate to true if abort == 1
 // all CTA's threads enter the barrier and do a popc on their predicates being True
 // If any of the thread's predicate was True, all the threads call exit()
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #define exitIfAbortBarrier(abort, abortCount) \
  if (abort) __atomic_fetch_add(abortCount, 1, __ATOMIC_SEQ_CST); \
  __syncthreads(); \
@@ -102,7 +102,7 @@ static const __device__ constexpr ncclFunc_t ncclFuncs[]{

 template<unsigned short f, unsigned short l>
 struct Caller {
-  static
+  static __device__ __host__
  void call(ncclColl* const c) noexcept
  {
    constexpr unsigned short m = f + (l - f) / 2;
@@ -113,7 +113,7 @@ struct Caller {

 template<unsigned short f>
 struct Caller<f, f + 1>{
-  static
+  static __device__ __host__
  void call(struct ncclColl* const c) noexcept { ncclFuncs[f](&c->args); }
 };

@@ -19,7 +19,7 @@ static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }

 typedef uint64_t PackType;

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 template<class FUNC, typename T>
 struct MULTI {
@@ -205,7 +205,7 @@ struct MULTI<FUNC, int64_t> {
  }
 };

-#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#endif //defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
@@ -217,7 +217,7 @@ void vStore(volatile T* ptr, const T val) {
  *ptr = val;
 }

-#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__))
+#if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
 template<> inline __device__
 half vFetch<half>(const volatile half* ptr) {
  half r;
@@ -254,7 +254,7 @@ struct MULTI128 {
 };

 inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  v.x = p->x;
  v.y = p->y;
 #else
@@ -262,7 +262,7 @@ inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
 #endif
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  p->x = v.x;
  p->y = v.y;
 #else
@@ -60,7 +60,7 @@ class ncclPrimitives {
  __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }

  __device__ void barrier() {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    __syncthreads();
 #else
    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
@@ -358,7 +358,7 @@ class ncclLLPrimitives {
  __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
  __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
  // Exit If Abort Barrier : make sure all threads exit consistently
  // Each thread sets a predicate to true if val == 1
@@ -379,7 +379,7 @@ class ncclLLPrimitives {
 #endif

  __device__ void barrier() {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    __syncthreads();
 #else
    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
@@ -447,7 +447,7 @@ class ncclLLPrimitives {
    uint32_t data1, flag1, data2, flag2;
    spins = 0;
    mismatch = 0;
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    using Vec = uint32_t __attribute__((ext_vector_type(4)));
    Vec i4;
    do {
@@ -468,7 +468,7 @@ class ncclLLPrimitives {
  }

  __device__ __attribute__((noinline)) void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  using Vec = uint32_t __attribute__((ext_vector_type(4)));
  Vec i4;
  i4[0] = val & 0xffffffff;
@@ -529,7 +529,7 @@ class ncclLLPrimitives {
        }
      }
    }
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    exitIfAbortBarrier(abort, abortCount);
 #else
    exitIfAbortLocalBarrier();
@@ -19,7 +19,7 @@ struct FuncNull {
  }
 };

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 //we really don't need any specializations and we don't need
 //to break things into uint32_t
@@ -180,7 +180,7 @@ static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
 template<>
 struct FuncSum<int8_t> {
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
@@ -198,7 +198,7 @@ struct FuncSum<int8_t> {
 template<>
 struct FuncSum<uint8_t> {
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
@@ -250,7 +250,7 @@ template<>
 struct FuncMax<int8_t> {
  union converter { uint32_t storage; char4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
@@ -276,7 +276,7 @@ template<>
 struct FuncMax<uint8_t> {
  union converter { uint32_t storage; uchar4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
@@ -303,7 +303,7 @@ template<>
 struct FuncMin<int8_t> {
  union converter { uint32_t storage; char4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
@@ -329,7 +329,7 @@ template<>
 struct FuncMin<uint8_t> {
  union converter { uint32_t storage; uchar4 a; };
  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
    int32_t rv, z=0;
@@ -436,6 +436,6 @@ struct FuncMin<half> {
  }
 };

-#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#endif // defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)

 #endif // REDUCE_KERNEL_H_
@@ -12,7 +12,7 @@
 #include <stdint.h>

 // Convert volatile access to atomic
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST)
 #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST)
 #else
@@ -9,7 +9,7 @@
 #define NCCL_RINGS_H_

 static int getDefaultThreads() {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  return 256;
 #else  // On Kepler, rings are doubled later.
  return ncclCudaCompCap() == 3 ? 128 : 256;
@@ -20,7 +20,7 @@
 #include "checks.h"
 #include "enqueue.h"
 #include "topo.h"
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #include "nvlink_stub.h"
 #else
 #include "nvlink.h"
@@ -52,7 +52,7 @@ FILE *ncclDebugFile = stdout;
 std::chrono::high_resolution_clock::time_point ncclEpoch;
 #endif

-#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
 #else
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@@ -150,7 +150,7 @@ NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", 0);
 int ncclThreadThreshold(int minCompCap, int multiNode) {
  int threshold = ncclParamThreadThreshold();
  if (threshold == -2) { // user has not set this env variable
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    threshold = NCCL_THREAD_THRESHOLD_VEGA;
 #else
    threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
@@ -314,7 +314,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  comm->llThreshold = ncclParamLlThreshold();
  comm->treeThreshold = ncclParamTreeThreshold();
  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  comm->groupCudaStream = ncclParamGroupCudaStream();
 #else
  // Don't allow the user to overload the default setting in older CUDA builds
@@ -355,7 +355,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
 }

 // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+hip"
 #else
 #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
@@ -383,7 +383,7 @@ static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commH
  // NVML device number. Then we get the busID from NVML to be sure it is
  // consistent with NVML remote PCI bus Ids.
  CUDACHECK(hipDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
  nvmlDevice_t nvmlDevice;
  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
@@ -171,7 +171,7 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo
  }
 }

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #define DEFAULT_MIN_NRINGS 2
 #elif defined(__PPC__)
 // Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes
@@ -381,7 +381,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
    if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
    *nrings = maxNrings;
  } else {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    int defaultMinNrings = 1;
 #else
    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
@@ -197,7 +197,7 @@ ncclResult_t ncclIbPciPath(int dev, char** path) {
 ncclResult_t ncclIbGdrSupport(int ibDev) {
  static int moduleLoaded = -1;
  if (moduleLoaded == -1) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
    moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
 #else
    moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
@@ -13,8 +13,10 @@
 #include <unistd.h>
 #include <hip/hip_runtime.h>
 #include <ctype.h>
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #include "nvlink_stub.h"
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
 #else
 #include "nvlink.h"
 #endif
@@ -110,7 +112,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
  }
  if (p2p == 0) return ncclSuccess;

-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  uint32_t link_type, hops;
  if (hipExtGetLinkTypeAndHopCount(myInfo->cudaDev, peerInfo->cudaDev, &link_type, &hops) != hipSuccess) {
    p2p = 0;
@@ -297,7 +299,7 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
  }

  // Duplicate the rings for direct NVLink
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
  compNrings = copyRings(nranks, rings, compNrings, compNrings*3);
 #else
  compNrings = copyRings(nranks, rings, compNrings, compNrings*2);