Move hipclang funcs into corresponding headers

2018-06-15 23:06:40 +00:00
@@ -24,8 +24,12 @@ THE SOFTWARE.
 #define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H

 #include "host_defines.h"
+#include "math_fwd.h"

+#include <hip/hip_runtime_api.h>
 #include <hip/hip_vector_types.h>
+#include <hip/hcc_detail/device_library_decls.h>
+#include <hip/hcc_detail/llvm_intrinsics.h>

 extern "C" __device__ unsigned int __hip_hc_ir_umul24_int(unsigned int, unsigned int);
 extern "C" __device__ signed int __hip_hc_ir_mul24_int(signed int, signed int);
@@ -209,5 +213,257 @@ __device__ char4 __hip_hc_add8pk(char4, char4);
 __device__ char4 __hip_hc_sub8pk(char4, char4);
 __device__ char4 __hip_hc_mul8pk(char4, char4);

+#if defined(__HCC__)
+#define __HCC_OR_HIP_CLANG__ 1
+#elif defined(__clang__) && defined(__HIP__)
+#define __HCC_OR_HIP_CLANG__ 1
+#else
+#define __HCC_OR_HIP_CLANG__ 0
+#endif
+
+#ifdef __HCC_OR_HIP_CLANG__
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+// Clock functions
+__device__
+inline
+long long int __clock64() { return (long long int) __builtin_amdgcn_s_memrealtime(); }
+
+__device__
+inline
+long long int  __clock() { return (long long int) __builtin_amdgcn_s_memrealtime(); }
+
+// hip.amdgcn.bc - named sync
+__device__
+inline
+void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
+
+#endif // __HIP_DEVICE_COMPILE__
+
+// warp vote function __all __any __ballot
+__device__
+int __all(int input);
+__device__
+int __any(int input);
+__device__
+unsigned long long int __ballot(int input);
+
+__device__
+inline
+uint64_t __ballot64(int a) {
+    int64_t s;
+    // define i64 @__ballot64(i32 %a) #0 {
+    //   %b = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %a) #1
+    //   ret i64 %b
+    // }
+    __asm("v_cmp_ne_i32_e64 $0, 0, $1" : "=s"(s) : "v"(a));
+    return s;
+}
+
+// hip.amdgcn.bc - lanemask
+__device__
+inline
+int64_t  __lanemask_gt()
+{
+    int32_t activelane = __ockl_activelane_u32();
+    int64_t ballot = __ballot64(1);
+    if (activelane != 63) {
+        int64_t tmp = (~0UL) << (activelane + 1);
+        return tmp & ballot;
+    }
+    return 0;
+}
+
+__device__
+inline
+int64_t __lanemask_lt()
+{
+    int32_t activelane = __ockl_activelane_u32();
+    int64_t ballot = __ballot64(1);
+    if (activelane == 0)
+        return 0;
+    return ballot;
+}
+
+__device__
+inline
+void* __get_dynamicgroupbaseptr()
+{
+    // Get group segment base pointer.
+    return (char*)__local_to_generic(__to_local(__llvm_amdgcn_groupstaticsize()));
+}
+
+__device__
+inline
+void *__amdgcn_get_dynamicgroupbaseptr() {
+    return __get_dynamicgroupbaseptr();
+}
+
+#endif // __HCC_OR_HIP_CLANG__
+
+#ifdef __HCC__
+
+/**
+ * extern __shared__
+ */
+
+// Macro to replace extern __shared__ declarations
+// to local variable definitions
+#define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
+
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+
+#elif defined(__clang__) && defined(__HIP__)
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
+  __attribute__((weak))
+
+__DEVICE__
+inline
+void __assert_fail(const char * __assertion,
+                                     const char *__file,
+                                     unsigned int __line,
+                                     const char *__function)
+{
+    // Ignore all the args for now.
+    __builtin_trap();
+}
+
+__DEVICE__
+inline
+void __assertfail(const char * __assertion,
+                  const char *__file,
+                  unsigned int __line,
+                  const char *__function,
+                  size_t charsize)
+{
+    // ignore all the args for now.
+    __builtin_trap();
+}
+
+// hip.amdgcn.bc - sync threads
+#define __CLK_LOCAL_MEM_FENCE    0x01
+typedef unsigned __cl_mem_fence_flags;
+
+typedef enum __memory_scope {
+  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+} __memory_scope;
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum __memory_order
+{
+  __memory_order_relaxed = __ATOMIC_RELAXED,
+  __memory_order_acquire = __ATOMIC_ACQUIRE,
+  __memory_order_release = __ATOMIC_RELEASE,
+  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  __memory_order_seq_cst = __ATOMIC_SEQ_CST
+} __memory_order;
+
+__device__
+inline
+static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
+{
+    if (flags) {
+        switch(scope) {
+          case __memory_scope_work_item:        break;
+          case __memory_scope_sub_group:        __llvm_fence_rel_sg(); break;
+          case __memory_scope_work_group:       __llvm_fence_rel_wg(); break;
+          case __memory_scope_device:           __llvm_fence_rel_dev(); break;
+          case __memory_scope_all_svm_devices:  __llvm_fence_rel_sys(); break;
+        }
+        //atomic_work_item_fence(flags, memory_order_release, scope);
+        __builtin_amdgcn_s_barrier();
+        //atomic_work_item_fence(flags, memory_order_acquire, scope);
+        switch(scope) {
+          case __memory_scope_work_item:        break;
+          case __memory_scope_sub_group:        __llvm_fence_acq_sg(); break;
+          case __memory_scope_work_group:       __llvm_fence_acq_wg(); break;
+          case __memory_scope_device:           __llvm_fence_acq_dev(); break;
+          case __memory_scope_all_svm_devices:  __llvm_fence_acq_sys(); break;
+        }
+    } else {
+        __builtin_amdgcn_s_barrier();
+    }
+}
+
+__device__
+inline
+static void __barrier(int n)
+{
+  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
+}
+
+__device__
+inline
+__attribute__((noduplicate))
+void __syncthreads()
+{
+  __barrier(__CLK_LOCAL_MEM_FENCE);
+}
+
+// hip.amdgcn.bc - device routine
+/*
+   HW_ID Register bit structure
+   WAVE_ID     3:0     Wave buffer slot number. 0-9.
+   SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
+   PIPE_ID     7:6     Pipeline from which the wave was dispatched.
+   CU_ID       11:8    Compute Unit the wave is assigned to.
+   SH_ID       12      Shader Array (within an SE) the wave is assigned to.
+   SE_ID       14:13   Shader Engine the wave is assigned to.
+   TG_ID       19:16   Thread-group ID
+   VM_ID       23:20   Virtual Memory ID
+   QUEUE_ID    26:24   Queue from which this wave was dispatched.
+   STATE_ID    29:27   State ID (graphics only, not compute).
+   ME_ID       31:30   Micro-engine ID.
+ */
+
+#define HW_ID               4
+
+#define HW_ID_CU_ID_SIZE    4
+#define HW_ID_CU_ID_OFFSET  8
+
+#define HW_ID_SE_ID_SIZE    2
+#define HW_ID_SE_ID_OFFSET  13
+
+/*
+   Encoding of parameter bitmask
+   HW_ID        5:0     HW_ID
+   OFFSET       10:6    Range: 0..31
+   SIZE         15:11   Range: 1..32
+ */
+
+#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
+
+__device__
+inline
+unsigned __smid(void)
+{
+    unsigned cu_id = __builtin_amdgcn_s_getreg(
+            GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
+    unsigned se_id = __builtin_amdgcn_s_getreg(
+            GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
+
+    /* Each shader engine has 16 CU */
+    return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+}
+
+#pragma push_macro("__DEVICE__")
+
+// Macro to replace extern __shared__ declarations
+// to local variable definitions
+#define HIP_DYNAMIC_SHARED(type, var) \
+    type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
+
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+
+#endif //defined(__clang__) && defined(__HIP__)

 #endif
@@ -62,8 +62,6 @@ THE SOFTWARE.
 #define CUDA_SUCCESS hipSuccess

 #include <hip/hip_runtime_api.h>
-#include <hip/hcc_detail/device_library_decls.h>
-#include <hip/hcc_detail/llvm_intrinsics.h>
 #endif  // __HCC_OR_HIP_CLANG__

 #if __HCC__
@@ -190,60 +188,9 @@ extern int HIP_TRACE_API;
 //#define warpSize hc::__wavesize()
 static constexpr int warpSize = 64;

-#define clock_t long long int
-__device__
-inline
-long long int __clock64() { return (long long int)__llvm_amdgcn_s_memrealtime(); }
-
-__device__
-inline
-clock_t  __clock() { return (clock_t)__llvm_amdgcn_s_memrealtime(); }
-
 // abort
 __device__ void abort();

-// warp vote function __all __any __ballot
-__device__ int __all(int input);
-__device__ int __any(int input);
-__device__ unsigned long long int __ballot(int input);
-
-__device__
-inline
-int64_t __ballot64(int a) {
-    int64_t s;
-    // define i64 @__ballot64(i32 %a) #0 {
-    //   %b = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %a) #1
-    //   ret i64 %b
-    // }
-    __asm("v_cmp_ne_i32_e64 $0, 0, $1" : "=s"(s) : "v"(a));
-    return s;
-}
-
-// hip.amdgcn.bc - lanemask
-__device__
-inline
-int64_t __lanemask_gt()
-{
-    int32_t activelane = __ockl_activelane_u32();
-    int64_t ballot = __ballot64(1);
-    if (activelane != 63) {
-        int64_t tmp = (~0UL) << (activelane + 1);
-        return tmp & ballot;
-    }
-    return 0;
-}
-
-__device__
-inline
-int64_t __lanemask_lt()
-{
-    int32_t activelane = __ockl_activelane_u32();
-    int64_t ballot = __ballot64(1);
-    if (activelane == 0)
-        return 0;
-    return ballot;
-}
-
 #if __HIP_ARCH_GFX701__ == 0

 // warp shuffle functions
@@ -283,20 +230,6 @@ __host__ __device__ int min(int arg1, int arg2);
 __host__ __device__ int max(int arg1, int arg2);


-__device__
-inline
-void* __get_dynamicgroupbaseptr()
-{
-    // Get group segment base pointer.
-    return (char*)__local_to_generic(__to_local(__llvm_amdgcn_groupstaticsize()));
-}
-
-__device__
-inline
-void *__amdgcn_get_dynamicgroupbaseptr() {
-    return __get_dynamicgroupbaseptr();
-}
-
 /**
 * CUDA 8 device function features

@@ -371,9 +304,6 @@ __device__ void __threadfence_system(void);
 * @}
 */

-// hip.amdgcn.bc - named sync
-__device__ inline void __named_sync(int a, int b) { __llvm_amdgcn_s_barrier(); }
-
 #endif  // __HCC_OR_HIP_CLANG__

 #if defined __HCC__
@@ -496,17 +426,6 @@ extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, gri

 #endif  //__HCC_CPP__

-/**
- * extern __shared__
- */
-
-// Macro to replace extern __shared__ declarations
-// to local variable definitions
-#define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
-
-#define HIP_DYNAMIC_SHARED_ATTRIBUTE
-
-
 /**
 * @defgroup HIP-ENV HIP Environment Variables
 * @{
@@ -625,154 +544,6 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
 #define hipGridDim_y gridDim.y
 #define hipGridDim_z gridDim.z

-#pragma push_macro("__DEVICE__")
-#define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
-  __attribute__((weak))
-
-__DEVICE__ void __device_trap() __asm("llvm.trap");
-
-__DEVICE__
-inline
-void __assert_fail(const char * __assertion,
-                                     const char *__file,
-                                     unsigned int __line,
-                                     const char *__function)
-{
-    // Ignore all the args for now.
-    __device_trap();
-}
-
-__DEVICE__
-inline
-void __assertfail(const char * __assertion,
-                  const char *__file,
-                  unsigned int __line,
-                  const char *__function,
-                  size_t charsize)
-{
-    // ignore all the args for now.
-    __device_trap();
-}
-
-// hip.amdgcn.bc - sync threads
-#define __CLK_LOCAL_MEM_FENCE    0x01
-typedef unsigned __cl_mem_fence_flags;
-
-typedef enum __memory_scope {
-  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
-  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
-  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
-  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
-} __memory_scope;
-
-// enum values aligned with what clang uses in EmitAtomicExpr()
-typedef enum __memory_order
-{
-  __memory_order_relaxed = __ATOMIC_RELAXED,
-  __memory_order_acquire = __ATOMIC_ACQUIRE,
-  __memory_order_release = __ATOMIC_RELEASE,
-  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
-  __memory_order_seq_cst = __ATOMIC_SEQ_CST
-} __memory_order;
-
-__device__
-inline
-static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
-{
-    if (flags) {
-        switch(scope) {
-          case __memory_scope_work_item:        break;
-          case __memory_scope_sub_group:        __llvm_fence_rel_sg(); break;
-          case __memory_scope_work_group:       __llvm_fence_rel_wg(); break;
-          case __memory_scope_device:           __llvm_fence_rel_dev(); break;
-          case __memory_scope_all_svm_devices:  __llvm_fence_rel_sys(); break;
-        }
-        //atomic_work_item_fence(flags, memory_order_release, scope);
-        __builtin_amdgcn_s_barrier();
-        //atomic_work_item_fence(flags, memory_order_acquire, scope);
-        switch(scope) {
-          case __memory_scope_work_item:        break;
-          case __memory_scope_sub_group:        __llvm_fence_acq_sg(); break;
-          case __memory_scope_work_group:       __llvm_fence_acq_wg(); break;
-          case __memory_scope_device:           __llvm_fence_acq_dev(); break;
-          case __memory_scope_all_svm_devices:  __llvm_fence_acq_sys(); break;
-        }
-    } else {
-        __builtin_amdgcn_s_barrier();
-    }
-}
-
-__device__
-inline
-static void __barrier(int n)
-{
-  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
-}
-
-__device__
-inline
-__attribute__((noduplicate))
-void __syncthreads()
-{
-  __barrier(__CLK_LOCAL_MEM_FENCE);
-}
-
-// hip.amdgcn.bc - device routine
-/*
-   HW_ID Register bit structure
-   WAVE_ID     3:0     Wave buffer slot number. 0-9.
-   SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
-   PIPE_ID     7:6     Pipeline from which the wave was dispatched.
-   CU_ID       11:8    Compute Unit the wave is assigned to.
-   SH_ID       12      Shader Array (within an SE) the wave is assigned to.
-   SE_ID       14:13   Shader Engine the wave is assigned to.
-   TG_ID       19:16   Thread-group ID
-   VM_ID       23:20   Virtual Memory ID
-   QUEUE_ID    26:24   Queue from which this wave was dispatched.
-   STATE_ID    29:27   State ID (graphics only, not compute).
-   ME_ID       31:30   Micro-engine ID.
- */
-
-#define HW_ID               4
-
-#define HW_ID_CU_ID_SIZE    4
-#define HW_ID_CU_ID_OFFSET  8
-
-#define HW_ID_SE_ID_SIZE    2
-#define HW_ID_SE_ID_OFFSET  13
-
-/*
-   Encoding of parameter bitmask
-   HW_ID        5:0     HW_ID
-   OFFSET       10:6    Range: 0..31
-   SIZE         15:11   Range: 1..32
- */
-
-#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
-
-__device__
-inline
-unsigned __smid(void)
-{
-    unsigned cu_id = __llvm_amdgcn_s_getreg(
-            GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
-    unsigned se_id = __llvm_amdgcn_s_getreg(
-            GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
-
-    /* Each shader engine has 16 CU */
-    return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
-}
-
-// Macro to replace extern __shared__ declarations
-// to local variable definitions
-#define HIP_DYNAMIC_SHARED(type, var) \
-    type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
-
-#define HIP_DYNAMIC_SHARED_ATTRIBUTE
-
-#pragma push_macro("__DEVICE__")
-
 #include <hip/hcc_detail/math_functions.h>

 #endif
@@ -31,16 +31,7 @@ THE SOFTWARE.

 #include "hip/hcc_detail/host_defines.h"

-__device__
-unsigned long __llvm_amdgcn_s_memrealtime(void) __asm("llvm.amdgcn.s.memrealtime");
-
-__device__
-unsigned __llvm_amdgcn_s_getreg(unsigned) __asm("llvm.amdgcn.s.getreg");
-
 __device__
 unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");

-__device__
-void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
-
 #endif