Add hipclang amdgcn functions

These are moving from hipclang in device library to hip headers. These are required for the functionality of HIPclang project. [ROCm/hip commit: c453b42bff]
2018-06-12 22:05:59 +00:00
@@ -189,8 +189,16 @@ extern int HIP_TRACE_API;
 static constexpr int warpSize = 64;

 #define clock_t long long int
-__device__ long long int clock64();
-__device__ clock_t clock();
+__device__
+unsigned long __llvm_amdgcn_s_memrealtime(void) __asm("llvm.amdgcn.s.memrealtime");
+
+__device__
+inline
+long long int __clock64() { return (long long int)__llvm_amdgcn_s_memrealtime(); }
+
+__device__
+inline
+clock_t  __clock() { return (clock_t)__llvm_amdgcn_s_memrealtime(); }

 // abort
 __device__ void abort();
@@ -200,6 +208,45 @@ __device__ int __all(int input);
 __device__ int __any(int input);
 __device__ unsigned long long int __ballot(int input);

+__device__
+inline
+int64_t __ballot64(int a) {
+    int64_t s;
+    // define i64 @__ballot64(i32 %a) #0 {
+    //   %b = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %a) #1
+    //   ret i64 %b
+    // }
+    __asm("v_cmp_ne_i32_e64 $0, 0, $1" : "=s"(s) : "v"(a));
+    return s;
+}
+
+// hip.amdgcn.bc - lanemask
+extern "C" __device__ int32_t __ockl_activelane_u32(void);
+
+__device__
+inline
+int64_t __lanemask_gt()
+{
+    int32_t activelane = __ockl_activelane_u32();
+    int64_t ballot = __ballot64(1);
+    if (activelane != 63) {
+        int64_t tmp = (~0UL) << (activelane + 1);
+        return tmp & ballot;
+    }
+    return 0;
+}
+
+__device__
+inline
+int64_t __lanemask_lt()
+{
+    int32_t activelane = __ockl_activelane_u32();
+    int64_t ballot = __ballot64(1);
+    if (activelane == 0)
+        return 0;
+    return ballot;
+}
+
 #if __HIP_ARCH_GFX701__ == 0

 // warp shuffle functions
@@ -238,8 +285,11 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask
 __host__ __device__ int min(int arg1, int arg2);
 __host__ __device__ int max(int arg1, int arg2);

-__device__ void* __get_dynamicgroupbaseptr();
+extern "C" __device__ void* get_dynamic_group_segment_base_pointer();

+__device__
+inline
+void* __get_dynamicgroupbaseptr() { return get_dynamic_group_segment_base_pointer(); }

 /**
 * CUDA 8 device function features
@@ -315,6 +365,11 @@ __device__ void __threadfence_system(void);
 * @}
 */

+// hip.amdgcn.bc - named sync
+__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
+
+__device__ inline void __named_sync(int a, int b) { __llvm_amdgcn_s_barrier(); }
+
 #endif  // __HCC_OR_HIP_CLANG__

 #if defined __HCC__
@@ -572,7 +627,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;

 __DEVICE__ void __device_trap() __asm("llvm.trap");

-__DEVICE__ void inline __assert_fail(const char * __assertion,
+__DEVICE__
+inline
+void __assert_fail(const char * __assertion,
                                     const char *__file,
                                     unsigned int __line,
                                     const char *__function)
@@ -581,8 +638,136 @@ __DEVICE__ void inline __assert_fail(const char * __assertion,
    __device_trap();
 }

-extern "C" __device__ __attribute__((noduplicate)) void __syncthreads();
-extern "C" __device__ void *__amdgcn_get_dynamicgroupbaseptr();
+__DEVICE__
+inline
+void __assertfail(const char * __assertion,
+                  const char *__file,
+                  unsigned int __line,
+                  const char *__function,
+                  size_t charsize)
+{
+    // ignore all the args for now.
+    __device_trap();
+}
+
+// hip.amdgcn.bc - sync threads
+// extern "C" __device__ __attribute__((noduplicate)) void __syncthreads();
+#define CLK_LOCAL_MEM_FENCE    0x01
+#define local __attribute__((address_space(3)))
+
+typedef unsigned cl_mem_fence_flags;
+
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+} memory_scope;
+
+// enum values aligned with what clang uses in EmitAtomicExpr()
+typedef enum memory_order
+{
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+extern "C" __device__ __attribute__((overloadable))
+void atomic_work_item_fence(cl_mem_fence_flags, memory_order, memory_scope);
+
+__device__
+inline
+static void hc_work_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
+{
+    if (flags) {
+        atomic_work_item_fence(flags, memory_order_release, scope);
+        __builtin_amdgcn_s_barrier();
+        atomic_work_item_fence(flags, memory_order_acquire, scope);
+    } else {
+        __builtin_amdgcn_s_barrier();
+    }
+}
+
+__device__
+inline
+static void hc_barrier(int n)
+{
+  hc_work_group_barrier((cl_mem_fence_flags)n, memory_scope_work_group);
+}
+
+__device__
+inline
+__attribute__((noduplicate))
+void __syncthreads()
+{
+  hc_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+
+__device__ unsigned __llvm_amdgcn_s_getreg(unsigned) __asm("llvm.amdgcn.s.getreg");
+
+__device__ unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
+
+__device__ inline static local char* __to_local(unsigned x) { return (local char*)x; }
+
+__device__ inline void *__amdgcn_get_dynamicgroupbaseptr() {
+#if 0
+    // Get group segment base pointer.
+    char* base = __llvm_amdgcn_s_getreg(14342) << 8);
+    base += __llvm_amdgcn_groupstaticsize();
+    return base;
+#endif
+    return __get_dynamicgroupbaseptr();
+}
+
+// hip.amdgcn.bc - device routine
+/*
+   HW_ID Register bit structure
+   WAVE_ID     3:0     Wave buffer slot number. 0-9.
+   SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
+   PIPE_ID     7:6     Pipeline from which the wave was dispatched.
+   CU_ID       11:8    Compute Unit the wave is assigned to.
+   SH_ID       12      Shader Array (within an SE) the wave is assigned to.
+   SE_ID       14:13   Shader Engine the wave is assigned to.
+   TG_ID       19:16   Thread-group ID
+   VM_ID       23:20   Virtual Memory ID
+   QUEUE_ID    26:24   Queue from which this wave was dispatched.
+   STATE_ID    29:27   State ID (graphics only, not compute).
+   ME_ID       31:30   Micro-engine ID.
+ */
+
+#define HW_ID               4
+
+#define HW_ID_CU_ID_SIZE    4
+#define HW_ID_CU_ID_OFFSET  8
+
+#define HW_ID_SE_ID_SIZE    2
+#define HW_ID_SE_ID_OFFSET  13
+
+/*
+   Encoding of parameter bitmask
+   HW_ID        5:0     HW_ID
+   OFFSET       10:6    Range: 0..31
+   SIZE         15:11   Range: 1..32
+ */
+
+#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
+
+__device__
+inline
+unsigned __smid(void)
+{
+    unsigned cu_id = __llvm_amdgcn_s_getreg(
+            GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
+    unsigned se_id = __llvm_amdgcn_s_getreg(
+            GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
+
+    /* Each shader engine has 16 CU */
+    return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
+}

 // Macro to replace extern __shared__ declarations
 // to local variable definitions
@@ -144,9 +144,6 @@ __device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) {
    return dst;
 }

-__device__ long long int clock64() { return (long long int)hc::__cycle_u64(); };
-__device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); };
-
 // abort
 __device__ void abort() { return hc::abort(); }

@@ -203,11 +200,7 @@ __host__ __device__ int max(int arg1, int arg2) {
    return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));
 }

-__device__ void* __get_dynamicgroupbaseptr() {
-    return hc::get_dynamic_group_segment_base_pointer();
-}
-
 __host__ void* __get_dynamicgroupbaseptr() { return nullptr; }


-__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }
+__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }