From b34868005a54feee2d2952585851f8be4c07d2cf Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Tue, 12 Jun 2018 22:05:59 +0000 Subject: [PATCH] Add hipclang amdgcn functions These are moving from hipclang in device library to hip headers. These are required for the functionality of HIPclang project. [ROCm/hip commit: c453b42bff45800ab04e10231dcf63a6acddc876] --- .../hip/include/hip/hcc_detail/hip_runtime.h | 197 +++++++++++++++++- projects/hip/src/device_util.cpp | 9 +- 2 files changed, 192 insertions(+), 14 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/hip_runtime.h b/projects/hip/include/hip/hcc_detail/hip_runtime.h index c62c85df64..34650d728a 100644 --- a/projects/hip/include/hip/hcc_detail/hip_runtime.h +++ b/projects/hip/include/hip/hcc_detail/hip_runtime.h @@ -189,8 +189,16 @@ extern int HIP_TRACE_API; static constexpr int warpSize = 64; #define clock_t long long int -__device__ long long int clock64(); -__device__ clock_t clock(); +__device__ +unsigned long __llvm_amdgcn_s_memrealtime(void) __asm("llvm.amdgcn.s.memrealtime"); + +__device__ +inline +long long int __clock64() { return (long long int)__llvm_amdgcn_s_memrealtime(); } + +__device__ +inline +clock_t __clock() { return (clock_t)__llvm_amdgcn_s_memrealtime(); } // abort __device__ void abort(); @@ -200,6 +208,45 @@ __device__ int __all(int input); __device__ int __any(int input); __device__ unsigned long long int __ballot(int input); +__device__ +inline +int64_t __ballot64(int a) { + int64_t s; + // define i64 @__ballot64(i32 %a) #0 { + // %b = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %a) #1 + // ret i64 %b + // } + __asm("v_cmp_ne_i32_e64 $0, 0, $1" : "=s"(s) : "v"(a)); + return s; +} + +// hip.amdgcn.bc - lanemask +extern "C" __device__ int32_t __ockl_activelane_u32(void); + +__device__ +inline +int64_t __lanemask_gt() +{ + int32_t activelane = __ockl_activelane_u32(); + int64_t ballot = __ballot64(1); + if (activelane != 63) { + int64_t tmp = (~0UL) << (activelane + 1); + return tmp & ballot; + } + return 0; +} + +__device__ +inline +int64_t __lanemask_lt() +{ + int32_t activelane = __ockl_activelane_u32(); + int64_t ballot = __ballot64(1); + if (activelane == 0) + return 0; + return ballot; +} + #if __HIP_ARCH_GFX701__ == 0 // warp shuffle functions @@ -238,8 +285,11 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); -__device__ void* __get_dynamicgroupbaseptr(); +extern "C" __device__ void* get_dynamic_group_segment_base_pointer(); +__device__ +inline +void* __get_dynamicgroupbaseptr() { return get_dynamic_group_segment_base_pointer(); } /** * CUDA 8 device function features @@ -315,6 +365,11 @@ __device__ void __threadfence_system(void); * @} */ +// hip.amdgcn.bc - named sync +__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier"); + +__device__ inline void __named_sync(int a, int b) { __llvm_amdgcn_s_barrier(); } + #endif // __HCC_OR_HIP_CLANG__ #if defined __HCC__ @@ -572,7 +627,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim; __DEVICE__ void __device_trap() __asm("llvm.trap"); -__DEVICE__ void inline __assert_fail(const char * __assertion, +__DEVICE__ +inline +void __assert_fail(const char * __assertion, const char *__file, unsigned int __line, const char *__function) @@ -581,8 +638,136 @@ __DEVICE__ void inline __assert_fail(const char * __assertion, __device_trap(); } -extern "C" __device__ __attribute__((noduplicate)) void __syncthreads(); -extern "C" __device__ void *__amdgcn_get_dynamicgroupbaseptr(); +__DEVICE__ +inline +void __assertfail(const char * __assertion, + const char *__file, + unsigned int __line, + const char *__function, + size_t charsize) +{ + // ignore all the args for now. + __device_trap(); +} + +// hip.amdgcn.bc - sync threads +// extern "C" __device__ __attribute__((noduplicate)) void __syncthreads(); +#define CLK_LOCAL_MEM_FENCE 0x01 +#define local __attribute__((address_space(3))) + +typedef unsigned cl_mem_fence_flags; + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +} memory_scope; + +// enum values aligned with what clang uses in EmitAtomicExpr() +typedef enum memory_order +{ + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +extern "C" __device__ __attribute__((overloadable)) +void atomic_work_item_fence(cl_mem_fence_flags, memory_order, memory_scope); + +__device__ +inline +static void hc_work_group_barrier(cl_mem_fence_flags flags, memory_scope scope) +{ + if (flags) { + atomic_work_item_fence(flags, memory_order_release, scope); + __builtin_amdgcn_s_barrier(); + atomic_work_item_fence(flags, memory_order_acquire, scope); + } else { + __builtin_amdgcn_s_barrier(); + } +} + +__device__ +inline +static void hc_barrier(int n) +{ + hc_work_group_barrier((cl_mem_fence_flags)n, memory_scope_work_group); +} + +__device__ +inline +__attribute__((noduplicate)) +void __syncthreads() +{ + hc_barrier(CLK_LOCAL_MEM_FENCE); +} + + +__device__ unsigned __llvm_amdgcn_s_getreg(unsigned) __asm("llvm.amdgcn.s.getreg"); + +__device__ unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize"); + +__device__ inline static local char* __to_local(unsigned x) { return (local char*)x; } + +__device__ inline void *__amdgcn_get_dynamicgroupbaseptr() { +#if 0 + // Get group segment base pointer. + char* base = __llvm_amdgcn_s_getreg(14342) << 8); + base += __llvm_amdgcn_groupstaticsize(); + return base; +#endif + return __get_dynamicgroupbaseptr(); +} + +// hip.amdgcn.bc - device routine +/* + HW_ID Register bit structure + WAVE_ID 3:0 Wave buffer slot number. 0-9. + SIMD_ID 5:4 SIMD which the wave is assigned to within the CU. + PIPE_ID 7:6 Pipeline from which the wave was dispatched. + CU_ID 11:8 Compute Unit the wave is assigned to. + SH_ID 12 Shader Array (within an SE) the wave is assigned to. + SE_ID 14:13 Shader Engine the wave is assigned to. + TG_ID 19:16 Thread-group ID + VM_ID 23:20 Virtual Memory ID + QUEUE_ID 26:24 Queue from which this wave was dispatched. + STATE_ID 29:27 State ID (graphics only, not compute). + ME_ID 31:30 Micro-engine ID. + */ + +#define HW_ID 4 + +#define HW_ID_CU_ID_SIZE 4 +#define HW_ID_CU_ID_OFFSET 8 + +#define HW_ID_SE_ID_SIZE 2 +#define HW_ID_SE_ID_OFFSET 13 + +/* + Encoding of parameter bitmask + HW_ID 5:0 HW_ID + OFFSET 10:6 Range: 0..31 + SIZE 15:11 Range: 1..32 + */ + +#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG + +__device__ +inline +unsigned __smid(void) +{ + unsigned cu_id = __llvm_amdgcn_s_getreg( + GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); + unsigned se_id = __llvm_amdgcn_s_getreg( + GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); + + /* Each shader engine has 16 CU */ + return (se_id << HW_ID_CU_ID_SIZE) + cu_id; +} // Macro to replace extern __shared__ declarations // to local variable definitions diff --git a/projects/hip/src/device_util.cpp b/projects/hip/src/device_util.cpp index 613e35f0cc..a3386ba14d 100644 --- a/projects/hip/src/device_util.cpp +++ b/projects/hip/src/device_util.cpp @@ -144,9 +144,6 @@ __device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) { return dst; } -__device__ long long int clock64() { return (long long int)hc::__cycle_u64(); }; -__device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); }; - // abort __device__ void abort() { return hc::abort(); } @@ -203,11 +200,7 @@ __host__ __device__ int max(int arg1, int arg2) { return (int)(hc::precise_math::fmax((float)arg1, (float)arg2)); } -__device__ void* __get_dynamicgroupbaseptr() { - return hc::get_dynamic_group_segment_base_pointer(); -} - __host__ void* __get_dynamicgroupbaseptr() { return nullptr; } -__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); } \ No newline at end of file +__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }