diff --git a/hipamd/include/hip/hcc_detail/device_functions.h b/hipamd/include/hip/hcc_detail/device_functions.h index 32509ffffd..716c51a887 100644 --- a/hipamd/include/hip/hcc_detail/device_functions.h +++ b/hipamd/include/hip/hcc_detail/device_functions.h @@ -752,6 +752,101 @@ void *__amdgcn_get_dynamicgroupbaseptr() { #endif // __HIP_DEVICE_COMPILE__ + +// hip.amdgcn.bc - sync threads +#define __CLK_LOCAL_MEM_FENCE 0x01 +typedef unsigned __cl_mem_fence_flags; + +typedef enum __memory_scope { + __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, + __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +} __memory_scope; + +// enum values aligned with what clang uses in EmitAtomicExpr() +typedef enum __memory_order +{ + __memory_order_relaxed = __ATOMIC_RELAXED, + __memory_order_acquire = __ATOMIC_ACQUIRE, + __memory_order_release = __ATOMIC_RELEASE, + __memory_order_acq_rel = __ATOMIC_ACQ_REL, + __memory_order_seq_cst = __ATOMIC_SEQ_CST +} __memory_order; + +__device__ +inline +static void +__atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope) +{ + // We're tying global-happens-before and local-happens-before together as does HSA + if (order != __memory_order_relaxed) { + switch (scope) { + case __memory_scope_work_item: + break; + case __memory_scope_sub_group: + switch (order) { + case __memory_order_relaxed: break; + case __memory_order_acquire: __llvm_fence_acq_sg(); break; + case __memory_order_release: __llvm_fence_rel_sg(); break; + case __memory_order_acq_rel: __llvm_fence_ar_sg(); break; + case __memory_order_seq_cst: __llvm_fence_sc_sg(); break; + } + break; + case __memory_scope_work_group: + switch (order) { + case __memory_order_relaxed: break; + case __memory_order_acquire: __llvm_fence_acq_wg(); break; + case __memory_order_release: __llvm_fence_rel_wg(); break; + case __memory_order_acq_rel: __llvm_fence_ar_wg(); break; + case __memory_order_seq_cst: __llvm_fence_sc_wg(); break; + } + break; + case __memory_scope_device: + switch (order) { + case __memory_order_relaxed: break; + case __memory_order_acquire: __llvm_fence_acq_dev(); break; + case __memory_order_release: __llvm_fence_rel_dev(); break; + case __memory_order_acq_rel: __llvm_fence_ar_dev(); break; + case __memory_order_seq_cst: __llvm_fence_sc_dev(); break; + } + break; + case __memory_scope_all_svm_devices: + switch (order) { + case __memory_order_relaxed: break; + case __memory_order_acquire: __llvm_fence_acq_sys(); break; + case __memory_order_release: __llvm_fence_rel_sys(); break; + case __memory_order_acq_rel: __llvm_fence_ar_sys(); break; + case __memory_order_seq_cst: __llvm_fence_sc_sys(); break; + } + break; + } + } +} + +// Memory Fence Functions +__device__ +inline +static void __threadfence() +{ + __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device); +} + +__device__ +inline +static void __threadfence_block() +{ + __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group); +} + +__device__ +inline +static void __threadfence_system() +{ + __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices); +} + #endif // __HCC_OR_HIP_CLANG__ #ifdef __HCC__ @@ -796,50 +891,14 @@ void __assertfail(const char * __assertion, __builtin_trap(); } -// hip.amdgcn.bc - sync threads -#define __CLK_LOCAL_MEM_FENCE 0x01 -typedef unsigned __cl_mem_fence_flags; - -typedef enum __memory_scope { - __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, - __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, - __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, - __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, - __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP -} __memory_scope; - -// enum values aligned with what clang uses in EmitAtomicExpr() -typedef enum __memory_order -{ - __memory_order_relaxed = __ATOMIC_RELAXED, - __memory_order_acquire = __ATOMIC_ACQUIRE, - __memory_order_release = __ATOMIC_RELEASE, - __memory_order_acq_rel = __ATOMIC_ACQ_REL, - __memory_order_seq_cst = __ATOMIC_SEQ_CST -} __memory_order; - __device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope) { if (flags) { - switch(scope) { - case __memory_scope_work_item: break; - case __memory_scope_sub_group: __llvm_fence_rel_sg(); break; - case __memory_scope_work_group: __llvm_fence_rel_wg(); break; - case __memory_scope_device: __llvm_fence_rel_dev(); break; - case __memory_scope_all_svm_devices: __llvm_fence_rel_sys(); break; - } - //atomic_work_item_fence(flags, memory_order_release, scope); + __atomic_work_item_fence(flags, __memory_order_release, scope); __builtin_amdgcn_s_barrier(); - //atomic_work_item_fence(flags, memory_order_acquire, scope); - switch(scope) { - case __memory_scope_work_item: break; - case __memory_scope_sub_group: __llvm_fence_acq_sg(); break; - case __memory_scope_work_group: __llvm_fence_acq_wg(); break; - case __memory_scope_device: __llvm_fence_acq_dev(); break; - case __memory_scope_all_svm_devices: __llvm_fence_acq_sys(); break; - } + __atomic_work_item_fence(flags, __memory_order_acquire, scope); } else { __builtin_amdgcn_s_barrier(); } @@ -918,4 +977,5 @@ unsigned __smid(void) #endif //defined(__clang__) && defined(__HIP__) + #endif diff --git a/hipamd/include/hip/hcc_detail/device_library_decls.h b/hipamd/include/hip/hcc_detail/device_library_decls.h index a636c2c950..82c39b24f0 100644 --- a/hipamd/include/hip/hcc_detail/device_library_decls.h +++ b/hipamd/include/hip/hcc_detail/device_library_decls.h @@ -63,4 +63,15 @@ extern "C" __device__ void __llvm_fence_rel_wg(void); extern "C" __device__ void __llvm_fence_rel_dev(void); extern "C" __device__ void __llvm_fence_rel_sys(void); +extern "C" __device__ void __llvm_fence_ar_sg(void); +extern "C" __device__ void __llvm_fence_ar_wg(void); +extern "C" __device__ void __llvm_fence_ar_dev(void); +extern "C" __device__ void __llvm_fence_ar_sys(void); + + +extern "C" __device__ void __llvm_fence_sc_sg(void); +extern "C" __device__ void __llvm_fence_sc_wg(void); +extern "C" __device__ void __llvm_fence_sc_dev(void); +extern "C" __device__ void __llvm_fence_sc_sys(void); + #endif diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index 68f2244014..3e0f2e27c5 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -204,81 +204,6 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); - -/** - * CUDA 8 device function features - - */ - - -/** - * Kernel launching - */ - -/** - *------------------------------------------------------------------------------------------------- - *------------------------------------------------------------------------------------------------- - * @defgroup Fence Fence Functions - * @{ - * - * - * @warning The HIP memory fence functions are currently not supported yet. - * If any of those threadfence stubs are reached by the application, you should set "export - *HSA_DISABLE_CACHE=1" to disable L1 and L2 caches. - * - * - * On AMD platforms, the threadfence* routines are currently empty stubs. - */ - -extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev"); -extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm( - "__llvm_fence_sc_wg"); - - -/** - * @brief threadfence_block makes writes visible to threads running in same block. - * - * @Returns void - * - * @param void - * - * @warning __threadfence_block is a stub and map to no-op. - */ -// __device__ void __threadfence_block(void); -__device__ static inline void __threadfence_block(void) { return __hip_hc_threadfence_block(); } - -/** - * @brief threadfence makes wirtes visible to other threads running on same GPU. - * - * @Returns void - * - * @param void - * - * @warning __threadfence is a stub and map to no-op, application should set "export - * HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches. - */ -// __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time -// compatibility, not yet functional"))); -__device__ static inline void __threadfence(void) { return __hip_hc_threadfence(); } - -/** - * @brief threadfence_system makes writes to pinned system memory visible on host CPU. - * - * @Returns void - * - * @param void - * - * @warning __threadfence_system is a stub and map to no-op. - */ -//__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround -//configuration, see hip_kernel_language.md for details"))); -__device__ void __threadfence_system(void); - -// doxygen end Fence Fence -/** - * @} - */ - #endif // __HCC_OR_HIP_CLANG__ #if defined __HCC__ diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 65ee5f4368..5107acd8c6 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -155,6 +155,3 @@ __host__ __device__ int max(int arg1, int arg2) { } __host__ void* __get_dynamicgroupbaseptr() { return nullptr; } - - -__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); } diff --git a/hipamd/src/device_util.h b/hipamd/src/device_util.h index 6603689d82..8fa96da9d9 100644 --- a/hipamd/src/device_util.h +++ b/hipamd/src/device_util.h @@ -125,7 +125,6 @@ __device__ double __hip_fast_dsqrt_rd(double x); __device__ double __hip_fast_dsqrt_rn(double x); __device__ double __hip_fast_dsqrt_ru(double x); __device__ double __hip_fast_dsqrt_rz(double x); -__device__ void __threadfence_system(void); float __hip_host_j0f(float x); double __hip_host_j0(double x);