Implement Memory Fence Functions in header

Enabled __llvm_fence_* functions for seq_cst.
Dieser Commit ist enthalten in:
Aaron Enye Shi
2018-07-04 23:13:35 +00:00
Ursprung ce9ca744dc
Commit 96ab7c7b25
5 geänderte Dateien mit 109 neuen und 117 gelöschten Zeilen
@@ -752,6 +752,101 @@ void *__amdgcn_get_dynamicgroupbaseptr() {
#endif // __HIP_DEVICE_COMPILE__
// hip.amdgcn.bc - sync threads
#define __CLK_LOCAL_MEM_FENCE 0x01
typedef unsigned __cl_mem_fence_flags;
typedef enum __memory_scope {
__memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
__memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
__memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
__memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
__memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
} __memory_scope;
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum __memory_order
{
__memory_order_relaxed = __ATOMIC_RELAXED,
__memory_order_acquire = __ATOMIC_ACQUIRE,
__memory_order_release = __ATOMIC_RELEASE,
__memory_order_acq_rel = __ATOMIC_ACQ_REL,
__memory_order_seq_cst = __ATOMIC_SEQ_CST
} __memory_order;
__device__
inline
static void
__atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
{
// We're tying global-happens-before and local-happens-before together as does HSA
if (order != __memory_order_relaxed) {
switch (scope) {
case __memory_scope_work_item:
break;
case __memory_scope_sub_group:
switch (order) {
case __memory_order_relaxed: break;
case __memory_order_acquire: __llvm_fence_acq_sg(); break;
case __memory_order_release: __llvm_fence_rel_sg(); break;
case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
}
break;
case __memory_scope_work_group:
switch (order) {
case __memory_order_relaxed: break;
case __memory_order_acquire: __llvm_fence_acq_wg(); break;
case __memory_order_release: __llvm_fence_rel_wg(); break;
case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
}
break;
case __memory_scope_device:
switch (order) {
case __memory_order_relaxed: break;
case __memory_order_acquire: __llvm_fence_acq_dev(); break;
case __memory_order_release: __llvm_fence_rel_dev(); break;
case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
}
break;
case __memory_scope_all_svm_devices:
switch (order) {
case __memory_order_relaxed: break;
case __memory_order_acquire: __llvm_fence_acq_sys(); break;
case __memory_order_release: __llvm_fence_rel_sys(); break;
case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
}
break;
}
}
}
// Memory Fence Functions
__device__
inline
static void __threadfence()
{
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
}
__device__
inline
static void __threadfence_block()
{
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
}
__device__
inline
static void __threadfence_system()
{
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
}
#endif // __HCC_OR_HIP_CLANG__
#ifdef __HCC__
@@ -796,50 +891,14 @@ void __assertfail(const char * __assertion,
__builtin_trap();
}
// hip.amdgcn.bc - sync threads
#define __CLK_LOCAL_MEM_FENCE 0x01
typedef unsigned __cl_mem_fence_flags;
typedef enum __memory_scope {
__memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
__memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
__memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
__memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
__memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
} __memory_scope;
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum __memory_order
{
__memory_order_relaxed = __ATOMIC_RELAXED,
__memory_order_acquire = __ATOMIC_ACQUIRE,
__memory_order_release = __ATOMIC_RELEASE,
__memory_order_acq_rel = __ATOMIC_ACQ_REL,
__memory_order_seq_cst = __ATOMIC_SEQ_CST
} __memory_order;
__device__
inline
static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
{
if (flags) {
switch(scope) {
case __memory_scope_work_item: break;
case __memory_scope_sub_group: __llvm_fence_rel_sg(); break;
case __memory_scope_work_group: __llvm_fence_rel_wg(); break;
case __memory_scope_device: __llvm_fence_rel_dev(); break;
case __memory_scope_all_svm_devices: __llvm_fence_rel_sys(); break;
}
//atomic_work_item_fence(flags, memory_order_release, scope);
__atomic_work_item_fence(flags, __memory_order_release, scope);
__builtin_amdgcn_s_barrier();
//atomic_work_item_fence(flags, memory_order_acquire, scope);
switch(scope) {
case __memory_scope_work_item: break;
case __memory_scope_sub_group: __llvm_fence_acq_sg(); break;
case __memory_scope_work_group: __llvm_fence_acq_wg(); break;
case __memory_scope_device: __llvm_fence_acq_dev(); break;
case __memory_scope_all_svm_devices: __llvm_fence_acq_sys(); break;
}
__atomic_work_item_fence(flags, __memory_order_acquire, scope);
} else {
__builtin_amdgcn_s_barrier();
}
@@ -918,4 +977,5 @@ unsigned __smid(void)
#endif //defined(__clang__) && defined(__HIP__)
#endif
@@ -63,4 +63,15 @@ extern "C" __device__ void __llvm_fence_rel_wg(void);
extern "C" __device__ void __llvm_fence_rel_dev(void);
extern "C" __device__ void __llvm_fence_rel_sys(void);
extern "C" __device__ void __llvm_fence_ar_sg(void);
extern "C" __device__ void __llvm_fence_ar_wg(void);
extern "C" __device__ void __llvm_fence_ar_dev(void);
extern "C" __device__ void __llvm_fence_ar_sys(void);
extern "C" __device__ void __llvm_fence_sc_sg(void);
extern "C" __device__ void __llvm_fence_sc_wg(void);
extern "C" __device__ void __llvm_fence_sc_dev(void);
extern "C" __device__ void __llvm_fence_sc_sys(void);
#endif
@@ -204,81 +204,6 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask
__host__ __device__ int min(int arg1, int arg2);
__host__ __device__ int max(int arg1, int arg2);
/**
* CUDA 8 device function features
*/
/**
* Kernel launching
*/
/**
*-------------------------------------------------------------------------------------------------
*-------------------------------------------------------------------------------------------------
* @defgroup Fence Fence Functions
* @{
*
*
* @warning The HIP memory fence functions are currently not supported yet.
* If any of those threadfence stubs are reached by the application, you should set "export
*HSA_DISABLE_CACHE=1" to disable L1 and L2 caches.
*
*
* On AMD platforms, the threadfence* routines are currently empty stubs.
*/
extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev");
extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm(
"__llvm_fence_sc_wg");
/**
* @brief threadfence_block makes writes visible to threads running in same block.
*
* @Returns void
*
* @param void
*
* @warning __threadfence_block is a stub and map to no-op.
*/
// __device__ void __threadfence_block(void);
__device__ static inline void __threadfence_block(void) { return __hip_hc_threadfence_block(); }
/**
* @brief threadfence makes wirtes visible to other threads running on same GPU.
*
* @Returns void
*
* @param void
*
* @warning __threadfence is a stub and map to no-op, application should set "export
* HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches.
*/
// __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time
// compatibility, not yet functional")));
__device__ static inline void __threadfence(void) { return __hip_hc_threadfence(); }
/**
* @brief threadfence_system makes writes to pinned system memory visible on host CPU.
*
* @Returns void
*
* @param void
*
* @warning __threadfence_system is a stub and map to no-op.
*/
//__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround
//configuration, see hip_kernel_language.md for details")));
__device__ void __threadfence_system(void);
// doxygen end Fence Fence
/**
* @}
*/
#endif // __HCC_OR_HIP_CLANG__
#if defined __HCC__
-3
Datei anzeigen
@@ -155,6 +155,3 @@ __host__ __device__ int max(int arg1, int arg2) {
}
__host__ void* __get_dynamicgroupbaseptr() { return nullptr; }
__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }
-1
Datei anzeigen
@@ -125,7 +125,6 @@ __device__ double __hip_fast_dsqrt_rd(double x);
__device__ double __hip_fast_dsqrt_rn(double x);
__device__ double __hip_fast_dsqrt_ru(double x);
__device__ double __hip_fast_dsqrt_rz(double x);
__device__ void __threadfence_system(void);
float __hip_host_j0f(float x);
double __hip_host_j0(double x);