Implement Memory Fence Functions in header
Enabled __llvm_fence_* functions for seq_cst.
Dieser Commit ist enthalten in:
@@ -752,6 +752,101 @@ void *__amdgcn_get_dynamicgroupbaseptr() {
|
||||
|
||||
#endif // __HIP_DEVICE_COMPILE__
|
||||
|
||||
|
||||
// hip.amdgcn.bc - sync threads
|
||||
#define __CLK_LOCAL_MEM_FENCE 0x01
|
||||
typedef unsigned __cl_mem_fence_flags;
|
||||
|
||||
typedef enum __memory_scope {
|
||||
__memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
|
||||
__memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
|
||||
__memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
||||
__memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
||||
__memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
||||
} __memory_scope;
|
||||
|
||||
// enum values aligned with what clang uses in EmitAtomicExpr()
|
||||
typedef enum __memory_order
|
||||
{
|
||||
__memory_order_relaxed = __ATOMIC_RELAXED,
|
||||
__memory_order_acquire = __ATOMIC_ACQUIRE,
|
||||
__memory_order_release = __ATOMIC_RELEASE,
|
||||
__memory_order_acq_rel = __ATOMIC_ACQ_REL,
|
||||
__memory_order_seq_cst = __ATOMIC_SEQ_CST
|
||||
} __memory_order;
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void
|
||||
__atomic_work_item_fence(__cl_mem_fence_flags flags, __memory_order order, __memory_scope scope)
|
||||
{
|
||||
// We're tying global-happens-before and local-happens-before together as does HSA
|
||||
if (order != __memory_order_relaxed) {
|
||||
switch (scope) {
|
||||
case __memory_scope_work_item:
|
||||
break;
|
||||
case __memory_scope_sub_group:
|
||||
switch (order) {
|
||||
case __memory_order_relaxed: break;
|
||||
case __memory_order_acquire: __llvm_fence_acq_sg(); break;
|
||||
case __memory_order_release: __llvm_fence_rel_sg(); break;
|
||||
case __memory_order_acq_rel: __llvm_fence_ar_sg(); break;
|
||||
case __memory_order_seq_cst: __llvm_fence_sc_sg(); break;
|
||||
}
|
||||
break;
|
||||
case __memory_scope_work_group:
|
||||
switch (order) {
|
||||
case __memory_order_relaxed: break;
|
||||
case __memory_order_acquire: __llvm_fence_acq_wg(); break;
|
||||
case __memory_order_release: __llvm_fence_rel_wg(); break;
|
||||
case __memory_order_acq_rel: __llvm_fence_ar_wg(); break;
|
||||
case __memory_order_seq_cst: __llvm_fence_sc_wg(); break;
|
||||
}
|
||||
break;
|
||||
case __memory_scope_device:
|
||||
switch (order) {
|
||||
case __memory_order_relaxed: break;
|
||||
case __memory_order_acquire: __llvm_fence_acq_dev(); break;
|
||||
case __memory_order_release: __llvm_fence_rel_dev(); break;
|
||||
case __memory_order_acq_rel: __llvm_fence_ar_dev(); break;
|
||||
case __memory_order_seq_cst: __llvm_fence_sc_dev(); break;
|
||||
}
|
||||
break;
|
||||
case __memory_scope_all_svm_devices:
|
||||
switch (order) {
|
||||
case __memory_order_relaxed: break;
|
||||
case __memory_order_acquire: __llvm_fence_acq_sys(); break;
|
||||
case __memory_order_release: __llvm_fence_rel_sys(); break;
|
||||
case __memory_order_acq_rel: __llvm_fence_ar_sys(); break;
|
||||
case __memory_order_seq_cst: __llvm_fence_sc_sys(); break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Memory Fence Functions
|
||||
__device__
|
||||
inline
|
||||
static void __threadfence()
|
||||
{
|
||||
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __threadfence_block()
|
||||
{
|
||||
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __threadfence_system()
|
||||
{
|
||||
__atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
|
||||
}
|
||||
|
||||
#endif // __HCC_OR_HIP_CLANG__
|
||||
|
||||
#ifdef __HCC__
|
||||
@@ -796,50 +891,14 @@ void __assertfail(const char * __assertion,
|
||||
__builtin_trap();
|
||||
}
|
||||
|
||||
// hip.amdgcn.bc - sync threads
|
||||
#define __CLK_LOCAL_MEM_FENCE 0x01
|
||||
typedef unsigned __cl_mem_fence_flags;
|
||||
|
||||
typedef enum __memory_scope {
|
||||
__memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
|
||||
__memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
|
||||
__memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
||||
__memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
||||
__memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
||||
} __memory_scope;
|
||||
|
||||
// enum values aligned with what clang uses in EmitAtomicExpr()
|
||||
typedef enum __memory_order
|
||||
{
|
||||
__memory_order_relaxed = __ATOMIC_RELAXED,
|
||||
__memory_order_acquire = __ATOMIC_ACQUIRE,
|
||||
__memory_order_release = __ATOMIC_RELEASE,
|
||||
__memory_order_acq_rel = __ATOMIC_ACQ_REL,
|
||||
__memory_order_seq_cst = __ATOMIC_SEQ_CST
|
||||
} __memory_order;
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
|
||||
{
|
||||
if (flags) {
|
||||
switch(scope) {
|
||||
case __memory_scope_work_item: break;
|
||||
case __memory_scope_sub_group: __llvm_fence_rel_sg(); break;
|
||||
case __memory_scope_work_group: __llvm_fence_rel_wg(); break;
|
||||
case __memory_scope_device: __llvm_fence_rel_dev(); break;
|
||||
case __memory_scope_all_svm_devices: __llvm_fence_rel_sys(); break;
|
||||
}
|
||||
//atomic_work_item_fence(flags, memory_order_release, scope);
|
||||
__atomic_work_item_fence(flags, __memory_order_release, scope);
|
||||
__builtin_amdgcn_s_barrier();
|
||||
//atomic_work_item_fence(flags, memory_order_acquire, scope);
|
||||
switch(scope) {
|
||||
case __memory_scope_work_item: break;
|
||||
case __memory_scope_sub_group: __llvm_fence_acq_sg(); break;
|
||||
case __memory_scope_work_group: __llvm_fence_acq_wg(); break;
|
||||
case __memory_scope_device: __llvm_fence_acq_dev(); break;
|
||||
case __memory_scope_all_svm_devices: __llvm_fence_acq_sys(); break;
|
||||
}
|
||||
__atomic_work_item_fence(flags, __memory_order_acquire, scope);
|
||||
} else {
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
@@ -918,4 +977,5 @@ unsigned __smid(void)
|
||||
|
||||
#endif //defined(__clang__) && defined(__HIP__)
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@@ -63,4 +63,15 @@ extern "C" __device__ void __llvm_fence_rel_wg(void);
|
||||
extern "C" __device__ void __llvm_fence_rel_dev(void);
|
||||
extern "C" __device__ void __llvm_fence_rel_sys(void);
|
||||
|
||||
extern "C" __device__ void __llvm_fence_ar_sg(void);
|
||||
extern "C" __device__ void __llvm_fence_ar_wg(void);
|
||||
extern "C" __device__ void __llvm_fence_ar_dev(void);
|
||||
extern "C" __device__ void __llvm_fence_ar_sys(void);
|
||||
|
||||
|
||||
extern "C" __device__ void __llvm_fence_sc_sg(void);
|
||||
extern "C" __device__ void __llvm_fence_sc_wg(void);
|
||||
extern "C" __device__ void __llvm_fence_sc_dev(void);
|
||||
extern "C" __device__ void __llvm_fence_sc_sys(void);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -204,81 +204,6 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask
|
||||
__host__ __device__ int min(int arg1, int arg2);
|
||||
__host__ __device__ int max(int arg1, int arg2);
|
||||
|
||||
|
||||
/**
|
||||
* CUDA 8 device function features
|
||||
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Kernel launching
|
||||
*/
|
||||
|
||||
/**
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
*-------------------------------------------------------------------------------------------------
|
||||
* @defgroup Fence Fence Functions
|
||||
* @{
|
||||
*
|
||||
*
|
||||
* @warning The HIP memory fence functions are currently not supported yet.
|
||||
* If any of those threadfence stubs are reached by the application, you should set "export
|
||||
*HSA_DISABLE_CACHE=1" to disable L1 and L2 caches.
|
||||
*
|
||||
*
|
||||
* On AMD platforms, the threadfence* routines are currently empty stubs.
|
||||
*/
|
||||
|
||||
extern __attribute__((const)) __device__ void __hip_hc_threadfence() __asm("__llvm_fence_sc_dev");
|
||||
extern __attribute__((const)) __device__ void __hip_hc_threadfence_block() __asm(
|
||||
"__llvm_fence_sc_wg");
|
||||
|
||||
|
||||
/**
|
||||
* @brief threadfence_block makes writes visible to threads running in same block.
|
||||
*
|
||||
* @Returns void
|
||||
*
|
||||
* @param void
|
||||
*
|
||||
* @warning __threadfence_block is a stub and map to no-op.
|
||||
*/
|
||||
// __device__ void __threadfence_block(void);
|
||||
__device__ static inline void __threadfence_block(void) { return __hip_hc_threadfence_block(); }
|
||||
|
||||
/**
|
||||
* @brief threadfence makes wirtes visible to other threads running on same GPU.
|
||||
*
|
||||
* @Returns void
|
||||
*
|
||||
* @param void
|
||||
*
|
||||
* @warning __threadfence is a stub and map to no-op, application should set "export
|
||||
* HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches.
|
||||
*/
|
||||
// __device__ void __threadfence(void) __attribute__((deprecated("Provided for compile-time
|
||||
// compatibility, not yet functional")));
|
||||
__device__ static inline void __threadfence(void) { return __hip_hc_threadfence(); }
|
||||
|
||||
/**
|
||||
* @brief threadfence_system makes writes to pinned system memory visible on host CPU.
|
||||
*
|
||||
* @Returns void
|
||||
*
|
||||
* @param void
|
||||
*
|
||||
* @warning __threadfence_system is a stub and map to no-op.
|
||||
*/
|
||||
//__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround
|
||||
//configuration, see hip_kernel_language.md for details")));
|
||||
__device__ void __threadfence_system(void);
|
||||
|
||||
// doxygen end Fence Fence
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
#endif // __HCC_OR_HIP_CLANG__
|
||||
|
||||
#if defined __HCC__
|
||||
|
||||
@@ -155,6 +155,3 @@ __host__ __device__ int max(int arg1, int arg2) {
|
||||
}
|
||||
|
||||
__host__ void* __get_dynamicgroupbaseptr() { return nullptr; }
|
||||
|
||||
|
||||
__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }
|
||||
|
||||
@@ -125,7 +125,6 @@ __device__ double __hip_fast_dsqrt_rd(double x);
|
||||
__device__ double __hip_fast_dsqrt_rn(double x);
|
||||
__device__ double __hip_fast_dsqrt_ru(double x);
|
||||
__device__ double __hip_fast_dsqrt_rz(double x);
|
||||
__device__ void __threadfence_system(void);
|
||||
|
||||
float __hip_host_j0f(float x);
|
||||
double __hip_host_j0(double x);
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren