These are moving from hipclang in device library to hip headers. These are required for the functionality of HIPclang project.


[ROCm/hip commit: c453b42bff]
Этот коммит содержится в:
Aaron Enye Shi
2018-06-12 22:05:59 +00:00
родитель a4b9e5305f
Коммит b34868005a
2 изменённых файлов: 192 добавлений и 14 удалений
+191 -6
Просмотреть файл
@@ -189,8 +189,16 @@ extern int HIP_TRACE_API;
static constexpr int warpSize = 64;
#define clock_t long long int
__device__ long long int clock64();
__device__ clock_t clock();
__device__
unsigned long __llvm_amdgcn_s_memrealtime(void) __asm("llvm.amdgcn.s.memrealtime");
__device__
inline
long long int __clock64() { return (long long int)__llvm_amdgcn_s_memrealtime(); }
__device__
inline
clock_t __clock() { return (clock_t)__llvm_amdgcn_s_memrealtime(); }
// abort
__device__ void abort();
@@ -200,6 +208,45 @@ __device__ int __all(int input);
__device__ int __any(int input);
__device__ unsigned long long int __ballot(int input);
__device__
inline
int64_t __ballot64(int a) {
int64_t s;
// define i64 @__ballot64(i32 %a) #0 {
// %b = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %a) #1
// ret i64 %b
// }
__asm("v_cmp_ne_i32_e64 $0, 0, $1" : "=s"(s) : "v"(a));
return s;
}
// hip.amdgcn.bc - lanemask
extern "C" __device__ int32_t __ockl_activelane_u32(void);
__device__
inline
int64_t __lanemask_gt()
{
int32_t activelane = __ockl_activelane_u32();
int64_t ballot = __ballot64(1);
if (activelane != 63) {
int64_t tmp = (~0UL) << (activelane + 1);
return tmp & ballot;
}
return 0;
}
__device__
inline
int64_t __lanemask_lt()
{
int32_t activelane = __ockl_activelane_u32();
int64_t ballot = __ballot64(1);
if (activelane == 0)
return 0;
return ballot;
}
#if __HIP_ARCH_GFX701__ == 0
// warp shuffle functions
@@ -238,8 +285,11 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask
__host__ __device__ int min(int arg1, int arg2);
__host__ __device__ int max(int arg1, int arg2);
__device__ void* __get_dynamicgroupbaseptr();
extern "C" __device__ void* get_dynamic_group_segment_base_pointer();
__device__
inline
void* __get_dynamicgroupbaseptr() { return get_dynamic_group_segment_base_pointer(); }
/**
* CUDA 8 device function features
@@ -315,6 +365,11 @@ __device__ void __threadfence_system(void);
* @}
*/
// hip.amdgcn.bc - named sync
__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
__device__ inline void __named_sync(int a, int b) { __llvm_amdgcn_s_barrier(); }
#endif // __HCC_OR_HIP_CLANG__
#if defined __HCC__
@@ -572,7 +627,9 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
__DEVICE__ void __device_trap() __asm("llvm.trap");
__DEVICE__ void inline __assert_fail(const char * __assertion,
__DEVICE__
inline
void __assert_fail(const char * __assertion,
const char *__file,
unsigned int __line,
const char *__function)
@@ -581,8 +638,136 @@ __DEVICE__ void inline __assert_fail(const char * __assertion,
__device_trap();
}
extern "C" __device__ __attribute__((noduplicate)) void __syncthreads();
extern "C" __device__ void *__amdgcn_get_dynamicgroupbaseptr();
__DEVICE__
inline
void __assertfail(const char * __assertion,
const char *__file,
unsigned int __line,
const char *__function,
size_t charsize)
{
// ignore all the args for now.
__device_trap();
}
// hip.amdgcn.bc - sync threads
// extern "C" __device__ __attribute__((noduplicate)) void __syncthreads();
#define CLK_LOCAL_MEM_FENCE 0x01
#define local __attribute__((address_space(3)))
typedef unsigned cl_mem_fence_flags;
typedef enum memory_scope {
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
} memory_scope;
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum memory_order
{
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} memory_order;
extern "C" __device__ __attribute__((overloadable))
void atomic_work_item_fence(cl_mem_fence_flags, memory_order, memory_scope);
__device__
inline
static void hc_work_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
{
if (flags) {
atomic_work_item_fence(flags, memory_order_release, scope);
__builtin_amdgcn_s_barrier();
atomic_work_item_fence(flags, memory_order_acquire, scope);
} else {
__builtin_amdgcn_s_barrier();
}
}
__device__
inline
static void hc_barrier(int n)
{
hc_work_group_barrier((cl_mem_fence_flags)n, memory_scope_work_group);
}
__device__
inline
__attribute__((noduplicate))
void __syncthreads()
{
hc_barrier(CLK_LOCAL_MEM_FENCE);
}
__device__ unsigned __llvm_amdgcn_s_getreg(unsigned) __asm("llvm.amdgcn.s.getreg");
__device__ unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
__device__ inline static local char* __to_local(unsigned x) { return (local char*)x; }
__device__ inline void *__amdgcn_get_dynamicgroupbaseptr() {
#if 0
// Get group segment base pointer.
char* base = __llvm_amdgcn_s_getreg(14342) << 8);
base += __llvm_amdgcn_groupstaticsize();
return base;
#endif
return __get_dynamicgroupbaseptr();
}
// hip.amdgcn.bc - device routine
/*
HW_ID Register bit structure
WAVE_ID 3:0 Wave buffer slot number. 0-9.
SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
PIPE_ID 7:6 Pipeline from which the wave was dispatched.
CU_ID 11:8 Compute Unit the wave is assigned to.
SH_ID 12 Shader Array (within an SE) the wave is assigned to.
SE_ID 14:13 Shader Engine the wave is assigned to.
TG_ID 19:16 Thread-group ID
VM_ID 23:20 Virtual Memory ID
QUEUE_ID 26:24 Queue from which this wave was dispatched.
STATE_ID 29:27 State ID (graphics only, not compute).
ME_ID 31:30 Micro-engine ID.
*/
#define HW_ID 4
#define HW_ID_CU_ID_SIZE 4
#define HW_ID_CU_ID_OFFSET 8
#define HW_ID_SE_ID_SIZE 2
#define HW_ID_SE_ID_OFFSET 13
/*
Encoding of parameter bitmask
HW_ID 5:0 HW_ID
OFFSET 10:6 Range: 0..31
SIZE 15:11 Range: 1..32
*/
#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
__device__
inline
unsigned __smid(void)
{
unsigned cu_id = __llvm_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
unsigned se_id = __llvm_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
/* Each shader engine has 16 CU */
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
}
// Macro to replace extern __shared__ declarations
// to local variable definitions
+1 -8
Просмотреть файл
@@ -144,9 +144,6 @@ __device__ void* __hip_hc_memset(void* dst, uint8_t val, size_t size) {
return dst;
}
__device__ long long int clock64() { return (long long int)hc::__cycle_u64(); };
__device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); };
// abort
__device__ void abort() { return hc::abort(); }
@@ -203,11 +200,7 @@ __host__ __device__ int max(int arg1, int arg2) {
return (int)(hc::precise_math::fmax((float)arg1, (float)arg2));
}
__device__ void* __get_dynamicgroupbaseptr() {
return hc::get_dynamic_group_segment_base_pointer();
}
__host__ void* __get_dynamicgroupbaseptr() { return nullptr; }
__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }
__device__ void __threadfence_system(void) { std::atomic_thread_fence(std::memory_order_seq_cst); }