Remove some asm declarations for intrinsics

This technique should never be used, and only accessed through
__builtins.

There's currently no builtin for groupstaticsize. I left ds_swizzle
since for some reason it switches to the builtin based on __HCC__ or
not.

Change-Id: If1e1394221dba83ea4add6db5e94d6b715552044
Этот коммит содержится в:
Matt Arsenault
2020-05-08 12:29:10 -04:00
коммит произвёл Matthew Arsenault
родитель 8194edb2a2
Коммит 03ebfd2d49
3 изменённых файлов: 20 добавлений и 46 удалений
+16 -13
Просмотреть файл
@@ -85,11 +85,11 @@ __device__ static inline unsigned int __ffsll(long long int input) {
}
__device__ static inline unsigned int __brev(unsigned int input) {
return __llvm_bitrev_b32(input);
return __builtin_bitreverse32(input);
}
__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
return __llvm_bitrev_b64(input);
return __builtin_bitreverse64(input);
}
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
@@ -233,7 +233,10 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns
return __ockl_sadd_u32(x, y, z);
}
__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
__device__ static inline unsigned int __lane_id() {
return __builtin_amdgcn_mbcnt_hi(
-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
}
/*
HIP specific device functions
@@ -241,25 +244,25 @@ HIP specific device functions
__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
return tmp.u;
}
__device__ static inline float __hip_ds_bpermutef(int index, float src) {
union { int i; unsigned u; float f; } tmp; tmp.f = src;
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
return tmp.f;
}
__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
return tmp.u;
}
__device__ static inline float __hip_ds_permutef(int index, float src) {
union { int i; unsigned u; float f; } tmp; tmp.u = src;
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
return tmp.u;
}
@@ -293,8 +296,8 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) {
template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
__device__ static inline int __hip_move_dpp_N(int src) {
return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
bound_ctrl);
return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
bound_ctrl);
}
static constexpr int warpSize = 64;
@@ -304,7 +307,7 @@ inline
int __shfl(int var, int src_lane, int width = warpSize) {
int self = __lane_id();
int index = src_lane + (self & ~(width-1));
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
@@ -376,7 +379,7 @@ int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self - lane_delta;
index = (index < (self & ~(width-1)))?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
@@ -446,7 +449,7 @@ int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
int index = self + lane_delta;
index = (int)((self&(width-1))+lane_delta) >= width?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
@@ -516,7 +519,7 @@ int __shfl_xor(int var, int lane_mask, int width = warpSize) {
int self = __lane_id();
int index = self^lane_mask;
index = index >= ((self+width)&~(width-1))?self:index;
return __llvm_amdgcn_ds_bpermute(index<<2, var);
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
+1 -30
Просмотреть файл
@@ -31,40 +31,11 @@ THE SOFTWARE.
#include "hip/hcc_detail/host_defines.h"
__device__
__attribute__((convergent))
ulong __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) __asm("llvm.amdgcn.icmp.i32");
// FIXME: These should all be removed and proper builtins used.
__device__
unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
__device__
unsigned int __llvm_bitrev_b32(unsigned int src0) __asm("llvm.bitreverse.i32");
__device__
uint64_t __llvm_bitrev_b64(uint64_t src0) __asm("llvm.bitreverse.i64");
extern
__device__
__attribute__((const))
unsigned int __mbcnt_lo(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.lo");
extern
__device__
__attribute__((const))
unsigned int __mbcnt_hi(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.hi");
__device__
int __llvm_amdgcn_ds_bpermute(int index, int src) __asm("llvm.amdgcn.ds.bpermute");
__device__
int __llvm_amdgcn_ds_permute(int index, int src) __asm("llvm.amdgcn.ds.permute");
__device__
int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle");
__device__
int __llvm_amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask,
bool bound_ctrl) __asm("llvm.amdgcn.mov.dpp.i32");
#endif
+3 -3
Просмотреть файл
@@ -38,11 +38,11 @@ THE SOFTWARE.
__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0);
mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0);
mbcnt_lo[x] = __builtin_amdgcn_mbcnt_lo(0xFFFFFFFF, 0);
mbcnt_hi[x] = __builtin_amdgcn_mbcnt_hi(0xFFFFFFFF, 0);
lane_id[x] = __lane_id();
}
using namespace std;
int main() {