Remove some asm declarations for intrinsics
This technique should never be used, and only accessed through __builtins. There's currently no builtin for groupstaticsize. I left ds_swizzle since for some reason it switches to the builtin based on __HCC__ or not. Change-Id: If1e1394221dba83ea4add6db5e94d6b715552044
Этот коммит содержится в:
коммит произвёл
Matthew Arsenault
родитель
8194edb2a2
Коммит
03ebfd2d49
@@ -85,11 +85,11 @@ __device__ static inline unsigned int __ffsll(long long int input) {
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __brev(unsigned int input) {
|
||||
return __llvm_bitrev_b32(input);
|
||||
return __builtin_bitreverse32(input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
|
||||
return __llvm_bitrev_b64(input);
|
||||
return __builtin_bitreverse64(input);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
|
||||
@@ -233,7 +233,10 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns
|
||||
return __ockl_sadd_u32(x, y, z);
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __lane_id() { return __mbcnt_hi(-1, __mbcnt_lo(-1, 0)); }
|
||||
__device__ static inline unsigned int __lane_id() {
|
||||
return __builtin_amdgcn_mbcnt_hi(
|
||||
-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
|
||||
}
|
||||
|
||||
/*
|
||||
HIP specific device functions
|
||||
@@ -241,25 +244,25 @@ HIP specific device functions
|
||||
|
||||
__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
|
||||
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
__device__ static inline float __hip_ds_bpermutef(int index, float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
||||
tmp.i = __llvm_amdgcn_ds_bpermute(index, tmp.i);
|
||||
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
|
||||
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
__device__ static inline float __hip_ds_permutef(int index, float src) {
|
||||
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
||||
tmp.i = __llvm_amdgcn_ds_permute(index, tmp.i);
|
||||
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
||||
return tmp.u;
|
||||
}
|
||||
|
||||
@@ -293,8 +296,8 @@ __device__ static inline float __hip_ds_swizzlef_N(float src) {
|
||||
|
||||
template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
|
||||
__device__ static inline int __hip_move_dpp_N(int src) {
|
||||
return __llvm_amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask,
|
||||
bound_ctrl);
|
||||
return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
|
||||
bound_ctrl);
|
||||
}
|
||||
|
||||
static constexpr int warpSize = 64;
|
||||
@@ -304,7 +307,7 @@ inline
|
||||
int __shfl(int var, int src_lane, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = src_lane + (self & ~(width-1));
|
||||
return __llvm_amdgcn_ds_bpermute(index<<2, var);
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
@@ -376,7 +379,7 @@ int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self - lane_delta;
|
||||
index = (index < (self & ~(width-1)))?self:index;
|
||||
return __llvm_amdgcn_ds_bpermute(index<<2, var);
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
@@ -446,7 +449,7 @@ int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self + lane_delta;
|
||||
index = (int)((self&(width-1))+lane_delta) >= width?self:index;
|
||||
return __llvm_amdgcn_ds_bpermute(index<<2, var);
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
@@ -516,7 +519,7 @@ int __shfl_xor(int var, int lane_mask, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = self^lane_mask;
|
||||
index = index >= ((self+width)&~(width-1))?self:index;
|
||||
return __llvm_amdgcn_ds_bpermute(index<<2, var);
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
|
||||
@@ -31,40 +31,11 @@ THE SOFTWARE.
|
||||
|
||||
#include "hip/hcc_detail/host_defines.h"
|
||||
|
||||
__device__
|
||||
__attribute__((convergent))
|
||||
ulong __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) __asm("llvm.amdgcn.icmp.i32");
|
||||
|
||||
// FIXME: These should all be removed and proper builtins used.
|
||||
__device__
|
||||
unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
|
||||
|
||||
__device__
|
||||
unsigned int __llvm_bitrev_b32(unsigned int src0) __asm("llvm.bitreverse.i32");
|
||||
|
||||
__device__
|
||||
uint64_t __llvm_bitrev_b64(uint64_t src0) __asm("llvm.bitreverse.i64");
|
||||
|
||||
extern
|
||||
__device__
|
||||
__attribute__((const))
|
||||
unsigned int __mbcnt_lo(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.lo");
|
||||
|
||||
extern
|
||||
__device__
|
||||
__attribute__((const))
|
||||
unsigned int __mbcnt_hi(unsigned int x, unsigned int y) __asm("llvm.amdgcn.mbcnt.hi");
|
||||
|
||||
__device__
|
||||
int __llvm_amdgcn_ds_bpermute(int index, int src) __asm("llvm.amdgcn.ds.bpermute");
|
||||
|
||||
__device__
|
||||
int __llvm_amdgcn_ds_permute(int index, int src) __asm("llvm.amdgcn.ds.permute");
|
||||
|
||||
__device__
|
||||
int __llvm_amdgcn_ds_swizzle(int index, int pattern) __asm("llvm.amdgcn.ds.swizzle");
|
||||
|
||||
__device__
|
||||
int __llvm_amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask,
|
||||
bool bound_ctrl) __asm("llvm.amdgcn.mov.dpp.i32");
|
||||
|
||||
#endif
|
||||
|
||||
@@ -38,11 +38,11 @@ THE SOFTWARE.
|
||||
|
||||
__global__ void HIP_kernel(unsigned int* mbcnt_lo, unsigned int* mbcnt_hi, unsigned int* lane_id) {
|
||||
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
mbcnt_lo[x] = __mbcnt_lo(0xFFFFFFFF, 0);
|
||||
mbcnt_hi[x] = __mbcnt_hi(0xFFFFFFFF, 0);
|
||||
mbcnt_lo[x] = __builtin_amdgcn_mbcnt_lo(0xFFFFFFFF, 0);
|
||||
mbcnt_hi[x] = __builtin_amdgcn_mbcnt_hi(0xFFFFFFFF, 0);
|
||||
lane_id[x] = __lane_id();
|
||||
}
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
|
||||
Ссылка в новой задаче
Block a user