Remove activelanemask asm using ockl and llvm instrinsics

Replace implementation of __any and __all functions using OCKL functions and replaced __ballot implementation to use llvm intrinsic llvm.amdgcn.icmp.i32 instead of calls to __activelanemask_v4_b64_b1 which is not convergent.
This commit is contained in:
Aaron Enye Shi
2018-07-10 17:56:57 +00:00
والد 9aaa792286
کامیت f8746ecc64
3فایلهای تغییر یافته به همراه12 افزوده شده و 21 حذف شده
@@ -662,51 +662,38 @@ void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
#endif // __HIP_DEVICE_COMPILE__
// warp vote function __all __any __ballot
extern "C" __device__ inline uint64_t __activelanemask_v4_b64_b1(unsigned int input) {
uint64_t output;
// define i64 @__activelanemask_v4_b64_b1(i32 %input) #5 {
// %a = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %input) #9
// ret i64 %a
// }
__asm("v_cmp_ne_i32_e64 %0, 0, %1" : "=s"(output) : "v"(input));
return output;
}
__device__
inline
unsigned int __activelanecount_u32_b1(unsigned int input) {
return __popcll(__activelanemask_v4_b64_b1(input));
}
__device__
inline
int __all(int predicate) {
return __popcll(__activelanemask_v4_b64_b1(predicate)) == __activelanecount_u32_b1(1);
return __ockl_wfall_i32(predicate);
}
__device__
inline
int __any(int predicate) {
#ifdef NVCC_COMPAT
if (__popcll(__activelanemask_v4_b64_b1(predicate)) != 0)
if (__ockl_wfany_i32(predicate) != 0)
return 1;
else
return 0;
#else
return __popcll(__activelanemask_v4_b64_b1(predicate));
return __ockl_wfany_i32(predicate);
#endif
}
// XXX from llvm/include/llvm/IR/InstrTypes.h
#define ICMP_NE 33
__device__
inline
unsigned long long int __ballot(int predicate) {
return __activelanemask_v4_b64_b1(predicate);
return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
}
__device__
inline
unsigned long long int __ballot64(int predicate) {
return __activelanemask_v4_b64_b1(predicate);
return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE);
}
// hip.amdgcn.bc - lanemask
@@ -30,6 +30,8 @@ THE SOFTWARE.
#include "hip/hcc_detail/host_defines.h"
extern "C" __device__ bool __ockl_wfany_i32(int);
extern "C" __device__ bool __ockl_wfall_i32(int);
extern "C" __device__ int32_t __ockl_activelane_u32(void);
extern "C" __device__ uint __ockl_mul24_u32(uint, uint);
@@ -31,6 +31,8 @@ THE SOFTWARE.
#include "hip/hcc_detail/host_defines.h"
__device__ ulong __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) __asm("llvm.amdgcn.icmp.i32");
__device__
unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");