From f8746ecc64afa1d94ecd68db5fe8b01367a74207 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Tue, 10 Jul 2018 17:56:57 +0000 Subject: [PATCH] Remove activelanemask asm using ockl and llvm instrinsics Replace implementation of __any and __all functions using OCKL functions and replaced __ballot implementation to use llvm intrinsic llvm.amdgcn.icmp.i32 instead of calls to __activelanemask_v4_b64_b1 which is not convergent. --- .../include/hip/hcc_detail/device_functions.h | 29 +++++-------------- .../hip/hcc_detail/device_library_decls.h | 2 ++ .../include/hip/hcc_detail/llvm_intrinsics.h | 2 ++ 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/hipamd/include/hip/hcc_detail/device_functions.h b/hipamd/include/hip/hcc_detail/device_functions.h index 1938170ce4..5466982878 100644 --- a/hipamd/include/hip/hcc_detail/device_functions.h +++ b/hipamd/include/hip/hcc_detail/device_functions.h @@ -662,51 +662,38 @@ void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); } #endif // __HIP_DEVICE_COMPILE__ // warp vote function __all __any __ballot -extern "C" __device__ inline uint64_t __activelanemask_v4_b64_b1(unsigned int input) { - uint64_t output; - // define i64 @__activelanemask_v4_b64_b1(i32 %input) #5 { - // %a = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %input) #9 - // ret i64 %a - // } - __asm("v_cmp_ne_i32_e64 %0, 0, %1" : "=s"(output) : "v"(input)); - return output; -} - -__device__ -inline -unsigned int __activelanecount_u32_b1(unsigned int input) { - return __popcll(__activelanemask_v4_b64_b1(input)); -} - __device__ inline int __all(int predicate) { - return __popcll(__activelanemask_v4_b64_b1(predicate)) == __activelanecount_u32_b1(1); + return __ockl_wfall_i32(predicate); } __device__ inline int __any(int predicate) { #ifdef NVCC_COMPAT - if (__popcll(__activelanemask_v4_b64_b1(predicate)) != 0) + if (__ockl_wfany_i32(predicate) != 0) return 1; else return 0; #else - return __popcll(__activelanemask_v4_b64_b1(predicate)); + return __ockl_wfany_i32(predicate); #endif } +// XXX from llvm/include/llvm/IR/InstrTypes.h +#define ICMP_NE 33 + __device__ inline unsigned long long int __ballot(int predicate) { - return __activelanemask_v4_b64_b1(predicate); + return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE); } __device__ inline unsigned long long int __ballot64(int predicate) { - return __activelanemask_v4_b64_b1(predicate); + return __llvm_amdgcn_icmp_i32(predicate, 0, ICMP_NE); } // hip.amdgcn.bc - lanemask diff --git a/hipamd/include/hip/hcc_detail/device_library_decls.h b/hipamd/include/hip/hcc_detail/device_library_decls.h index 53ad7595fe..64e4ff8898 100644 --- a/hipamd/include/hip/hcc_detail/device_library_decls.h +++ b/hipamd/include/hip/hcc_detail/device_library_decls.h @@ -30,6 +30,8 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" +extern "C" __device__ bool __ockl_wfany_i32(int); +extern "C" __device__ bool __ockl_wfall_i32(int); extern "C" __device__ int32_t __ockl_activelane_u32(void); extern "C" __device__ uint __ockl_mul24_u32(uint, uint); diff --git a/hipamd/include/hip/hcc_detail/llvm_intrinsics.h b/hipamd/include/hip/hcc_detail/llvm_intrinsics.h index 02df3c2fbe..6f2fc45626 100644 --- a/hipamd/include/hip/hcc_detail/llvm_intrinsics.h +++ b/hipamd/include/hip/hcc_detail/llvm_intrinsics.h @@ -31,6 +31,8 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" +__device__ ulong __llvm_amdgcn_icmp_i32(uint x, uint y, uint z) __asm("llvm.amdgcn.icmp.i32"); + __device__ unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");