SWDEV-396533 - correct _shfl function to match cuda

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: I311419fd25c055339f25fe0c7a132ec9ee225600
This commit is contained in:
sdashmiz
2023-04-24 15:08:45 -04:00
zatwierdzone przez Shadi Dashmiz
rodzic 3f4b70cafd
commit 23e99dbb07
@@ -89,7 +89,7 @@ __device__
inline
int __shfl(int var, int src_lane, int width = warpSize) {
int self = __lane_id();
int index = src_lane + (self & ~(width-1));
int index = (src_lane & (width - 1)) + (self & ~(width-1));
return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__