SWDEV-396533 - correct _shfl function to match cuda
Signed-off-by: sdashmiz <shadi.dashmiz@amd.com> Change-Id: I311419fd25c055339f25fe0c7a132ec9ee225600
This commit is contained in:
zatwierdzone przez
Shadi Dashmiz
rodzic
3f4b70cafd
commit
23e99dbb07
@@ -89,7 +89,7 @@ __device__
|
||||
inline
|
||||
int __shfl(int var, int src_lane, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
int index = src_lane + (self & ~(width-1));
|
||||
int index = (src_lane & (width - 1)) + (self & ~(width-1));
|
||||
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
||||
}
|
||||
__device__
|
||||
|
||||
Reference in New Issue
Block a user