SWDEV-271978 - Function __byte_perm to match CUDA behavior

Selector indices are as follows (the upper 16-bits of the selector are not used): selector[0] = s<2:0> selector[1] = s<6:4> selector[2] = s<10:8> selector[3] = s<14:12>

Change-Id: Ibf76c6ec2374f1f5b9bba8bd9dbd73660f830eea


[ROCm/hip commit: 1cc65c48a2]
This commit is contained in:
Anusha GodavarthySurya
2021-04-27 07:19:11 -07:00
committed by Anusha Godavarthy Surya
parent 9d1462020d
commit f1f38679e5
@@ -147,15 +147,15 @@ __device__
static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
struct uchar2Holder cHoldVal;
struct ucharHolder cHoldKey;
struct ucharHolder cHoldOut;
cHoldKey.ui = s;
cHoldVal.ui[0] = x;
cHoldVal.ui[1] = y;
cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
return cHoldOut.ui;
unsigned int result;
result = cHoldVal.c[cHoldKey.c[0] & 0x07];
result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
return result;
}
__device__ static inline unsigned int __hadd(int x, int y) {