SWDEV-271978 - Function __byte_perm to match CUDA behavior
Selector indices are as follows (the upper 16-bits of the selector are not used): selector[0] = s<2:0> selector[1] = s<6:4> selector[2] = s<10:8> selector[3] = s<14:12>
Change-Id: Ibf76c6ec2374f1f5b9bba8bd9dbd73660f830eea
[ROCm/hip commit: 1cc65c48a2]
This commit is contained in:
committed by
Anusha Godavarthy Surya
parent
9d1462020d
commit
f1f38679e5
@@ -147,15 +147,15 @@ __device__
|
||||
static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
|
||||
struct uchar2Holder cHoldVal;
|
||||
struct ucharHolder cHoldKey;
|
||||
struct ucharHolder cHoldOut;
|
||||
cHoldKey.ui = s;
|
||||
cHoldVal.ui[0] = x;
|
||||
cHoldVal.ui[1] = y;
|
||||
cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
|
||||
cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
|
||||
cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
|
||||
cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
|
||||
return cHoldOut.ui;
|
||||
unsigned int result;
|
||||
result = cHoldVal.c[cHoldKey.c[0] & 0x07];
|
||||
result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
|
||||
result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
|
||||
result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ static inline unsigned int __hadd(int x, int y) {
|
||||
|
||||
Reference in New Issue
Block a user