From f1f38679e585eb41ec267909a2d802fcb9692944 Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Tue, 27 Apr 2021 07:19:11 -0700 Subject: [PATCH] SWDEV-271978 - Function __byte_perm to match CUDA behavior Selector indices are as follows (the upper 16-bits of the selector are not used): selector[0] = s<2:0> selector[1] = s<6:4> selector[2] = s<10:8> selector[3] = s<14:12> Change-Id: Ibf76c6ec2374f1f5b9bba8bd9dbd73660f830eea [ROCm/hip commit: 1cc65c48a264c578cb93c9ea6cbc8249bf55b0f7] --- .../hip/include/hip/amd_detail/device_functions.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/projects/hip/include/hip/amd_detail/device_functions.h b/projects/hip/include/hip/amd_detail/device_functions.h index 0c4c79c538..703e72b5f1 100644 --- a/projects/hip/include/hip/amd_detail/device_functions.h +++ b/projects/hip/include/hip/amd_detail/device_functions.h @@ -147,15 +147,15 @@ __device__ static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) { struct uchar2Holder cHoldVal; struct ucharHolder cHoldKey; - struct ucharHolder cHoldOut; cHoldKey.ui = s; cHoldVal.ui[0] = x; cHoldVal.ui[1] = y; - cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]]; - cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]]; - cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]]; - cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]]; - return cHoldOut.ui; + unsigned int result; + result = cHoldVal.c[cHoldKey.c[0] & 0x07]; + result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8); + result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16); + result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24); + return result; } __device__ static inline unsigned int __hadd(int x, int y) {