From fcba52f6f532ef2653e136a280685d2ace27dca1 Mon Sep 17 00:00:00 2001 From: Sarbojit Sarkar Date: Wed, 23 Sep 2020 07:23:09 -0400 Subject: [PATCH] SWDEV-253247: add ulong and ulonglong version of__shfl* Change-Id: I40ab6cfa12175f334e8392b71f567054d8256e2a [ROCm/hip commit: bf20337fc158dd710cb6c932222502c01bf3e3b4] --- .../include/hip/hcc_detail/device_functions.h | 138 +++++++++++++++++- .../hip/tests/src/kernel/hipShflTests.cpp | 17 ++- .../tests/src/kernel/hipShflUpDownTest.cpp | 64 +++++++- 3 files changed, 213 insertions(+), 6 deletions(-) diff --git a/projects/hip/include/hip/hcc_detail/device_functions.h b/projects/hip/include/hip/hcc_detail/device_functions.h index fb06e88abd..a5fe9425f4 100644 --- a/projects/hip/include/hip/hcc_detail/device_functions.h +++ b/projects/hip/include/hip/hcc_detail/device_functions.h @@ -365,6 +365,25 @@ long __shfl(long var, int src_lane, int width = warpSize) } __device__ inline +unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) { + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl(static_cast(var), src_lane, width)); + #endif +} +__device__ +inline long long __shfl(long long var, int src_lane, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -378,8 +397,22 @@ long long __shfl(long long var, int src_lane, int width = warpSize) long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) { + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); - __device__ + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl(tmp[0], src_lane, width); + tmp[1] = __shfl(tmp[1], src_lane, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + +__device__ inline int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) { int self = __lane_id(); @@ -435,6 +468,28 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize) return static_cast(__shfl_up(static_cast(var), lane_delta, width)); #endif } + +__device__ +inline +unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_up(static_cast(var), lane_delta, width)); + #endif +} + __device__ inline long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize) @@ -449,6 +504,20 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize return tmp1; } +__device__ +inline +unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_up(tmp[0], lane_delta, width); + tmp[1] = __shfl_up(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} + __device__ inline int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) { @@ -507,6 +576,26 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize) } __device__ inline +unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_down(static_cast(var), lane_delta, width)); + #endif +} +__device__ +inline long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -518,6 +607,19 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } +__device__ +inline +unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_down(tmp[0], lane_delta, width); + tmp[1] = __shfl_down(tmp[1], lane_delta, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} __device__ inline @@ -577,6 +679,26 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize) } __device__ inline +unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize) +{ + #ifndef _MSC_VER + static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long) == sizeof(uint64_t), ""); + + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; + #else + static_assert(sizeof(unsigned long) == sizeof(unsigned int), ""); + return static_cast(__shfl_xor(static_cast(var), lane_mask, width)); + #endif +} +__device__ +inline long long __shfl_xor(long long var, int lane_mask, int width = warpSize) { static_assert(sizeof(long long) == 2 * sizeof(int), ""); @@ -588,7 +710,19 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize) long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); return tmp1; } - +__device__ +inline +unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize) +{ + static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), ""); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), ""); + unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp)); + tmp[0] = __shfl_xor(tmp[0], lane_mask, width); + tmp[1] = __shfl_xor(tmp[1], lane_mask, width); + uint64_t tmp0 = (static_cast(tmp[1]) << 32ull) | static_cast(tmp[0]); + unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0)); + return tmp1; +} #define MASK1 0x00ff00ff #define MASK2 0xff00ff00 diff --git a/projects/hip/tests/src/kernel/hipShflTests.cpp b/projects/hip/tests/src/kernel/hipShflTests.cpp index 9b1cc73248..06b6a90b83 100644 --- a/projects/hip/tests/src/kernel/hipShflTests.cpp +++ b/projects/hip/tests/src/kernel/hipShflTests.cpp @@ -57,6 +57,15 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width) } } +void getFactor(int& fact) { fact = 101; } +void getFactor(unsigned int& fact) { fact = static_cast(INT32_MAX)+1; } +void getFactor(float& fact) { fact = 2.5; } +void getFactor(double& fact) { fact = 2.5; } +void getFactor(long& fact) { fact = 202; } +void getFactor(unsigned long& fact) { fact = static_cast(__LONG_MAX__)+1; } +void getFactor(long long& fact) { fact = 303; } +void getFactor(unsigned long long& fact) { fact = static_cast(__LONG_LONG_MAX__)+1; } + template void runTest() { T* Matrix; @@ -77,8 +86,10 @@ void runTest() { cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T)); // initialize the input data + T factor; + getFactor(factor); for (i = 0; i < NUM; i++) { - Matrix[i] = (T)i * 10l; + Matrix[i] = (T)i + factor; } // allocate the memory on the device side @@ -124,7 +135,11 @@ void runTest() { int main() { runTest(); runTest(); + runTest(); runTest(); runTest(); + runTest(); + runTest(); + runTest(); passed(); } diff --git a/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp b/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp index 553087ce45..cd3900aee5 100644 --- a/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp +++ b/projects/hip/tests/src/kernel/hipShflUpDownTest.cpp @@ -47,13 +47,31 @@ __global__ void shflUpSum(T* a, int size) { a[threadIdx.x] = val; } +template +__global__ void shflXorSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size/2; i > 0; i /= 2) + val += __shfl_xor(val, i, size); + a[threadIdx.x] = val; +} + +void getFactor(int& fact) { fact = 101; } +void getFactor(unsigned int& fact) { fact = static_cast(INT32_MAX)+1; } +void getFactor(float& fact) { fact = 2.5; } +void getFactor(double& fact) { fact = 2.5; } +void getFactor(long& fact) { fact = 202; } +void getFactor(unsigned long& fact) { fact = static_cast(__LONG_MAX__)+1; } +void getFactor(long long& fact) { fact = 303; } +void getFactor(unsigned long long& fact) { fact = static_cast(__LONG_LONG_MAX__)+1; } + template void runTestShflUp() { const int size = 32; T a[size]; T cpuSum = 0; + T factor; getFactor(factor); for (int i = 0; i < size; i++) { - a[i] = i; + a[i] = i + factor; cpuSum += a[i]; } T* d_a; @@ -73,8 +91,9 @@ void runTestShflDown() { const int size = 32; T a[size]; T cpuSum = 0; + T factor; getFactor(factor); for (int i = 0; i < size; i++) { - a[i] = i; + a[i] = i + factor; cpuSum += a[i]; } T* d_a; @@ -84,19 +103,58 @@ void runTestShflDown() { hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); if (a[0] != cpuSum) { hipFree(d_a); - failed("Shfl Up Sum did not match."); + failed("Shfl Down Sum did not match."); + } + hipFree(d_a); +} + +template +void runTestShflXor() { + const int size = 32; + T a[size]; + T cpuSum = 0; + T factor; getFactor(factor); + for (int i = 0; i < size; i++) { + a[i] = i + factor; + cpuSum += a[i]; + } + T* d_a; + hipMalloc(&d_a, sizeof(T) * size); + hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault); + hipLaunchKernelGGL(shflXorSum, 1, size, 0, 0, d_a, size); + hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault); + if (a[0] != cpuSum) { + hipFree(d_a); + failed("Shfl Xor Sum did not match."); } hipFree(d_a); } int main() { runTestShflUp(); runTestShflUp(); + runTestShflUp(); runTestShflUp(); runTestShflUp(); + runTestShflUp(); + runTestShflUp(); + runTestShflUp(); runTestShflDown(); runTestShflDown(); + runTestShflDown(); runTestShflDown(); runTestShflDown(); + runTestShflDown(); + runTestShflDown(); + runTestShflDown(); + + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); + runTestShflXor(); passed(); }