SWDEV-253247: add ulong and ulonglong version of__shfl*

Change-Id: I40ab6cfa12175f334e8392b71f567054d8256e2a


[ROCm/hip commit: bf20337fc1]
This commit is contained in:
Sarbojit Sarkar
2020-09-23 07:23:09 -04:00
zatwierdzone przez Sarbojit Sarkar
rodzic bd685759d2
commit fcba52f6f5
3 zmienionych plików z 213 dodań i 6 usunięć
@@ -365,6 +365,25 @@ long __shfl(long var, int src_lane, int width = warpSize)
}
__device__
inline
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
#endif
}
__device__
inline
long long __shfl(long long var, int src_lane, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -378,8 +397,22 @@ long long __shfl(long long var, int src_lane, int width = warpSize)
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
__device__
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl(tmp[0], src_lane, width);
tmp[1] = __shfl(tmp[1], src_lane, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
int self = __lane_id();
@@ -435,6 +468,28 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
#endif
}
__device__
inline
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
@@ -449,6 +504,20 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize
return tmp1;
}
__device__
inline
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
@@ -507,6 +576,26 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
}
__device__
inline
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
#endif
}
__device__
inline
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -518,6 +607,19 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
@@ -577,6 +679,26 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
}
__device__
inline
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
{
#ifndef _MSC_VER
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
#else
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
#endif
}
__device__
inline
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
{
static_assert(sizeof(long long) == 2 * sizeof(int), "");
@@ -588,7 +710,19 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__
inline
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
{
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00
@@ -57,6 +57,15 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width)
}
}
void getFactor(int& fact) { fact = 101; }
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
void getFactor(float& fact) { fact = 2.5; }
void getFactor(double& fact) { fact = 2.5; }
void getFactor(long& fact) { fact = 202; }
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
void getFactor(long long& fact) { fact = 303; }
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
template<typename T>
void runTest() {
T* Matrix;
@@ -77,8 +86,10 @@ void runTest() {
cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
// initialize the input data
T factor;
getFactor(factor);
for (i = 0; i < NUM; i++) {
Matrix[i] = (T)i * 10l;
Matrix[i] = (T)i + factor;
}
// allocate the memory on the device side
@@ -124,7 +135,11 @@ void runTest() {
int main() {
runTest<int>();
runTest<float>();
runTest<double>();
runTest<long>();
runTest<long long>();
runTest<unsigned int>();
runTest<unsigned long>();
runTest<unsigned long long>();
passed();
}
@@ -47,13 +47,31 @@ __global__ void shflUpSum(T* a, int size) {
a[threadIdx.x] = val;
}
template <typename T>
__global__ void shflXorSum(T* a, int size) {
T val = a[threadIdx.x];
for (int i = size/2; i > 0; i /= 2)
val += __shfl_xor(val, i, size);
a[threadIdx.x] = val;
}
void getFactor(int& fact) { fact = 101; }
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
void getFactor(float& fact) { fact = 2.5; }
void getFactor(double& fact) { fact = 2.5; }
void getFactor(long& fact) { fact = 202; }
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
void getFactor(long long& fact) { fact = 303; }
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
template <typename T>
void runTestShflUp() {
const int size = 32;
T a[size];
T cpuSum = 0;
T factor; getFactor(factor);
for (int i = 0; i < size; i++) {
a[i] = i;
a[i] = i + factor;
cpuSum += a[i];
}
T* d_a;
@@ -73,8 +91,9 @@ void runTestShflDown() {
const int size = 32;
T a[size];
T cpuSum = 0;
T factor; getFactor(factor);
for (int i = 0; i < size; i++) {
a[i] = i;
a[i] = i + factor;
cpuSum += a[i];
}
T* d_a;
@@ -84,19 +103,58 @@ void runTestShflDown() {
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
if (a[0] != cpuSum) {
hipFree(d_a);
failed("Shfl Up Sum did not match.");
failed("Shfl Down Sum did not match.");
}
hipFree(d_a);
}
template <typename T>
void runTestShflXor() {
const int size = 32;
T a[size];
T cpuSum = 0;
T factor; getFactor(factor);
for (int i = 0; i < size; i++) {
a[i] = i + factor;
cpuSum += a[i];
}
T* d_a;
hipMalloc(&d_a, sizeof(T) * size);
hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
if (a[0] != cpuSum) {
hipFree(d_a);
failed("Shfl Xor Sum did not match.");
}
hipFree(d_a);
}
int main() {
runTestShflUp<int>();
runTestShflUp<float>();
runTestShflUp<double>();
runTestShflUp<long>();
runTestShflUp<long long>();
runTestShflUp<unsigned int>();
runTestShflUp<unsigned long>();
runTestShflUp<unsigned long long>();
runTestShflDown<int>();
runTestShflDown<float>();
runTestShflDown<double>();
runTestShflDown<long>();
runTestShflDown<long long>();
runTestShflDown<unsigned int>();
runTestShflDown<unsigned long>();
runTestShflDown<unsigned long long>();
runTestShflXor<int>();
runTestShflXor<float>();
runTestShflXor<double>();
runTestShflXor<long>();
runTestShflXor<long long>();
runTestShflXor<unsigned int>();
runTestShflXor<unsigned long>();
runTestShflXor<unsigned long long>();
passed();
}