SWDEV-253247: add ulong and ulonglong version of__shfl*
Change-Id: I40ab6cfa12175f334e8392b71f567054d8256e2a
[ROCm/hip commit: bf20337fc1]
This commit is contained in:
zatwierdzone przez
Sarbojit Sarkar
rodzic
bd685759d2
commit
fcba52f6f5
@@ -365,6 +365,25 @@ long __shfl(long var, int src_lane, int width = warpSize)
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl(long long var, int src_lane, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
@@ -378,8 +397,22 @@ long long __shfl(long long var, int src_lane, int width = warpSize)
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
|
||||
__device__
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl(tmp[0], src_lane, width);
|
||||
tmp[1] = __shfl(tmp[1], src_lane, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
|
||||
int self = __lane_id();
|
||||
@@ -435,6 +468,28 @@ long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
|
||||
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
|
||||
@@ -449,6 +504,20 @@ long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
|
||||
@@ -507,6 +576,26 @@ long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
@@ -518,6 +607,19 @@ long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSi
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
||||
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
@@ -577,6 +679,26 @@ long __shfl_xor(long var, int lane_mask, int width = warpSize)
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
||||
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
#else
|
||||
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
||||
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
|
||||
#endif
|
||||
}
|
||||
__device__
|
||||
inline
|
||||
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
||||
@@ -588,7 +710,19 @@ long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
|
||||
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
|
||||
{
|
||||
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
||||
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
||||
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
||||
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
||||
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
||||
return tmp1;
|
||||
}
|
||||
#define MASK1 0x00ff00ff
|
||||
#define MASK2 0xff00ff00
|
||||
|
||||
|
||||
@@ -57,6 +57,15 @@ void matrixTransposeCPUReference(T* output, T* input, const unsigned int width)
|
||||
}
|
||||
}
|
||||
|
||||
void getFactor(int& fact) { fact = 101; }
|
||||
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
|
||||
void getFactor(float& fact) { fact = 2.5; }
|
||||
void getFactor(double& fact) { fact = 2.5; }
|
||||
void getFactor(long& fact) { fact = 202; }
|
||||
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
|
||||
void getFactor(long long& fact) { fact = 303; }
|
||||
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
|
||||
|
||||
template<typename T>
|
||||
void runTest() {
|
||||
T* Matrix;
|
||||
@@ -77,8 +86,10 @@ void runTest() {
|
||||
cpuTransposeMatrix = (T*)malloc(NUM * sizeof(T));
|
||||
|
||||
// initialize the input data
|
||||
T factor;
|
||||
getFactor(factor);
|
||||
for (i = 0; i < NUM; i++) {
|
||||
Matrix[i] = (T)i * 10l;
|
||||
Matrix[i] = (T)i + factor;
|
||||
}
|
||||
|
||||
// allocate the memory on the device side
|
||||
@@ -124,7 +135,11 @@ void runTest() {
|
||||
int main() {
|
||||
runTest<int>();
|
||||
runTest<float>();
|
||||
runTest<double>();
|
||||
runTest<long>();
|
||||
runTest<long long>();
|
||||
runTest<unsigned int>();
|
||||
runTest<unsigned long>();
|
||||
runTest<unsigned long long>();
|
||||
passed();
|
||||
}
|
||||
|
||||
@@ -47,13 +47,31 @@ __global__ void shflUpSum(T* a, int size) {
|
||||
a[threadIdx.x] = val;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shflXorSum(T* a, int size) {
|
||||
T val = a[threadIdx.x];
|
||||
for (int i = size/2; i > 0; i /= 2)
|
||||
val += __shfl_xor(val, i, size);
|
||||
a[threadIdx.x] = val;
|
||||
}
|
||||
|
||||
void getFactor(int& fact) { fact = 101; }
|
||||
void getFactor(unsigned int& fact) { fact = static_cast<unsigned int>(INT32_MAX)+1; }
|
||||
void getFactor(float& fact) { fact = 2.5; }
|
||||
void getFactor(double& fact) { fact = 2.5; }
|
||||
void getFactor(long& fact) { fact = 202; }
|
||||
void getFactor(unsigned long& fact) { fact = static_cast<unsigned long>(__LONG_MAX__)+1; }
|
||||
void getFactor(long long& fact) { fact = 303; }
|
||||
void getFactor(unsigned long long& fact) { fact = static_cast<unsigned long long>(__LONG_LONG_MAX__)+1; }
|
||||
|
||||
template <typename T>
|
||||
void runTestShflUp() {
|
||||
const int size = 32;
|
||||
T a[size];
|
||||
T cpuSum = 0;
|
||||
T factor; getFactor(factor);
|
||||
for (int i = 0; i < size; i++) {
|
||||
a[i] = i;
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
T* d_a;
|
||||
@@ -73,8 +91,9 @@ void runTestShflDown() {
|
||||
const int size = 32;
|
||||
T a[size];
|
||||
T cpuSum = 0;
|
||||
T factor; getFactor(factor);
|
||||
for (int i = 0; i < size; i++) {
|
||||
a[i] = i;
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
T* d_a;
|
||||
@@ -84,19 +103,58 @@ void runTestShflDown() {
|
||||
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
|
||||
if (a[0] != cpuSum) {
|
||||
hipFree(d_a);
|
||||
failed("Shfl Up Sum did not match.");
|
||||
failed("Shfl Down Sum did not match.");
|
||||
}
|
||||
hipFree(d_a);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void runTestShflXor() {
|
||||
const int size = 32;
|
||||
T a[size];
|
||||
T cpuSum = 0;
|
||||
T factor; getFactor(factor);
|
||||
for (int i = 0; i < size; i++) {
|
||||
a[i] = i + factor;
|
||||
cpuSum += a[i];
|
||||
}
|
||||
T* d_a;
|
||||
hipMalloc(&d_a, sizeof(T) * size);
|
||||
hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault);
|
||||
hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
|
||||
hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault);
|
||||
if (a[0] != cpuSum) {
|
||||
hipFree(d_a);
|
||||
failed("Shfl Xor Sum did not match.");
|
||||
}
|
||||
hipFree(d_a);
|
||||
}
|
||||
int main() {
|
||||
runTestShflUp<int>();
|
||||
runTestShflUp<float>();
|
||||
runTestShflUp<double>();
|
||||
runTestShflUp<long>();
|
||||
runTestShflUp<long long>();
|
||||
runTestShflUp<unsigned int>();
|
||||
runTestShflUp<unsigned long>();
|
||||
runTestShflUp<unsigned long long>();
|
||||
|
||||
runTestShflDown<int>();
|
||||
runTestShflDown<float>();
|
||||
runTestShflDown<double>();
|
||||
runTestShflDown<long>();
|
||||
runTestShflDown<long long>();
|
||||
runTestShflDown<unsigned int>();
|
||||
runTestShflDown<unsigned long>();
|
||||
runTestShflDown<unsigned long long>();
|
||||
|
||||
runTestShflXor<int>();
|
||||
runTestShflXor<float>();
|
||||
runTestShflXor<double>();
|
||||
runTestShflXor<long>();
|
||||
runTestShflXor<long long>();
|
||||
runTestShflXor<unsigned int>();
|
||||
runTestShflXor<unsigned long>();
|
||||
runTestShflXor<unsigned long long>();
|
||||
passed();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user