388 lines
13 KiB
C++
388 lines
13 KiB
C++
/*
|
|
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#include <hip/device_functions.h>
|
|
#include <hc.hpp>
|
|
#include <grid_launch.h>
|
|
#include <hc_math.hpp>
|
|
#include "device_util.h"
|
|
|
|
__device__ float __double2float_rd(double x) { return (double)x; }
|
|
__device__ float __double2float_rn(double x) { return (double)x; }
|
|
__device__ float __double2float_ru(double x) { return (double)x; }
|
|
__device__ float __double2float_rz(double x) { return (double)x; }
|
|
|
|
|
|
__device__ int __double2hiint(double x) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
|
|
int tmp[2];
|
|
__builtin_memcpy(tmp, &x, sizeof(tmp));
|
|
|
|
return tmp[1];
|
|
}
|
|
__device__ int __double2loint(double x) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
|
|
int tmp[2];
|
|
__builtin_memcpy(tmp, &x, sizeof(tmp));
|
|
|
|
return tmp[0];
|
|
}
|
|
|
|
|
|
__device__ int __double2int_rd(double x) { return (int)x; }
|
|
__device__ int __double2int_rn(double x) { return (int)x; }
|
|
__device__ int __double2int_ru(double x) { return (int)x; }
|
|
__device__ int __double2int_rz(double x) { return (int)x; }
|
|
|
|
__device__ long long int __double2ll_rd(double x) { return (long long int)x; }
|
|
__device__ long long int __double2ll_rn(double x) { return (long long int)x; }
|
|
__device__ long long int __double2ll_ru(double x) { return (long long int)x; }
|
|
__device__ long long int __double2ll_rz(double x) { return (long long int)x; }
|
|
|
|
|
|
__device__ unsigned int __double2uint_rd(double x) { return (unsigned int)x; }
|
|
__device__ unsigned int __double2uint_rn(double x) { return (unsigned int)x; }
|
|
__device__ unsigned int __double2uint_ru(double x) { return (unsigned int)x; }
|
|
__device__ unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
|
|
|
|
__device__ unsigned long long int __double2ull_rd(double x) { return (unsigned long long int)x; }
|
|
__device__ unsigned long long int __double2ull_rn(double x) { return (unsigned long long int)x; }
|
|
__device__ unsigned long long int __double2ull_ru(double x) { return (unsigned long long int)x; }
|
|
__device__ unsigned long long int __double2ull_rz(double x) { return (unsigned long long int)x; }
|
|
|
|
__device__ long long int __double_as_longlong(double x) {
|
|
static_assert(sizeof(long long) == sizeof(double), "");
|
|
|
|
long long tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
|
|
__device__ int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
|
|
__device__ int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
|
|
__device__ int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
|
|
|
|
__device__ long long int __float2ll_rd(float x) { return (long long int)x; }
|
|
__device__ long long int __float2ll_rn(float x) { return (long long int)x; }
|
|
__device__ long long int __float2ll_ru(float x) { return (long long int)x; }
|
|
__device__ long long int __float2ll_rz(float x) { return (long long int)x; }
|
|
|
|
__device__ unsigned int __float2uint_rd(float x) { return (unsigned int)x; }
|
|
__device__ unsigned int __float2uint_rn(float x) { return (unsigned int)x; }
|
|
__device__ unsigned int __float2uint_ru(float x) { return (unsigned int)x; }
|
|
__device__ unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
|
|
|
|
__device__ unsigned long long int __float2ull_rd(float x) { return (unsigned long long int)x; }
|
|
__device__ unsigned long long int __float2ull_rn(float x) { return (unsigned long long int)x; }
|
|
__device__ unsigned long long int __float2ull_ru(float x) { return (unsigned long long int)x; }
|
|
__device__ unsigned long long int __float2ull_rz(float x) { return (unsigned long long int)x; }
|
|
|
|
__device__ int __float_as_int(float x) {
|
|
static_assert(sizeof(int) == sizeof(float), "");
|
|
|
|
int tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
__device__ unsigned int __float_as_uint(float x) {
|
|
static_assert(sizeof(unsigned int) == sizeof(float), "");
|
|
|
|
unsigned int tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
__device__ double __hiloint2double(int32_t hi, int32_t lo) {
|
|
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
|
|
double tmp1;
|
|
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
|
|
return tmp1;
|
|
}
|
|
__device__ double __int2double_rn(int x) { return (double)x; }
|
|
|
|
__device__ float __int2float_rd(int x) { return (float)x; }
|
|
__device__ float __int2float_rn(int x) { return (float)x; }
|
|
__device__ float __int2float_ru(int x) { return (float)x; }
|
|
__device__ float __int2float_rz(int x) { return (float)x; }
|
|
|
|
__device__ float __int_as_float(int x) {
|
|
static_assert(sizeof(float) == sizeof(int), "");
|
|
|
|
float tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ double __ll2double_rd(long long int x) { return (double)x; }
|
|
__device__ double __ll2double_rn(long long int x) { return (double)x; }
|
|
__device__ double __ll2double_ru(long long int x) { return (double)x; }
|
|
__device__ double __ll2double_rz(long long int x) { return (double)x; }
|
|
|
|
__device__ float __ll2float_rd(long long int x) { return (float)x; }
|
|
__device__ float __ll2float_rn(long long int x) { return (float)x; }
|
|
__device__ float __ll2float_ru(long long int x) { return (float)x; }
|
|
__device__ float __ll2float_rz(long long int x) { return (float)x; }
|
|
|
|
__device__ double __longlong_as_double(long long int x) {
|
|
static_assert(sizeof(double) == sizeof(long long), "");
|
|
|
|
double tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return x;
|
|
}
|
|
|
|
__device__ double __uint2double_rn(int x) { return (double)x; }
|
|
|
|
__device__ float __uint2float_rd(unsigned int x) { return (float)x; }
|
|
__device__ float __uint2float_rn(unsigned int x) { return (float)x; }
|
|
__device__ float __uint2float_ru(unsigned int x) { return (float)x; }
|
|
__device__ float __uint2float_rz(unsigned int x) { return (float)x; }
|
|
|
|
__device__ float __uint_as_float(unsigned int x) {
|
|
static_assert(sizeof(float) == sizeof(unsigned int), "");
|
|
|
|
float tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ double __ull2double_rd(unsigned long long int x) { return (double)x; }
|
|
__device__ double __ull2double_rn(unsigned long long int x) { return (double)x; }
|
|
__device__ double __ull2double_ru(unsigned long long int x) { return (double)x; }
|
|
__device__ double __ull2double_rz(unsigned long long int x) { return (double)x; }
|
|
|
|
__device__ float __ull2float_rd(unsigned long long int x) { return (float)x; }
|
|
__device__ float __ull2float_rn(unsigned long long int x) { return (float)x; }
|
|
__device__ float __ull2float_ru(unsigned long long int x) { return (float)x; }
|
|
__device__ float __ull2float_rz(unsigned long long int x) { return (float)x; }
|
|
|
|
/*
|
|
Integer Intrinsics
|
|
*/
|
|
|
|
// integer intrinsic function __poc __clz __ffs __brev
|
|
__device__ unsigned int __popc(unsigned int input) { return hc::__popcount_u32_b32(input); }
|
|
|
|
__device__ unsigned int __popcll(unsigned long long int input) {
|
|
return hc::__popcount_u32_b64(input);
|
|
}
|
|
|
|
__device__ unsigned int __clz(unsigned int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return input == 0 ? 32 : hc::__firstbit_u32_u32(input);
|
|
#else
|
|
return hc::__firstbit_u32_u32(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __clzll(unsigned long long int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return input == 0 ? 64 : hc::__firstbit_u32_u64(input);
|
|
#else
|
|
return hc::__firstbit_u32_u64(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __clz(int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return input == 0 ? 32 : hc::__firstbit_u32_s32(input);
|
|
#else
|
|
return hc::__firstbit_u32_s32(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __clzll(long long int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return input == 0 ? 64 : hc::__firstbit_u32_s64(input);
|
|
#else
|
|
return hc::__firstbit_u32_s64(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __ffs(unsigned int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return hc::__lastbit_u32_u32(input) + 1;
|
|
#else
|
|
return hc::__lastbit_u32_u32(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __ffsll(unsigned long long int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return hc::__lastbit_u32_u64(input) + 1;
|
|
#else
|
|
return hc::__lastbit_u32_u64(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __ffs(int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return hc::__lastbit_u32_s32(input) + 1;
|
|
#else
|
|
return hc::__lastbit_u32_s32(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __ffsll(long long int input) {
|
|
#ifdef NVCC_COMPAT
|
|
return hc::__lastbit_u32_s64(input) + 1;
|
|
#else
|
|
return hc::__lastbit_u32_s64(input);
|
|
#endif
|
|
}
|
|
|
|
__device__ unsigned int __brev(unsigned int input) { return hc::__bitrev_b32(input); }
|
|
|
|
__device__ unsigned long long int __brevll(unsigned long long int input) {
|
|
return hc::__bitrev_b64(input);
|
|
}
|
|
|
|
struct ucharHolder {
|
|
union {
|
|
unsigned char c[4];
|
|
unsigned int ui;
|
|
};
|
|
} __attribute__((aligned(4)));
|
|
|
|
struct uchar2Holder {
|
|
union {
|
|
unsigned int ui[2];
|
|
unsigned char c[8];
|
|
};
|
|
} __attribute__((aligned(8)));
|
|
|
|
__device__ unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
|
|
struct uchar2Holder cHoldVal;
|
|
struct ucharHolder cHoldKey;
|
|
struct ucharHolder cHoldOut;
|
|
cHoldKey.ui = s;
|
|
cHoldVal.ui[0] = x;
|
|
cHoldVal.ui[1] = y;
|
|
cHoldOut.c[0] = cHoldVal.c[cHoldKey.c[0]];
|
|
cHoldOut.c[1] = cHoldVal.c[cHoldKey.c[1]];
|
|
cHoldOut.c[2] = cHoldVal.c[cHoldKey.c[2]];
|
|
cHoldOut.c[3] = cHoldVal.c[cHoldKey.c[3]];
|
|
return cHoldOut.ui;
|
|
}
|
|
|
|
__device__ long long __mul64hi(long long int x, long long int y) {
|
|
ulong x0 = (ulong)x & 0xffffffffUL;
|
|
long x1 = x >> 32;
|
|
ulong y0 = (ulong)y & 0xffffffffUL;
|
|
long y1 = y >> 32;
|
|
ulong z0 = x0*y0;
|
|
long t = x1*y0 + (z0 >> 32);
|
|
long z1 = t & 0xffffffffL;
|
|
long z2 = t >> 32;
|
|
z1 = x0*y1 + z1;
|
|
return x1*y1 + z2 + (z1 >> 32);
|
|
}
|
|
|
|
__device__ unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
|
|
ulong x0 = x & 0xffffffffUL;
|
|
ulong x1 = x >> 32;
|
|
ulong y0 = y & 0xffffffffUL;
|
|
ulong y1 = y >> 32;
|
|
ulong z0 = x0*y0;
|
|
ulong t = x1*y0 + (z0 >> 32);
|
|
ulong z1 = t & 0xffffffffUL;
|
|
ulong z2 = t >> 32;
|
|
z1 = x0*y1 + z1;
|
|
return x1*y1 + z2 + (z1 >> 32);
|
|
}
|
|
|
|
/*
|
|
HIP specific device functions
|
|
*/
|
|
|
|
__device__ unsigned __hip_ds_bpermute(int index, unsigned src) {
|
|
return hc::__amdgcn_ds_bpermute(index, src);
|
|
}
|
|
|
|
__device__ float __hip_ds_bpermutef(int index, float src) {
|
|
return hc::__amdgcn_ds_bpermute(index, src);
|
|
}
|
|
|
|
__device__ unsigned __hip_ds_permute(int index, unsigned src) {
|
|
return hc::__amdgcn_ds_permute(index, src);
|
|
}
|
|
|
|
__device__ float __hip_ds_permutef(int index, float src) {
|
|
return hc::__amdgcn_ds_permute(index, src);
|
|
}
|
|
|
|
__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern) {
|
|
return hc::__amdgcn_ds_swizzle(src, pattern);
|
|
}
|
|
|
|
__device__ float __hip_ds_swizzlef(float src, int pattern) {
|
|
return hc::__amdgcn_ds_swizzle(src, pattern);
|
|
}
|
|
|
|
__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) {
|
|
return hc::__amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
|
|
}
|
|
|
|
#define MASK1 0x00ff00ff
|
|
#define MASK2 0xff00ff00
|
|
|
|
__device__ char4 __hip_hc_add8pk(char4 in1, char4 in2) {
|
|
char4 out;
|
|
unsigned one1 = in1.w & MASK1;
|
|
unsigned one2 = in2.w & MASK1;
|
|
out.w = (one1 + one2) & MASK1;
|
|
one1 = in1.w & MASK2;
|
|
one2 = in2.w & MASK2;
|
|
out.w = out.w | ((one1 + one2) & MASK2);
|
|
return out;
|
|
}
|
|
|
|
__device__ char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
|
|
char4 out;
|
|
unsigned one1 = in1.w & MASK1;
|
|
unsigned one2 = in2.w & MASK1;
|
|
out.w = (one1 - one2) & MASK1;
|
|
one1 = in1.w & MASK2;
|
|
one2 = in2.w & MASK2;
|
|
out.w = out.w | ((one1 - one2) & MASK2);
|
|
return out;
|
|
}
|
|
|
|
__device__ char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
|
|
char4 out;
|
|
unsigned one1 = in1.w & MASK1;
|
|
unsigned one2 = in2.w & MASK1;
|
|
out.w = (one1 * one2) & MASK1;
|
|
one1 = in1.w & MASK2;
|
|
one2 = in2.w & MASK2;
|
|
out.w = out.w | ((one1 * one2) & MASK2);
|
|
return out;
|
|
}
|