From 36e805cf76d6a1f52449c124df313a143acb9c68 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Wed, 23 May 2018 17:57:09 +0100 Subject: [PATCH] Re-factor half support to match CUDA whilst exploiting native support. --- hipamd/CMakeLists.txt | 1 - hipamd/include/hip/hcc_detail/hip_fp16.h | 2041 +++++++++++++---- hipamd/include/hip/hcc_detail/hip_fp16_gcc.h | 252 ++ .../hip/hcc_detail/hip_fp16_math_fwd.h | 76 + hipamd/tests/src/deviceLib/hipTestHalf.cpp | 48 +- .../tests/src/deviceLib/hipTestNativeHalf.cpp | 142 ++ 6 files changed, 2128 insertions(+), 432 deletions(-) create mode 100644 hipamd/include/hip/hcc_detail/hip_fp16_gcc.h create mode 100644 hipamd/include/hip/hcc_detail/hip_fp16_math_fwd.h create mode 100644 hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp diff --git a/hipamd/CMakeLists.txt b/hipamd/CMakeLists.txt index dcc9e3c6cd..2c90c3f0de 100644 --- a/hipamd/CMakeLists.txt +++ b/hipamd/CMakeLists.txt @@ -195,7 +195,6 @@ if(HIP_PLATFORM STREQUAL "hcc") set(SOURCE_FILES_DEVICE src/device_util.cpp src/hip_ldg.cpp - src/hip_fp16.cpp src/device_functions.cpp src/math_functions.cpp) diff --git a/hipamd/include/hip/hcc_detail/hip_fp16.h b/hipamd/include/hip/hcc_detail/hip_fp16.h index fe8414ca87..46644504bd 100644 --- a/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -20,420 +20,1631 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H -#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H - -#include "hip/hcc_detail/hip_vector_types.h" -#if (__clang_major__ > 3) -typedef __fp16 __half; -typedef __fp16 __half1 __attribute__((ext_vector_type(1))); -typedef __fp16 __half2 __attribute__((ext_vector_type(2))); -typedef __fp16 half; - -/* -Half Arithmetic Functions -*/ -__device__ __half __hadd(const __half a, const __half b); -__device__ __half __hadd_sat(__half a, __half b); -__device__ __half __hfma(__half a, __half b, __half c); -__device__ __half __hfma_sat(__half a, __half b, __half c); -__device__ __half __hmul(__half a, __half b); -__device__ __half __hmul_sat(__half a, __half b); -__device__ __half __hneg(__half a); -__device__ __half __hsub(__half a, __half b); -__device__ __half __hsub_sat(__half a, __half b); -__device__ __half hdiv(__half a, __half b); - -/* -Half2 Arithmetic Functions -*/ - -__device__ static __half2 __hadd2(__half2 a, __half2 b); -__device__ static __half2 __hadd2_sat(__half2 a, __half2 b); -__device__ static __half2 __hfma2(__half2 a, __half2 b, __half2 c); -__device__ static __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c); -__device__ static __half2 __hmul2(__half2 a, __half2 b); -__device__ static __half2 __hmul2_sat(__half2 a, __half2 b); -__device__ static __half2 __hsub2(__half2 a, __half2 b); -__device__ static __half2 __hneg2(__half2 a); -__device__ static __half2 __hsub2_sat(__half2 a, __half2 b); -__device__ static __half2 h2div(__half2 a, __half2 b); - -/* -Half Comparision Functions -*/ - -__device__ bool __heq(__half a, __half b); -__device__ bool __hge(__half a, __half b); -__device__ bool __hgt(__half a, __half b); -__device__ bool __hisinf(__half a); -__device__ bool __hisnan(__half a); -__device__ bool __hle(__half a, __half b); -__device__ bool __hlt(__half a, __half b); -__device__ bool __hne(__half a, __half b); - -/* -Half2 Comparision Functions -*/ - -__device__ bool __hbeq2(__half2 a, __half2 b); -__device__ bool __hbge2(__half2 a, __half2 b); -__device__ bool __hbgt2(__half2 a, __half2 b); -__device__ bool __hble2(__half2 a, __half2 b); -__device__ bool __hblt2(__half2 a, __half2 b); -__device__ bool __hbne2(__half2 a, __half2 b); -__device__ __half2 __heq2(__half2 a, __half2 b); -__device__ __half2 __hge2(__half2 a, __half2 b); -__device__ __half2 __hgt2(__half2 a, __half2 b); -__device__ __half2 __hisnan2(__half2 a); -__device__ __half2 __hle2(__half2 a, __half2 b); -__device__ __half2 __hlt2(__half2 a, __half2 b); -__device__ __half2 __hne2(__half2 a, __half2 b); - -/* -Half Math Functions -*/ - -__device__ static __half hceil(const __half h); -__device__ static __half hcos(const __half h); -__device__ static __half hexp(const __half h); -__device__ static __half hexp10(const __half h); -__device__ static __half hexp2(const __half h); -__device__ static __half hfloor(const __half h); -__device__ static __half hlog(const __half h); -__device__ static __half hlog10(const __half h); -__device__ static __half hlog2(const __half h); -//__device__ static __half hrcp(const __half h); -__device__ static __half hrint(const __half h); -__device__ static __half hsin(const __half h); -__device__ static __half hsqrt(const __half a); -__device__ static __half htrunc(const __half a); - -/* -Half2 Math Functions -*/ - -__device__ static __half2 h2ceil(const __half2 h); -__device__ static __half2 h2exp(const __half2 h); -__device__ static __half2 h2exp10(const __half2 h); -__device__ static __half2 h2exp2(const __half2 h); -__device__ static __half2 h2floor(const __half2 h); -__device__ static __half2 h2log(const __half2 h); -__device__ static __half2 h2log10(const __half2 h); -__device__ static __half2 h2log2(const __half2 h); -__device__ static __half2 h2rcp(const __half2 h); -__device__ static __half2 h2rsqrt(const __half2 h); -__device__ static __half2 h2sin(const __half2 h); -__device__ static __half2 h2sqrt(const __half2 h); - -/* -Half Conversion And Data Movement -*/ - -__device__ __half2 __float22half2_rn(const float2 a); -__device__ __half __float2half(const float a); -__device__ __half2 __float2half2_rn(const float a); -__device__ __half __float2half_rd(const float a); -__device__ __half __float2half_rn(const float a); -__device__ __half __float2half_ru(const float a); -__device__ __half __float2half_rz(const float a); -__device__ __half2 __floats2half2_rn(const float a, const float b); -__device__ float2 __half22float2(const __half2 a); -__device__ float __half2float(const __half a); -__device__ __half2 half2half2(const __half a); -__device__ int __half2int_rd(__half h); -__device__ int __half2int_rn(__half h); -__device__ int __half2int_ru(__half h); -__device__ int __half2int_rz(__half h); -__device__ long long int __half2ll_rd(__half h); -__device__ long long int __half2ll_rn(__half h); -__device__ long long int __half2ll_ru(__half h); -__device__ long long int __half2ll_rz(__half h); -__device__ short __half2short_rd(__half h); -__device__ short __half2short_rn(__half h); -__device__ short __half2short_ru(__half h); -__device__ short __half2short_rz(__half h); -__device__ unsigned int __half2uint_rd(__half h); -__device__ unsigned int __half2uint_rn(__half h); -__device__ unsigned int __half2uint_ru(__half h); -__device__ unsigned int __half2uint_rz(__half h); -__device__ unsigned long long int __half2ull_rd(__half h); -__device__ unsigned long long int __half2ull_rn(__half h); -__device__ unsigned long long int __half2ull_ru(__half h); -__device__ unsigned long long int __half2ull_rz(__half h); -__device__ unsigned short int __half2ushort_rd(__half h); -__device__ unsigned short int __half2ushort_rn(__half h); -__device__ unsigned short int __half2ushort_ru(__half h); -__device__ unsigned short int __half2ushort_rz(__half h); -__device__ short int __half_as_short(const __half h); -__device__ unsigned short int __half_as_ushort(const __half h); -__device__ __half2 __halves2half2(const __half a, const __half b); -__device__ float __high2float(const __half2 a); -__device__ __half __high2half(const __half2 a); -__device__ __half2 __high2half2(const __half2 a); -__device__ __half2 __highs2half2(const __half2 a, const __half2 b); -__device__ __half __int2half_rd(int i); -__device__ __half __int2half_rn(int i); -__device__ __half __int2half_ru(int i); -__device__ __half __int2half_rz(int i); -__device__ __half __ll2half_rd(long long int i); -__device__ __half __ll2half_rn(long long int i); -__device__ __half __ll2half_ru(long long int i); -__device__ __half __ll2half_rz(long long int i); -__device__ float __low2float(const __half2 a); - -__device__ __half __low2half(const __half2 a); -__device__ __half2 __low2half2(const __half2 a, const __half2 b); -__device__ __half2 __low2half2(const __half2 a); -__device__ __half2 __lowhigh2highlow(const __half2 a); -__device__ __half2 __lows2half2(const __half2 a, const __half2 b); -__device__ __half __short2half_rd(short int i); -__device__ __half __short2half_rn(short int i); -__device__ __half __short2half_ru(short int i); -__device__ __half __short2half_rz(short int i); -__device__ __half __uint2half_rd(unsigned int i); -__device__ __half __uint2half_rn(unsigned int i); -__device__ __half __uint2half_ru(unsigned int i); -__device__ __half __uint2half_rz(unsigned int i); -__device__ __half __ull2half_rd(unsigned long long int i); -__device__ __half __ull2half_rn(unsigned long long int i); -__device__ __half __ull2half_ru(unsigned long long int i); -__device__ __half __ull2half_rz(unsigned long long int i); -__device__ __half __ushort2half_rd(unsigned short int i); -__device__ __half __ushort2half_rn(unsigned short int i); -__device__ __half __ushort2half_ru(unsigned short int i); -__device__ __half __ushort2half_rz(unsigned short int i); -__device__ __half __ushort_as_half(const unsigned short int i); - -extern "C" __half2 __hip_hc_ir_hadd2_int(__half2, __half2); -extern "C" __half2 __hip_hc_ir_hfma2_int(__half2, __half2, __half2); -extern "C" __half2 __hip_hc_ir_hmul2_int(__half2, __half2); -extern "C" __half2 __hip_hc_ir_hsub2_int(__half2, __half2); - -extern "C" __half __hip_hc_ir_hceil_half(__half) __asm("llvm.ceil.f16"); -extern "C" __half __hip_hc_ir_hcos_half(__half) __asm("llvm.cos.f16"); -extern "C" __half __hip_hc_ir_hexp2_half(__half) __asm("llvm.exp2.f16"); -extern "C" __half __hip_hc_ir_hfloor_half(__half) __asm("llvm.floor.f16"); -extern "C" __half __hip_hc_ir_hlog2_half(__half) __asm("llvm.log2.f16"); -extern "C" __half __hip_hc_ir_hrcp_half(__half) __asm("llvm.amdgcn.rcp.f16"); -extern "C" __half __hip_hc_ir_hrint_half(__half) __asm("llvm.rint.f16"); -extern "C" __half __hip_hc_ir_hrsqrt_half(__half) __asm("llvm.sqrt.f16"); -extern "C" __half __hip_hc_ir_hsin_half(__half) __asm("llvm.sin.f16"); -extern "C" __half __hip_hc_ir_hsqrt_half(__half) __asm("llvm.sqrt.f16"); -extern "C" __half __hip_hc_ir_htrunc_half(__half) __asm("llvm.trunc.f16"); - -extern "C" __half2 __hip_hc_ir_h2ceil_int(__half2); -extern "C" __half2 __hip_hc_ir_h2cos_int(__half2); -extern "C" __half2 __hip_hc_ir_h2exp2_int(__half2); -extern "C" __half2 __hip_hc_ir_h2floor_int(__half2); -extern "C" __half2 __hip_hc_ir_h2log2_int(__half2); -extern "C" __half2 __hip_hc_ir_h2rcp_int(__half2); -extern "C" __half2 __hip_hc_ir_h2rsqrt_int(__half2); -extern "C" __half2 __hip_hc_ir_h2sin_int(__half2); -extern "C" __half2 __hip_hc_ir_h2sqrt_int(__half2); -extern "C" __half2 __hip_hc_ir_h2trunc_int(__half2); - -/* - Half2 Arithmetic Functions -*/ - -__device__ static inline __half2 __hadd2(__half2 a, __half2 b) { - __half2 c; - c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); - return c; -} - -__device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) { - __half2 c; - c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); - return c; -} - -__device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) { - __half2 d; - d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); - return d; -} - -__device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) { - __half2 d; - d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); - return d; -} - -__device__ static inline __half2 __hmul2(__half2 a, __half2 b) { - __half2 c; - c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); - return c; -} - -__device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) { - __half2 c; - c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); - return c; -} - -__device__ static inline __half2 __hsub2(__half2 a, __half2 b) { - __half2 c; - c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); - return c; -} - -__device__ static inline __half2 __hneg2(__half2 a) { - __half2 c; - c.x = -a.x; - c.y = -a.y; - return c; -} - -__device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) { - __half2 c; - c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); - return c; -} - -__device__ static inline __half2 h2div(__half2 a, __half2 b) { - __half2 c; - c.x = a.x / b.x; - c.y = a.y / b.y; - return c; -} - - -__device__ static inline __half hceil(const __half h) { return __hip_hc_ir_hceil_half(h); } - -__device__ static inline __half hcos(const __half h) { return __hip_hc_ir_hcos_half(h); } - -__device__ static inline __half hexp(const __half h) { - return __hip_hc_ir_hexp2_half(__hmul(h, 1.442694)); -} - -__device__ static inline __half hexp10(const __half h) { - return __hip_hc_ir_hexp2_half(__hmul(h, 3.3219281)); -} - -__device__ static inline __half hexp2(const __half h) { return __hip_hc_ir_hexp2_half(h); } - -__device__ static inline __half hfloor(const __half h) { return __hip_hc_ir_hfloor_half(h); } - -__device__ static inline __half hlog(const __half h) { - return __hmul(__hip_hc_ir_hlog2_half(h), 0.693147); -} - -__device__ static inline __half hlog10(const __half h) { - return __hmul(__hip_hc_ir_hlog2_half(h), 0.301029); -} - -__device__ static inline __half hlog2(const __half h) { return __hip_hc_ir_hlog2_half(h); } -/* -__device__ static inline __half hrcp(const __half h) { - return __hip_hc_ir_hrcp_half(h); -} -*/ -__device__ static inline __half hrint(const __half h) { return __hip_hc_ir_hrint_half(h); } - -__device__ static inline __half hrsqrt(const __half h) { return __hip_hc_ir_hrsqrt_half(h); } - -__device__ static inline __half hsin(const __half h) { return __hip_hc_ir_hsin_half(h); } - -__device__ static inline __half hsqrt(const __half a) { return __hip_hc_ir_hsqrt_half(a); } - -__device__ static inline __half htrunc(const __half a) { return __hip_hc_ir_htrunc_half(a); } - -/* -Half2 Math Operations -*/ - -__device__ static inline __half2 h2ceil(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2ceil_int(h.xy); - return a; -} - -__device__ static inline __half2 h2cos(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2cos_int(h.xy); - return a; -} - -__device__ static inline __half2 h2exp(const __half2 h) { - __half2 factor; - factor.x = 1.442694; - factor.y = 1.442694; - factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); - return factor; -} - -__device__ static inline __half2 h2exp10(const __half2 h) { - __half2 factor; - factor.x = 3.3219281; - factor.y = 3.3219281; - factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); - return factor; -} - -__device__ static inline __half2 h2exp2(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2exp2_int(h.xy); - return a; -} - -__device__ static inline __half2 h2floor(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2floor_int(h.xy); - return a; -} - -__device__ static inline __half2 h2log(const __half2 h) { - __half2 factor; - factor.x = 0.693147; - factor.y = 0.693147; - factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); - return factor; -} - -__device__ static inline __half2 h2log10(const __half2 h) { - __half2 factor; - factor.x = 0.301029; - factor.y = 0.301029; - factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); - return factor; -} -__device__ static inline __half2 h2log2(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2log2_int(h.xy); - return a; -} - -__device__ static inline __half2 h2rcp(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2rcp_int(h.xy); - return a; -} - -__device__ static inline __half2 h2rsqrt(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2rsqrt_int(h.xy); - return a; -} - -__device__ static inline __half2 h2sin(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2sin_int(h.xy); - return a; -} - -__device__ static inline __half2 h2sqrt(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2sqrt_int(h.xy); - return a; -} - -__device__ static inline __half2 h2trunc(const __half2 h) { - __half2 a; - a.xy = __hip_hc_ir_h2trunc_int(h.xy); - return a; -} -#endif // clang_major > 3 +#pragma once +#include +#if defined(__cplusplus) + #include + #include + #include #endif + +#if defined(__clang__) && (__clang_major__ > 3) + typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2))); + + struct __half_raw { + union { + static_assert(sizeof(_Float16) == sizeof(unsigned short), ""); + + _Float16 data; + unsigned short x; + }; + }; + + struct __half2_raw { + union { + static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), ""); + + _Float16_2 data; + struct { + unsigned short x; + unsigned short y; + }; + }; + }; + + #if defined(__cplusplus) + #include "hip_fp16_math_fwd.h" + #include "hip_vector_types.h" + #include "host_defines.h" + + template + using Enable_if_t = typename std::enable_if::type; + + // BEGIN STRUCT __HALF + struct __half { + protected: + union { + static_assert(sizeof(_Float16) == sizeof(unsigned short), ""); + + _Float16 data; + unsigned short __x; + }; + public: + // CREATORS + __host__ __device__ + __half() = default; + __host__ __device__ + __half(const __half_raw& x) : data{x.data} {} + __host__ __device__ + __half(decltype(data) x) : data{x} {} + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, + Enable_if_t{}>* = nullptr> + __host__ __device__ + __half(T x) : data{static_cast<_Float16>(x)} {} + #endif + __host__ __device__ + __half(const __half&) = default; + __host__ __device__ + __half(__half&&) = default; + __host__ __device__ + ~__half() = default; + + // CREATORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, Enable_if_t{}>* = nullptr> + __device__ + __half(T x) : data{static_cast<_Float16>(x)} {} + #endif + + // MANIPULATORS + __host__ __device__ + __half& operator=(const __half&) = default; + __host__ __device__ + __half& operator=(__half&&) = default; + __host__ __device__ + __half& operator=(const __half_raw& x) + { + data = x.data; + return *this; + } + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, + Enable_if_t{}>* = nullptr> + __host__ __device__ + __half& operator=(T x) + { + data = static_cast<_Float16>(x); + return *this; + } + #endif + + // MANIPULATORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + template< + typename T, Enable_if_t{}>* = nullptr> + __device__ + __half& operator=(T x) + { + data = static_cast<_Float16>(x); + return *this; + } + #endif + + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half& operator+=(const __half& x) + { + data += x.data; + return *this; + } + __device__ + __half& operator-=(const __half& x) + { + data -= x.data; + return *this; + } + __device__ + __half& operator*=(const __half& x) + { + data *= x.data; + return *this; + } + __device__ + __half& operator/=(const __half& x) + { + data /= x.data; + return *this; + } + __device__ + __half& operator++() { ++data; return *this; } + __device__ + __half operator++(int) + { + __half tmp{*this}; + ++*this; + return tmp; + } + __device__ + __half& operator--() { --data; return *this; } + __device__ + __half operator--(int) + { + __half tmp{*this}; + --*this; + return tmp; + } + #endif + + // ACCESSORS + __host__ __device__ + operator decltype(data)() const { return data; } + __host__ __device__ + operator float() const { return static_cast(data); } + __host__ __device__ + operator __half_raw() const { return __half_raw{data}; } + + // ACCESSORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + __device__ + operator bool() const { return data; } + #endif + + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half operator+() const { return *this; } + __device__ + __half operator-() const + { + __half tmp{*this}; + tmp.data = -tmp.data; + return tmp; + } + #endif + + // FRIENDS + #if !defined(__HIP_NO_HALF_OPERATORS__) + friend + inline + __device__ + __half operator+(const __half& x, const __half& y) + { + return __half{x} += y; + } + friend + inline + __device__ + __half operator-(const __half& x, const __half& y) + { + return __half{x} -= y; + } + friend + inline + __device__ + __half operator*(const __half& x, const __half& y) + { + return __half{x} *= y; + } + friend + inline + __device__ + __half operator/(const __half& x, const __half& y) + { + return __half{x} /= y; + } + friend + inline + __device__ + bool operator==(const __half& x, const __half& y) + { + return x.data == y.data; + } + friend + inline + __device__ + bool operator!=(const __half& x, const __half& y) + { + return !(x == y); + } + friend + inline + __device__ + bool operator<(const __half& x, const __half& y) + { + return x.data < y.data; + } + friend + inline + __device__ + bool operator>(const __half& x, const __half& y) + { + return y.data < x.data; + } + friend + inline + __device__ + bool operator<=(const __half& x, const __half& y) + { + return !(y < x); + } + friend + inline + __device__ + bool operator>=(const __half& x, const __half& y) + { + return !(x < y); + } + #endif // !defined(__HIP_NO_HALF_OPERATORS__) + }; + // END STRUCT __HALF + + // BEGIN STRUCT __HALF2 + struct __half2 { + protected: + union { + static_assert( + sizeof(_Float16_2) == sizeof(unsigned short[2]), ""); + + _Float16_2 data; + struct { + unsigned short x; + unsigned short y; + }; + }; + public: + // CREATORS + __host__ __device__ + __half2() = default; + __host__ __device__ + __half2(const __half2_raw& x) : data{x.data} {} + __host__ __device__ + __half2(decltype(data) x) : data{x} {} + __host__ __device__ + __half2(const __half& x, const __half& y) + : + data{ + static_cast<__half_raw>(x).data, + static_cast<__half_raw>(y).data} + {} + __host__ __device__ + __half2(const __half2&) = default; + __host__ __device__ + __half2(__half2&&) = default; + __host__ __device__ + ~__half2() = default; + + // MANIPULATORS + __host__ __device__ + __half2& operator=(const __half2&) = default; + __host__ __device__ + __half2& operator=(__half2&&) = default; + __host__ __device__ + __half2& operator=(const __half2_raw& x) + { + data = x.data; + return *this; + } + + // MANIPULATORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half2& operator+=(const __half2& x) + { + data += x.data; + return *this; + } + __device__ + __half2& operator-=(const __half2& x) + { + data -= x.data; + return *this; + } + __device__ + __half2& operator*=(const __half2& x) + { + data *= x.data; + return *this; + } + __device__ + __half2& operator/=(const __half2& x) + { + data /= x.data; + return *this; + } + __device__ + __half2& operator++() { return *this += __half2{1, 1}; } + __device__ + __half2 operator++(int) + { + __half2 tmp{*this}; + ++*this; + return tmp; + } + __device__ + __half2& operator--() { return *this -= __half2{1, 1}; } + __device__ + __half2 operator--(int) + { + __half2 tmp{*this}; + --*this; + return tmp; + } + #endif + + // ACCESSORS + __host__ __device__ + operator decltype(data)() const { return data; } + __host__ __device__ + operator __half2_raw() const { return __half2_raw{data}; } + + // ACCESSORS - DEVICE ONLY + #if !defined(__HIP_NO_HALF_OPERATORS__) + __device__ + __half2 operator+() const { return *this; } + __device__ + __half2 operator-() const + { + __half2 tmp{*this}; + tmp.data = -tmp.data; + return tmp; + } + #endif + + // FRIENDS + #if !defined(__HIP_NO_HALF_OPERATORS__) + friend + inline + __device__ + __half2 operator+(const __half2& x, const __half2& y) + { + return __half2{x} += y; + } + friend + inline + __device__ + __half2 operator-(const __half2& x, const __half2& y) + { + return __half2{x} -= y; + } + friend + inline + __device__ + __half2 operator*(const __half2& x, const __half2& y) + { + return __half2{x} *= y; + } + friend + inline + __device__ + __half2 operator/(const __half2& x, const __half2& y) + { + return __half2{x} /= y; + } + friend + inline + __device__ + bool operator==(const __half2& x, const __half2& y) + { + auto r = x.data == y.data; + return r.x != 0 && r.y != 0; + } + friend + inline + __device__ + bool operator!=(const __half2& x, const __half2& y) + { + return !(x == y); + } + friend + inline + __device__ + bool operator<(const __half2& x, const __half2& y) + { + auto r = x.data < y.data; + return r.x != 0 && r.y != 0; + } + friend + inline + __device__ + bool operator>(const __half2& x, const __half2& y) + { + return y < x; + } + friend + inline + __device__ + bool operator<=(const __half2& x, const __half2& y) + { + return !(y < x); + } + friend + inline + __device__ + bool operator>=(const __half2& x, const __half2& y) + { + return !(x < y); + } + #endif // !defined(__HIP_NO_HALF_OPERATORS__) + }; + // END STRUCT __HALF2 + + namespace + { + inline + __host__ __device__ + __half2 make_half2(__half x, __half y) + { + return __half2{x, y}; + } + + inline + __device__ + __half __low2half(__half2 x) + { + return __half{static_cast<__half2_raw>(x).data.x}; + } + + inline + __device__ + __half __high2half(__half2 x) + { + return __half{static_cast<__half2_raw>(x).data.y}; + } + + inline + __device__ + __half2 __half2half2(__half x) + { + return __half2{x, x}; + } + + inline + __device__ + __half2 __halves2half2(__half x, __half y) + { + return __half2{x, y}; + } + + inline + __device__ + __half2 __low2half2(__half2 x) + { + return __half2{ + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(x).data.x}; + } + + inline + __device__ + __half2 __high2half2(__half2 x) + { + return __half2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(x).data.y}; + } + + inline + __device__ + __half2 __lows2half2(__half2 x, __half2 y) + { + return __half2{ + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(y).data.x}; + } + + inline + __device__ + __half2 __highs2half2(__half2 x, __half2 y) + { + return __half2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(y).data.y}; + } + + inline + __device__ + __half2 __lowhigh2highlow(__half2 x) + { + return __half2{ + static_cast<__half2_raw>(x).data.y, + static_cast<__half2_raw>(x).data.x}; + } + + // Bitcasts + inline + __device__ + short __half_as_short(__half x) + { + return static_cast<__half_raw>(x).x; + } + + inline + __device__ + unsigned short __half_as_ushort(__half x) + { + return static_cast<__half_raw>(x).x; + } + + inline + __device__ + __half __short_as_half(short x) + { + __half_raw r; r.x = x; + return r; + } + + inline + __device__ + __half __ushort_as_half(unsigned short x) + { + __half_raw r; r.x = x; + return r; + } + + // TODO: rounding behaviour is not correct. + // float -> half | half2 + inline + __device__ + __half __float2half(float x) { return __half{x}; } + inline + __device__ + __half __float2half_rn(float x) { return __half{x}; } + inline + __device__ + __half __float2half_rz(float x) { return __half{x}; } + inline + __device__ + __half __float2half_rd(float x) { return __half{x}; } + inline + __device__ + __half __float2half_ru(float x) { return __half{x}; } + inline + __device__ + __half2 __float2half2_rn(float x) { return __half2{x, x}; } + inline + __device__ + __half2 __floats2half2_rn(float x, float y) + { + return __half2{x, y}; + } + inline + __device__ + __half2 __float22half2_rn(float2 x) + { + return __floats2half2_rn(x.x, x.y); + } + + // half | half2 -> float + inline + __device__ + float __half2float(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + float __low2float(__half2 x) + { + return static_cast<__half2_raw>(x).data.x; + } + inline + __device__ + float __high2float(__half2 x) + { + return static_cast<__half2_raw>(x).data.y; + } + inline + __device__ + float2 __half22float2(__half2 x) + { + return make_float2( + static_cast<__half2_raw>(x).data.x, + static_cast<__half2_raw>(x).data.y); + } + + // half -> int + inline + __device__ + int __half2int_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + int __half2int_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // int -> half + inline + __device__ + __half __int2half_rn(int x) { return __half{x}; } + inline + __device__ + __half __int2half_rz(int x) { return __half{x}; } + inline + __device__ + __half __int2half_rd(int x) { return __half{x}; } + inline + __device__ + __half __int2half_ru(int x) { return __half{x}; } + + // half -> short + inline + __device__ + short __half2short_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + short __half2short_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // short -> half + inline + __device__ + __half __short2half_rn(short x) { return __half{x}; } + inline + __device__ + __half __short2half_rz(short x) { return __half{x}; } + inline + __device__ + __half __short2half_rd(short x) { return __half{x}; } + inline + __device__ + __half __short2half_ru(short x) { return __half{x}; } + + // half -> long long + inline + __device__ + long long __half2ll_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + long long __half2ll_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // long long -> half + inline + __device__ + __half __ll2half_rn(long long x) { return __half{x}; } + inline + __device__ + __half __ll2half_rz(long long x) { return __half{x}; } + inline + __device__ + __half __ll2half_rd(long long x) { return __half{x}; } + inline + __device__ + __half __ll2half_ru(long long x) { return __half{x}; } + + // half -> unsigned int + inline + __device__ + unsigned int __half2uint_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned int __half2uint_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // unsigned int -> half + inline + __device__ + __half __uint2half_rn(unsigned int x) { return __half{x}; } + inline + __device__ + __half __uint2half_rz(unsigned int x) { return __half{x}; } + inline + __device__ + __half __uint2half_rd(unsigned int x) { return __half{x}; } + inline + __device__ + __half __uint2half_ru(unsigned int x) { return __half{x}; } + + // half -> unsigned short + inline + __device__ + unsigned short __half2ushort_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned short __half2ushort_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // unsigned short -> half + inline + __device__ + __half __ushort2half_rn(unsigned short x) { return __half{x}; } + inline + __device__ + __half __ushort2half_rz(unsigned short x) { return __half{x}; } + inline + __device__ + __half __ushort2half_rd(unsigned short x) { return __half{x}; } + inline + __device__ + __half __ushort2half_ru(unsigned short x) { return __half{x}; } + + // half -> unsigned long long + inline + __device__ + unsigned long long __half2ull_rn(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_rz(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_rd(__half x) + { + return static_cast<__half_raw>(x).data; + } + inline + __device__ + unsigned long long __half2ull_ru(__half x) + { + return static_cast<__half_raw>(x).data; + } + + // unsigned long long -> half + inline + __device__ + __half __ull2half_rn(unsigned long long x) { return __half{x}; } + inline + __device__ + __half __ull2half_rz(unsigned long long x) { return __half{x}; } + inline + __device__ + __half __ull2half_rd(unsigned long long x) { return __half{x}; } + inline + __device__ + __half __ull2half_ru(unsigned long long x) { return __half{x}; } + + // Load primitives + inline + __device__ + __half __ldg(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldcg(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldca(const __half* ptr) { return *ptr; } + inline + __device__ + __half __ldcs(const __half* ptr) { return *ptr; } + + inline + __device__ + __half2 __ldg(const __half2* ptr) { return *ptr; } + inline + __device__ + __half2 __ldcg(const __half2* ptr) { return *ptr; } + inline + __device__ + __half2 __ldca(const __half2* ptr) { return *ptr; } + inline + __device__ + __half2 __ldcs(const __half2* ptr) { return *ptr; } + + // Relations + inline + __device__ + bool __heq(__half x, __half y) + { + return static_cast<__half_raw>(x).data == + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hne(__half x, __half y) + { + return static_cast<__half_raw>(x).data != + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hle(__half x, __half y) + { + return static_cast<__half_raw>(x).data <= + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hge(__half x, __half y) + { + return static_cast<__half_raw>(x).data >= + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hlt(__half x, __half y) + { + return static_cast<__half_raw>(x).data < + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hgt(__half x, __half y) + { + return static_cast<__half_raw>(x).data > + static_cast<__half_raw>(y).data; + } + inline + __device__ + bool __hequ(__half x, __half y) { return __heq(x, y); } + inline + __device__ + bool __hneu(__half x, __half y) { return __hne(x, y); } + inline + __device__ + bool __hleu(__half x, __half y) { return __hle(x, y); } + inline + __device__ + bool __hgeu(__half x, __half y) { return __hge(x, y); } + inline + __device__ + bool __hltu(__half x, __half y) { return __hlt(x, y); } + inline + __device__ + bool __hgtu(__half x, __half y) { return __hgt(x, y); } + + inline + __device__ + __half2 __heq2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data == + static_cast<__half2_raw>(y).data; + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hne2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data != + static_cast<__half2_raw>(y).data; + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hle2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data <= + static_cast<__half2_raw>(y).data; + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hge2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data >= + static_cast<__half2_raw>(y).data; + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hlt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data < + static_cast<__half2_raw>(y).data; + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hgt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(x).data > + static_cast<__half2_raw>(y).data; + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hequ2(__half2 x, __half2 y) { return __heq2(x, y); } + inline + __device__ + __half2 __hneu2(__half2 x, __half2 y) { return __hne2(x, y); } + inline + __device__ + __half2 __hleu2(__half2 x, __half2 y) { return __hle2(x, y); } + inline + __device__ + __half2 __hgeu2(__half2 x, __half2 y) { return __hge2(x, y); } + inline + __device__ + __half2 __hltu2(__half2 x, __half2 y) { return __hlt2(x, y); } + inline + __device__ + __half2 __hgtu2(__half2 x, __half2 y) { return __hgt2(x, y); } + + inline + __device__ + bool __hbeq2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__heq2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbne2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hne2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hble2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hle2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbge2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hge2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hblt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hlt2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbgt2(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hgt2(x, y)); + return r.data.x != 0 && r.data.y != 0; + } + inline + __device__ + bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); } + inline + __device__ + bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); } + inline + __device__ + bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); } + inline + __device__ + bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); } + inline + __device__ + bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); } + inline + __device__ + bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); } + + // Arithmetic + inline + __device__ + __half __clamp_01(__half x) + { + __half_raw r{x}; + return __half{(r.data < 0) ? 0 : ((r.data > 1) ? 1 : r.data)}; + } + + inline + __device__ + __half __hadd(__half x, __half y) + { + return static_cast<__half_raw>(x).data + + static_cast<__half_raw>(y).data; + } + inline + __device__ + __half __hsub(__half x, __half y) + { + return static_cast<__half_raw>(x).data - + static_cast<__half_raw>(y).data; + } + inline + __device__ + __half __hmul(__half x, __half y) + { + return static_cast<__half_raw>(x).data * + static_cast<__half_raw>(y).data; + } + inline + __device__ + __half __hadd_sat(__half x, __half y) + { + return __clamp_01(__hadd(x, y)); + } + inline + __device__ + __half __hsub_sat(__half x, __half y) + { + return __clamp_01(__hsub(x, y)); + } + inline + __device__ + __half __hmul_sat(__half x, __half y) + { + return __clamp_01(__hmul(x, y)); + } + inline + __device__ + __half __hfma(__half x, __half y, __half z) + { + return __ocml_fma_f16(x, y, z); + } + inline + __device__ + __half __hfma_sat(__half x, __half y, __half z) + { + return __clamp_01(__hfma(x, y, z)); + } + inline + __device__ + __half __hdiv(__half x, __half y) + { + return static_cast<__half_raw>(x).data / + static_cast<__half_raw>(y).data; + } + + inline + __device__ + __half2 __hadd2(__half2 x, __half2 y) + { + return static_cast<__half2_raw>(x).data + + static_cast<__half2_raw>(y).data; + } + inline + __device__ + __half2 __hsub2(__half2 x, __half2 y) + { + return static_cast<__half2_raw>(x).data - + static_cast<__half2_raw>(y).data; + } + inline + __device__ + __half2 __hmul2(__half2 x, __half2 y) + { + return static_cast<__half2_raw>(x).data * + static_cast<__half2_raw>(y).data; + } + inline + __device__ + __half2 __hadd2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hadd2(x, y)); + return __half2{__clamp_01(r.data.x), __clamp_01(r.data.y)}; + } + inline + __device__ + __half2 __hsub2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hsub2(x, y)); + return __half2{__clamp_01(r.data.x), __clamp_01(r.data.y)}; + } + inline + __device__ + __half2 __hmul2_sat(__half2 x, __half2 y) + { + auto r = static_cast<__half2_raw>(__hmul2(x, y)); + return __half2{__clamp_01(r.data.x), __clamp_01(r.data.y)}; + } + inline + __device__ + __half2 __hfma2(__half2 x, __half2 y, __half2 z) + { + return __ocml_fma_2f16(x, y, z); + } + inline + __device__ + __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z) + { + auto r = static_cast<__half2_raw>(__hfma2(x, y, z)); + return __half2{__clamp_01(r.data.x), __clamp_01(r.data.y)}; + } + inline + __device__ + __half2 __h2div(__half2 x, __half2 y) + { + return static_cast<__half2_raw>(x).data / + static_cast<__half2_raw>(y).data; + } + + // Math functions + inline + __device__ + __half htrunc(__half x) { return __ocml_trunc_f16(x); } + inline + __device__ + __half hceil(__half x) { return __ocml_ceil_f16(x); } + inline + __device__ + __half hfloor(__half x) { return __ocml_floor_f16(x); } + inline + __device__ + __half hrint(__half x) { return __ocml_rint_f16(x); } + inline + __device__ + __half hsin(__half x) { return __ocml_sin_f16(x); } + inline + __device__ + __half hcos(__half x) { return __ocml_cos_f16(x); } + inline + __device__ + __half hexp(__half x) { return __ocml_exp_f16(x); } + inline + __device__ + __half hexp2(__half x) { return __ocml_exp2_f16(x); } + inline + __device__ + __half hexp10(__half x) { return __ocml_exp10_f16(x); } + inline + __device__ + __half hlog2(__half x) { return __ocml_log2_f16(x); } + inline + __device__ + __half hlog(__half x) { return __ocml_log_f16(x); } + inline + __device__ + __half hlog10(__half x) { return __ocml_log10_f16(x); } + inline + __device__ + __half hrcp(__half x) { return __llvm_amdgcn_rcp_f16(x); } + inline + __device__ + __half hrsqrt(__half x) { return __ocml_rsqrt_f16(x); } + inline + __device__ + __half hsqrt(__half x) { return __ocml_sqrt_f16(x); } + inline + __device__ + bool __hisinf(__half x) { return __ocml_isinf_f16(x); } + inline + __device__ + bool __hisnan(__half x) { return __ocml_isnan_f16(x); } + inline + __device__ + __half __hneg(__half x) { return -static_cast<__half_raw>(x).data; } + + inline + __device__ + __half2 h2trunc(__half2 x) { return __ocml_trunc_2f16(x); } + inline + __device__ + __half2 h2ceil(__half2 x) { return __ocml_ceil_2f16(x); } + inline + __device__ + __half2 h2floor(__half2 x) { return __ocml_floor_2f16(x); } + inline + __device__ + __half2 h2rint(__half2 x) { return __ocml_rint_2f16(x); } + inline + __device__ + __half2 h2sin(__half2 x) { return __ocml_sin_2f16(x); } + inline + __device__ + __half2 h2cos(__half2 x) { return __ocml_cos_2f16(x); } + inline + __device__ + __half2 h2exp(__half2 x) { return __ocml_exp_2f16(x); } + inline + __device__ + __half2 h2exp2(__half2 x) { return __ocml_exp2_2f16(x); } + inline + __device__ + __half2 h2exp10(__half2 x) { return __ocml_exp10_2f16(x); } + inline + __device__ + __half2 h2log2(__half2 x) { return __ocml_log2_2f16(x); } + inline + __device__ + __half2 h2log(__half2 x) { return __ocml_log_2f16(x); } + inline + __device__ + __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); } + inline + __device__ + __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); } + inline + __device__ + __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); } + inline + __device__ + __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); } + inline + __device__ + __half2 __hisinf2(__half2 x) + { + auto r = __ocml_isinf_2f16(x); + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hisnan2(__half2 x) + { + auto r = __ocml_isnan_2f16(x); + return __half2{r.x, r.y}; + } + inline + __device__ + __half2 __hneg2(__half2 x) + { + return -static_cast<__half2_raw>(x).data; + } + } // Anonymous namespace. + + #if !defined(HIP_NO_HALF) + using half = __half; + using half2 = __half2; + #endif + #endif // defined(__cplusplus) +#elif defined(__GNUC__) + #include "hip_fp16_gcc.h" +#endif // !defined(__clang__) && defined(__GNUC__) + +// /* +// Half Arithmetic Functions +// */ +// __device__ __half __hadd(const __half a, const __half b); +// __device__ __half __hadd_sat(__half a, __half b); +// __device__ __half __hfma(__half a, __half b, __half c); +// __device__ __half __hfma_sat(__half a, __half b, __half c); +// __device__ __half __hmul(__half a, __half b); +// __device__ __half __hmul_sat(__half a, __half b); +// __device__ __half __hneg(__half a); +// __device__ __half __hsub(__half a, __half b); +// __device__ __half __hsub_sat(__half a, __half b); +// __device__ __half hdiv(__half a, __half b); + +// /* +// Half2 Arithmetic Functions +// */ + +// __device__ static __half2 __hadd2(__half2 a, __half2 b); +// __device__ static __half2 __hadd2_sat(__half2 a, __half2 b); +// __device__ static __half2 __hfma2(__half2 a, __half2 b, __half2 c); +// __device__ static __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c); +// __device__ static __half2 __hmul2(__half2 a, __half2 b); +// __device__ static __half2 __hmul2_sat(__half2 a, __half2 b); +// __device__ static __half2 __hsub2(__half2 a, __half2 b); +// __device__ static __half2 __hneg2(__half2 a); +// __device__ static __half2 __hsub2_sat(__half2 a, __half2 b); +// __device__ static __half2 h2div(__half2 a, __half2 b); + +// /* +// Half Comparision Functions +// */ + +// __device__ bool __heq(__half a, __half b); +// __device__ bool __hge(__half a, __half b); +// __device__ bool __hgt(__half a, __half b); +// __device__ bool __hisinf(__half a); +// __device__ bool __hisnan(__half a); +// __device__ bool __hle(__half a, __half b); +// __device__ bool __hlt(__half a, __half b); +// __device__ bool __hne(__half a, __half b); + +// /* +// Half Math Functions +// */ + +// __device__ static __half hceil(const __half h); +// __device__ static __half hcos(const __half h); +// __device__ static __half hexp(const __half h); +// __device__ static __half hexp10(const __half h); +// __device__ static __half hexp2(const __half h); +// __device__ static __half hfloor(const __half h); +// __device__ static __half hlog(const __half h); +// __device__ static __half hlog10(const __half h); +// __device__ static __half hlog2(const __half h); +// //__device__ static __half hrcp(const __half h); +// __device__ static __half hrint(const __half h); +// __device__ static __half hsin(const __half h); +// __device__ static __half hsqrt(const __half a); +// __device__ static __half htrunc(const __half a); + +// /* +// Half2 Math Functions +// */ + +// __device__ static __half2 h2ceil(const __half2 h); +// __device__ static __half2 h2exp(const __half2 h); +// __device__ static __half2 h2exp10(const __half2 h); +// __device__ static __half2 h2exp2(const __half2 h); +// __device__ static __half2 h2floor(const __half2 h); +// __device__ static __half2 h2log(const __half2 h); +// __device__ static __half2 h2log10(const __half2 h); +// __device__ static __half2 h2log2(const __half2 h); +// __device__ static __half2 h2rcp(const __half2 h); +// __device__ static __half2 h2rsqrt(const __half2 h); +// __device__ static __half2 h2sin(const __half2 h); +// __device__ static __half2 h2sqrt(const __half2 h); + +// /* +// Half2 Arithmetic Functions +// */ + +// __device__ static inline __half2 __hadd2(__half2 a, __half2 b) { +// __half2 c; +// c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); +// return c; +// } + +// __device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) { +// __half2 c; +// c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); +// return c; +// } + +// __device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) { +// __half2 d; +// d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); +// return d; +// } + +// __device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) { +// __half2 d; +// d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); +// return d; +// } + +// __device__ static inline __half2 __hmul2(__half2 a, __half2 b) { +// __half2 c; +// c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); +// return c; +// } + +// __device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) { +// __half2 c; +// c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); +// return c; +// } + +// __device__ static inline __half2 __hsub2(__half2 a, __half2 b) { +// __half2 c; +// c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); +// return c; +// } + +// __device__ static inline __half2 __hneg2(__half2 a) { +// __half2 c; +// c.x = -a.x; +// c.y = -a.y; +// return c; +// } + +// __device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) { +// __half2 c; +// c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); +// return c; +// } + +// __device__ static inline __half2 h2div(__half2 a, __half2 b) { +// __half2 c; +// c.x = a.x / b.x; +// c.y = a.y / b.y; +// return c; +// } + + +// __device__ static inline __half hceil(const __half h) { return __hip_hc_ir_hceil_half(h); } + +// __device__ static inline __half hcos(const __half h) { return __hip_hc_ir_hcos_half(h); } + +// __device__ static inline __half hexp(const __half h) { +// return __hip_hc_ir_hexp2_half(__hmul(h, 1.442694)); +// } + +// __device__ static inline __half hexp10(const __half h) { +// return __hip_hc_ir_hexp2_half(__hmul(h, 3.3219281)); +// } + +// __device__ static inline __half hexp2(const __half h) { return __hip_hc_ir_hexp2_half(h); } + +// __device__ static inline __half hfloor(const __half h) { return __hip_hc_ir_hfloor_half(h); } + +// __device__ static inline __half hlog(const __half h) { +// return __hmul(__hip_hc_ir_hlog2_half(h), 0.693147); +// } + +// __device__ static inline __half hlog10(const __half h) { +// return __hmul(__hip_hc_ir_hlog2_half(h), 0.301029); +// } + +// __device__ static inline __half hlog2(const __half h) { return __hip_hc_ir_hlog2_half(h); } +// /* +// __device__ static inline __half hrcp(const __half h) { +// return __hip_hc_ir_hrcp_half(h); +// } +// */ +// __device__ static inline __half hrint(const __half h) { return __hip_hc_ir_hrint_half(h); } + +// __device__ static inline __half hrsqrt(const __half h) { return __hip_hc_ir_hrsqrt_half(h); } + +// __device__ static inline __half hsin(const __half h) { return __hip_hc_ir_hsin_half(h); } + +// __device__ static inline __half hsqrt(const __half a) { return __hip_hc_ir_hsqrt_half(a); } + +// __device__ static inline __half htrunc(const __half a) { return __hip_hc_ir_htrunc_half(a); } + +// /* +// Half2 Math Operations +// */ + +// __device__ static inline __half2 h2ceil(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2ceil_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2cos(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2cos_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2exp(const __half2 h) { +// __half2 factor; +// factor.x = 1.442694; +// factor.y = 1.442694; +// factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); +// return factor; +// } + +// __device__ static inline __half2 h2exp10(const __half2 h) { +// __half2 factor; +// factor.x = 3.3219281; +// factor.y = 3.3219281; +// factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); +// return factor; +// } + +// __device__ static inline __half2 h2exp2(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2exp2_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2floor(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2floor_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2log(const __half2 h) { +// __half2 factor; +// factor.x = 0.693147; +// factor.y = 0.693147; +// factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); +// return factor; +// } + +// __device__ static inline __half2 h2log10(const __half2 h) { +// __half2 factor; +// factor.x = 0.301029; +// factor.y = 0.301029; +// factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); +// return factor; +// } +// __device__ static inline __half2 h2log2(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2log2_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2rcp(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2rcp_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2rsqrt(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2rsqrt_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2sin(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2sin_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2sqrt(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2sqrt_int(h.xy); +// return a; +// } + +// __device__ static inline __half2 h2trunc(const __half2 h) { +// __half2 a; +// a.xy = __hip_hc_ir_h2trunc_int(h.xy); +// return a; +// } +// #endif // clang_major > 3 + +// #endif diff --git a/hipamd/include/hip/hcc_detail/hip_fp16_gcc.h b/hipamd/include/hip/hcc_detail/hip_fp16_gcc.h new file mode 100644 index 0000000000..e53afadcf2 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/hip_fp16_gcc.h @@ -0,0 +1,252 @@ +#pragma once + +#if defined(__cplusplus) + #include +#endif + +struct __half_raw { + unsigned short x; +}; + +struct __half2_raw { + unsigned short x; + unsigned short y; +}; + +#if defined(__cplusplus) + struct __half; + + __half __float2half(float); + float __half2float(__half); + + // BEGIN STRUCT __HALF + struct __half { + protected: + unsigned short __x; + public: + // CREATORS + __half() = default; + __half(const __half_raw& x) : __x{x.x} {} + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + __half(float x) : __x{__float2half(x).__x} {} + __half(double x) : __x{__float2half(x).__x} {} + #endif + __half(const __half&) = default; + __half(__half&&) = default; + ~__half() = default; + + // MANIPULATORS + __half& operator=(const __half&) = default; + __half& operator=(__half&&) = default; + __half& operator=(const __half_raw& x) { __x = x.x; return *this; } + #if !defined(__HIP_NO_HALF_CONVERSIONS__) + __half& operator=(float x) + { + __x = __float2half(x).__x; + return *this; + } + __half& operator=(double x) + { + return *this = static_cast(x); + } + #endif + + // ACCESSORS + operator float() const { return __half2float(*this); } + operator __half_raw() const { return __half_raw{__x}; } + }; + // END STRUCT __HALF + + // BEGIN STRUCT __HALF2 + struct __half2 { + protected: + __half x; + __half y; + public: + // CREATORS + __half2() = default; + __half2(const __half2_raw& ix) + : + x{reinterpret_cast(ix.x)}, + y{reinterpret_cast(ix.y)} + {} + __half2(const __half& ix, const __half& iy) : x{ix}, y{iy} {} + __half2(const __half2&) = default; + __half2(__half2&&) = default; + ~__half2() = default; + + // MANIPULATORS + __half2& operator=(const __half2&) = default; + __half2& operator=(__half2&&) = default; + __half2& operator=(const __half2_raw& ix) + { + x = reinterpret_cast(ix.x); + y = reinterpret_cast(ix.y); + return *this; + } + + // ACCESSORS + operator __half2_raw() const + { + return __half2_raw{ + reinterpret_cast(x), + reinterpret_cast(y)}; + } + }; + // END STRUCT __HALF2 + + namespace + { + inline + unsigned short __internal_float2half( + float flt, unsigned int& sgn, unsigned int& rem) + { + unsigned int x{}; + std::memcpy(&x, &flt, sizeof(flt)); + + unsigned int u = (x & 0x7fffffffU); + sgn = ((x >> 16) & 0x8000U); + + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + rem = 0; + return static_cast( + (u == 0x7f800000U) ? (sgn | 0x7c00U) : 0x7fffU); + } + // Overflows + if (u > 0x477fefffU) { + rem = 0x80000000U; + return static_cast(sgn | 0x7bffU); + } + // Normal numbers + if (u >= 0x38800000U) { + rem = u << 19; + u -= 0x38000000U; + return static_cast(sgn | (u >> 13)); + } + // +0/-0 + if (u < 0x33000001U) { + rem = u; + return static_cast(sgn); + } + // Denormal numbers + unsigned int exponent = u >> 23; + unsigned int mantissa = (u & 0x7fffffU); + unsigned int shift = 0x7eU - exponent; + mantissa |= 0x800000U; + rem = mantissa << (32 - shift); + return static_cast(sgn | (mantissa >> shift)); + } + + inline + __half __float2half(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + if (rem > 0x80000000U || (rem == 0x80000000U && (r.x & 0x1))) ++r.x; + + return r; + } + + inline + __half __float2half_rn(float x) { return __float2half(x); } + + inline + __half __float2half_rz(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + + return r; + } + + inline + __half __float2half_rd(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + if (rem && sgn) ++r.x; + + return r; + } + + inline + __half __float2half_ru(float x) + { + __half_raw r; + unsigned int sgn{}; + unsigned int rem{}; + r.x = __internal_float2half(x, sgn, rem); + if (rem && !sgn) ++r.x; + + return r; + } + + inline + __half2 __float2half2_rn(float x) + { + return __half2{__float2half_rn(x), __float2half_rn(x)}; + } + + inline + __half2 __floats2half2_rn(float x, float y) + { + return __half2{__float2half_rn(x), __float2half_rn(y)}; + } + + inline + float __internal_half2float(unsigned short x) + { + unsigned int sign = ((x >> 15) & 1); + unsigned int exponent = ((x >> 10) & 0x1f); + unsigned int mantissa = ((x & 0x3ff) << 13); + + if (exponent == 0x1fU) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffffU) : 0); + exponent = 0xffU; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + unsigned int u = ((sign << 31) | (exponent << 23) | mantissa); + float f; + memcpy(&f, &u, sizeof(u)); + + return f; + } + + inline + float __half2float(__half x) + { + return __internal_half2float(static_cast<__half_raw>(x).x); + } + + inline + float __low2float(__half2 x) + { + return __internal_half2float(static_cast<__half2_raw>(x).x); + } + + inline + float __high2float(__half2 x) + { + return __internal_half2float(static_cast<__half2_raw>(x).y); + } + } +#endif // defined(__cplusplus) \ No newline at end of file diff --git a/hipamd/include/hip/hcc_detail/hip_fp16_math_fwd.h b/hipamd/include/hip/hcc_detail/hip_fp16_math_fwd.h new file mode 100644 index 0000000000..81f9fe4761 --- /dev/null +++ b/hipamd/include/hip/hcc_detail/hip_fp16_math_fwd.h @@ -0,0 +1,76 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +// /* +// Half Math Functions +// */ + +extern "C" +{ + __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16); + _Float16 __ocml_cos_f16(_Float16); + __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16); + __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16); + __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16); + __attribute__((const)) _Float16 __ocml_floor_f16(_Float16); + __attribute__((const)) + _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16); + __attribute__((const)) int __ocml_isinf_f16(_Float16); + __attribute__((const)) int __ocml_isnan_f16(_Float16); + __attribute__((pure)) _Float16 __ocml_log_f16(_Float16); + __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16); + __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16); + __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16); + __attribute__((const)) _Float16 __ocml_rint_f16(_Float16); + __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16); + _Float16 __ocml_sin_f16(_Float16); + __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16); + __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16); + + typedef _Float16 __2f16 __attribute__((ext_vector_type(2))); + typedef short __2i16 __attribute__((ext_vector_type(2))); + + __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16); + __2f16 __ocml_cos_2f16(__2f16); + __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16); + __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16); + __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16); + __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16); + __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16); + __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16); + __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16); + __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16); + __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16); + __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16); + inline + __2f16 __llvm_amdgcn_rcp_2f16(__2f16 x) // Not currently exposed by ROCDL. + { + return __2f16{__llvm_amdgcn_rcp_f16(x.x), __llvm_amdgcn_rcp_f16(x.y)}; + } + __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16); + __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16); + __2f16 __ocml_sin_2f16(__2f16); + __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16); + __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16); +} \ No newline at end of file diff --git a/hipamd/tests/src/deviceLib/hipTestHalf.cpp b/hipamd/tests/src/deviceLib/hipTestHalf.cpp index 5a2aac2b29..3b5a75bcaf 100644 --- a/hipamd/tests/src/deviceLib/hipTestHalf.cpp +++ b/hipamd/tests/src/deviceLib/hipTestHalf.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ -__global__ void __halfMath(hipLaunchParm lp, __half* A, __half* B, __half* C) { +__global__ void __halfMath(__half* A, __half* B, __half* C) { int tx = threadIdx.x; __half a = A[tx]; __half b = B[tx]; @@ -47,10 +47,10 @@ __global__ void __halfMath(hipLaunchParm lp, __half* A, __half* B, __half* C) { c = __hsub_sat(b, c); c = __hmul(a, c); c = __hmul_sat(b, c); - c = hdiv(a, c); + c = __hdiv(a, c); } -__global__ void __half2Math(hipLaunchParm lp, __half2* A, __half2* B, __half2* C) { +__global__ void __half2Math(__half2* A, __half2* B, __half2* C) { int tx = threadIdx.x; __half2 a = A[tx]; __half2 b = B[tx]; @@ -65,12 +65,12 @@ __global__ void __half2Math(hipLaunchParm lp, __half2* A, __half2* B, __half2* C c = __hmul2_sat(b, c); } -__global__ void kernel_hisnan(hipLaunchParm lp, __half* input, int* output) { +__global__ void kernel_hisnan(__half* input, int* output) { int tx = threadIdx.x; output[tx] = __hisnan(input[tx]); } -__global__ void kernel_hisinf(hipLaunchParm lp, __half* input, int* output) { +__global__ void kernel_hisinf(__half* input, int* output) { int tx = threadIdx.x; output[tx] = __hisinf(input[tx]); } @@ -93,7 +93,8 @@ void check_hisnan(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) { hipMalloc((void**)&outputGPU, memsize); // launch the kernel - hipLaunchKernel(kernel_hisnan, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU); + hipLaunchKernelGGL( + kernel_hisnan, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU); // copy output from device int* outputCPU = (int*) malloc(memsize); @@ -103,12 +104,18 @@ void check_hisnan(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) { for (int i=0; i(inputCPU[i]), + i); } } else { // inputs are NOT nan, output should be false if (outputCPU[i] != 0) { - failed("__hisnan() returned true for %f (input idx = %d)\n", inputCPU[i], i); + failed( + "__hisnan() returned true for %f (input idx = %d)\n", + static_cast(inputCPU[i]), + i); } } } @@ -129,7 +136,8 @@ void check_hisinf(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) { hipMalloc((void**)&outputGPU, memsize); // launch the kernel - hipLaunchKernel(kernel_hisinf, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU); + hipLaunchKernelGGL( + kernel_hisinf, dim3(1), dim3(NUM_INPUTS), 0, 0, inputGPU, outputGPU); // copy output from device int* outputCPU = (int*) malloc(memsize); @@ -139,12 +147,18 @@ void check_hisinf(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) { for (int i=0; i(inputCPU[i]), + i); } } else { // inputs are NOT inf, output should be false if (outputCPU[i] != 0) { - failed("__hisinf() returned true for %f (input idx = %d)\n", inputCPU[i], i); + failed( + "__hisinf() returned true for %f (input idx = %d)\n", + static_cast(inputCPU[i]), + i); } } } @@ -160,11 +174,11 @@ void check_hisinf(int NUM_INPUTS, __half* inputCPU, __half* inputGPU) { void checkFunctional() { - // allocate memory + // allocate memory const int NUM_INPUTS = 16; auto memsize = NUM_INPUTS * sizeof(__half); __half* inputCPU = (__half*) malloc(memsize); - + // populate inputs inputCPU[0] = host_ushort_as_half(0x7c00); // inf inputCPU[1] = host_ushort_as_half(0xfc00); // -inf @@ -207,7 +221,8 @@ int main() { hipMalloc(&A, HALF_SIZE); hipMalloc(&B, HALF_SIZE); hipMalloc(&C, HALF_SIZE); - hipLaunchKernel(__halfMath, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, A, B, C); + hipLaunchKernelGGL( + __halfMath, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, A, B, C); hipFree(A); hipFree(B); hipFree(C); @@ -215,13 +230,14 @@ int main() { hipMalloc(&A2, HALF2_SIZE); hipMalloc(&B2, HALF2_SIZE); hipMalloc(&C2, HALF2_SIZE); - hipLaunchKernel(__half2Math, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, A2, B2, C2); + hipLaunchKernelGGL( + __half2Math, dim3(1, 1, 1), dim3(LEN, 1, 1), 0, 0, A2, B2, C2); hipFree(A2); hipFree(B2); hipFree(C2); // run some functional checks checkFunctional(); - + passed(); } diff --git a/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp b/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp new file mode 100644 index 0000000000..f0224895fc --- /dev/null +++ b/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp @@ -0,0 +1,142 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * RUN: %t + * HIT_END + */ + +#include +#include "hip/hip_runtime.h" + +#include "test_common.h" + +#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ + +__global__ +__attribute__((optnone)) +void __halfTest(bool* result) { + // Construction + __half a{1}; result[0] = __heq(a, 1); + a = __half{1.0f}; result[0] = __heq(a, 1) && result[0]; + a = __half{1.0}; result[0] = __heq(a, 1) && result[0]; + a = __half{static_cast(1)}; + result[0] = __heq(a, 1) && result[0]; + a = __half{static_cast(1)}; result[0] = __heq(a, 1) && result[0]; + a = __half{1u}; result[0] = __heq(a, 1) && result[0]; + a = __half{1ul}; result[0] = __heq(a, 1) && result[0]; + a = __half{1l}; result[0] = __heq(a, 1) && result[0]; + a = __half{1ll}; result[0] = __heq(a, 1) && result[0]; + a = __half{1ull}; result[0] = __heq(a, 1) && result[0]; + + // Assignment + a = 0.0f; result[0] = __heq(a, 0) && result[0]; + a = 1.0; result[0] = __heq(a, 1) && result[0]; + a = __half_raw{2}; result[0] = __heq(a, 2) && result[0]; + + // Nullary + result[0] = __heq(a, +a) && result[0]; + result[0] = __heq(__hneg(a), -a) && result[0]; + + // Unary arithmetic + result[0] = __heq(a += 0, a) && result[0]; + result[0] = __heq(a -= 0, a) && result[0]; + result[0] = __heq(a *= 1, a) && result[0]; + result[0] = __heq(a /= 1, a) && result[0]; + + // Binary arithmetic + result[0] = __heq((a + a), __hadd(a, a)) && result[0]; + result[0] = __heq((a - a), __hsub(a, a)) && result[0]; + result[0] = __heq((a * a), __hmul(a, a)) && result[0]; + result[0] = __heq((a / a), __hdiv(a, a)) && result[0]; + + // Relations + result[0] = (a == a) && result[0]; + result[0] = !(a != a) && result[0]; + result[0] = (a <= a) && result[0]; + result[0] = (a >= a) && result[0]; + result[0] = !(a < a) && result[0]; + result[0] = !(a > a) && result[0]; +} + +__device__ +bool to_bool(const __half2& x) +{ + auto r = static_cast(x); + + return r.data.x != 0 && r.data.y != 0; +} +__global__ +__attribute__((optnone)) +void __half2Test(bool* result) { + // Construction + __half2 a{1}; + result[0] = to_bool(__heq2(a, 1)); + a = __half2{__half{1}, __half{1}}; + result[0] = to_bool(__heq2(a, {1, 1})) && result[0]; + + // Assignment + a = __half2_raw{2}; result[0] = to_bool(__heq2(a, {2, 2})) && result[0]; + + // Nullary + result[0] = to_bool(__heq2(a, +a)) && result[0]; + result[0] = to_bool(__heq2(__hneg2(a), -a)) && result[0]; + + // Unary arithmetic + result[0] = to_bool(__heq2(a += 0, a)) && result[0]; + result[0] = to_bool(__heq2(a -= 0, a)) && result[0]; + result[0] = to_bool(__heq2(a *= 1, a)) && result[0]; + result[0] = to_bool(__heq2(a /= 1, a)) && result[0]; + + // Binary arithmetic + result[0] = to_bool(__heq2((a + a), __hadd2(a, a))) && result[0]; + result[0] = to_bool(__heq2((a - a), __hsub2(a, a))) && result[0]; + result[0] = to_bool(__heq2((a * a), __hmul2(a, a))) && result[0]; + result[0] = to_bool(__heq2((a / a), __h2div(a, a))) && result[0]; + + // Relations + result[0] = (a == a) && result[0]; + result[0] = !(a != a) && result[0]; + result[0] = (a <= a) && result[0]; + result[0] = (a >= a) && result[0]; + result[0] = !(a < a) && result[0]; + result[0] = !(a > a) && result[0]; +} + +#endif + +int main() { + bool* result{nullptr}; + hipHostMalloc(&result, 1); + + result[0] = false; + hipLaunchKernelGGL(__halfTest, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, result); + hipDeviceSynchronize(); + + if (!result[0]) { failed("Failed __half tests."); } + + result[0] = false; + hipLaunchKernelGGL(__half2Test, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, result); + hipDeviceSynchronize(); + + if (!result[0]) { failed("Failed __half2 tests."); } + + passed(); +}