From 12febe67822e9a992a73fe6afce91504be5fdce0 Mon Sep 17 00:00:00 2001 From: "Chaudhary, Jatin Jaikishan" Date: Thu, 1 May 2025 15:17:38 +0100 Subject: [PATCH] SWDEV-514560 - add fp6 header implementation (#54) Co-authored-by: rahul manocha --- hipamd/include/hip/amd_detail/amd_hip_fp6.h | 733 ++++++++++++++++++++ 1 file changed, 733 insertions(+) create mode 100644 hipamd/include/hip/amd_detail/amd_hip_fp6.h diff --git a/hipamd/include/hip/amd_detail/amd_hip_fp6.h b/hipamd/include/hip/amd_detail/amd_hip_fp6.h new file mode 100644 index 0000000000..a323fa329b --- /dev/null +++ b/hipamd/include/hip/amd_detail/amd_hip_fp6.h @@ -0,0 +1,733 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#pragma once + +#include "amd_hip_mx_common.h" + +#include "amd_hip_fp16.h" +#include "amd_hip_bf16.h" +#include "amd_hip_fp8.h" + +#include "amd_hip_ocp_types.h" +#include "amd_hip_ocp_host.hpp" +#include "hip/amd_detail/amd_hip_mx_common.h" + +#if defined(__HIPCC_RTC__) +#define __FP6_HOST_DEVICE__ __device__ +#define __FP6_HOST_DEVICE_STATIC__ __FP6_HOST_DEVICE__ static +#else +#define __FP6_HOST_DEVICE__ __host__ __device__ +#define __FP6_HOST_DEVICE_STATIC__ __FP6_HOST_DEVICE__ static inline +#endif // __HIPCC_RTC__ + +typedef __hip_fp8_storage_t __hip_fp6_storage_t; +typedef __hip_fp8x2_storage_t __hip_fp6x2_storage_t; +typedef __hip_fp8x4_storage_t __hip_fp6x4_storage_t; + +static_assert(sizeof(__hip_fp6_storage_t[4]) == sizeof(uint32_t)); +static_assert(sizeof(__hip_fp6x2_storage_t[2]) == sizeof(uint32_t)); +static_assert(sizeof(__hip_fp6x4_storage_t[2]) == sizeof(uint64_t)); + +enum __hip_fp6_interpretation_t { + __HIP_E3M2 = 0, /**< FP6 E3M2 Type*/ + __HIP_E2M3 = 1, /**< FP6 E2M3 Type */ +}; + +namespace internal { +__FP6_HOST_DEVICE_STATIC__ __amd_fp16_storage_t half_to_f16(const __half val) { + __half_raw tmp = val; + return tmp.data; +} +__FP6_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t half2_to_f16x2(const __half2 val) { + __half2_raw tmp = val; + return tmp.data; +} +__FP6_HOST_DEVICE_STATIC__ __amd_bf16_storage_t hipbf16_to_bf16(const __hip_bfloat16 val) { + static_assert(sizeof(__hip_bfloat16) == sizeof(__amd_bf16_storage_t)); + union { + __hip_bfloat16 hip_bf16; + __amd_bf16_storage_t bf16; + } u{val}; + return u.bf16; +} +__FP6_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t hipbf162_to_bf16x2(const __hip_bfloat162 val) { + static_assert(sizeof(__hip_bfloat162) == sizeof(__amd_bf16x2_storage_t)); + union { + __hip_bfloat162 hip_bf16; + __amd_bf16x2_storage_t bf16; + } u{val}; + return u.bf16; +} +} // namespace internal + + +// Note: Ignore rounding input on AMD GPUs for now. At the moment AMD GPUs do not support rounding +// modes, all the inputs are rounded to nearest or use an input to do stochastic rounding. +// We hide the rounding variable to not trigger the unused variable compiler warning. +__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t __hip_cvt_bfloat16raw_to_fp6( + const __hip_bfloat16_raw x, const __hip_fp6_interpretation_t fp6_interpretation, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6_storage_t fp6[4]; + } u{0}; +#if __gfx950__ + __amd_bf16x32_storage_t in; + __amd_fp6x32_storage_t out; + in[0] = internal::hipbf16_to_bf16(x); + if (fp6_interpretation == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, 1.0f /* scale */); + else if (fp6_interpretation == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, 1.0f /* scale */); + u.ui32 = out[0]; + return u.fp6[0]; +#else + if (fp6_interpretation == __HIP_E2M3) + u.ui32 = fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E2M3, true>( + internal::hipbf16_to_bf16(x), 0); + else if (fp6_interpretation == __HIP_E3M2) + u.ui32 = fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E3M2, true>( + internal::hipbf16_to_bf16(x), 0); + return u.fp6[0]; +#endif +} +__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t __hip_cvt_bfloat16raw2_to_fp6x2( + const __hip_bfloat162_raw x, const __hip_fp6_interpretation_t fp6_interpretation, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6x2_storage_t fp6x2[2]; + } u{0}; +#if __gfx950__ + __amd_bf16x32_storage_t in; + in[0] = internal::hipbf16_to_bf16(x.x); + in[1] = internal::hipbf16_to_bf16(x.y); + __amd_fp6x32_storage_t out; + if (fp6_interpretation == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, 1.0f /* scale */); + else if (fp6_interpretation == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, 1.0f /* scale */); + u.ui32 = out[0]; + return u.fp6x2[0]; +#else + if (fp6_interpretation == __HIP_E2M3) { + auto bf16x2 = internal::hipbf162_to_bf16x2(x); + u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E2M3, true>(bf16x2[1], 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E2M3, true>(bf16x2[0], 0); + } else if (fp6_interpretation == __HIP_E3M2) { + auto bf16x2 = internal::hipbf162_to_bf16x2(x); + u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E3M2, true>(bf16x2[1], 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E3M2, true>(bf16x2[0], 0); + } + return u.fp6x2[0]; +#endif +} +__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t +__hip_cvt_double_to_fp6(const double x, const __hip_fp6_interpretation_t fp6_interpretation_t, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6_storage_t fp6[4]; + } u{0}; +#if __gfx950__ + __amd_floatx16_storage_t in1; + __amd_floatx16_storage_t in2; + __amd_fp6x32_storage_t out; + in1[0] = float(x); + in2[0] = 0.0f; + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */); + u.ui32 = out[0]; + return u.fp6[0]; +#else + if (fp6_interpretation_t == __HIP_E2M3) { + u.ui32 = fcbx::from_float(float(x), 0); + } else if (fp6_interpretation_t == __HIP_E3M2) { + u.ui32 = fcbx::from_float(float(x), 0); + } + return u.fp6[0]; +#endif +} +__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t +__hip_cvt_double2_to_fp6x2(const double2 x, const __hip_fp6_interpretation_t fp6_interpretation_t, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6x2_storage_t fp6x2[2]; + } u{0}; +#if __gfx950__ + __amd_floatx16_storage_t in1; + __amd_floatx16_storage_t in2; + __amd_fp6x32_storage_t out; + in1[0] = float(x.x); + in2[0] = float(x.y); + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */); + u.ui32 = out[0] & 0x3Fu; + u.ui32 |= ((out[0] & 0xFC0u) << 2); + return u.fp6x2[0]; +#else + if (fp6_interpretation_t == __HIP_E2M3) { + u.ui32 |= fcbx::from_float(float(x.y), 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float(float(x.x), 0); + } else if (fp6_interpretation_t == __HIP_E3M2) { + u.ui32 |= fcbx::from_float(float(x.y), 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float(float(x.x), 0); + } + return u.fp6x2[0]; +#endif +} +__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t +__hip_cvt_float_to_fp6(const float x, const __hip_fp6_interpretation_t fp6_interpretation_t, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6_storage_t fp6[4]; + } u{0}; +#if __gfx950__ + __amd_floatx16_storage_t in1; + __amd_floatx16_storage_t in2; + __amd_fp6x32_storage_t out; + in1[0] = x; + in2[0] = 0.0f; + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */); + u.ui32 = out[0]; + return u.fp6[0]; +#else + if (fp6_interpretation_t == __HIP_E2M3) + u.ui32 = fcbx::from_float(x, 0); + else if (fp6_interpretation_t == __HIP_E3M2) + u.ui32 = fcbx::from_float(x, 0); + return u.fp6[0]; +#endif +} +__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t +__hip_cvt_float2_to_fp6x2(const float2 x, const __hip_fp6_interpretation_t fp6_interpretation_t, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6x2_storage_t fp6x2[2]; + } u{0}; +#if __gfx950__ + __amd_floatx16_storage_t in1; + __amd_floatx16_storage_t in2; + __amd_fp6x32_storage_t out; + in1[0] = x.x; + in2[0] = x.y; + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */); + u.ui32 = out[0] & 0x3Fu; + u.ui32 |= ((out[0] & 0xFC0u) << 2); + return u.fp6x2[0]; +#else + if (fp6_interpretation_t == __HIP_E2M3) { + u.ui32 |= fcbx::from_float(x.y, 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float(x.x, 0); + } else if (fp6_interpretation_t == __HIP_E3M2) { + u.ui32 |= fcbx::from_float(x.y, 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float(x.x, 0); + } + return u.fp6x2[0]; +#endif +} +__FP6_HOST_DEVICE_STATIC__ __half_raw __hip_cvt_fp6_to_halfraw( + const __hip_fp6_storage_t x, const __hip_fp6_interpretation_t fp6_interpretation_t) { + __half_raw ret; +#if __gfx950__ + __amd_fp16x32_storage_t out; + __amd_fp6x32_storage_t in; + in[0] = (uint32_t)x; + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, 1.0f); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, 1.0f); + ret.data = out[0]; +#else + using namespace fcbx; + if (fp6_interpretation_t == __HIP_E2M3) { + ret.data = __amd_fp16_storage_t{to_float<__amd_fp16_storage_t, Encoding::E2M3, true>(x, 0)}; + } else if (fp6_interpretation_t == __HIP_E3M2) { + ret.data = __amd_fp16_storage_t{to_float<__amd_fp16_storage_t, Encoding::E3M2, true>(x, 0)}; + } +#endif + return ret; +} +__FP6_HOST_DEVICE_STATIC__ __half2_raw __hip_cvt_fp6x2_to_halfraw2( + const __hip_fp6x2_storage_t x, const __hip_fp6_interpretation_t fp6_interpretation_t) { + __half2_raw ret; +#if __gfx950__ + __amd_fp16x32_storage_t out; + __amd_fp6x32_storage_t in; + in[0] = x & 0x3Fu; // first 6 bits + in[0] |= (x & 0x3F00u) >> 2; // next 6 bits + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, 1.0f); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, 1.0f); + ret.data = {out[0], out[1]}; +#else + using namespace fcbx; + if (fp6_interpretation_t == __HIP_E2M3) { + ret.data = + __amd_fp16x2_storage_t{to_float<__amd_fp16_storage_t, Encoding::E2M3, true>(x & 0xFFu, 0), + to_float<__amd_fp16_storage_t, Encoding::E2M3, true>(x >> 8, 0)}; + } else if (fp6_interpretation_t == __HIP_E3M2) { + ret.data = + __amd_fp16x2_storage_t{to_float<__amd_fp16_storage_t, Encoding::E3M2, true>(x & 0xFFu, 0), + to_float<__amd_fp16_storage_t, Encoding::E3M2, true>(x >> 8, 0)}; + } +#endif + return ret; +} +__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t +__hip_cvt_halfraw_to_fp6(const __half_raw x, const __hip_fp6_interpretation_t fp6_interpretation_t, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6_storage_t fp6[4]; + } u{0}; +#if __gfx950__ + __amd_fp16x32_storage_t in; + __amd_fp6x32_storage_t out; + in[0] = x.data; + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, 1.0f); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, 1.0f); + u.ui32 = out[0]; + return u.fp6[0]; +#else + if (fp6_interpretation_t == __HIP_E2M3) { + u.ui32 = fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E2M3, true>( + internal::half_to_f16(x), 0); + } else if (fp6_interpretation_t == __HIP_E3M2) { + u.ui32 = fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E3M2, true>( + internal::half_to_f16(x), 0); + } + return u.fp6[0]; +#endif +} +__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t __hip_cvt_halfraw2_to_fp6x2( + const __half2_raw x, const __hip_fp6_interpretation_t fp6_interpretation_t, + const enum hipRoundMode /* rounding */) { + union { + uint32_t ui32; + __hip_fp6x2_storage_t fp6x2[2]; + } u{0}; +#if __gfx950__ + __amd_fp16x32_storage_t in; + __amd_fp6x32_storage_t out; + in[0] = x.data[0]; + in[1] = x.data[1]; + if (fp6_interpretation_t == __HIP_E2M3) + out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, 1.0f); + else if (fp6_interpretation_t == __HIP_E3M2) + out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, 1.0f); + u.ui32 = out[0] & 0x3Fu; + u.ui32 |= ((out[0] & 0xFC0u) << 2); + return u.fp6x2[0]; +#else + auto fp16x2 = internal::half2_to_f16x2(x); + if (fp6_interpretation_t == __HIP_E2M3) { + u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E2M3, true>(fp16x2[1], 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E2M3, true>(fp16x2[0], 0); + } else if (fp6_interpretation_t == __HIP_E3M2) { + u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E3M2, true>(fp16x2[1], 0); + u.ui32 <<= 8; + u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E3M2, true>(fp16x2[0], 0); + } + return u.fp6x2[0]; +#endif +} + +//======================== structs ==================== +struct __hip_fp6_e2m3 { + __hip_fp6_storage_t __x; + + __FP6_HOST_DEVICE__ __hip_fp6_e2m3() = default; +#if !defined(__HIP_NO_FP6_CONVERSIONS__) + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const __half f) + : __x(__hip_cvt_halfraw_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const __hip_bfloat16 f) + : __x(__hip_cvt_bfloat16raw_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const double f) + : __x(__hip_cvt_double_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const float f) + : __x(__hip_cvt_float_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const long long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const short int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned long long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned short int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {} +#endif // !defined(__HIP_NO_FP6_CONVERSIONS__) + +#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) + __FP6_HOST_DEVICE__ operator __half_raw() const { + return __hip_cvt_fp6_to_halfraw(__x, __HIP_E2M3); + } + __FP6_HOST_DEVICE__ operator __hip_bfloat16_raw() const { + static_assert(sizeof(__hip_bfloat16_raw) == sizeof(__amd_bf16_storage_t)); + union { + __hip_bfloat16_raw bf16_raw; + __amd_bf16_storage_t bf16; + } u; +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_bf16x32_storage_t out; + in[0] = (uint32_t)__x; + out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, 1.0f /* scale */); + u.bf16 = out[0]; +#else + using namespace fcbx; + u.bf16 = to_float<__amd_bf16_storage_t, Encoding::E2M3, true>(__x, 0); +#endif + return u.bf16_raw; + } + __FP6_HOST_DEVICE__ operator float() const { +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_floatx32_storage_t out; + in[0] = (uint32_t)__x; + out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */); + auto ret = out[0]; +#else + using namespace fcbx; + float ret{to_float(__x, 0)}; +#endif + return ret; + } + __FP6_HOST_DEVICE__ operator double() const { return double(float(*this)); } +#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) +}; + +struct __hip_fp6_e3m2 { + __hip_fp6_storage_t __x; + + __FP6_HOST_DEVICE__ __hip_fp6_e3m2() = default; +#if !defined(__HIP_NO_FP6_CONVERSIONS__) + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const __half f) + : __x(__hip_cvt_halfraw_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const __hip_bfloat16 f) + : __x(__hip_cvt_bfloat16raw_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const double f) + : __x(__hip_cvt_double_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const float f) + : __x(__hip_cvt_float_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const long long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const short int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned long long int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned short int val) + : __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {} +#endif // !defined(__HIP_NO_FP6_CONVERSIONS__) +#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) + __FP6_HOST_DEVICE__ operator __half_raw() const { + return __hip_cvt_fp6_to_halfraw(__x, __HIP_E3M2); + } + __FP6_HOST_DEVICE__ operator __hip_bfloat16_raw() const { + static_assert(sizeof(__hip_bfloat16_raw) == sizeof(__amd_bf16_storage_t)); + union { + __hip_bfloat16_raw bf16_raw; + __amd_bf16_storage_t bf16; + } u; +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_bf16x32_storage_t out; + in[0] = (uint32_t)__x; + out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, 1.0f /* scale */); + u.bf16 = out[0]; +#else + using namespace fcbx; + u.bf16 = to_float<__amd_bf16_storage_t, Encoding::E3M2, true>(__x, 0); +#endif + return u.bf16_raw; + } + __FP6_HOST_DEVICE__ operator float() const { +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_floatx32_storage_t out; + in[0] = (uint32_t)__x; + out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */); + auto ret = out[0]; +#else + using namespace fcbx; + float ret{to_float(__x, 0)}; +#endif + return ret; + } + __FP6_HOST_DEVICE__ operator double() const { return double(float(*this)); } +#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) +}; + +struct __hip_fp6x2_e2m3 { + __hip_fp6x2_storage_t __x; + __FP6_HOST_DEVICE__ inline __hip_fp6x2_e2m3() = default; +#if !defined(__HIP_NO_FP6_CONVERSIONS__) + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const __half2 f) + : __x(__hip_cvt_halfraw2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const __hip_bfloat162 f) + : __x(__hip_cvt_bfloat16raw2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const double2 f) + : __x(__hip_cvt_double2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const float2 f) + : __x(__hip_cvt_float2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {} +#endif // !defined(__HIP_NO_FP6_CONVERSIONS__) +#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) + __FP6_HOST_DEVICE__ operator __half2_raw() const { + return __hip_cvt_fp6x2_to_halfraw2(__x, __HIP_E2M3); + } + __FP6_HOST_DEVICE__ operator __hip_bfloat162_raw() const { + static_assert(sizeof(__hip_bfloat162_raw) == sizeof(__amd_bf16x2_storage_t)); + union { + __hip_bfloat162_raw bf162_raw; + __amd_bf16x2_storage_t bf16x2; + } u; +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_bf16x32_storage_t out; + in[0] = __x & 0x3Fu; // first 6 bits + in[0] |= (__x & FC00u) >> 2; // next 6 bits + out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, 1.0f /* scale */); + u.bf16x2 = {out[0], out[1]}; +#else + using namespace fcbx; + u.bf16x2 = + __amd_bf16x2_storage_t{to_float<__amd_bf16_storage_t, Encoding::E2M3, true>(__x & 0xFFu, 0), + to_float<__amd_bf16_storage_t, Encoding::E2M3, true>(__x >> 8, 0)}; +#endif + return u.bf162_raw; + } + __FP6_HOST_DEVICE__ operator float2() const { +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_floatx32_storage_t out; + in[0] = __x & 0x3Fu; // first 6 bits + in[0] |= (__x & FC00u) >> 2; // next 6 bits + out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */); + auto fp32x2 = {out[0], out[1]}; +#else + using namespace fcbx; + auto fp32x2 = __amd_floatx2_storage_t{to_float(__x & 0xFFu, 0), + to_float(__x >> 8, 0)}; +#endif + return float2(fp32x2[0], fp32x2[1]); + } + __FP6_HOST_DEVICE__ operator double2() const { + auto fp32 = float2(*this); + return double2(fp32.x, fp32.y); + } +#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) +}; + +struct __hip_fp6x2_e3m2 { + __hip_fp6x2_storage_t __x; + __FP6_HOST_DEVICE__ inline __hip_fp6x2_e3m2() = default; +#if !defined(__HIP_NO_FP6_CONVERSIONS__) + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const __half2 f) + : __x(__hip_cvt_halfraw2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const __hip_bfloat162 f) + : __x(__hip_cvt_bfloat16raw2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const double2 f) + : __x(__hip_cvt_double2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const float2 f) + : __x(__hip_cvt_float2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {} +#endif //! defined(__HIP_NO_FP6_CONVERSIONS__) +#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) + __FP6_HOST_DEVICE__ operator __half2_raw() const { + return __hip_cvt_fp6x2_to_halfraw2(__x, __HIP_E3M2); + } + __FP6_HOST_DEVICE__ operator __hip_bfloat162_raw() const { + static_assert(sizeof(__hip_bfloat162_raw) == sizeof(__amd_bf16x2_storage_t)); + union { + __hip_bfloat162_raw bf162_raw; + __amd_bf16x2_storage_t bf16x2; + } u; +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_bf16x32_storage_t out; + in[0] = __x & 0x3Fu; // first 6 bits + in[0] |= (__x & FC00u) >> 2; // next 6 bits + out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, 1.0f /* scale */); + u.bf16x2 = {out[0], out[1]}; +#else + using namespace fcbx; + u.bf16x2 = + __amd_bf16x2_storage_t{to_float<__amd_bf16_storage_t, Encoding::E3M2, true>(__x & 0xFFu, 0), + to_float<__amd_bf16_storage_t, Encoding::E3M2, true>(__x >> 8, 0)}; +#endif + return u.bf162_raw; + } + __FP6_HOST_DEVICE__ operator float2() const { +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_floatx32_storage_t out; + in[0] = __x & 0x3Fu; // first 6 bits + in[0] |= (__x & FC00u) >> 2; // next 6 bits + out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */); + auto fp32x2 = {out[0], out[1]}; +#else + using namespace fcbx; + auto fp32x2 = __amd_floatx2_storage_t{to_float(__x & 0xFFu, 0), + to_float(__x >> 8, 0)}; +#endif + return float2(fp32x2[0], fp32x2[1]); + } + __FP6_HOST_DEVICE__ operator double2() const { + auto fp32 = float2(*this); + return double2(fp32.x, fp32.y); + } +#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) +}; + +struct __hip_fp6x4_e2m3 { + __hip_fp6x4_storage_t __x; + __FP6_HOST_DEVICE__ inline __hip_fp6x4_e2m3() = default; +#if !defined(__HIP_NO_FP6_CONVERSIONS__) + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const __half2 low, const __half2 high) + : __x(__hip_cvt_halfraw2_to_fp6x2(high, __HIP_E2M3, hipRoundNearest) << 16 | + __hip_cvt_halfraw2_to_fp6x2(low, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const __hip_bfloat162 low, + const __hip_bfloat162 high) + : __x(__hip_cvt_bfloat16raw2_to_fp6x2(high, __HIP_E2M3, hipRoundNearest) << 16 | + __hip_cvt_bfloat16raw2_to_fp6x2(low, __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const double4 f) + : __x(__hip_cvt_double2_to_fp6x2(double2(f.z, f.w), __HIP_E2M3, hipRoundNearest) << 16 | + __hip_cvt_double2_to_fp6x2(double2(f.x, f.y), __HIP_E2M3, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const float4 f) + : __x(__hip_cvt_float2_to_fp6x2(float2(f.z, f.w), __HIP_E2M3, hipRoundNearest) << 16 | + __hip_cvt_float2_to_fp6x2(float2(f.x, f.y), __HIP_E2M3, hipRoundNearest)) {} +#endif // !defined(__HIP_NO_FP6_CONVERSIONS__) +#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) + __FP6_HOST_DEVICE__ operator float4() const { +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_floatx32_storage_t out; + in[0] = __x & 0x3Fu; // first 6 bits + in[0] |= ((__x >> 8) & 0x3Fu) << 6; // second 6 bits + in[0] |= ((__x >> 16) & 0x3Fu) << 12; + in[0] |= ((__x >> 24) & 0x3Fu) << 18; + out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */); + auto fp32x2_1 = {out[0], out[1]}; + auto fp32x2_2 = {out[2], out[3]}; +#else + using namespace fcbx; + auto fp32x2_1 = + __amd_floatx2_storage_t{to_float(__x & 0xFFu, 0), + to_float((__x >> 8) & 0xFFu, 0)}; + auto fp32x2_2 = + __amd_floatx2_storage_t{to_float((__x >> 16) & 0xFFu, 0), + to_float(__x >> 24, 0)}; +#endif + return float4{fp32x2_1[0], fp32x2_1[1], fp32x2_2[0], fp32x2_2[1]}; + } + __FP6_HOST_DEVICE__ operator double4() const { + auto fp32 = float4(*this); + return double4{fp32.x, fp32.y, fp32.z, fp32.w}; + } +#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) +}; + +struct __hip_fp6x4_e3m2 { + __hip_fp6x4_storage_t __x; + __FP6_HOST_DEVICE__ inline __hip_fp6x4_e3m2() = default; +#if !defined(__HIP_NO_FP6_CONVERSIONS__) + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const __half2 high, const __half2 low) + : __x(__hip_cvt_halfraw2_to_fp6x2(high, __HIP_E3M2, hipRoundNearest) << 16 | + __hip_cvt_halfraw2_to_fp6x2(low, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const __hip_bfloat162 high, + const __hip_bfloat162 low) + : __x(__hip_cvt_bfloat16raw2_to_fp6x2(high, __HIP_E3M2, hipRoundNearest) << 16 | + __hip_cvt_bfloat16raw2_to_fp6x2(low, __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const double4 f) + : __x(__hip_cvt_double2_to_fp6x2(double2(f.z, f.w), __HIP_E3M2, hipRoundNearest) << 16 | + __hip_cvt_double2_to_fp6x2(double2(f.x, f.y), __HIP_E3M2, hipRoundNearest)) {} + __FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const float4 f) + : __x(__hip_cvt_float2_to_fp6x2(float2(f.z, f.w), __HIP_E3M2, hipRoundNearest) << 16 | + __hip_cvt_float2_to_fp6x2(float2(f.x, f.y), __HIP_E3M2, hipRoundNearest)) {} +#endif //! defined(__HIP_NO_FP6_CONVERSIONS__) +#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) + __FP6_HOST_DEVICE__ operator float4() const { +#if HIP_ENABLE_GFX950_OCP_BUILTINS + __amd_fp6x32_storage_t in; + __amd_floatx32_storage_t out; + in[0] = __x & 0x3Fu; // first 6 bits + in[0] |= ((__x >> 8) & 0x3Fu) << 6; // second 6 bits + in[0] |= ((__x >> 16) & 0x3Fu) << 12; + in[0] |= ((__x >> 24) & 0x3Fu) << 18; + out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */); + auto fp32x2_1 = {out[0], out[1]}; + auto fp32x2_2 = {out[2], out[3]}; +#else + using namespace fcbx; + auto fp32x2_1 = + __amd_floatx2_storage_t{to_float(__x & 0xFFu, 0), + to_float((__x >> 8) & 0xFFu, 0)}; + auto fp32x2_2 = + __amd_floatx2_storage_t{to_float((__x >> 16) & 0xFFu, 0), + to_float(__x >> 24, 0)}; +#endif + return float4{fp32x2_1[0], fp32x2_1[1], fp32x2_2[0], fp32x2_2[1]}; + } + __FP6_HOST_DEVICE__ operator double4() const { + auto fp32 = float4(*this); + return double4{fp32.x, fp32.y, fp32.z, fp32.w}; + } +#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__) +};