2
0

SWDEV-514560 - add fp6 header implementation (#54)

Co-authored-by: rahul manocha <rmanocha_amdeng>
Este cometimento está contido em:
Chaudhary, Jatin Jaikishan
2025-05-01 15:17:38 +01:00
cometido por GitHub
ascendente 9d6a0d1a4d
cometimento 12febe6782
+733
Ver ficheiro
@@ -0,0 +1,733 @@
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#pragma once
#include "amd_hip_mx_common.h"
#include "amd_hip_fp16.h"
#include "amd_hip_bf16.h"
#include "amd_hip_fp8.h"
#include "amd_hip_ocp_types.h"
#include "amd_hip_ocp_host.hpp"
#include "hip/amd_detail/amd_hip_mx_common.h"
#if defined(__HIPCC_RTC__)
#define __FP6_HOST_DEVICE__ __device__
#define __FP6_HOST_DEVICE_STATIC__ __FP6_HOST_DEVICE__ static
#else
#define __FP6_HOST_DEVICE__ __host__ __device__
#define __FP6_HOST_DEVICE_STATIC__ __FP6_HOST_DEVICE__ static inline
#endif // __HIPCC_RTC__
typedef __hip_fp8_storage_t __hip_fp6_storage_t;
typedef __hip_fp8x2_storage_t __hip_fp6x2_storage_t;
typedef __hip_fp8x4_storage_t __hip_fp6x4_storage_t;
static_assert(sizeof(__hip_fp6_storage_t[4]) == sizeof(uint32_t));
static_assert(sizeof(__hip_fp6x2_storage_t[2]) == sizeof(uint32_t));
static_assert(sizeof(__hip_fp6x4_storage_t[2]) == sizeof(uint64_t));
enum __hip_fp6_interpretation_t {
__HIP_E3M2 = 0, /**< FP6 E3M2 Type*/
__HIP_E2M3 = 1, /**< FP6 E2M3 Type */
};
namespace internal {
__FP6_HOST_DEVICE_STATIC__ __amd_fp16_storage_t half_to_f16(const __half val) {
__half_raw tmp = val;
return tmp.data;
}
__FP6_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t half2_to_f16x2(const __half2 val) {
__half2_raw tmp = val;
return tmp.data;
}
__FP6_HOST_DEVICE_STATIC__ __amd_bf16_storage_t hipbf16_to_bf16(const __hip_bfloat16 val) {
static_assert(sizeof(__hip_bfloat16) == sizeof(__amd_bf16_storage_t));
union {
__hip_bfloat16 hip_bf16;
__amd_bf16_storage_t bf16;
} u{val};
return u.bf16;
}
__FP6_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t hipbf162_to_bf16x2(const __hip_bfloat162 val) {
static_assert(sizeof(__hip_bfloat162) == sizeof(__amd_bf16x2_storage_t));
union {
__hip_bfloat162 hip_bf16;
__amd_bf16x2_storage_t bf16;
} u{val};
return u.bf16;
}
} // namespace internal
// Note: Ignore rounding input on AMD GPUs for now. At the moment AMD GPUs do not support rounding
// modes, all the inputs are rounded to nearest or use an input to do stochastic rounding.
// We hide the rounding variable to not trigger the unused variable compiler warning.
__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t __hip_cvt_bfloat16raw_to_fp6(
const __hip_bfloat16_raw x, const __hip_fp6_interpretation_t fp6_interpretation,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6_storage_t fp6[4];
} u{0};
#if __gfx950__
__amd_bf16x32_storage_t in;
__amd_fp6x32_storage_t out;
in[0] = internal::hipbf16_to_bf16(x);
if (fp6_interpretation == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, 1.0f /* scale */);
else if (fp6_interpretation == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, 1.0f /* scale */);
u.ui32 = out[0];
return u.fp6[0];
#else
if (fp6_interpretation == __HIP_E2M3)
u.ui32 = fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E2M3, true>(
internal::hipbf16_to_bf16(x), 0);
else if (fp6_interpretation == __HIP_E3M2)
u.ui32 = fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E3M2, true>(
internal::hipbf16_to_bf16(x), 0);
return u.fp6[0];
#endif
}
__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t __hip_cvt_bfloat16raw2_to_fp6x2(
const __hip_bfloat162_raw x, const __hip_fp6_interpretation_t fp6_interpretation,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6x2_storage_t fp6x2[2];
} u{0};
#if __gfx950__
__amd_bf16x32_storage_t in;
in[0] = internal::hipbf16_to_bf16(x.x);
in[1] = internal::hipbf16_to_bf16(x.y);
__amd_fp6x32_storage_t out;
if (fp6_interpretation == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, 1.0f /* scale */);
else if (fp6_interpretation == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, 1.0f /* scale */);
u.ui32 = out[0];
return u.fp6x2[0];
#else
if (fp6_interpretation == __HIP_E2M3) {
auto bf16x2 = internal::hipbf162_to_bf16x2(x);
u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E2M3, true>(bf16x2[1], 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E2M3, true>(bf16x2[0], 0);
} else if (fp6_interpretation == __HIP_E3M2) {
auto bf16x2 = internal::hipbf162_to_bf16x2(x);
u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E3M2, true>(bf16x2[1], 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<__amd_bf16_storage_t, fcbx::Encoding::E3M2, true>(bf16x2[0], 0);
}
return u.fp6x2[0];
#endif
}
__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t
__hip_cvt_double_to_fp6(const double x, const __hip_fp6_interpretation_t fp6_interpretation_t,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6_storage_t fp6[4];
} u{0};
#if __gfx950__
__amd_floatx16_storage_t in1;
__amd_floatx16_storage_t in2;
__amd_fp6x32_storage_t out;
in1[0] = float(x);
in2[0] = 0.0f;
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */);
u.ui32 = out[0];
return u.fp6[0];
#else
if (fp6_interpretation_t == __HIP_E2M3) {
u.ui32 = fcbx::from_float<float, fcbx::Encoding::E2M3, true>(float(x), 0);
} else if (fp6_interpretation_t == __HIP_E3M2) {
u.ui32 = fcbx::from_float<float, fcbx::Encoding::E3M2, true>(float(x), 0);
}
return u.fp6[0];
#endif
}
__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t
__hip_cvt_double2_to_fp6x2(const double2 x, const __hip_fp6_interpretation_t fp6_interpretation_t,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6x2_storage_t fp6x2[2];
} u{0};
#if __gfx950__
__amd_floatx16_storage_t in1;
__amd_floatx16_storage_t in2;
__amd_fp6x32_storage_t out;
in1[0] = float(x.x);
in2[0] = float(x.y);
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */);
u.ui32 = out[0] & 0x3Fu;
u.ui32 |= ((out[0] & 0xFC0u) << 2);
return u.fp6x2[0];
#else
if (fp6_interpretation_t == __HIP_E2M3) {
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E2M3, true>(float(x.y), 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E2M3, true>(float(x.x), 0);
} else if (fp6_interpretation_t == __HIP_E3M2) {
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E3M2, true>(float(x.y), 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E3M2, true>(float(x.x), 0);
}
return u.fp6x2[0];
#endif
}
__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t
__hip_cvt_float_to_fp6(const float x, const __hip_fp6_interpretation_t fp6_interpretation_t,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6_storage_t fp6[4];
} u{0};
#if __gfx950__
__amd_floatx16_storage_t in1;
__amd_floatx16_storage_t in2;
__amd_fp6x32_storage_t out;
in1[0] = x;
in2[0] = 0.0f;
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */);
u.ui32 = out[0];
return u.fp6[0];
#else
if (fp6_interpretation_t == __HIP_E2M3)
u.ui32 = fcbx::from_float<float, fcbx::Encoding::E2M3, true>(x, 0);
else if (fp6_interpretation_t == __HIP_E3M2)
u.ui32 = fcbx::from_float<float, fcbx::Encoding::E3M2, true>(x, 0);
return u.fp6[0];
#endif
}
__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t
__hip_cvt_float2_to_fp6x2(const float2 x, const __hip_fp6_interpretation_t fp6_interpretation_t,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6x2_storage_t fp6x2[2];
} u{0};
#if __gfx950__
__amd_floatx16_storage_t in1;
__amd_floatx16_storage_t in2;
__amd_fp6x32_storage_t out;
in1[0] = x.x;
in2[0] = x.y;
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, 1.0f /* scale */);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, 1.0f /* scale */);
u.ui32 = out[0] & 0x3Fu;
u.ui32 |= ((out[0] & 0xFC0u) << 2);
return u.fp6x2[0];
#else
if (fp6_interpretation_t == __HIP_E2M3) {
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E2M3, true>(x.y, 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E2M3, true>(x.x, 0);
} else if (fp6_interpretation_t == __HIP_E3M2) {
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E3M2, true>(x.y, 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<float, fcbx::Encoding::E3M2, true>(x.x, 0);
}
return u.fp6x2[0];
#endif
}
__FP6_HOST_DEVICE_STATIC__ __half_raw __hip_cvt_fp6_to_halfraw(
const __hip_fp6_storage_t x, const __hip_fp6_interpretation_t fp6_interpretation_t) {
__half_raw ret;
#if __gfx950__
__amd_fp16x32_storage_t out;
__amd_fp6x32_storage_t in;
in[0] = (uint32_t)x;
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, 1.0f);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, 1.0f);
ret.data = out[0];
#else
using namespace fcbx;
if (fp6_interpretation_t == __HIP_E2M3) {
ret.data = __amd_fp16_storage_t{to_float<__amd_fp16_storage_t, Encoding::E2M3, true>(x, 0)};
} else if (fp6_interpretation_t == __HIP_E3M2) {
ret.data = __amd_fp16_storage_t{to_float<__amd_fp16_storage_t, Encoding::E3M2, true>(x, 0)};
}
#endif
return ret;
}
__FP6_HOST_DEVICE_STATIC__ __half2_raw __hip_cvt_fp6x2_to_halfraw2(
const __hip_fp6x2_storage_t x, const __hip_fp6_interpretation_t fp6_interpretation_t) {
__half2_raw ret;
#if __gfx950__
__amd_fp16x32_storage_t out;
__amd_fp6x32_storage_t in;
in[0] = x & 0x3Fu; // first 6 bits
in[0] |= (x & 0x3F00u) >> 2; // next 6 bits
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, 1.0f);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, 1.0f);
ret.data = {out[0], out[1]};
#else
using namespace fcbx;
if (fp6_interpretation_t == __HIP_E2M3) {
ret.data =
__amd_fp16x2_storage_t{to_float<__amd_fp16_storage_t, Encoding::E2M3, true>(x & 0xFFu, 0),
to_float<__amd_fp16_storage_t, Encoding::E2M3, true>(x >> 8, 0)};
} else if (fp6_interpretation_t == __HIP_E3M2) {
ret.data =
__amd_fp16x2_storage_t{to_float<__amd_fp16_storage_t, Encoding::E3M2, true>(x & 0xFFu, 0),
to_float<__amd_fp16_storage_t, Encoding::E3M2, true>(x >> 8, 0)};
}
#endif
return ret;
}
__FP6_HOST_DEVICE_STATIC__ __hip_fp6_storage_t
__hip_cvt_halfraw_to_fp6(const __half_raw x, const __hip_fp6_interpretation_t fp6_interpretation_t,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6_storage_t fp6[4];
} u{0};
#if __gfx950__
__amd_fp16x32_storage_t in;
__amd_fp6x32_storage_t out;
in[0] = x.data;
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, 1.0f);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, 1.0f);
u.ui32 = out[0];
return u.fp6[0];
#else
if (fp6_interpretation_t == __HIP_E2M3) {
u.ui32 = fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E2M3, true>(
internal::half_to_f16(x), 0);
} else if (fp6_interpretation_t == __HIP_E3M2) {
u.ui32 = fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E3M2, true>(
internal::half_to_f16(x), 0);
}
return u.fp6[0];
#endif
}
__FP6_HOST_DEVICE_STATIC__ __hip_fp6x2_storage_t __hip_cvt_halfraw2_to_fp6x2(
const __half2_raw x, const __hip_fp6_interpretation_t fp6_interpretation_t,
const enum hipRoundMode /* rounding */) {
union {
uint32_t ui32;
__hip_fp6x2_storage_t fp6x2[2];
} u{0};
#if __gfx950__
__amd_fp16x32_storage_t in;
__amd_fp6x32_storage_t out;
in[0] = x.data[0];
in[1] = x.data[1];
if (fp6_interpretation_t == __HIP_E2M3)
out = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, 1.0f);
else if (fp6_interpretation_t == __HIP_E3M2)
out = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, 1.0f);
u.ui32 = out[0] & 0x3Fu;
u.ui32 |= ((out[0] & 0xFC0u) << 2);
return u.fp6x2[0];
#else
auto fp16x2 = internal::half2_to_f16x2(x);
if (fp6_interpretation_t == __HIP_E2M3) {
u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E2M3, true>(fp16x2[1], 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E2M3, true>(fp16x2[0], 0);
} else if (fp6_interpretation_t == __HIP_E3M2) {
u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E3M2, true>(fp16x2[1], 0);
u.ui32 <<= 8;
u.ui32 |= fcbx::from_float<__amd_fp16_storage_t, fcbx::Encoding::E3M2, true>(fp16x2[0], 0);
}
return u.fp6x2[0];
#endif
}
//======================== structs ====================
struct __hip_fp6_e2m3 {
__hip_fp6_storage_t __x;
__FP6_HOST_DEVICE__ __hip_fp6_e2m3() = default;
#if !defined(__HIP_NO_FP6_CONVERSIONS__)
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const __half f)
: __x(__hip_cvt_halfraw_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const __hip_bfloat16 f)
: __x(__hip_cvt_bfloat16raw_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const double f)
: __x(__hip_cvt_double_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const float f)
: __x(__hip_cvt_float_to_fp6(f, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const long long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const short int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned long long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e2m3(const unsigned short int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E2M3, hipRoundNearest)) {}
#endif // !defined(__HIP_NO_FP6_CONVERSIONS__)
#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
__FP6_HOST_DEVICE__ operator __half_raw() const {
return __hip_cvt_fp6_to_halfraw(__x, __HIP_E2M3);
}
__FP6_HOST_DEVICE__ operator __hip_bfloat16_raw() const {
static_assert(sizeof(__hip_bfloat16_raw) == sizeof(__amd_bf16_storage_t));
union {
__hip_bfloat16_raw bf16_raw;
__amd_bf16_storage_t bf16;
} u;
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_bf16x32_storage_t out;
in[0] = (uint32_t)__x;
out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, 1.0f /* scale */);
u.bf16 = out[0];
#else
using namespace fcbx;
u.bf16 = to_float<__amd_bf16_storage_t, Encoding::E2M3, true>(__x, 0);
#endif
return u.bf16_raw;
}
__FP6_HOST_DEVICE__ operator float() const {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_floatx32_storage_t out;
in[0] = (uint32_t)__x;
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */);
auto ret = out[0];
#else
using namespace fcbx;
float ret{to_float<float, Encoding::E2M3, true>(__x, 0)};
#endif
return ret;
}
__FP6_HOST_DEVICE__ operator double() const { return double(float(*this)); }
#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
};
struct __hip_fp6_e3m2 {
__hip_fp6_storage_t __x;
__FP6_HOST_DEVICE__ __hip_fp6_e3m2() = default;
#if !defined(__HIP_NO_FP6_CONVERSIONS__)
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const __half f)
: __x(__hip_cvt_halfraw_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const __hip_bfloat16 f)
: __x(__hip_cvt_bfloat16raw_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const double f)
: __x(__hip_cvt_double_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const float f)
: __x(__hip_cvt_float_to_fp6(f, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const long long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const short int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned long long int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6_e3m2(const unsigned short int val)
: __x(__hip_cvt_float_to_fp6(float(val), __HIP_E3M2, hipRoundNearest)) {}
#endif // !defined(__HIP_NO_FP6_CONVERSIONS__)
#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
__FP6_HOST_DEVICE__ operator __half_raw() const {
return __hip_cvt_fp6_to_halfraw(__x, __HIP_E3M2);
}
__FP6_HOST_DEVICE__ operator __hip_bfloat16_raw() const {
static_assert(sizeof(__hip_bfloat16_raw) == sizeof(__amd_bf16_storage_t));
union {
__hip_bfloat16_raw bf16_raw;
__amd_bf16_storage_t bf16;
} u;
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_bf16x32_storage_t out;
in[0] = (uint32_t)__x;
out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, 1.0f /* scale */);
u.bf16 = out[0];
#else
using namespace fcbx;
u.bf16 = to_float<__amd_bf16_storage_t, Encoding::E3M2, true>(__x, 0);
#endif
return u.bf16_raw;
}
__FP6_HOST_DEVICE__ operator float() const {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_floatx32_storage_t out;
in[0] = (uint32_t)__x;
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */);
auto ret = out[0];
#else
using namespace fcbx;
float ret{to_float<float, Encoding::E3M2, true>(__x, 0)};
#endif
return ret;
}
__FP6_HOST_DEVICE__ operator double() const { return double(float(*this)); }
#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
};
struct __hip_fp6x2_e2m3 {
__hip_fp6x2_storage_t __x;
__FP6_HOST_DEVICE__ inline __hip_fp6x2_e2m3() = default;
#if !defined(__HIP_NO_FP6_CONVERSIONS__)
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const __half2 f)
: __x(__hip_cvt_halfraw2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const __hip_bfloat162 f)
: __x(__hip_cvt_bfloat16raw2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const double2 f)
: __x(__hip_cvt_double2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e2m3(const float2 f)
: __x(__hip_cvt_float2_to_fp6x2(f, __HIP_E2M3, hipRoundNearest)) {}
#endif // !defined(__HIP_NO_FP6_CONVERSIONS__)
#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
__FP6_HOST_DEVICE__ operator __half2_raw() const {
return __hip_cvt_fp6x2_to_halfraw2(__x, __HIP_E2M3);
}
__FP6_HOST_DEVICE__ operator __hip_bfloat162_raw() const {
static_assert(sizeof(__hip_bfloat162_raw) == sizeof(__amd_bf16x2_storage_t));
union {
__hip_bfloat162_raw bf162_raw;
__amd_bf16x2_storage_t bf16x2;
} u;
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_bf16x32_storage_t out;
in[0] = __x & 0x3Fu; // first 6 bits
in[0] |= (__x & FC00u) >> 2; // next 6 bits
out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, 1.0f /* scale */);
u.bf16x2 = {out[0], out[1]};
#else
using namespace fcbx;
u.bf16x2 =
__amd_bf16x2_storage_t{to_float<__amd_bf16_storage_t, Encoding::E2M3, true>(__x & 0xFFu, 0),
to_float<__amd_bf16_storage_t, Encoding::E2M3, true>(__x >> 8, 0)};
#endif
return u.bf162_raw;
}
__FP6_HOST_DEVICE__ operator float2() const {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_floatx32_storage_t out;
in[0] = __x & 0x3Fu; // first 6 bits
in[0] |= (__x & FC00u) >> 2; // next 6 bits
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */);
auto fp32x2 = {out[0], out[1]};
#else
using namespace fcbx;
auto fp32x2 = __amd_floatx2_storage_t{to_float<float, Encoding::E2M3, true>(__x & 0xFFu, 0),
to_float<float, Encoding::E2M3, true>(__x >> 8, 0)};
#endif
return float2(fp32x2[0], fp32x2[1]);
}
__FP6_HOST_DEVICE__ operator double2() const {
auto fp32 = float2(*this);
return double2(fp32.x, fp32.y);
}
#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
};
struct __hip_fp6x2_e3m2 {
__hip_fp6x2_storage_t __x;
__FP6_HOST_DEVICE__ inline __hip_fp6x2_e3m2() = default;
#if !defined(__HIP_NO_FP6_CONVERSIONS__)
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const __half2 f)
: __x(__hip_cvt_halfraw2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const __hip_bfloat162 f)
: __x(__hip_cvt_bfloat16raw2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const double2 f)
: __x(__hip_cvt_double2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x2_e3m2(const float2 f)
: __x(__hip_cvt_float2_to_fp6x2(f, __HIP_E3M2, hipRoundNearest)) {}
#endif //! defined(__HIP_NO_FP6_CONVERSIONS__)
#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
__FP6_HOST_DEVICE__ operator __half2_raw() const {
return __hip_cvt_fp6x2_to_halfraw2(__x, __HIP_E3M2);
}
__FP6_HOST_DEVICE__ operator __hip_bfloat162_raw() const {
static_assert(sizeof(__hip_bfloat162_raw) == sizeof(__amd_bf16x2_storage_t));
union {
__hip_bfloat162_raw bf162_raw;
__amd_bf16x2_storage_t bf16x2;
} u;
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_bf16x32_storage_t out;
in[0] = __x & 0x3Fu; // first 6 bits
in[0] |= (__x & FC00u) >> 2; // next 6 bits
out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, 1.0f /* scale */);
u.bf16x2 = {out[0], out[1]};
#else
using namespace fcbx;
u.bf16x2 =
__amd_bf16x2_storage_t{to_float<__amd_bf16_storage_t, Encoding::E3M2, true>(__x & 0xFFu, 0),
to_float<__amd_bf16_storage_t, Encoding::E3M2, true>(__x >> 8, 0)};
#endif
return u.bf162_raw;
}
__FP6_HOST_DEVICE__ operator float2() const {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_floatx32_storage_t out;
in[0] = __x & 0x3Fu; // first 6 bits
in[0] |= (__x & FC00u) >> 2; // next 6 bits
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */);
auto fp32x2 = {out[0], out[1]};
#else
using namespace fcbx;
auto fp32x2 = __amd_floatx2_storage_t{to_float<float, Encoding::E3M2, true>(__x & 0xFFu, 0),
to_float<float, Encoding::E3M2, true>(__x >> 8, 0)};
#endif
return float2(fp32x2[0], fp32x2[1]);
}
__FP6_HOST_DEVICE__ operator double2() const {
auto fp32 = float2(*this);
return double2(fp32.x, fp32.y);
}
#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
};
struct __hip_fp6x4_e2m3 {
__hip_fp6x4_storage_t __x;
__FP6_HOST_DEVICE__ inline __hip_fp6x4_e2m3() = default;
#if !defined(__HIP_NO_FP6_CONVERSIONS__)
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const __half2 low, const __half2 high)
: __x(__hip_cvt_halfraw2_to_fp6x2(high, __HIP_E2M3, hipRoundNearest) << 16 |
__hip_cvt_halfraw2_to_fp6x2(low, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const __hip_bfloat162 low,
const __hip_bfloat162 high)
: __x(__hip_cvt_bfloat16raw2_to_fp6x2(high, __HIP_E2M3, hipRoundNearest) << 16 |
__hip_cvt_bfloat16raw2_to_fp6x2(low, __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const double4 f)
: __x(__hip_cvt_double2_to_fp6x2(double2(f.z, f.w), __HIP_E2M3, hipRoundNearest) << 16 |
__hip_cvt_double2_to_fp6x2(double2(f.x, f.y), __HIP_E2M3, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e2m3(const float4 f)
: __x(__hip_cvt_float2_to_fp6x2(float2(f.z, f.w), __HIP_E2M3, hipRoundNearest) << 16 |
__hip_cvt_float2_to_fp6x2(float2(f.x, f.y), __HIP_E2M3, hipRoundNearest)) {}
#endif // !defined(__HIP_NO_FP6_CONVERSIONS__)
#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
__FP6_HOST_DEVICE__ operator float4() const {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_floatx32_storage_t out;
in[0] = __x & 0x3Fu; // first 6 bits
in[0] |= ((__x >> 8) & 0x3Fu) << 6; // second 6 bits
in[0] |= ((__x >> 16) & 0x3Fu) << 12;
in[0] |= ((__x >> 24) & 0x3Fu) << 18;
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */);
auto fp32x2_1 = {out[0], out[1]};
auto fp32x2_2 = {out[2], out[3]};
#else
using namespace fcbx;
auto fp32x2_1 =
__amd_floatx2_storage_t{to_float<float, Encoding::E2M3, true>(__x & 0xFFu, 0),
to_float<float, Encoding::E2M3, true>((__x >> 8) & 0xFFu, 0)};
auto fp32x2_2 =
__amd_floatx2_storage_t{to_float<float, Encoding::E2M3, true>((__x >> 16) & 0xFFu, 0),
to_float<float, Encoding::E2M3, true>(__x >> 24, 0)};
#endif
return float4{fp32x2_1[0], fp32x2_1[1], fp32x2_2[0], fp32x2_2[1]};
}
__FP6_HOST_DEVICE__ operator double4() const {
auto fp32 = float4(*this);
return double4{fp32.x, fp32.y, fp32.z, fp32.w};
}
#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
};
struct __hip_fp6x4_e3m2 {
__hip_fp6x4_storage_t __x;
__FP6_HOST_DEVICE__ inline __hip_fp6x4_e3m2() = default;
#if !defined(__HIP_NO_FP6_CONVERSIONS__)
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const __half2 high, const __half2 low)
: __x(__hip_cvt_halfraw2_to_fp6x2(high, __HIP_E3M2, hipRoundNearest) << 16 |
__hip_cvt_halfraw2_to_fp6x2(low, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const __hip_bfloat162 high,
const __hip_bfloat162 low)
: __x(__hip_cvt_bfloat16raw2_to_fp6x2(high, __HIP_E3M2, hipRoundNearest) << 16 |
__hip_cvt_bfloat16raw2_to_fp6x2(low, __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const double4 f)
: __x(__hip_cvt_double2_to_fp6x2(double2(f.z, f.w), __HIP_E3M2, hipRoundNearest) << 16 |
__hip_cvt_double2_to_fp6x2(double2(f.x, f.y), __HIP_E3M2, hipRoundNearest)) {}
__FP6_HOST_DEVICE__ inline explicit __hip_fp6x4_e3m2(const float4 f)
: __x(__hip_cvt_float2_to_fp6x2(float2(f.z, f.w), __HIP_E3M2, hipRoundNearest) << 16 |
__hip_cvt_float2_to_fp6x2(float2(f.x, f.y), __HIP_E3M2, hipRoundNearest)) {}
#endif //! defined(__HIP_NO_FP6_CONVERSIONS__)
#if !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
__FP6_HOST_DEVICE__ operator float4() const {
#if HIP_ENABLE_GFX950_OCP_BUILTINS
__amd_fp6x32_storage_t in;
__amd_floatx32_storage_t out;
in[0] = __x & 0x3Fu; // first 6 bits
in[0] |= ((__x >> 8) & 0x3Fu) << 6; // second 6 bits
in[0] |= ((__x >> 16) & 0x3Fu) << 12;
in[0] |= ((__x >> 24) & 0x3Fu) << 18;
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */);
auto fp32x2_1 = {out[0], out[1]};
auto fp32x2_2 = {out[2], out[3]};
#else
using namespace fcbx;
auto fp32x2_1 =
__amd_floatx2_storage_t{to_float<float, Encoding::E3M2, true>(__x & 0xFFu, 0),
to_float<float, Encoding::E3M2, true>((__x >> 8) & 0xFFu, 0)};
auto fp32x2_2 =
__amd_floatx2_storage_t{to_float<float, Encoding::E3M2, true>((__x >> 16) & 0xFFu, 0),
to_float<float, Encoding::E3M2, true>(__x >> 24, 0)};
#endif
return float4{fp32x2_1[0], fp32x2_1[1], fp32x2_2[0], fp32x2_2[1]};
}
__FP6_HOST_DEVICE__ operator double4() const {
auto fp32 = float4(*this);
return double4{fp32.x, fp32.y, fp32.z, fp32.w};
}
#endif // !defined(__HIP_NO_FP6_CONVERSION_OPERATORS__)
};