SWDEV-546110 - Fix encoding for certain types (#446)
Bu işleme şunda yer alıyor:
işlemeyi yapan:
GitHub
ebeveyn
0b61026495
işleme
06a3a5ca10
@@ -23,12 +23,8 @@ SOFTWARE.
|
||||
#pragma once
|
||||
|
||||
#include "amd_hip_mx_common.h"
|
||||
|
||||
#include "amd_hip_fp16.h"
|
||||
#include "amd_hip_bf16.h"
|
||||
#include "amd_hip_fp8.h"
|
||||
|
||||
#include "amd_hip_ocp_types.h"
|
||||
#include "amd_hip_ocp_host.hpp"
|
||||
|
||||
#if defined(__HIPCC_RTC__)
|
||||
@@ -51,36 +47,6 @@ enum __hip_fp4_interpretation_t {
|
||||
__HIP_E2M1 = 0,
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
__FP4_HOST_DEVICE_STATIC__ __amd_fp16_storage_t half_to_f16(const __half val) {
|
||||
__half_raw tmp = val;
|
||||
return tmp.data;
|
||||
}
|
||||
|
||||
__FP4_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t half2_to_f16x2(const __half2 val) {
|
||||
__half2_raw tmp = val;
|
||||
return tmp.data;
|
||||
}
|
||||
|
||||
__FP4_HOST_DEVICE_STATIC__ __amd_bf16_storage_t hipbf16_to_bf16(const __hip_bfloat16 val) {
|
||||
static_assert(sizeof(__hip_bfloat16) == sizeof(__amd_bf16_storage_t));
|
||||
union {
|
||||
__hip_bfloat16 hip_bf16;
|
||||
__amd_bf16_storage_t bf16;
|
||||
} u{val};
|
||||
return u.bf16;
|
||||
}
|
||||
|
||||
__FP4_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t hipbf162_to_bf16x2(const __hip_bfloat162 val) {
|
||||
static_assert(sizeof(__hip_bfloat162) == sizeof(__amd_bf16x2_storage_t));
|
||||
union {
|
||||
__hip_bfloat162 hip_bf16;
|
||||
__amd_bf16x2_storage_t bf16;
|
||||
} u{val};
|
||||
return u.bf16;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
// Note: Ignore rounding input on AMD GPUs for now. At the moment AMD GPUs do not support rounding
|
||||
// modes, all the inputs are rounded to nearest or use an input to do stochastic rounding.
|
||||
// We hide the rounding variable to not trigger the unused variable compiler warning.
|
||||
|
||||
@@ -23,14 +23,9 @@ SOFTWARE.
|
||||
#pragma once
|
||||
|
||||
#include "amd_hip_mx_common.h"
|
||||
|
||||
#include "amd_hip_fp16.h"
|
||||
#include "amd_hip_bf16.h"
|
||||
#include "amd_hip_fp8.h"
|
||||
|
||||
#include "amd_hip_ocp_types.h"
|
||||
#include "amd_hip_ocp_host.hpp"
|
||||
#include "hip/amd_detail/amd_hip_mx_common.h"
|
||||
|
||||
#if defined(__HIPCC_RTC__)
|
||||
#define __FP6_HOST_DEVICE__ __device__
|
||||
@@ -53,33 +48,6 @@ enum __hip_fp6_interpretation_t {
|
||||
__HIP_E2M3 = 1, /**< FP6 E2M3 Type */
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
__FP6_HOST_DEVICE_STATIC__ __amd_fp16_storage_t half_to_f16(const __half val) {
|
||||
__half_raw tmp = val;
|
||||
return tmp.data;
|
||||
}
|
||||
__FP6_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t half2_to_f16x2(const __half2 val) {
|
||||
__half2_raw tmp = val;
|
||||
return tmp.data;
|
||||
}
|
||||
__FP6_HOST_DEVICE_STATIC__ __amd_bf16_storage_t hipbf16_to_bf16(const __hip_bfloat16 val) {
|
||||
static_assert(sizeof(__hip_bfloat16) == sizeof(__amd_bf16_storage_t));
|
||||
union {
|
||||
__hip_bfloat16 hip_bf16;
|
||||
__amd_bf16_storage_t bf16;
|
||||
} u{val};
|
||||
return u.bf16;
|
||||
}
|
||||
__FP6_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t hipbf162_to_bf16x2(const __hip_bfloat162 val) {
|
||||
static_assert(sizeof(__hip_bfloat162) == sizeof(__amd_bf16x2_storage_t));
|
||||
union {
|
||||
__hip_bfloat162 hip_bf16;
|
||||
__amd_bf16x2_storage_t bf16;
|
||||
} u{val};
|
||||
return u.bf16;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
|
||||
// Note: Ignore rounding input on AMD GPUs for now. At the moment AMD GPUs do not support rounding
|
||||
// modes, all the inputs are rounded to nearest or use an input to do stochastic rounding.
|
||||
@@ -543,7 +511,7 @@ struct __hip_fp6x2_e2m3 {
|
||||
__amd_fp6x32_storage_t in;
|
||||
__amd_bf16x32_storage_t out;
|
||||
in[0] = __x & 0x3Fu; // first 6 bits
|
||||
in[0] |= (__x & FC00u) >> 2; // next 6 bits
|
||||
in[0] |= (__x & 0xFC00u) >> 2; // next 6 bits
|
||||
out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, 1.0f /* scale */);
|
||||
u.bf16x2 = {out[0], out[1]};
|
||||
#else
|
||||
@@ -559,9 +527,9 @@ struct __hip_fp6x2_e2m3 {
|
||||
__amd_fp6x32_storage_t in;
|
||||
__amd_floatx32_storage_t out;
|
||||
in[0] = __x & 0x3Fu; // first 6 bits
|
||||
in[0] |= (__x & FC00u) >> 2; // next 6 bits
|
||||
in[0] |= (__x & 0xFC00u) >> 2; // next 6 bits
|
||||
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */);
|
||||
auto fp32x2 = {out[0], out[1]};
|
||||
__amd_floatx2_storage_t fp32x2 = {out[0], out[1]};
|
||||
#else
|
||||
using namespace fcbx;
|
||||
auto fp32x2 = __amd_floatx2_storage_t{to_float<float, Encoding::E2M3, true>(__x & 0xFFu, 0),
|
||||
@@ -603,7 +571,7 @@ struct __hip_fp6x2_e3m2 {
|
||||
__amd_fp6x32_storage_t in;
|
||||
__amd_bf16x32_storage_t out;
|
||||
in[0] = __x & 0x3Fu; // first 6 bits
|
||||
in[0] |= (__x & FC00u) >> 2; // next 6 bits
|
||||
in[0] |= (__x & 0xFC00u) >> 2; // next 6 bits
|
||||
out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, 1.0f /* scale */);
|
||||
u.bf16x2 = {out[0], out[1]};
|
||||
#else
|
||||
@@ -619,9 +587,9 @@ struct __hip_fp6x2_e3m2 {
|
||||
__amd_fp6x32_storage_t in;
|
||||
__amd_floatx32_storage_t out;
|
||||
in[0] = __x & 0x3Fu; // first 6 bits
|
||||
in[0] |= (__x & FC00u) >> 2; // next 6 bits
|
||||
in[0] |= (__x & 0xFC00u) >> 2; // next 6 bits
|
||||
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */);
|
||||
auto fp32x2 = {out[0], out[1]};
|
||||
__amd_floatx2_storage_t fp32x2 = {out[0], out[1]};
|
||||
#else
|
||||
using namespace fcbx;
|
||||
auto fp32x2 = __amd_floatx2_storage_t{to_float<float, Encoding::E3M2, true>(__x & 0xFFu, 0),
|
||||
@@ -664,8 +632,8 @@ struct __hip_fp6x4_e2m3 {
|
||||
in[0] |= ((__x >> 16) & 0x3Fu) << 12;
|
||||
in[0] |= ((__x >> 24) & 0x3Fu) << 18;
|
||||
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(in, 1.0f /* scale */);
|
||||
auto fp32x2_1 = {out[0], out[1]};
|
||||
auto fp32x2_2 = {out[2], out[3]};
|
||||
__amd_floatx2_storage_t fp32x2_1 = {out[0], out[1]};
|
||||
__amd_floatx2_storage_t fp32x2_2 = {out[2], out[3]};
|
||||
#else
|
||||
using namespace fcbx;
|
||||
auto fp32x2_1 =
|
||||
@@ -712,8 +680,8 @@ struct __hip_fp6x4_e3m2 {
|
||||
in[0] |= ((__x >> 16) & 0x3Fu) << 12;
|
||||
in[0] |= ((__x >> 24) & 0x3Fu) << 18;
|
||||
out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(in, 1.0f /* scale */);
|
||||
auto fp32x2_1 = {out[0], out[1]};
|
||||
auto fp32x2_2 = {out[2], out[3]};
|
||||
__amd_floatx2_storage_t fp32x2_1 = {out[0], out[1]};
|
||||
__amd_floatx2_storage_t fp32x2_2 = {out[2], out[3]};
|
||||
#else
|
||||
using namespace fcbx;
|
||||
auto fp32x2_1 =
|
||||
|
||||
@@ -21,6 +21,20 @@ SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#if defined(__gfx950__)
|
||||
#define HIP_ENABLE_GFX950_OCP_BUILTINS 1
|
||||
#else
|
||||
#define HIP_ENABLE_GFX950_OCP_BUILTINS 0
|
||||
#endif
|
||||
#if !defined(__gfx950__)
|
||||
#define HIP_ENABLE_HOST_OCP_CONVERSIONS 1
|
||||
#else
|
||||
#define HIP_ENABLE_HOST_OCP_CONVERSIONS 0
|
||||
#endif
|
||||
|
||||
#include "amd_hip_ocp_types.h"
|
||||
#include "amd_hip_fp16.h"
|
||||
#include "amd_hip_bf16.h"
|
||||
|
||||
enum hipRoundMode {
|
||||
hipRoundNearest = 0,
|
||||
@@ -28,3 +42,33 @@ enum hipRoundMode {
|
||||
hipRoundPosInf = 2,
|
||||
hipRoundMinInf = 3,
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
__host__ __device__ static inline __amd_fp16_storage_t half_to_f16(const __half val) {
|
||||
__half_raw tmp = val;
|
||||
return tmp.data;
|
||||
}
|
||||
|
||||
__host__ __device__ static inline __amd_fp16x2_storage_t half2_to_f16x2(const __half2 val) {
|
||||
__half2_raw tmp = val;
|
||||
return tmp.data;
|
||||
}
|
||||
|
||||
__host__ __device__ static inline __amd_bf16_storage_t hipbf16_to_bf16(const __hip_bfloat16 val) {
|
||||
static_assert(sizeof(__hip_bfloat16) == sizeof(__amd_bf16_storage_t));
|
||||
union {
|
||||
__hip_bfloat16 hip_bf16;
|
||||
__amd_bf16_storage_t bf16;
|
||||
} u{val};
|
||||
return u.bf16;
|
||||
}
|
||||
|
||||
__host__ __device__ static inline __amd_bf16x2_storage_t hipbf162_to_bf16x2(const __hip_bfloat162 val) {
|
||||
static_assert(sizeof(__hip_bfloat162) == sizeof(__amd_bf16x2_storage_t));
|
||||
union {
|
||||
__hip_bfloat162 hip_bf16;
|
||||
__amd_bf16x2_storage_t bf16;
|
||||
} u{val};
|
||||
return u.bf16;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
@@ -339,8 +339,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ float __amd_cvt_fp8_to_float_scale(
|
||||
#else
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? to_float<float, Encoding::E4M3, true>(static_cast<uint32_t>(val), scale)
|
||||
: to_float<float, Encoding::E5M2, true>(static_cast<uint32_t>(val), scale);
|
||||
? to_float<float, Encoding::E4M3Mx, true>(static_cast<uint32_t>(val), scale)
|
||||
: to_float<float, Encoding::E5M2Mx, true>(static_cast<uint32_t>(val), scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -378,8 +378,8 @@ __amd_cvt_float_to_fp8_sr_scale(const float val, const __amd_fp8_interpretation_
|
||||
} u{0};
|
||||
using namespace fcbx;
|
||||
u.ui32t = interpret == __AMD_OCP_E4M3
|
||||
? from_float_sr<float, Encoding::E4M3, true>(val, seed, scale)
|
||||
: from_float_sr<float, Encoding::E5M2, true>(val, seed, scale);
|
||||
? from_float_sr<float, Encoding::E4M3Mx, true>(val, seed, scale)
|
||||
: from_float_sr<float, Encoding::E5M2Mx, true>(val, seed, scale);
|
||||
return u.fp8[0];
|
||||
#endif
|
||||
}
|
||||
@@ -528,9 +528,9 @@ __amd_cvt_floatx2_to_fp4x2_scale(const __amd_floatx2_storage_t val,
|
||||
return u.fp4x2[0];
|
||||
#else
|
||||
using namespace fcbx;
|
||||
auto l = from_float<float, Encoding::E2M1, true>(val[1], scale);
|
||||
auto r = from_float<float, Encoding::E2M1, true>(val[0], scale);
|
||||
__amd_fp4x2_storage_t ret(l << 4 | r);
|
||||
auto l = from_float<float, Encoding::E2M1, true>(val[0], scale);
|
||||
auto r = from_float<float, Encoding::E2M1, true>(val[1], scale);
|
||||
__amd_fp4x2_storage_t ret(r << 4 | l);
|
||||
return ret;
|
||||
#endif
|
||||
}
|
||||
@@ -554,11 +554,11 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx2_storage_t __amd_cvt_fp8x2_to_floatx2
|
||||
using namespace fcbx;
|
||||
__amd_floatx2_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = to_float<float, Encoding::E4M3, true>(val >> 8, scale);
|
||||
ret[1] = to_float<float, Encoding::E4M3, true>((val << 8) >> 8, scale);
|
||||
ret[0] = to_float<float, Encoding::E4M3Mx, true>(val & 0xFFu, scale);
|
||||
ret[1] = to_float<float, Encoding::E4M3Mx, true>(val >> 8, scale);
|
||||
} else {
|
||||
ret[0] = to_float<float, Encoding::E5M2, true>(val >> 8, scale);
|
||||
ret[1] = to_float<float, Encoding::E5M2, true>((val << 8) >> 8, scale);
|
||||
ret[0] = to_float<float, Encoding::E5M2Mx, true>(val & 0xFFu, scale);
|
||||
ret[1] = to_float<float, Encoding::E5M2Mx, true>(val >> 8, scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -591,13 +591,13 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_floatx2_to_fp8x2_s
|
||||
using namespace fcbx;
|
||||
uint8_t l, r;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
l = from_float<float, Encoding::E4M3, true>(val[0], scale);
|
||||
r = from_float<float, Encoding::E4M3, true>(val[1], scale);
|
||||
l = from_float<float, Encoding::E4M3Mx, true>(val[0], scale);
|
||||
r = from_float<float, Encoding::E4M3Mx, true>(val[1], scale);
|
||||
} else {
|
||||
l = from_float<float, Encoding::E5M2, true>(val[0], scale);
|
||||
r = from_float<float, Encoding::E5M2, true>(val[1], scale);
|
||||
l = from_float<float, Encoding::E5M2Mx, true>(val[0], scale);
|
||||
r = from_float<float, Encoding::E5M2Mx, true>(val[1], scale);
|
||||
}
|
||||
__amd_fp8x2_storage_t ret(l << 8 | r);
|
||||
__amd_fp8x2_storage_t ret(r << 8 | l);
|
||||
return ret;
|
||||
#endif
|
||||
}
|
||||
@@ -685,11 +685,11 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t __amd_cvt_fp8x2_to_fp16x2_s
|
||||
using namespace fcbx;
|
||||
__amd_fp16x2_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val >> 8, scale);
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val >> 8, scale);
|
||||
} else {
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val >> 8, scale);
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val >> 8, scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -746,23 +746,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x8_storage_t __amd_cvt_fp8x8_to_fp16x8_s
|
||||
using namespace fcbx;
|
||||
__amd_fp16x8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[7], scale);
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[7], scale);
|
||||
} else {
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[7], scale);
|
||||
ret[0] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[7], scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -794,11 +794,11 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t __amd_cvt_fp8x2_to_bf16x2_s
|
||||
using namespace fcbx;
|
||||
__amd_bf16x2_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(in & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(in >> 8, scale);
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(in & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(in >> 8, scale);
|
||||
} else {
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(in & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(in >> 8, scale);
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(in & 0xFF, scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(in >> 8, scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -856,23 +856,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x8_storage_t __amd_cvt_fp8x8_to_bf16x8_s
|
||||
using namespace fcbx;
|
||||
__amd_bf16x8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[7], scale);
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[7], scale);
|
||||
} else {
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[7], scale);
|
||||
ret[0] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[0], scale);
|
||||
ret[1] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[1], scale);
|
||||
ret[2] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[2], scale);
|
||||
ret[3] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[3], scale);
|
||||
ret[4] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[4], scale);
|
||||
ret[5] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[5], scale);
|
||||
ret[6] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[6], scale);
|
||||
ret[7] = to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[7], scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -1214,13 +1214,13 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_fp16x2_to_fp8x2_sc
|
||||
__amd_fp8x2_storage_t fp8x2[2];
|
||||
} u{0};
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
u.ui32 = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(in[1], scale);
|
||||
u.ui32 = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(in[1], scale);
|
||||
u.ui32 <<= 8;
|
||||
u.ui32 |= from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(in[0], scale);
|
||||
u.ui32 |= from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(in[0], scale);
|
||||
} else {
|
||||
u.ui32 = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(in[1], scale);
|
||||
u.ui32 = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(in[1], scale);
|
||||
u.ui32 <<= 8;
|
||||
u.ui32 |= from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(in[0], scale);
|
||||
u.ui32 |= from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(in[0], scale);
|
||||
}
|
||||
return u.fp8x2[0];
|
||||
#endif
|
||||
@@ -1255,13 +1255,13 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_bf16x2_to_fp8x2_sc
|
||||
__amd_fp8x2_storage_t fp8x2[2];
|
||||
} u{0};
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
u.ui32 = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(in[1], scale);
|
||||
u.ui32 = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(in[1], scale);
|
||||
u.ui32 <<= 8;
|
||||
u.ui32 |= from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(in[0], scale);
|
||||
u.ui32 |= from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(in[0], scale);
|
||||
} else {
|
||||
u.ui32 = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(in[1], scale);
|
||||
u.ui32 = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(in[1], scale);
|
||||
u.ui32 <<= 8;
|
||||
u.ui32 |= from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(in[0], scale);
|
||||
u.ui32 |= from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(in[0], scale);
|
||||
}
|
||||
return u.fp8x2[0];
|
||||
#endif
|
||||
@@ -1323,23 +1323,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x8_storage_t __amd_cvt_bf16x8_to_fp8x8_sc
|
||||
using namespace fcbx;
|
||||
__amd_fp8x8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val[7], scale);
|
||||
ret[0] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[7], scale);
|
||||
} else {
|
||||
ret[0] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val[7], scale);
|
||||
ret[0] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[7], scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -1396,23 +1396,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx8_storage_t __amd_cvt_fp8x8_to_floatx8
|
||||
using namespace fcbx;
|
||||
__amd_floatx8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = to_float<float, Encoding::E4M3, true>(val[0], scale);
|
||||
ret[1] = to_float<float, Encoding::E4M3, true>(val[1], scale);
|
||||
ret[2] = to_float<float, Encoding::E4M3, true>(val[2], scale);
|
||||
ret[3] = to_float<float, Encoding::E4M3, true>(val[3], scale);
|
||||
ret[4] = to_float<float, Encoding::E4M3, true>(val[4], scale);
|
||||
ret[5] = to_float<float, Encoding::E4M3, true>(val[5], scale);
|
||||
ret[6] = to_float<float, Encoding::E4M3, true>(val[6], scale);
|
||||
ret[7] = to_float<float, Encoding::E4M3, true>(val[7], scale);
|
||||
ret[0] = to_float<float, Encoding::E4M3Mx, true>(val[0], scale);
|
||||
ret[1] = to_float<float, Encoding::E4M3Mx, true>(val[1], scale);
|
||||
ret[2] = to_float<float, Encoding::E4M3Mx, true>(val[2], scale);
|
||||
ret[3] = to_float<float, Encoding::E4M3Mx, true>(val[3], scale);
|
||||
ret[4] = to_float<float, Encoding::E4M3Mx, true>(val[4], scale);
|
||||
ret[5] = to_float<float, Encoding::E4M3Mx, true>(val[5], scale);
|
||||
ret[6] = to_float<float, Encoding::E4M3Mx, true>(val[6], scale);
|
||||
ret[7] = to_float<float, Encoding::E4M3Mx, true>(val[7], scale);
|
||||
} else {
|
||||
ret[0] = to_float<float, Encoding::E5M2, true>(val[0], scale);
|
||||
ret[1] = to_float<float, Encoding::E5M2, true>(val[1], scale);
|
||||
ret[2] = to_float<float, Encoding::E5M2, true>(val[2], scale);
|
||||
ret[3] = to_float<float, Encoding::E5M2, true>(val[3], scale);
|
||||
ret[4] = to_float<float, Encoding::E5M2, true>(val[4], scale);
|
||||
ret[5] = to_float<float, Encoding::E5M2, true>(val[5], scale);
|
||||
ret[6] = to_float<float, Encoding::E5M2, true>(val[6], scale);
|
||||
ret[7] = to_float<float, Encoding::E5M2, true>(val[7], scale);
|
||||
ret[0] = to_float<float, Encoding::E5M2Mx, true>(val[0], scale);
|
||||
ret[1] = to_float<float, Encoding::E5M2Mx, true>(val[1], scale);
|
||||
ret[2] = to_float<float, Encoding::E5M2Mx, true>(val[2], scale);
|
||||
ret[3] = to_float<float, Encoding::E5M2Mx, true>(val[3], scale);
|
||||
ret[4] = to_float<float, Encoding::E5M2Mx, true>(val[4], scale);
|
||||
ret[5] = to_float<float, Encoding::E5M2Mx, true>(val[5], scale);
|
||||
ret[6] = to_float<float, Encoding::E5M2Mx, true>(val[6], scale);
|
||||
ret[7] = to_float<float, Encoding::E5M2Mx, true>(val[7], scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -1439,9 +1439,9 @@ __amd_cvt_fp8_to_fp16_scale(const __amd_fp8_storage_t val,
|
||||
#else
|
||||
using namespace fcbx;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
return to_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val, scale);
|
||||
return to_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val, scale);
|
||||
} else {
|
||||
return to_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val, scale);
|
||||
return to_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val, scale);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -1474,9 +1474,9 @@ __amd_cvt_fp8_to_bf16_scale(const __amd_fp8_storage_t val,
|
||||
#else
|
||||
using namespace fcbx;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
return to_float<__amd_bf16_storage_t, Encoding::E4M3, true>(val, scale);
|
||||
return to_float<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val, scale);
|
||||
} else {
|
||||
return to_float<__amd_bf16_storage_t, Encoding::E5M2, true>(val, scale);
|
||||
return to_float<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val, scale);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -1500,10 +1500,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx16_floatx16
|
||||
#else
|
||||
__amd_floatx32_storage_t tmp;
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
tmp[i] = in1[i];
|
||||
}
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
tmp[i + 16] = in2[i];
|
||||
tmp[i * 2] = in1[i];
|
||||
tmp[i * 2 + 1] = in2[i];
|
||||
}
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
@@ -1527,11 +1525,12 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx32_to_fp6x3
|
||||
const __amd_floatx32_storage_t val, const __amd_fp6_interpretation_t interpret,
|
||||
const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
__amd_floatx16_storage_t in1{val[0], val[1], val[2], val[3], val[4], val[5],
|
||||
val[6], val[7], val[8], val[9], val[10], val[11],
|
||||
val[12], val[13], val[14], val[15]},
|
||||
in2 = {val[16], val[17], val[18], val[19], val[20], val[21], val[22], val[23],
|
||||
val[24], val[25], val[26], val[27], val[28], val[29], val[30], val[31]};
|
||||
// The API exepcts interleaved inputs
|
||||
__amd_floatx16_storage_t in1{val[0], val[2], val[4], val[6], val[8], val[10],
|
||||
val[12], val[14], val[16], val[18], val[20], val[22],
|
||||
val[24], val[26], val[28], val[30]},
|
||||
in2 = {val[1], val[3], val[5], val[7], val[9], val[11], val[13], val[15],
|
||||
val[17], val[19], val[21], val[23], val[25], val[27], val[29], val[31]};
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
|
||||
@@ -2217,23 +2216,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x8_storage_t __amd_cvt_floatx8_to_fp8x8_s
|
||||
using namespace fcbx;
|
||||
__amd_fp8x8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = from_float_sr<float, Encoding::E4M3, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<float, Encoding::E4M3, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<float, Encoding::E4M3, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<float, Encoding::E4M3, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<float, Encoding::E4M3, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<float, Encoding::E4M3, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<float, Encoding::E4M3, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<float, Encoding::E4M3, true>(val[7], seed, scale);
|
||||
ret[0] = from_float_sr<float, Encoding::E4M3Mx, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<float, Encoding::E4M3Mx, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<float, Encoding::E4M3Mx, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<float, Encoding::E4M3Mx, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<float, Encoding::E4M3Mx, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<float, Encoding::E4M3Mx, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<float, Encoding::E4M3Mx, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<float, Encoding::E4M3Mx, true>(val[7], seed, scale);
|
||||
} else {
|
||||
ret[0] = from_float_sr<float, Encoding::E5M2, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<float, Encoding::E5M2, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<float, Encoding::E5M2, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<float, Encoding::E5M2, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<float, Encoding::E5M2, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<float, Encoding::E5M2, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<float, Encoding::E5M2, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<float, Encoding::E5M2, true>(val[7], seed, scale);
|
||||
ret[0] = from_float_sr<float, Encoding::E5M2Mx, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<float, Encoding::E5M2Mx, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<float, Encoding::E5M2Mx, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<float, Encoding::E5M2Mx, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<float, Encoding::E5M2Mx, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<float, Encoding::E5M2Mx, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<float, Encoding::E5M2Mx, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<float, Encoding::E5M2Mx, true>(val[7], seed, scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -2268,9 +2267,9 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8_storage_t __amd_cvt_fp16_to_fp8_sr_scale
|
||||
#else
|
||||
using namespace fcbx;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
return from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val, seed, scale);
|
||||
return from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val, seed, scale);
|
||||
} else {
|
||||
return from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val, seed, scale);
|
||||
return from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val, seed, scale);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -2350,23 +2349,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x8_storage_t __amd_cvt_fp16x8_to_fp8x8_sr
|
||||
using namespace fcbx;
|
||||
__amd_fp8x8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val[7], seed, scale);
|
||||
ret[0] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[7], seed, scale);
|
||||
} else {
|
||||
ret[0] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val[7], seed, scale);
|
||||
ret[0] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[7], seed, scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -2401,9 +2400,9 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8_storage_t __amd_cvt_bf16_to_fp8_sr_scale
|
||||
#else
|
||||
using namespace fcbx;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
return from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val, seed, scale);
|
||||
return from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val, seed, scale);
|
||||
} else {
|
||||
return from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val, seed, scale);
|
||||
return from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val, seed, scale);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -2483,23 +2482,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x8_storage_t __amd_cvt_bf16x8_to_fp8x8_sr
|
||||
using namespace fcbx;
|
||||
__amd_fp8x8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3, true>(val[7], seed, scale);
|
||||
ret[0] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_bf16_storage_t, Encoding::E4M3Mx, true>(val[7], seed, scale);
|
||||
} else {
|
||||
ret[0] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2, true>(val[7], seed, scale);
|
||||
ret[0] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[0], seed, scale);
|
||||
ret[1] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[1], seed, scale);
|
||||
ret[2] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[2], seed, scale);
|
||||
ret[3] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[3], seed, scale);
|
||||
ret[4] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[4], seed, scale);
|
||||
ret[5] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[5], seed, scale);
|
||||
ret[6] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[6], seed, scale);
|
||||
ret[7] = from_float_sr<__amd_bf16_storage_t, Encoding::E5M2Mx, true>(val[7], seed, scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -2668,23 +2667,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x8_storage_t __amd_cvt_fp16x8_to_fp8x8_sc
|
||||
__amd_fp8x8_storage_t ret;
|
||||
using namespace fcbx;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_fp16_storage_t, Encoding::E4M3, true>(val[7], scale);
|
||||
ret[0] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_fp16_storage_t, Encoding::E4M3Mx, true>(val[7], scale);
|
||||
} else {
|
||||
ret[0] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_fp16_storage_t, Encoding::E5M2, true>(val[7], scale);
|
||||
ret[0] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[0], scale);
|
||||
ret[1] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[1], scale);
|
||||
ret[2] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[2], scale);
|
||||
ret[3] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[3], scale);
|
||||
ret[4] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[4], scale);
|
||||
ret[5] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[5], scale);
|
||||
ret[6] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[6], scale);
|
||||
ret[7] = from_float<__amd_fp16_storage_t, Encoding::E5M2Mx, true>(val[7], scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
@@ -2744,23 +2743,23 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x8_storage_t __amd_cvt_floatx8_to_fp8x8_s
|
||||
using namespace fcbx;
|
||||
__amd_fp8x8_storage_t ret;
|
||||
if (interpret == __AMD_OCP_E4M3) {
|
||||
ret[0] = from_float<float, Encoding::E4M3, true>(val[0], scale);
|
||||
ret[1] = from_float<float, Encoding::E4M3, true>(val[1], scale);
|
||||
ret[2] = from_float<float, Encoding::E4M3, true>(val[2], scale);
|
||||
ret[3] = from_float<float, Encoding::E4M3, true>(val[3], scale);
|
||||
ret[4] = from_float<float, Encoding::E4M3, true>(val[4], scale);
|
||||
ret[5] = from_float<float, Encoding::E4M3, true>(val[5], scale);
|
||||
ret[6] = from_float<float, Encoding::E4M3, true>(val[6], scale);
|
||||
ret[7] = from_float<float, Encoding::E4M3, true>(val[7], scale);
|
||||
ret[0] = from_float<float, Encoding::E4M3Mx, true>(val[0], scale);
|
||||
ret[1] = from_float<float, Encoding::E4M3Mx, true>(val[1], scale);
|
||||
ret[2] = from_float<float, Encoding::E4M3Mx, true>(val[2], scale);
|
||||
ret[3] = from_float<float, Encoding::E4M3Mx, true>(val[3], scale);
|
||||
ret[4] = from_float<float, Encoding::E4M3Mx, true>(val[4], scale);
|
||||
ret[5] = from_float<float, Encoding::E4M3Mx, true>(val[5], scale);
|
||||
ret[6] = from_float<float, Encoding::E4M3Mx, true>(val[6], scale);
|
||||
ret[7] = from_float<float, Encoding::E4M3Mx, true>(val[7], scale);
|
||||
} else {
|
||||
ret[0] = from_float<float, Encoding::E5M2, true>(val[0], scale);
|
||||
ret[1] = from_float<float, Encoding::E5M2, true>(val[1], scale);
|
||||
ret[2] = from_float<float, Encoding::E5M2, true>(val[2], scale);
|
||||
ret[3] = from_float<float, Encoding::E5M2, true>(val[3], scale);
|
||||
ret[4] = from_float<float, Encoding::E5M2, true>(val[4], scale);
|
||||
ret[5] = from_float<float, Encoding::E5M2, true>(val[5], scale);
|
||||
ret[6] = from_float<float, Encoding::E5M2, true>(val[6], scale);
|
||||
ret[7] = from_float<float, Encoding::E5M2, true>(val[7], scale);
|
||||
ret[0] = from_float<float, Encoding::E5M2Mx, true>(val[0], scale);
|
||||
ret[1] = from_float<float, Encoding::E5M2Mx, true>(val[1], scale);
|
||||
ret[2] = from_float<float, Encoding::E5M2Mx, true>(val[2], scale);
|
||||
ret[3] = from_float<float, Encoding::E5M2Mx, true>(val[3], scale);
|
||||
ret[4] = from_float<float, Encoding::E5M2Mx, true>(val[4], scale);
|
||||
ret[5] = from_float<float, Encoding::E5M2Mx, true>(val[5], scale);
|
||||
ret[6] = from_float<float, Encoding::E5M2Mx, true>(val[6], scale);
|
||||
ret[7] = from_float<float, Encoding::E5M2Mx, true>(val[7], scale);
|
||||
}
|
||||
return ret;
|
||||
#endif
|
||||
|
||||
Yeni konuda referans
Bir kullanıcı engelle