SWDEV-501963 - Add missing codes for gfx950
Cherry-pick https://gerrit-git.amd.com/c/compute/ec/clr/+/1162997
Change-Id: I6b3c6bf55c61cffd43cd6f17b75998f751b75723
[ROCm/clr commit: 32daa8f384]
Этот коммит содержится в:
@@ -835,7 +835,7 @@ int __syncthreads_or(int predicate)
|
||||
STATE_ID 29:27 State ID (graphics only, not compute).
|
||||
ME_ID 31:30 Micro-engine ID.
|
||||
|
||||
XCC_ID Register bit structure for gfx940
|
||||
XCC_ID Register bit structure for gfx940/941/942/950
|
||||
XCC_ID 3:0 XCC the wave is assigned to.
|
||||
*/
|
||||
|
||||
@@ -871,14 +871,14 @@ int __syncthreads_or(int predicate)
|
||||
#define HW_ID_SE_ID_OFFSET 13
|
||||
#endif
|
||||
|
||||
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
||||
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
|
||||
#define __gfx94plus_clr__
|
||||
#define XCC_ID 20
|
||||
#define XCC_ID_XCC_ID_SIZE 4
|
||||
#define XCC_ID_XCC_ID_OFFSET 0
|
||||
#endif
|
||||
|
||||
#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
|
||||
(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
|
||||
#if !defined(__HIP_NO_IMAGE_SUPPORT) && defined(__gfx94plus_clr__)
|
||||
#define __HIP_NO_IMAGE_SUPPORT 1
|
||||
#endif
|
||||
|
||||
@@ -913,7 +913,7 @@ unsigned __smid(void)
|
||||
GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
|
||||
#endif
|
||||
#else
|
||||
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
||||
#if defined(__gfx94plus_clr__)
|
||||
unsigned xcc_id = __builtin_amdgcn_s_getreg(
|
||||
GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
|
||||
#endif
|
||||
@@ -929,7 +929,7 @@ unsigned __smid(void)
|
||||
#endif
|
||||
return temp;
|
||||
//TODO : CU Mode impl
|
||||
#elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
||||
#elif defined(__gfx94plus_clr__)
|
||||
unsigned temp = xcc_id;
|
||||
temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
|
||||
temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
|
||||
|
||||
@@ -30,9 +30,8 @@
|
||||
#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
|
||||
#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
|
||||
|
||||
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \
|
||||
defined(__gfx1201__)) && \
|
||||
__HIP_DEVICE_COMPILE__
|
||||
#if (defined(__gfx94plus_clr__) || defined(__gfx1200__) || defined(__gfx1201__)) && \
|
||||
__HIP_DEVICE_COMPILE__
|
||||
#define HIP_FP8_CVT_FAST_PATH 1
|
||||
#else
|
||||
#define HIP_FP8_CVT_FAST_PATH 0
|
||||
@@ -3173,4 +3172,4 @@ struct __hip_fp8x4_e5m2 {
|
||||
}
|
||||
};
|
||||
#endif // ENABLE_OCP_HIPRTC
|
||||
#endif // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
|
||||
#endif // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_
|
||||
|
||||
@@ -209,8 +209,8 @@ __device__ inline double unsafeAtomicAdd(double* addr, double value) {
|
||||
* @return Original value contained at \p addr.
|
||||
*/
|
||||
__device__ inline double unsafeAtomicMax(double* addr, double val) {
|
||||
#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
|
||||
#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
|
||||
return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
|
||||
#else
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
@@ -262,8 +262,8 @@ __device__ inline double unsafeAtomicMax(double* addr, double val) {
|
||||
* @return Original value contained at \p addr.
|
||||
*/
|
||||
__device__ inline double unsafeAtomicMin(double* addr, double val) {
|
||||
#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
|
||||
#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
|
||||
return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
|
||||
#else
|
||||
#if __has_builtin(__hip_atomic_load) && \
|
||||
@@ -304,13 +304,13 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) {
|
||||
*/
|
||||
__device__ inline float safeAtomicAdd(float* addr, float value) {
|
||||
#if defined(__gfx908__) || defined(__gfx941__) \
|
||||
|| ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \
|
||||
&& !__has_builtin(__hip_atomic_fetch_add))
|
||||
|| ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__) || \
|
||||
defined(__gfx950__)) && !__has_builtin(__hip_atomic_fetch_add))
|
||||
// On gfx908, we can generate unsafe FP32 atomic add that does not follow all
|
||||
// IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
|
||||
// On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically,
|
||||
// so we need to force a CAS loop emulation to ensure safety.
|
||||
// On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we
|
||||
// On gfx90a, gfx940, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we
|
||||
// need to force a CAS loop here.
|
||||
float old_val;
|
||||
#if __has_builtin(__hip_atomic_load)
|
||||
|
||||
@@ -61,6 +61,7 @@ static const AMDDeviceInfo DeviceInfo[] = {
|
||||
{"gfx940", "gfx940", 4, 16, 1, 256, 64 * Ki, 32, 9, 4},
|
||||
{"gfx941", "gfx941", 4, 16, 1, 256, 64 * Ki, 32, 9, 4},
|
||||
{"gfx942", "gfx942", 4, 16, 1, 256, 64 * Ki, 32, 9, 4},
|
||||
{"gfx950", "gfx950", 4, 16, 1, 256, 160 * Ki, 64, 9, 5},
|
||||
{"gfx1010", "gfx1010", 4, 32, 1, 256, 64 * Ki, 32, 10, 1},
|
||||
{"gfx1011", "gfx1011", 4, 32, 1, 256, 64 * Ki, 32, 10, 1},
|
||||
{"gfx1012", "gfx1012", 4, 32, 1, 256, 64 * Ki, 32, 10, 1},
|
||||
|
||||
@@ -98,7 +98,7 @@ const char* TrapHandlerCode = RUNTIME_KERNEL(
|
||||
// ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
|
||||
// gfx906/gfx908/gfx90a:
|
||||
// ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
|
||||
// gfx940/gfx941/gfx942:
|
||||
// gfx940/gfx941/gfx942/gfx950:
|
||||
// ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0]
|
||||
// gfx10:
|
||||
// ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
|
||||
|
||||
@@ -246,7 +246,7 @@ void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidH
|
||||
const uint32_t gfxipMinor = isa.versionMinor();
|
||||
const uint32_t gfxStepping = isa.versionStepping();
|
||||
|
||||
const bool isGfx94x = gfxipMajor == 9 && gfxipMinor == 4 &&
|
||||
const bool isGfx94x = gfxipMajor == 9 && (gfxipMinor == 4 || gfxipMinor == 5) &&
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
|
||||
const bool isGfx90a = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10);
|
||||
const bool isPreGfx908 =
|
||||
|
||||
Ссылка в новой задаче
Block a user