From 40df9006476375ce4c99cd04d8d406cbee32af13 Mon Sep 17 00:00:00 2001 From: taosang2 Date: Mon, 2 Dec 2024 15:17:47 -0500 Subject: [PATCH] SWDEV-501963 - Add missing codes for gfx950 Cherry-pick https://gerrit-git.amd.com/c/compute/ec/clr/+/1162997 Change-Id: I6b3c6bf55c61cffd43cd6f17b75998f751b75723 [ROCm/clr commit: 32daa8f3848c4c6583bb1b0af0eb34a8731e7c44] --- .../include/hip/amd_detail/amd_device_functions.h | 12 ++++++------ .../hipamd/include/hip/amd_detail/amd_hip_fp8.h | 7 +++---- .../hip/amd_detail/amd_hip_unsafe_atomics.h | 14 +++++++------- .../ocltst/module/runtime/OCLDeviceQueries.cpp | 1 + projects/clr/rocclr/device/pal/palblitcl.cpp | 2 +- projects/clr/rocclr/device/rocm/rocsettings.cpp | 2 +- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h index a8560b4658..4c55103e21 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h @@ -835,7 +835,7 @@ int __syncthreads_or(int predicate) STATE_ID 29:27 State ID (graphics only, not compute). ME_ID 31:30 Micro-engine ID. - XCC_ID Register bit structure for gfx940 + XCC_ID Register bit structure for gfx940/941/942/950 XCC_ID 3:0 XCC the wave is assigned to. */ @@ -871,14 +871,14 @@ int __syncthreads_or(int predicate) #define HW_ID_SE_ID_OFFSET 13 #endif -#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) +#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)) + #define __gfx94plus_clr__ #define XCC_ID 20 #define XCC_ID_XCC_ID_SIZE 4 #define XCC_ID_XCC_ID_OFFSET 0 #endif -#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \ - (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))) +#if !defined(__HIP_NO_IMAGE_SUPPORT) && defined(__gfx94plus_clr__) #define __HIP_NO_IMAGE_SUPPORT 1 #endif @@ -913,7 +913,7 @@ unsigned __smid(void) GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID)); #endif #else - #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) + #if defined(__gfx94plus_clr__) unsigned xcc_id = __builtin_amdgcn_s_getreg( GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID)); #endif @@ -929,7 +929,7 @@ unsigned __smid(void) #endif return temp; //TODO : CU Mode impl - #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) + #elif defined(__gfx94plus_clr__) unsigned temp = xcc_id; temp = (temp << HW_ID_SE_ID_SIZE) | se_id; temp = (temp << HW_ID_CU_ID_SIZE) | cu_id; diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h index da5bedf8a9..5ed745b668 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_fp8.h @@ -30,9 +30,8 @@ #ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_ #define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_ -#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || \ - defined(__gfx1201__)) && \ - __HIP_DEVICE_COMPILE__ +#if (defined(__gfx94plus_clr__) || defined(__gfx1200__) || defined(__gfx1201__)) && \ + __HIP_DEVICE_COMPILE__ #define HIP_FP8_CVT_FAST_PATH 1 #else #define HIP_FP8_CVT_FAST_PATH 0 @@ -3173,4 +3172,4 @@ struct __hip_fp8x4_e5m2 { } }; #endif // ENABLE_OCP_HIPRTC -#endif // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_ \ No newline at end of file +#endif // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_ diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h index 59841ab9b3..8c01cc5529 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -209,8 +209,8 @@ __device__ inline double unsafeAtomicAdd(double* addr, double value) { * @return Original value contained at \p addr. */ __device__ inline double unsafeAtomicMax(double* addr, double val) { -#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \ - __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64) +#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64) return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val); #else #if __has_builtin(__hip_atomic_load) && \ @@ -262,8 +262,8 @@ __device__ inline double unsafeAtomicMax(double* addr, double val) { * @return Original value contained at \p addr. */ __device__ inline double unsafeAtomicMin(double* addr, double val) { -#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \ - __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64) +#if (defined(__gfx90a__) || defined(__gfx94plus_clr__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64) return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val); #else #if __has_builtin(__hip_atomic_load) && \ @@ -304,13 +304,13 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) { */ __device__ inline float safeAtomicAdd(float* addr, float value) { #if defined(__gfx908__) || defined(__gfx941__) \ - || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \ - && !__has_builtin(__hip_atomic_fetch_add)) + || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__) || \ + defined(__gfx950__)) && !__has_builtin(__hip_atomic_fetch_add)) // On gfx908, we can generate unsafe FP32 atomic add that does not follow all // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead. // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically, // so we need to force a CAS loop emulation to ensure safety. - // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we + // On gfx90a, gfx940, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we // need to force a CAS loop here. float old_val; #if __has_builtin(__hip_atomic_load) diff --git a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp index b0fd4ebe8d..127add49e9 100644 --- a/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp +++ b/projects/clr/opencl/tests/ocltst/module/runtime/OCLDeviceQueries.cpp @@ -61,6 +61,7 @@ static const AMDDeviceInfo DeviceInfo[] = { {"gfx940", "gfx940", 4, 16, 1, 256, 64 * Ki, 32, 9, 4}, {"gfx941", "gfx941", 4, 16, 1, 256, 64 * Ki, 32, 9, 4}, {"gfx942", "gfx942", 4, 16, 1, 256, 64 * Ki, 32, 9, 4}, + {"gfx950", "gfx950", 4, 16, 1, 256, 160 * Ki, 64, 9, 5}, {"gfx1010", "gfx1010", 4, 32, 1, 256, 64 * Ki, 32, 10, 1}, {"gfx1011", "gfx1011", 4, 32, 1, 256, 64 * Ki, 32, 10, 1}, {"gfx1012", "gfx1012", 4, 32, 1, 256, 64 * Ki, 32, 10, 1}, diff --git a/projects/clr/rocclr/device/pal/palblitcl.cpp b/projects/clr/rocclr/device/pal/palblitcl.cpp index bc35c924c6..007ec8ba0b 100644 --- a/projects/clr/rocclr/device/pal/palblitcl.cpp +++ b/projects/clr/rocclr/device/pal/palblitcl.cpp @@ -98,7 +98,7 @@ const char* TrapHandlerCode = RUNTIME_KERNEL( // ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] // gfx906/gfx908/gfx90a: // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] -// gfx940/gfx941/gfx942: +// gfx940/gfx941/gfx942/gfx950: // ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0] // gfx10: // ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32] diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index 34047fff62..cd2006fedb 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -246,7 +246,7 @@ void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidH const uint32_t gfxipMinor = isa.versionMinor(); const uint32_t gfxStepping = isa.versionStepping(); - const bool isGfx94x = gfxipMajor == 9 && gfxipMinor == 4 && + const bool isGfx94x = gfxipMajor == 9 && (gfxipMinor == 4 || gfxipMinor == 5) && (gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2); const bool isGfx90a = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10); const bool isPreGfx908 =