From f14e8a2dba3c612b44b66ce3fd6c174943b8e614 Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Thu, 13 Apr 2023 11:43:03 -0400 Subject: [PATCH] SWDEV-393910 - Port gfx94x changes to mainline. Change-Id: Ibf727223bbe5230b132b47c39e0fc1d87cbd3b9c --- .../hip/amd_detail/amd_hip_unsafe_atomics.h | 26 +++++++++--------- hipamd/src/amd_hsa_elf.hpp | 14 +++++----- hipamd/src/hip_code_object.cpp | 10 +++++++ hipamd/src/hiprtc/hiprtcComgrHelper.cpp | 10 +++++++ rocclr/device/device.cpp | 27 +++++++++++++++++++ 5 files changed, 67 insertions(+), 20 deletions(-) diff --git a/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h index 0100e99e71..d54000a451 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h +++ b/hipamd/include/hip/amd_detail/amd_hip_unsafe_atomics.h @@ -53,10 +53,7 @@ THE SOFTWARE. * @return Original value contained in \p addr. */ __device__ inline float unsafeAtomicAdd(float* addr, float value) { -#if defined(__gfx940__) && \ - __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f32) - return __builtin_amdgcn_flat_atomic_fadd_f32(addr, value); -#elif defined(__gfx90a__) && \ +#if defined(__gfx90a__) && \ __has_builtin(__builtin_amdgcn_is_shared) && \ __has_builtin(__builtin_amdgcn_is_private) && \ __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \ @@ -178,8 +175,7 @@ __device__ inline float unsafeAtomicMin(float* addr, float val) { * @return Original value contained in \p addr. */ __device__ inline double unsafeAtomicAdd(double* addr, double value) { -#if (defined(__gfx90a__) || defined(__gfx940__)) && \ - __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64) +#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64) return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value); #elif defined (__hip_atomic_fetch_add) return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -215,7 +211,7 @@ __device__ inline double unsafeAtomicAdd(double* addr, double value) { * @return Original value contained at \p addr. */ __device__ inline double unsafeAtomicMax(double* addr, double val) { -#if (defined(__gfx90a__) || defined(__gfx940__)) && \ +#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \ __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64) return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val); #else @@ -268,7 +264,7 @@ __device__ inline double unsafeAtomicMax(double* addr, double val) { * @return Original value contained at \p addr. */ __device__ inline double unsafeAtomicMin(double* addr, double val) { -#if (defined(__gfx90a__) || defined(__gfx940__)) && \ +#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \ __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64) return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val); #else @@ -309,12 +305,15 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) { * @return Original value contained in \p addr. */ __device__ inline float safeAtomicAdd(float* addr, float value) { -#if defined(__gfx908__) || \ - (defined(__gfx90a__) && !__has_builtin(__hip_atomic_fetch_add)) +#if defined(__gfx908__) || defined(__gfx941__) \ + || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \ + && !__has_builtin(__hip_atomic_fetch_add)) // On gfx908, we can generate unsafe FP32 atomic add that does not follow all // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead. - // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to - // force a CAS loop here. + // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically, + // so we need to force a CAS loop emulation to ensure safety. + // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we + // need to force a CAS loop here. float old_val; #if __has_builtin(__hip_atomic_load) old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -434,8 +433,7 @@ __device__ inline float safeAtomicMin(float* addr, float val) { * @return Original value contained in \p addr. */ __device__ inline double safeAtomicAdd(double* addr, double value) { -#if (defined(__gfx90a__) || defined(__gfx940__)) && \ - __has_builtin(__hip_atomic_fetch_add) +#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add) // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope // atomics will produce safe CAS loops, but are otherwise not different than // agent-scope atomics. This logic is only applicable for gfx90a, and should diff --git a/hipamd/src/amd_hsa_elf.hpp b/hipamd/src/amd_hsa_elf.hpp index 383d4562be..eebe600562 100644 --- a/hipamd/src/amd_hsa_elf.hpp +++ b/hipamd/src/amd_hsa_elf.hpp @@ -95,13 +95,15 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, - EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, - EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, + EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x041, + EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x042, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X43 = 0x043, - EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, - EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, - EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, - EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x044, + EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x045, + EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x046, + EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x048, + EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x049, // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, diff --git a/hipamd/src/hip_code_object.cpp b/hipamd/src/hip_code_object.cpp index 5e169ee19f..54cf533547 100644 --- a/hipamd/src/hip_code_object.cpp +++ b/hipamd/src/hip_code_object.cpp @@ -175,6 +175,16 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = true; proc_name = "gfx940"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX941: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx941"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX942: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx942"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1010: xnackSupported = true; sramEccSupported = false; diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp index 754ff22e79..1fab4b7b00 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp @@ -162,6 +162,16 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp sramEccSupported = true; proc_name = "gfx940"; break; + case EF_AMDGPU_MACH_AMDGCN_GFX941: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx941"; + break; + case EF_AMDGPU_MACH_AMDGCN_GFX942: + xnackSupported = true; + sramEccSupported = true; + proc_name = "gfx942"; + break; case EF_AMDGPU_MACH_AMDGCN_GFX1010: xnackSupported = true; sramEccSupported = false; diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index 8cd1de3109..522c4f1ffb 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -166,6 +166,33 @@ std::pair Isa::supportedIsas() { {"gfx90a:sramecc-:xnack+", nullptr, true, false, 9, 0, 10, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc+:xnack-", nullptr, true, false, 9, 0, 10, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90a:sramecc+:xnack+", nullptr, true, false, 9, 0, 10, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940", nullptr, true, false, 9, 4, 0, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:sramecc-", nullptr, true, false, 9, 4, 0, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:sramecc+", nullptr, true, false, 9, 4, 0, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:xnack-", nullptr, true, false, 9, 4, 0, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:xnack+", nullptr, true, false, 9, 4, 0, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:sramecc-:xnack-", nullptr, true, false, 9, 4, 0, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:sramecc-:xnack+", nullptr, true, false, 9, 4, 0, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:sramecc+:xnack-", nullptr, true, false, 9, 4, 0, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx940:sramecc+:xnack+", nullptr, true, false, 9, 4, 0, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941", nullptr, true, false, 9, 4, 1, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:sramecc-", nullptr, true, false, 9, 4, 1, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:sramecc+", nullptr, true, false, 9, 4, 1, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:xnack-", nullptr, true, false, 9, 4, 1, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:xnack+", nullptr, true, false, 9, 4, 1, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:sramecc-:xnack-", nullptr, true, false, 9, 4, 1, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:sramecc-:xnack+", nullptr, true, false, 9, 4, 1, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:sramecc+:xnack-", nullptr, true, false, 9, 4, 1, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx941:sramecc+:xnack+", nullptr, true, false, 9, 4, 1, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942", nullptr, true, false, 9, 4, 2, ANY, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:sramecc-", nullptr, true, false, 9, 4, 2, OFF, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:sramecc+", nullptr, true, false, 9, 4, 2, ON, ANY, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:xnack-", nullptr, true, false, 9, 4, 2, ANY, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:xnack+", nullptr, true, false, 9, 4, 2, ANY, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:sramecc-:xnack-", nullptr, true, false, 9, 4, 2, OFF, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:sramecc-:xnack+", nullptr, true, false, 9, 4, 2, OFF, ON, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:sramecc+:xnack-", nullptr, true, false, 9, 4, 2, ON, OFF, 4, 16, 1, 256, 64 * Ki, 32}, + {"gfx942:sramecc+:xnack+", nullptr, true, false, 9, 4, 2, ON, ON, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90c", nullptr, true, true, 9, 0, 12, NONE, ANY, 4, 16, 1, 256, 64 * Ki, 32}, // Also Renoir {"gfx90c:xnack-", "gfx90c", true, true, 9, 0, 12, NONE, OFF, 4, 16, 1, 256, 64 * Ki, 32}, {"gfx90c:xnack+", "gfx90d", true, true, 9, 0, 12, NONE, ON, 4, 16, 1, 256, 64 * Ki, 32},