diff --git a/CMakeLists.txt b/CMakeLists.txt index 475a65dfa2..57bc073fec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,9 @@ set(DEFAULT_GPUS gfx1030 gfx1100 gfx1101 - gfx1102) + gfx1102 + gfx1200 + gfx1201) # Load CMake modules #================================================================================================== diff --git a/src/device/common.h b/src/device/common.h index acb05c40ac..6a720f5005 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -25,8 +25,13 @@ #define __syncwarp() +#ifdef __GFX12__ +#define __synclds() \ + asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier_signal -1 \n s_barrier_wait -1"); +#else #define __synclds() \ asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier"); +#endif #ifdef __GFX9__ #define STORE(DST, SRC) \ @@ -36,7 +41,7 @@ { __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); } #endif -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__) #define __trace_hwreg() #else #define __trace_hwreg() \ diff --git a/src/device/msccl_kernel_impl.h b/src/device/msccl_kernel_impl.h index b3f4b29a6f..77619347a2 100644 --- a/src/device/msccl_kernel_impl.h +++ b/src/device/msccl_kernel_impl.h @@ -50,7 +50,11 @@ extern __shared__ struct mscclShmemData mscclShmem; inline __device__ static void barrier(int nthreads) { #if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) assert(nthreads == NCCL_MAX_NTHREADS); - __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier"); + #ifdef __GFX12__ + __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier_signal -1\ns_barrier_wait -1"); + #else + __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier"); + #endif #else asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15)); #endif