diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h index 6ea594ce02..aafb0ec9b5 100644 --- a/src/collectives/device/prims_simple.h +++ b/src/collectives/device/prims_simple.h @@ -153,7 +153,11 @@ private: if (flags & (Recv*RolePostRecv | Send*RolePostSend)) { step += StepPerSlice; +#if defined(__gfx90a__) atomicExch_system((unsigned long long *)connStepPtr, step); +#else + __atomic_store_n(connStepPtr, step, __ATOMIC_SEQ_CST); +#endif } } @@ -319,10 +323,10 @@ private: } barrier(); // This barrier has a counterpart in following loop -#if defined(__gfx1030__) - if (Send && (flags & RolePostSend) && index == 0) __threadfence_system(); +#if defined(__gfx90a__) + if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && index == 0) __threadfence_system(); #else - if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && index == 0) __threadfence_system(); + if (Send && (flags & RolePostSend) && index == 0) __threadfence_system(); #endif __syncwarp(); postPeer(); @@ -343,10 +347,10 @@ private: waitPeer(0, 0, 0, 0); } barrier(); // Has couterpart in preceding worker-only loop. -#if defined(__gfx1030__) - if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system(); -#else +#if defined(__gfx90a__) if ((MaxSend == 0 || MaxRecv == 0) && Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system(); +#else + if (Send && (flags & RolePostSend) && sliceSize > 0 && index == 0) __threadfence_system(); #endif __syncwarp(); postPeer();