From 6ade5065b4c0e6338f69778dcd7d477ef5bfcbcd Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Mon, 18 Aug 2025 09:16:41 -0500 Subject: [PATCH] device: optimize threadfence for ll64 protocol (#1858) * device: optimize threadfence for ll64 protocol * device: use __atomic_signal_fence() --------- Co-authored-by: Nusrat Islam --- src/device/prims_ll128.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h index 698ab7b032..f36964c888 100644 --- a/src/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -138,11 +138,13 @@ private: if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += 1); } inline __device__ void postSend() { + __atomic_signal_fence(__ATOMIC_SEQ_CST); + asm volatile("s_waitcnt lgkmcnt(0) vmcnt(0)"); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + if (sendConnTailPtr) { #if __CUDA_ARCH__ >= 900 __threadfence_system(); -#else - __threadfence(); #endif STORE((unsigned long long *)sendConnTailPtr, sendConnTail += 1); }