diff --git a/tools/rccl-prim-test/copy_kernel.h b/tools/rccl-prim-test/copy_kernel.h index 7ea328c3eb..62e0bade2a 100644 --- a/tools/rccl-prim-test/copy_kernel.h +++ b/tools/rccl-prim-test/copy_kernel.h @@ -28,12 +28,12 @@ struct MULTI { template inline __device__ T vFetch(const volatile T* ptr) { - return *ptr; + return __builtin_nontemporal_load(ptr); } template inline __device__ void vStore(volatile T* ptr, const T val) { - *ptr = val; + __builtin_nontemporal_store(val, ptr); } template @@ -86,16 +86,16 @@ struct MULTI128 { inline __device__ void Fetch128(Pack128& v, const Pack128* p) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - v.x = p->x; - v.y = p->y; + v.x = __builtin_nontemporal_load(&p->x); + v.y = __builtin_nontemporal_load(&p->y); #else asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory"); #endif } inline __device__ void Store128(Pack128* p, Pack128& v) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - p->x = v.x; - p->y = v.y; + __builtin_nontemporal_store(v.x, &p->x); + __builtin_nontemporal_store(v.y, &p->y); #else asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory"); #endif