diff --git a/projects/rccl/src/device/prims_ll.h b/projects/rccl/src/device/prims_ll.h index 0730ee2694..1b21da8b2d 100644 --- a/projects/rccl/src/device/prims_ll.h +++ b/projects/rccl/src/device/prims_ll.h @@ -270,8 +270,8 @@ private: i4.flag2 = flag; *((u64_gptr) dst->v) = *((u64_gptr) i4.v); *((u64_gptr) dst->v+1) = *((u64_gptr) i4.v+1); -#if defined(__gfx950__) && ROCM_VERSION < 70200 - __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); // flush cache +#if defined(__gfx950__) && ROCM_VERSION < 70002 + __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); // flush cache on gfx950 if ROCr fix for hipHostMallocUncached is not available (ROCm version < 7.0.2) #endif #else asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory"); @@ -346,8 +346,8 @@ private: __builtin_nontemporal_store(u4, (uint32_t*)dst); else __builtin_nontemporal_store(u8, (uint64_t*)dst); -#if defined(__gfx950__) && ROCM_VERSION < 70200 - __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); // flush cache +#if defined(__gfx950__) && ROCM_VERSION < 70002 + __builtin_amdgcn_fence(__ATOMIC_RELEASE, ""); // flush cache on gfx950 if ROCr fix for hipHostMallocUncached is not available (ROCm version < 7.0.2) #endif #else if(sizeof(U) == 1) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 4bfcb85cdc..94bb566459 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1410,7 +1410,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p comm -> gfx9CheapFenceOff = 0; } else if(IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")){ - comm -> gfx9CheapFenceOff = ROCM_VERSION < 70200 && nNodes > 1; // Enable for single node only prior to ROCm 7.0.2 + comm -> gfx9CheapFenceOff = ROCM_VERSION < 70002 && nNodes > 1; // Enable for single node only prior to ROCm 7.0.2 } } INFO(NCCL_INIT, "GFX9 cheap fence is %s", comm -> gfx9CheapFenceOff ? "OFF" : "ON");