From 4640ab19b386d75ad978e2dd7923d946432facf3 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:38:49 -0700 Subject: [PATCH] Add support for extended fine grained system memory pool (#1770) * Add support for extended fine-grained system memory pool * Use hipHostRegisterUncached * Add "sc0 sc1" flags for LL store on gfx950 * Update after HIP flag is changed to hipExtHostRegisterUncached --- CMakeLists.txt | 9 +++++++++ src/device/prims_ll.h | 12 +++++++++++- src/include/alloc.h | 4 ++++ src/misc/shmutils.cc | 4 ++++ 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 05a57c0a49..e7ec90a2e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,6 +209,9 @@ set(CMAKE_REQUIRED_LIBRARIES hip::device) ### Check for hipDeviceMallocUncached support check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY) +### Check for hipHostMallocUncached support +check_symbol_exists("hipHostMallocUncached" "hip/hip_runtime_api.h" HIP_HOST_UNCACHED_MEMORY) + ### Check for hipDeviceMallocContiguous support check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY) @@ -946,6 +949,12 @@ else() message(STATUS "--hipcc-func-supp disabled") endif() endif() +if (HIP_HOST_UNCACHED_MEMORY) + target_compile_definitions(rccl PRIVATE HIP_HOST_UNCACHED_MEMORY) + message(STATUS "HIP_HOST_UNCACHED_MEMORY enabled") +else() + message(STATUS "HIP_HOST_UNCACHED_MEMORY disabled") +endif() if (BUILD_BFD) if (HAVE_BFD) target_compile_definitions(rccl PRIVATE HAVE_BFD) diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h index 173a83c884..6f6018bb86 100644 --- a/src/device/prims_ll.h +++ b/src/device/prims_ll.h @@ -71,7 +71,7 @@ private: inline __device__ void barrier() { #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (nthreads != WARP_SIZE) - #if defined(__gfx942__) + #if defined(__gfx942__) || (defined(__gfx950__) && defined(HIP_HOST_UNCACHED_MEMORY)) barrier_by_group_block(); #else barrier_by_group(); @@ -260,6 +260,15 @@ private: __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) +#if defined(__gfx950__) + using Vec = uint32_t __attribute__((ext_vector_type(4))); + Vec i4; + i4[0] = val & 0xffffffff; + i4[1] = flag; + i4[2] = (val >> 32); + i4[3] = flag; + asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(dst), "v"(i4)); +#else union ncclLLFifoLine i4; i4.data1 = val & 0xffffffff; i4.flag1 = flag; @@ -267,6 +276,7 @@ private: i4.flag2 = flag; __builtin_nontemporal_store(i4.v[0], dst->v); __builtin_nontemporal_store(i4.v[1], dst->v+1); +#endif #else asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory"); #endif diff --git a/src/include/alloc.h b/src/include/alloc.h index 3bac16603b..9a840138f2 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -122,7 +122,11 @@ ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT(), hipDeviceMallocFinegrained), result, finish); #endif } else +#if defined(HIP_HOST_UNCACHED_MEMORY) + CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT(), cudaHostAllocMapped | hipHostMallocUncached), result, finish); +#else CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT(), cudaHostAllocMapped), result, finish); +#endif memset(*ptr, 0, nelem*ncclSizeOfT()); } finish: diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc index eb9cd10156..a58ac08a81 100644 --- a/src/misc/shmutils.cc +++ b/src/misc/shmutils.cc @@ -114,7 +114,11 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void } if (devShmPtr) { +#if defined(HIP_HOST_UNCACHED_MEMORY) + CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped | hipExtHostRegisterUncached), ret, fail); +#else CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail); +#endif CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail); }