Add support for extended fine grained system memory pool (#1770)

* Add support for extended fine-grained system memory pool
* Use hipHostRegisterUncached
* Add "sc0 sc1" flags for LL store on gfx950
* Update after HIP flag is changed to hipExtHostRegisterUncached
This commit is contained in:
Wenkai Du
2025-07-01 14:38:49 -07:00
کامیت شده توسط GitHub
والد 3e51c41dcb
کامیت 4640ab19b3
4فایلهای تغییر یافته به همراه28 افزوده شده و 1 حذف شده
+9
مشاهده پرونده
@@ -209,6 +209,9 @@ set(CMAKE_REQUIRED_LIBRARIES hip::device)
### Check for hipDeviceMallocUncached support
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
### Check for hipHostMallocUncached support
check_symbol_exists("hipHostMallocUncached" "hip/hip_runtime_api.h" HIP_HOST_UNCACHED_MEMORY)
### Check for hipDeviceMallocContiguous support
check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY)
@@ -946,6 +949,12 @@ else()
message(STATUS "--hipcc-func-supp disabled")
endif()
endif()
if (HIP_HOST_UNCACHED_MEMORY)
target_compile_definitions(rccl PRIVATE HIP_HOST_UNCACHED_MEMORY)
message(STATUS "HIP_HOST_UNCACHED_MEMORY enabled")
else()
message(STATUS "HIP_HOST_UNCACHED_MEMORY disabled")
endif()
if (BUILD_BFD)
if (HAVE_BFD)
target_compile_definitions(rccl PRIVATE HAVE_BFD)
+11 -1
مشاهده پرونده
@@ -71,7 +71,7 @@ private:
inline __device__ void barrier() {
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
if (nthreads != WARP_SIZE)
#if defined(__gfx942__)
#if defined(__gfx942__) || (defined(__gfx950__) && defined(HIP_HOST_UNCACHED_MEMORY))
barrier_by_group_block();
#else
barrier_by_group();
@@ -260,6 +260,15 @@ private:
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#if defined(__gfx950__)
using Vec = uint32_t __attribute__((ext_vector_type(4)));
Vec i4;
i4[0] = val & 0xffffffff;
i4[1] = flag;
i4[2] = (val >> 32);
i4[3] = flag;
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(dst), "v"(i4));
#else
union ncclLLFifoLine i4;
i4.data1 = val & 0xffffffff;
i4.flag1 = flag;
@@ -267,6 +276,7 @@ private:
i4.flag2 = flag;
__builtin_nontemporal_store(i4.v[0], dst->v);
__builtin_nontemporal_store(i4.v[1], dst->v+1);
#endif
#else
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory");
#endif
+4
مشاهده پرونده
@@ -122,7 +122,11 @@ ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), hipDeviceMallocFinegrained), result, finish);
#endif
} else
#if defined(HIP_HOST_UNCACHED_MEMORY)
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped | hipHostMallocUncached), result, finish);
#else
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
#endif
memset(*ptr, 0, nelem*ncclSizeOfT<T>());
}
finish:
+4
مشاهده پرونده
@@ -114,7 +114,11 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void
}
if (devShmPtr) {
#if defined(HIP_HOST_UNCACHED_MEMORY)
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped | hipExtHostRegisterUncached), ret, fail);
#else
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
#endif
CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
}