Add support for extended fine grained system memory pool (#1770)
* Add support for extended fine-grained system memory pool * Use hipHostRegisterUncached * Add "sc0 sc1" flags for LL store on gfx950 * Update after HIP flag is changed to hipExtHostRegisterUncached
This commit is contained in:
@@ -209,6 +209,9 @@ set(CMAKE_REQUIRED_LIBRARIES hip::device)
|
||||
### Check for hipDeviceMallocUncached support
|
||||
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
|
||||
|
||||
### Check for hipHostMallocUncached support
|
||||
check_symbol_exists("hipHostMallocUncached" "hip/hip_runtime_api.h" HIP_HOST_UNCACHED_MEMORY)
|
||||
|
||||
### Check for hipDeviceMallocContiguous support
|
||||
check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY)
|
||||
|
||||
@@ -946,6 +949,12 @@ else()
|
||||
message(STATUS "--hipcc-func-supp disabled")
|
||||
endif()
|
||||
endif()
|
||||
if (HIP_HOST_UNCACHED_MEMORY)
|
||||
target_compile_definitions(rccl PRIVATE HIP_HOST_UNCACHED_MEMORY)
|
||||
message(STATUS "HIP_HOST_UNCACHED_MEMORY enabled")
|
||||
else()
|
||||
message(STATUS "HIP_HOST_UNCACHED_MEMORY disabled")
|
||||
endif()
|
||||
if (BUILD_BFD)
|
||||
if (HAVE_BFD)
|
||||
target_compile_definitions(rccl PRIVATE HAVE_BFD)
|
||||
|
||||
@@ -71,7 +71,7 @@ private:
|
||||
inline __device__ void barrier() {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
if (nthreads != WARP_SIZE)
|
||||
#if defined(__gfx942__)
|
||||
#if defined(__gfx942__) || (defined(__gfx950__) && defined(HIP_HOST_UNCACHED_MEMORY))
|
||||
barrier_by_group_block();
|
||||
#else
|
||||
barrier_by_group();
|
||||
@@ -260,6 +260,15 @@ private:
|
||||
|
||||
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
#if defined(__gfx950__)
|
||||
using Vec = uint32_t __attribute__((ext_vector_type(4)));
|
||||
Vec i4;
|
||||
i4[0] = val & 0xffffffff;
|
||||
i4[1] = flag;
|
||||
i4[2] = (val >> 32);
|
||||
i4[3] = flag;
|
||||
asm volatile ("flat_store_dwordx4 %0, %1 sc0 sc1 nt" :: "v"(dst), "v"(i4));
|
||||
#else
|
||||
union ncclLLFifoLine i4;
|
||||
i4.data1 = val & 0xffffffff;
|
||||
i4.flag1 = flag;
|
||||
@@ -267,6 +276,7 @@ private:
|
||||
i4.flag2 = flag;
|
||||
__builtin_nontemporal_store(i4.v[0], dst->v);
|
||||
__builtin_nontemporal_store(i4.v[1], dst->v+1);
|
||||
#endif
|
||||
#else
|
||||
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag) : "memory");
|
||||
#endif
|
||||
|
||||
@@ -122,7 +122,11 @@ ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc
|
||||
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), hipDeviceMallocFinegrained), result, finish);
|
||||
#endif
|
||||
} else
|
||||
#if defined(HIP_HOST_UNCACHED_MEMORY)
|
||||
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped | hipHostMallocUncached), result, finish);
|
||||
#else
|
||||
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
|
||||
#endif
|
||||
memset(*ptr, 0, nelem*ncclSizeOfT<T>());
|
||||
}
|
||||
finish:
|
||||
|
||||
@@ -114,7 +114,11 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void
|
||||
}
|
||||
|
||||
if (devShmPtr) {
|
||||
#if defined(HIP_HOST_UNCACHED_MEMORY)
|
||||
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped | hipExtHostRegisterUncached), ret, fail);
|
||||
#else
|
||||
CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
|
||||
#endif
|
||||
CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
|
||||
}
|
||||
|
||||
|
||||
مرجع در شماره جدید
Block a user