Use finegrain allocator by default (#140)

* Use FineGrained allocator for heap by default, consolidate all types of
allocators under saner cmake controls

Co-authored-by: Yiltan <ytemucin@amd.com>

* Uncached may not be only for debug

Need to include the rocshmem config otherwise produce an inconsistent
build with different allocators used in different files

* Undo this pr adding presumably useless hip_host_allocator_noncoherent

* Rename HEAP_IS_COHERENT/USE_COHERENT_HEAP to USE_HDP_FLUSH as the former
was misleading

* Remove unused __roc_inv()

---------

Co-authored-by: Yiltan <ytemucin@amd.com>
이 커밋은 다음에 포함됨:
Aurelien Bouteiller
2025-06-13 15:26:26 -04:00
커밋한 사람 GitHub
부모 bcc14b1a34
커밋 41fd9e2d57
13개의 변경된 파일67개의 추가작업 그리고 66개의 파일을 삭제
+8 -5
파일 보기
@@ -45,16 +45,19 @@ option(USE_RO "Enable RO conduit." ON)
option(USE_IPC "Enable IPC support (using HIP)" OFF)
option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
option(USE_COHERENT_HEAP "Enable support for coherent systems" OFF)
option(USE_MANAGED_HEAP "Enable managed memory" OFF)
option(USE_HOST_HEAP "Enable host memory using malloc/free" OFF)
option(USE_HIP_HOST_HEAP "Enable host memory using hip api" OFF)
option(USE_HEAP_DEVICE_FINEGRAIN "Heap uses GPU memory in finegrain mode" ON)
option(USE_HEAP_DEVICE_UNCACHED "Heap uses GPU memory in uncached mode" OFF)
option(USE_HEAP_DEVICE_COARSEGRAIN "Heap uses GPU memory in coarsegrain mode" OFF)
option(USE_HEAP_MANAGED "Heap uses managed memory" OFF)
option(USE_HEAP_HOST_HIP "Heap uses pinned host memory allocated with hip api" OFF)
option(USE_HEAP_HOST "Heap uses host memory allocated with malloc/free" OFF)
option(USE_ALLOC_DLMALLOC "Enable dlmalloc device memory allocator" ON)
option(USE_ALLOC_POW2BINS "Enable legacy Pow2Bins device memory allocator" OFF)
option(USE_FUNC_CALL "Force compiler to use function calls on library API" OFF)
option(USE_SHARED_CTX "Request support for shared ctx between WG" OFF)
option(USE_SINGLE_NODE "Enable single node support only." OFF)
option(USE_HOST_SIDE_HDP_FLUSH "Use a polling thread to flush the HDP cache on the host." OFF)
option(USE_HDP_FLUSH "Force flush the HDP cache." OFF)
option(USE_HDP_FLUSH_HOST_SIDE "Use a polling thread to flush the HDP cache on the host." OFF)
option(BUILD_FUNCTIONAL_TESTS "Build the functional tests" OFF)
option(BUILD_EXAMPLES "Build the examples" ON)
+8 -5
파일 보기
@@ -29,12 +29,15 @@
#cmakedefine USE_THREADS
#cmakedefine USE_SHARED_CTX
#cmakedefine USE_WF_COAL
#cmakedefine USE_COHERENT_HEAP
#cmakedefine USE_MANAGED_HEAP
#cmakedefine USE_HOST_HEAP
#cmakedefine USE_HIP_HOST_HEAP
#cmakedefine USE_HEAP_DEVICE_FINEGRAIN
#cmakedefine USE_HEAP_DEVICE_UNCACHED
#cmakedefine USE_HEAP_DEVICE_COARSEGRAIN
#cmakedefine USE_HEAP_MANAGED
#cmakedefine USE_HEAP_HOST_HIP
#cmakedefine USE_HEAP_HOST
#cmakedefine USE_ALLOC_DLMALLOC
#cmakedefine USE_ALLOC_POW2BINS
#cmakedefine USE_FUNC_CALL
#cmakedefine USE_SINGLE_NODE
#cmakedefine USE_HOST_SIDE_HDP_FLUSH
#cmakedefine USE_HDP_FLUSH
#cmakedefine USE_HDP_FLUSH_HOST_SIDE
+2 -2
파일 보기
@@ -45,11 +45,11 @@ cmake \
-DPROFILE=OFF \
-DUSE_RO=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HOST_SIDE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
+2 -2
파일 보기
@@ -48,11 +48,11 @@ cmake \
-DPROFILE=OFF \
-DUSE_RO=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HOST_SIDE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \
-DBUILD_TESTS_ONLY=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
+2 -3
파일 보기
@@ -44,11 +44,10 @@ cmake \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HOST_SIDE_HDP_FLUSH=OFF\
-DUSE_MANAGED_HEAP=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-DUSE_RO=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
+2 -3
파일 보기
@@ -44,11 +44,10 @@ cmake \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_IPC=OFF \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HOST_SIDE_HDP_FLUSH=OFF\
-DUSE_MANAGED_HEAP=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-DUSE_RO=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
+2 -3
파일 보기
@@ -42,11 +42,10 @@ cmake \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_IPC=OFF \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HOST_SIDE_HDP_FLUSH=OFF\
-DUSE_MANAGED_HEAP=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-DUSE_RO=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
+1 -16
파일 보기
@@ -165,23 +165,8 @@ NOWARN(-Wdeprecated-volatile,
)
// clang-format on
__device__ __forceinline__ void __roc_inv() {
#if defined USE_COHERENT_HEAP
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
// asm volatile("buffer_wbinvl1;");
#endif
#if defined(__gfx942__)
// asm volatile("buffer_inv sc0 sc1;");
#endif
#endif
}
__device__ __forceinline__ void __roc_flush() {
#if defined USE_COHERENT_HEAP
#if not defined USE_HDP_FLUSH
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
+4 -4
파일 보기
@@ -182,16 +182,16 @@ class NoHdpPolicy {
/*
* Select which one of our HDP policies to use at compile time.
*/
#ifdef USE_COHERENT_HEAP
typedef NoHdpPolicy HdpPolicy;
#else
#if defined USE_HDP_FLUSH
// Only when we are using the IB conduit, we have to use a polling thread to
// flush the HDP cache on the GPU's behalf.
#ifdef USE_HOST_SIDE_HDP_FLUSH
#if defined USE_HDP_FLUSH_HOST_SIDE
typedef HdpHostSideFlushRocmPolicy HdpPolicy;
#else
typedef HdpDeviceSideFlushRocmPolicy HdpPolicy;
#endif
#else
typedef NoHdpPolicy HdpPolicy;
#endif
} // namespace rocshmem
+6 -6
파일 보기
@@ -118,16 +118,16 @@ __host__ HostInterface::HostInterface(HdpPolicy* hdp_policy,
new HostContextWindowInfo(host_comm_world_, heap);
}
#if !defined(USE_COHERENT_HEAP) && !defined(USE_SINGLE_NODE)
#if defined(USE_HDP_FLUSH) && !defined(USE_SINGLE_NODE)
// The single node implementation needs a different path since
// the HDP flush pointers are allocated on the symmetric heap
// and we need to wait for other initialization to happen before
// calling `get_hdp_flush_ptr`.
create_hdp_window();
#endif // defined(USE_COHERENT_HEAP) && !defined(USE_SINGLE_NODE)
#endif // defined(USE_HDP_FLUSH) && !defined(USE_SINGLE_NODE)
}
#ifndef USE_COHERENT_HEAP
#if defined USE_HDP_FLUSH
__host__ void HostInterface::create_hdp_window() {
MPI_Win_create(hdp_policy_->get_hdp_flush_ptr(),
sizeof(unsigned int), /* size of window */
@@ -142,14 +142,14 @@ __host__ void HostInterface::create_hdp_window() {
*/
MPI_Win_lock_all(MPI_MODE_NOCHECK, hdp_win);
}
#endif // USE_COHERENT_HEAP
#endif // USE_HDP_FLUSH
__host__ HostInterface::~HostInterface() {
#ifndef USE_COHERENT_HEAP
#if defined USE_HDP_FLUSH
MPI_Win_unlock_all(hdp_win);
MPI_Win_free(&hdp_win);
#endif // USE_COHERENT_HEAP
#endif // USE_HDP_FLUSH
/* Detroy the pool of contexts */
for (int ctx_i = 0; ctx_i < max_num_ctxs_; ctx_i++) {
+8 -8
파일 보기
@@ -248,16 +248,16 @@ class HostInterface {
template <typename T>
__host__ int test(T *ivars, int cmp, T val, WindowInfo* window_info);
#ifndef USE_COHERENT_HEAP
#if defined USE_HDP_FLUSH
__host__ void create_hdp_window();
#endif // USE_COHERENT_HEAP
#endif // USE_HDP_FLUSH
private:
/**************************************************************************
**************************** INTERNAL METHODS ****************************
*************************************************************************/
__host__ void flush_remote_hdps() {
#ifndef USE_COHERENT_HEAP
#if defined USE_HDP_FLUSH
unsigned flush_val{HdpPolicy::HDP_FLUSH_VAL};
for (size_t i{0}; i < num_pes_; i++) {
if (i == my_pe_) {
@@ -266,15 +266,15 @@ class HostInterface {
MPI_Put(&flush_val, 1, MPI_UNSIGNED, i, 0, 1, MPI_UNSIGNED, hdp_win);
}
MPI_Win_flush_all(hdp_win);
#endif // USE_COHERENT_HEAP
#endif // USE_HDP_FLUSH
}
__host__ void flush_remote_hdp(int pe) {
#ifndef USE_COHERENT_HEAP
#if defined USE_HDP_FLUSH
unsigned flush_val{HdpPolicy::HDP_FLUSH_VAL};
MPI_Put(&flush_val, 1, MPI_UNSIGNED, pe, 0, 1, MPI_UNSIGNED, hdp_win);
MPI_Win_flush(pe, hdp_win);
#endif // USE_COHERENT_HEAP
#endif // USE_HDP_FLUSH
}
__host__ void initiate_put(void* dest, const void* source, size_t nelems,
@@ -333,12 +333,12 @@ class HostInterface {
*/
int num_pes_{0};
#ifndef USE_COHERENT_HEAP
#if defined USE_HDP_FLUSH
/**
* @brief MPI window for hdp flushing
*/
MPI_Win hdp_win;
#endif // USE_COHERENT_HEAP
#endif // USE_HDP_FLUSH
/**
* @brief Max number of contexts for the application
+17 -8
파일 보기
@@ -40,16 +40,25 @@
namespace rocshmem {
#if defined USE_MANAGED_HEAP
using HEAP_T = HeapMemory<HIPAllocatorManaged>;
#elif defined USE_COHERENT_HEAP
// Compilation error 'HEAP_T redefined' indicates that user had more than one
// USE_HEAP_* ON when configuring. Use ccmake to select only one.
#if defined USE_HEAP_DEVICE_COARSEGRAIN
using HEAP_T = HeapMemory<HIPAllocator>;
#elif defined USE_HOST_HEAP
using HEAP_T = HeapMemory<HostAllocator>;
#elif defined USE_HIP_HOST_HEAP
#endif
#if defined USE_HEAP_DEVICE_FINEGRAIN
using HEAP_T = HeapMemory<HIPAllocatorFinegrained>;
#endif
#if defined USE_HEAP_DEVICE_UNCACHED
using HEAP_T = HeapMemory<HIPAllocatorUncached>;
#endif
#if defined USE_HEAP_MANAGED
using HEAP_T = HeapMemory<HIPAllocatorManaged>;
#endif
#if defined USE_HEAP_HOST_HIP
using HEAP_T = HeapMemory<HIPHostAllocator>;
#else
using HEAP_T = HeapMemory<HIPDefaultFinegrainedAllocator>;
#endif
#if defined USE_HEAP_HOST
using HEAP_T = HeapMemory<HostAllocator>;
#endif
} // namespace rocshmem
+5 -1
파일 보기
@@ -36,12 +36,15 @@
#include <cstdlib>
#include <limits>
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
#include "memory_allocator.hpp"
// `hipDeviceMallocUncached` was introduced at ROCm 5.5
#if (HIP_VERSION_MAJOR > 5) || \
(HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 5)
#define HIP_SUPPORTS_MALLOC_UNCACHED
#elif defined USE_HEAP_DEVICE_UNCACHED
#error "USE_HEAP_DEVICE_UNCACHED unsupported in this HIP version"
#endif
namespace rocshmem {
@@ -57,13 +60,14 @@ class HIPAllocatorFinegrained : public MemoryAllocator {
hipDeviceMallocFinegrained) {}
};
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
#if defined HIP_SUPPORTS_MALLOC_UNCACHED
class HIPAllocatorUncached : public MemoryAllocator {
public:
HIPAllocatorUncached()
: MemoryAllocator(hipExtMallocWithFlags, hipFree,
hipDeviceMallocUncached) {}
};
// The default fine-grained coherence allocator is the uncached allocator
using HIPDefaultFinegrainedAllocator = HIPAllocatorUncached;
#else