Use finegrain allocator by default (#140)
* Use FineGrained allocator for heap by default, consolidate all types of
allocators under saner cmake controls
Co-authored-by: Yiltan <ytemucin@amd.com>
* Uncached may not be only for debug
Need to include the rocshmem config otherwise produce an inconsistent
build with different allocators used in different files
* Undo this pr adding presumably useless hip_host_allocator_noncoherent
* Rename HEAP_IS_COHERENT/USE_COHERENT_HEAP to USE_HDP_FLUSH as the former
was misleading
* Remove unused __roc_inv()
---------
Co-authored-by: Yiltan <ytemucin@amd.com>
[ROCm/rocshmem commit: 41fd9e2d57]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
87d2efa430
Коммит
f3345dbf05
@@ -45,16 +45,19 @@ option(USE_RO "Enable RO conduit." ON)
|
||||
option(USE_IPC "Enable IPC support (using HIP)" OFF)
|
||||
option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
|
||||
option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
|
||||
option(USE_COHERENT_HEAP "Enable support for coherent systems" OFF)
|
||||
option(USE_MANAGED_HEAP "Enable managed memory" OFF)
|
||||
option(USE_HOST_HEAP "Enable host memory using malloc/free" OFF)
|
||||
option(USE_HIP_HOST_HEAP "Enable host memory using hip api" OFF)
|
||||
option(USE_HEAP_DEVICE_FINEGRAIN "Heap uses GPU memory in finegrain mode" ON)
|
||||
option(USE_HEAP_DEVICE_UNCACHED "Heap uses GPU memory in uncached mode" OFF)
|
||||
option(USE_HEAP_DEVICE_COARSEGRAIN "Heap uses GPU memory in coarsegrain mode" OFF)
|
||||
option(USE_HEAP_MANAGED "Heap uses managed memory" OFF)
|
||||
option(USE_HEAP_HOST_HIP "Heap uses pinned host memory allocated with hip api" OFF)
|
||||
option(USE_HEAP_HOST "Heap uses host memory allocated with malloc/free" OFF)
|
||||
option(USE_ALLOC_DLMALLOC "Enable dlmalloc device memory allocator" ON)
|
||||
option(USE_ALLOC_POW2BINS "Enable legacy Pow2Bins device memory allocator" OFF)
|
||||
option(USE_FUNC_CALL "Force compiler to use function calls on library API" OFF)
|
||||
option(USE_SHARED_CTX "Request support for shared ctx between WG" OFF)
|
||||
option(USE_SINGLE_NODE "Enable single node support only." OFF)
|
||||
option(USE_HOST_SIDE_HDP_FLUSH "Use a polling thread to flush the HDP cache on the host." OFF)
|
||||
option(USE_HDP_FLUSH "Force flush the HDP cache." OFF)
|
||||
option(USE_HDP_FLUSH_HOST_SIDE "Use a polling thread to flush the HDP cache on the host." OFF)
|
||||
|
||||
option(BUILD_FUNCTIONAL_TESTS "Build the functional tests" OFF)
|
||||
option(BUILD_EXAMPLES "Build the examples" ON)
|
||||
|
||||
@@ -29,12 +29,15 @@
|
||||
#cmakedefine USE_THREADS
|
||||
#cmakedefine USE_SHARED_CTX
|
||||
#cmakedefine USE_WF_COAL
|
||||
#cmakedefine USE_COHERENT_HEAP
|
||||
#cmakedefine USE_MANAGED_HEAP
|
||||
#cmakedefine USE_HOST_HEAP
|
||||
#cmakedefine USE_HIP_HOST_HEAP
|
||||
#cmakedefine USE_HEAP_DEVICE_FINEGRAIN
|
||||
#cmakedefine USE_HEAP_DEVICE_UNCACHED
|
||||
#cmakedefine USE_HEAP_DEVICE_COARSEGRAIN
|
||||
#cmakedefine USE_HEAP_MANAGED
|
||||
#cmakedefine USE_HEAP_HOST_HIP
|
||||
#cmakedefine USE_HEAP_HOST
|
||||
#cmakedefine USE_ALLOC_DLMALLOC
|
||||
#cmakedefine USE_ALLOC_POW2BINS
|
||||
#cmakedefine USE_FUNC_CALL
|
||||
#cmakedefine USE_SINGLE_NODE
|
||||
#cmakedefine USE_HOST_SIDE_HDP_FLUSH
|
||||
#cmakedefine USE_HDP_FLUSH
|
||||
#cmakedefine USE_HDP_FLUSH_HOST_SIDE
|
||||
|
||||
@@ -45,11 +45,11 @@ cmake \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_RO=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_SINGLE_NODE=ON \
|
||||
-DUSE_HOST_SIDE_HDP_FLUSH=OFF \
|
||||
-DUSE_HDP_FLUSH=OFF \
|
||||
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
|
||||
-DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \
|
||||
-DBUILD_FUNCTIONAL_TESTS=ON \
|
||||
-DBUILD_UNIT_TESTS=ON \
|
||||
|
||||
@@ -48,11 +48,11 @@ cmake \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_RO=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_SINGLE_NODE=ON \
|
||||
-DUSE_HOST_SIDE_HDP_FLUSH=OFF \
|
||||
-DUSE_HDP_FLUSH=OFF \
|
||||
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
|
||||
-DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \
|
||||
-DBUILD_TESTS_ONLY=ON \
|
||||
-DBUILD_FUNCTIONAL_TESTS=ON \
|
||||
|
||||
@@ -44,11 +44,10 @@ cmake \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_IPC=ON \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_HOST_SIDE_HDP_FLUSH=OFF\
|
||||
-DUSE_MANAGED_HEAP=OFF \
|
||||
-DUSE_HDP_FLUSH=OFF \
|
||||
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
|
||||
-DUSE_RO=ON \
|
||||
-DBUILD_FUNCTIONAL_TESTS=ON \
|
||||
-DBUILD_UNIT_TESTS=ON \
|
||||
|
||||
@@ -44,11 +44,10 @@ cmake \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_HOST_SIDE_HDP_FLUSH=OFF\
|
||||
-DUSE_MANAGED_HEAP=OFF \
|
||||
-DUSE_HDP_FLUSH=OFF \
|
||||
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
|
||||
-DUSE_RO=ON \
|
||||
-DBUILD_FUNCTIONAL_TESTS=ON \
|
||||
-DBUILD_UNIT_TESTS=ON \
|
||||
|
||||
@@ -42,11 +42,10 @@ cmake \
|
||||
-DDEBUG=OFF \
|
||||
-DPROFILE=OFF \
|
||||
-DUSE_IPC=OFF \
|
||||
-DUSE_COHERENT_HEAP=ON \
|
||||
-DUSE_THREADS=OFF \
|
||||
-DUSE_WF_COAL=OFF \
|
||||
-DUSE_HOST_SIDE_HDP_FLUSH=OFF\
|
||||
-DUSE_MANAGED_HEAP=OFF \
|
||||
-DUSE_HDP_FLUSH=OFF \
|
||||
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
|
||||
-DUSE_RO=ON \
|
||||
-DBUILD_FUNCTIONAL_TESTS=ON \
|
||||
-DBUILD_UNIT_TESTS=ON \
|
||||
|
||||
@@ -165,23 +165,8 @@ NOWARN(-Wdeprecated-volatile,
|
||||
)
|
||||
// clang-format on
|
||||
|
||||
__device__ __forceinline__ void __roc_inv() {
|
||||
#if defined USE_COHERENT_HEAP
|
||||
#if defined(__gfx906__)
|
||||
#endif
|
||||
#if defined(__gfx908__)
|
||||
#endif
|
||||
#if defined(__gfx90a__)
|
||||
// asm volatile("buffer_wbinvl1;");
|
||||
#endif
|
||||
#if defined(__gfx942__)
|
||||
// asm volatile("buffer_inv sc0 sc1;");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void __roc_flush() {
|
||||
#if defined USE_COHERENT_HEAP
|
||||
#if not defined USE_HDP_FLUSH
|
||||
#if defined(__gfx906__)
|
||||
#endif
|
||||
#if defined(__gfx908__)
|
||||
|
||||
@@ -182,16 +182,16 @@ class NoHdpPolicy {
|
||||
/*
|
||||
* Select which one of our HDP policies to use at compile time.
|
||||
*/
|
||||
#ifdef USE_COHERENT_HEAP
|
||||
typedef NoHdpPolicy HdpPolicy;
|
||||
#else
|
||||
#if defined USE_HDP_FLUSH
|
||||
// Only when we are using the IB conduit, we have to use a polling thread to
|
||||
// flush the HDP cache on the GPU's behalf.
|
||||
#ifdef USE_HOST_SIDE_HDP_FLUSH
|
||||
#if defined USE_HDP_FLUSH_HOST_SIDE
|
||||
typedef HdpHostSideFlushRocmPolicy HdpPolicy;
|
||||
#else
|
||||
typedef HdpDeviceSideFlushRocmPolicy HdpPolicy;
|
||||
#endif
|
||||
#else
|
||||
typedef NoHdpPolicy HdpPolicy;
|
||||
#endif
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
@@ -118,16 +118,16 @@ __host__ HostInterface::HostInterface(HdpPolicy* hdp_policy,
|
||||
new HostContextWindowInfo(host_comm_world_, heap);
|
||||
}
|
||||
|
||||
#if !defined(USE_COHERENT_HEAP) && !defined(USE_SINGLE_NODE)
|
||||
#if defined(USE_HDP_FLUSH) && !defined(USE_SINGLE_NODE)
|
||||
// The single node implementation needs a different path since
|
||||
// the HDP flush pointers are allocated on the symmetric heap
|
||||
// and we need to wait for other initialization to happen before
|
||||
// calling `get_hdp_flush_ptr`.
|
||||
create_hdp_window();
|
||||
#endif // defined(USE_COHERENT_HEAP) && !defined(USE_SINGLE_NODE)
|
||||
#endif // defined(USE_HDP_FLUSH) && !defined(USE_SINGLE_NODE)
|
||||
}
|
||||
|
||||
#ifndef USE_COHERENT_HEAP
|
||||
#if defined USE_HDP_FLUSH
|
||||
__host__ void HostInterface::create_hdp_window() {
|
||||
MPI_Win_create(hdp_policy_->get_hdp_flush_ptr(),
|
||||
sizeof(unsigned int), /* size of window */
|
||||
@@ -142,14 +142,14 @@ __host__ void HostInterface::create_hdp_window() {
|
||||
*/
|
||||
MPI_Win_lock_all(MPI_MODE_NOCHECK, hdp_win);
|
||||
}
|
||||
#endif // USE_COHERENT_HEAP
|
||||
#endif // USE_HDP_FLUSH
|
||||
|
||||
__host__ HostInterface::~HostInterface() {
|
||||
#ifndef USE_COHERENT_HEAP
|
||||
#if defined USE_HDP_FLUSH
|
||||
MPI_Win_unlock_all(hdp_win);
|
||||
|
||||
MPI_Win_free(&hdp_win);
|
||||
#endif // USE_COHERENT_HEAP
|
||||
#endif // USE_HDP_FLUSH
|
||||
|
||||
/* Detroy the pool of contexts */
|
||||
for (int ctx_i = 0; ctx_i < max_num_ctxs_; ctx_i++) {
|
||||
|
||||
@@ -248,16 +248,16 @@ class HostInterface {
|
||||
template <typename T>
|
||||
__host__ int test(T *ivars, int cmp, T val, WindowInfo* window_info);
|
||||
|
||||
#ifndef USE_COHERENT_HEAP
|
||||
#if defined USE_HDP_FLUSH
|
||||
__host__ void create_hdp_window();
|
||||
#endif // USE_COHERENT_HEAP
|
||||
#endif // USE_HDP_FLUSH
|
||||
|
||||
private:
|
||||
/**************************************************************************
|
||||
**************************** INTERNAL METHODS ****************************
|
||||
*************************************************************************/
|
||||
__host__ void flush_remote_hdps() {
|
||||
#ifndef USE_COHERENT_HEAP
|
||||
#if defined USE_HDP_FLUSH
|
||||
unsigned flush_val{HdpPolicy::HDP_FLUSH_VAL};
|
||||
for (size_t i{0}; i < num_pes_; i++) {
|
||||
if (i == my_pe_) {
|
||||
@@ -266,15 +266,15 @@ class HostInterface {
|
||||
MPI_Put(&flush_val, 1, MPI_UNSIGNED, i, 0, 1, MPI_UNSIGNED, hdp_win);
|
||||
}
|
||||
MPI_Win_flush_all(hdp_win);
|
||||
#endif // USE_COHERENT_HEAP
|
||||
#endif // USE_HDP_FLUSH
|
||||
}
|
||||
|
||||
__host__ void flush_remote_hdp(int pe) {
|
||||
#ifndef USE_COHERENT_HEAP
|
||||
#if defined USE_HDP_FLUSH
|
||||
unsigned flush_val{HdpPolicy::HDP_FLUSH_VAL};
|
||||
MPI_Put(&flush_val, 1, MPI_UNSIGNED, pe, 0, 1, MPI_UNSIGNED, hdp_win);
|
||||
MPI_Win_flush(pe, hdp_win);
|
||||
#endif // USE_COHERENT_HEAP
|
||||
#endif // USE_HDP_FLUSH
|
||||
}
|
||||
|
||||
__host__ void initiate_put(void* dest, const void* source, size_t nelems,
|
||||
@@ -333,12 +333,12 @@ class HostInterface {
|
||||
*/
|
||||
int num_pes_{0};
|
||||
|
||||
#ifndef USE_COHERENT_HEAP
|
||||
#if defined USE_HDP_FLUSH
|
||||
/**
|
||||
* @brief MPI window for hdp flushing
|
||||
*/
|
||||
MPI_Win hdp_win;
|
||||
#endif // USE_COHERENT_HEAP
|
||||
#endif // USE_HDP_FLUSH
|
||||
|
||||
/**
|
||||
* @brief Max number of contexts for the application
|
||||
|
||||
@@ -40,16 +40,25 @@
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
#if defined USE_MANAGED_HEAP
|
||||
using HEAP_T = HeapMemory<HIPAllocatorManaged>;
|
||||
#elif defined USE_COHERENT_HEAP
|
||||
// Compilation error 'HEAP_T redefined' indicates that user had more than one
|
||||
// USE_HEAP_* ON when configuring. Use ccmake to select only one.
|
||||
#if defined USE_HEAP_DEVICE_COARSEGRAIN
|
||||
using HEAP_T = HeapMemory<HIPAllocator>;
|
||||
#elif defined USE_HOST_HEAP
|
||||
using HEAP_T = HeapMemory<HostAllocator>;
|
||||
#elif defined USE_HIP_HOST_HEAP
|
||||
#endif
|
||||
#if defined USE_HEAP_DEVICE_FINEGRAIN
|
||||
using HEAP_T = HeapMemory<HIPAllocatorFinegrained>;
|
||||
#endif
|
||||
#if defined USE_HEAP_DEVICE_UNCACHED
|
||||
using HEAP_T = HeapMemory<HIPAllocatorUncached>;
|
||||
#endif
|
||||
#if defined USE_HEAP_MANAGED
|
||||
using HEAP_T = HeapMemory<HIPAllocatorManaged>;
|
||||
#endif
|
||||
#if defined USE_HEAP_HOST_HIP
|
||||
using HEAP_T = HeapMemory<HIPHostAllocator>;
|
||||
#else
|
||||
using HEAP_T = HeapMemory<HIPDefaultFinegrainedAllocator>;
|
||||
#endif
|
||||
#if defined USE_HEAP_HOST
|
||||
using HEAP_T = HeapMemory<HostAllocator>;
|
||||
#endif
|
||||
|
||||
} // namespace rocshmem
|
||||
|
||||
@@ -36,12 +36,15 @@
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
|
||||
#include "rocshmem_config.h" // NOLINT(build/include_subdir)
|
||||
#include "memory_allocator.hpp"
|
||||
|
||||
// `hipDeviceMallocUncached` was introduced at ROCm 5.5
|
||||
#if (HIP_VERSION_MAJOR > 5) || \
|
||||
(HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 5)
|
||||
#define HIP_SUPPORTS_MALLOC_UNCACHED
|
||||
#elif defined USE_HEAP_DEVICE_UNCACHED
|
||||
#error "USE_HEAP_DEVICE_UNCACHED unsupported in this HIP version"
|
||||
#endif
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -57,13 +60,14 @@ class HIPAllocatorFinegrained : public MemoryAllocator {
|
||||
hipDeviceMallocFinegrained) {}
|
||||
};
|
||||
|
||||
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
|
||||
#if defined HIP_SUPPORTS_MALLOC_UNCACHED
|
||||
class HIPAllocatorUncached : public MemoryAllocator {
|
||||
public:
|
||||
HIPAllocatorUncached()
|
||||
: MemoryAllocator(hipExtMallocWithFlags, hipFree,
|
||||
hipDeviceMallocUncached) {}
|
||||
};
|
||||
|
||||
// The default fine-grained coherence allocator is the uncached allocator
|
||||
using HIPDefaultFinegrainedAllocator = HIPAllocatorUncached;
|
||||
#else
|
||||
|
||||
Ссылка в новой задаче
Block a user