diff --git a/projects/rocshmem/CMakeLists.txt b/projects/rocshmem/CMakeLists.txt index 8d39020f0d..300ed0690d 100644 --- a/projects/rocshmem/CMakeLists.txt +++ b/projects/rocshmem/CMakeLists.txt @@ -45,16 +45,19 @@ option(USE_RO "Enable RO conduit." ON) option(USE_IPC "Enable IPC support (using HIP)" OFF) option(USE_THREADS "Enable workgroup threads to share network queues" OFF) option(USE_WF_COAL "Enable wavefront message coalescing" OFF) -option(USE_COHERENT_HEAP "Enable support for coherent systems" OFF) -option(USE_MANAGED_HEAP "Enable managed memory" OFF) -option(USE_HOST_HEAP "Enable host memory using malloc/free" OFF) -option(USE_HIP_HOST_HEAP "Enable host memory using hip api" OFF) +option(USE_HEAP_DEVICE_FINEGRAIN "Heap uses GPU memory in finegrain mode" ON) +option(USE_HEAP_DEVICE_UNCACHED "Heap uses GPU memory in uncached mode" OFF) +option(USE_HEAP_DEVICE_COARSEGRAIN "Heap uses GPU memory in coarsegrain mode" OFF) +option(USE_HEAP_MANAGED "Heap uses managed memory" OFF) +option(USE_HEAP_HOST_HIP "Heap uses pinned host memory allocated with hip api" OFF) +option(USE_HEAP_HOST "Heap uses host memory allocated with malloc/free" OFF) option(USE_ALLOC_DLMALLOC "Enable dlmalloc device memory allocator" ON) option(USE_ALLOC_POW2BINS "Enable legacy Pow2Bins device memory allocator" OFF) option(USE_FUNC_CALL "Force compiler to use function calls on library API" OFF) option(USE_SHARED_CTX "Request support for shared ctx between WG" OFF) option(USE_SINGLE_NODE "Enable single node support only." OFF) -option(USE_HOST_SIDE_HDP_FLUSH "Use a polling thread to flush the HDP cache on the host." OFF) +option(USE_HDP_FLUSH "Force flush the HDP cache." OFF) +option(USE_HDP_FLUSH_HOST_SIDE "Use a polling thread to flush the HDP cache on the host." OFF) option(BUILD_FUNCTIONAL_TESTS "Build the functional tests" OFF) option(BUILD_EXAMPLES "Build the examples" ON) diff --git a/projects/rocshmem/cmake/rocshmem_config.h.in b/projects/rocshmem/cmake/rocshmem_config.h.in index 5e40b5997d..36c5aeae24 100644 --- a/projects/rocshmem/cmake/rocshmem_config.h.in +++ b/projects/rocshmem/cmake/rocshmem_config.h.in @@ -29,12 +29,15 @@ #cmakedefine USE_THREADS #cmakedefine USE_SHARED_CTX #cmakedefine USE_WF_COAL -#cmakedefine USE_COHERENT_HEAP -#cmakedefine USE_MANAGED_HEAP -#cmakedefine USE_HOST_HEAP -#cmakedefine USE_HIP_HOST_HEAP +#cmakedefine USE_HEAP_DEVICE_FINEGRAIN +#cmakedefine USE_HEAP_DEVICE_UNCACHED +#cmakedefine USE_HEAP_DEVICE_COARSEGRAIN +#cmakedefine USE_HEAP_MANAGED +#cmakedefine USE_HEAP_HOST_HIP +#cmakedefine USE_HEAP_HOST #cmakedefine USE_ALLOC_DLMALLOC #cmakedefine USE_ALLOC_POW2BINS #cmakedefine USE_FUNC_CALL #cmakedefine USE_SINGLE_NODE -#cmakedefine USE_HOST_SIDE_HDP_FLUSH +#cmakedefine USE_HDP_FLUSH +#cmakedefine USE_HDP_FLUSH_HOST_SIDE diff --git a/projects/rocshmem/scripts/build_configs/ipc_single b/projects/rocshmem/scripts/build_configs/ipc_single index df9674f5f5..83a82d4756 100755 --- a/projects/rocshmem/scripts/build_configs/ipc_single +++ b/projects/rocshmem/scripts/build_configs/ipc_single @@ -45,11 +45,11 @@ cmake \ -DPROFILE=OFF \ -DUSE_RO=OFF \ -DUSE_IPC=ON \ - -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ -DUSE_SINGLE_NODE=ON \ - -DUSE_HOST_SIDE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \ -DBUILD_FUNCTIONAL_TESTS=ON \ -DBUILD_UNIT_TESTS=ON \ diff --git a/projects/rocshmem/scripts/build_configs/ipc_tests_only b/projects/rocshmem/scripts/build_configs/ipc_tests_only index c78fa78692..7caacc5941 100755 --- a/projects/rocshmem/scripts/build_configs/ipc_tests_only +++ b/projects/rocshmem/scripts/build_configs/ipc_tests_only @@ -48,11 +48,11 @@ cmake \ -DPROFILE=OFF \ -DUSE_RO=OFF \ -DUSE_IPC=ON \ - -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ -DUSE_SINGLE_NODE=ON \ - -DUSE_HOST_SIDE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ -DBUILD_LOCAL_GPU_TARGET_ONLY=OFF \ -DBUILD_TESTS_ONLY=ON \ -DBUILD_FUNCTIONAL_TESTS=ON \ diff --git a/projects/rocshmem/scripts/build_configs/ro_ipc b/projects/rocshmem/scripts/build_configs/ro_ipc index e0fffed9fd..b39438e665 100755 --- a/projects/rocshmem/scripts/build_configs/ro_ipc +++ b/projects/rocshmem/scripts/build_configs/ro_ipc @@ -44,11 +44,10 @@ cmake \ -DDEBUG=OFF \ -DPROFILE=OFF \ -DUSE_IPC=ON \ - -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ - -DUSE_HOST_SIDE_HDP_FLUSH=OFF\ - -DUSE_MANAGED_HEAP=OFF \ + -DUSE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ -DUSE_RO=ON \ -DBUILD_FUNCTIONAL_TESTS=ON \ -DBUILD_UNIT_TESTS=ON \ diff --git a/projects/rocshmem/scripts/build_configs/ro_net b/projects/rocshmem/scripts/build_configs/ro_net index 5ee99d12af..abdcffcdd4 100755 --- a/projects/rocshmem/scripts/build_configs/ro_net +++ b/projects/rocshmem/scripts/build_configs/ro_net @@ -44,11 +44,10 @@ cmake \ -DDEBUG=OFF \ -DPROFILE=OFF \ -DUSE_IPC=OFF \ - -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ - -DUSE_HOST_SIDE_HDP_FLUSH=OFF\ - -DUSE_MANAGED_HEAP=OFF \ + -DUSE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ -DUSE_RO=ON \ -DBUILD_FUNCTIONAL_TESTS=ON \ -DBUILD_UNIT_TESTS=ON \ diff --git a/projects/rocshmem/scripts/build_configs/ro_net_debug b/projects/rocshmem/scripts/build_configs/ro_net_debug index 758aff5715..a7c42ba234 100755 --- a/projects/rocshmem/scripts/build_configs/ro_net_debug +++ b/projects/rocshmem/scripts/build_configs/ro_net_debug @@ -42,11 +42,10 @@ cmake \ -DDEBUG=OFF \ -DPROFILE=OFF \ -DUSE_IPC=OFF \ - -DUSE_COHERENT_HEAP=ON \ -DUSE_THREADS=OFF \ -DUSE_WF_COAL=OFF \ - -DUSE_HOST_SIDE_HDP_FLUSH=OFF\ - -DUSE_MANAGED_HEAP=OFF \ + -DUSE_HDP_FLUSH=OFF \ + -DUSE_HDP_FLUSH_HOST_SIDE=OFF \ -DUSE_RO=ON \ -DBUILD_FUNCTIONAL_TESTS=ON \ -DBUILD_UNIT_TESTS=ON \ diff --git a/projects/rocshmem/src/assembly.hpp b/projects/rocshmem/src/assembly.hpp index 791a8e93d9..aa48099ac1 100644 --- a/projects/rocshmem/src/assembly.hpp +++ b/projects/rocshmem/src/assembly.hpp @@ -165,23 +165,8 @@ NOWARN(-Wdeprecated-volatile, ) // clang-format on -__device__ __forceinline__ void __roc_inv() { -#if defined USE_COHERENT_HEAP -#if defined(__gfx906__) -#endif -#if defined(__gfx908__) -#endif -#if defined(__gfx90a__) -// asm volatile("buffer_wbinvl1;"); -#endif -#if defined(__gfx942__) -// asm volatile("buffer_inv sc0 sc1;"); -#endif -#endif -} - __device__ __forceinline__ void __roc_flush() { -#if defined USE_COHERENT_HEAP +#if not defined USE_HDP_FLUSH #if defined(__gfx906__) #endif #if defined(__gfx908__) diff --git a/projects/rocshmem/src/hdp_policy.hpp b/projects/rocshmem/src/hdp_policy.hpp index 7491dbf18a..1f37e46f47 100644 --- a/projects/rocshmem/src/hdp_policy.hpp +++ b/projects/rocshmem/src/hdp_policy.hpp @@ -182,16 +182,16 @@ class NoHdpPolicy { /* * Select which one of our HDP policies to use at compile time. */ -#ifdef USE_COHERENT_HEAP -typedef NoHdpPolicy HdpPolicy; -#else +#if defined USE_HDP_FLUSH // Only when we are using the IB conduit, we have to use a polling thread to // flush the HDP cache on the GPU's behalf. -#ifdef USE_HOST_SIDE_HDP_FLUSH +#if defined USE_HDP_FLUSH_HOST_SIDE typedef HdpHostSideFlushRocmPolicy HdpPolicy; #else typedef HdpDeviceSideFlushRocmPolicy HdpPolicy; #endif +#else +typedef NoHdpPolicy HdpPolicy; #endif } // namespace rocshmem diff --git a/projects/rocshmem/src/host/host.cpp b/projects/rocshmem/src/host/host.cpp index 45a15c1de4..edcd8953c1 100644 --- a/projects/rocshmem/src/host/host.cpp +++ b/projects/rocshmem/src/host/host.cpp @@ -118,16 +118,16 @@ __host__ HostInterface::HostInterface(HdpPolicy* hdp_policy, new HostContextWindowInfo(host_comm_world_, heap); } -#if !defined(USE_COHERENT_HEAP) && !defined(USE_SINGLE_NODE) +#if defined(USE_HDP_FLUSH) && !defined(USE_SINGLE_NODE) // The single node implementation needs a different path since // the HDP flush pointers are allocated on the symmetric heap // and we need to wait for other initialization to happen before // calling `get_hdp_flush_ptr`. create_hdp_window(); -#endif // defined(USE_COHERENT_HEAP) && !defined(USE_SINGLE_NODE) +#endif // defined(USE_HDP_FLUSH) && !defined(USE_SINGLE_NODE) } -#ifndef USE_COHERENT_HEAP +#if defined USE_HDP_FLUSH __host__ void HostInterface::create_hdp_window() { MPI_Win_create(hdp_policy_->get_hdp_flush_ptr(), sizeof(unsigned int), /* size of window */ @@ -142,14 +142,14 @@ __host__ void HostInterface::create_hdp_window() { */ MPI_Win_lock_all(MPI_MODE_NOCHECK, hdp_win); } -#endif // USE_COHERENT_HEAP +#endif // USE_HDP_FLUSH __host__ HostInterface::~HostInterface() { -#ifndef USE_COHERENT_HEAP +#if defined USE_HDP_FLUSH MPI_Win_unlock_all(hdp_win); MPI_Win_free(&hdp_win); -#endif // USE_COHERENT_HEAP +#endif // USE_HDP_FLUSH /* Detroy the pool of contexts */ for (int ctx_i = 0; ctx_i < max_num_ctxs_; ctx_i++) { diff --git a/projects/rocshmem/src/host/host.hpp b/projects/rocshmem/src/host/host.hpp index 0255eddfd1..cbd4ebb1ab 100644 --- a/projects/rocshmem/src/host/host.hpp +++ b/projects/rocshmem/src/host/host.hpp @@ -248,16 +248,16 @@ class HostInterface { template __host__ int test(T *ivars, int cmp, T val, WindowInfo* window_info); -#ifndef USE_COHERENT_HEAP +#if defined USE_HDP_FLUSH __host__ void create_hdp_window(); -#endif // USE_COHERENT_HEAP +#endif // USE_HDP_FLUSH private: /************************************************************************** **************************** INTERNAL METHODS **************************** *************************************************************************/ __host__ void flush_remote_hdps() { -#ifndef USE_COHERENT_HEAP +#if defined USE_HDP_FLUSH unsigned flush_val{HdpPolicy::HDP_FLUSH_VAL}; for (size_t i{0}; i < num_pes_; i++) { if (i == my_pe_) { @@ -266,15 +266,15 @@ class HostInterface { MPI_Put(&flush_val, 1, MPI_UNSIGNED, i, 0, 1, MPI_UNSIGNED, hdp_win); } MPI_Win_flush_all(hdp_win); -#endif // USE_COHERENT_HEAP +#endif // USE_HDP_FLUSH } __host__ void flush_remote_hdp(int pe) { -#ifndef USE_COHERENT_HEAP +#if defined USE_HDP_FLUSH unsigned flush_val{HdpPolicy::HDP_FLUSH_VAL}; MPI_Put(&flush_val, 1, MPI_UNSIGNED, pe, 0, 1, MPI_UNSIGNED, hdp_win); MPI_Win_flush(pe, hdp_win); -#endif // USE_COHERENT_HEAP +#endif // USE_HDP_FLUSH } __host__ void initiate_put(void* dest, const void* source, size_t nelems, @@ -333,12 +333,12 @@ class HostInterface { */ int num_pes_{0}; -#ifndef USE_COHERENT_HEAP +#if defined USE_HDP_FLUSH /** * @brief MPI window for hdp flushing */ MPI_Win hdp_win; -#endif // USE_COHERENT_HEAP +#endif // USE_HDP_FLUSH /** * @brief Max number of contexts for the application diff --git a/projects/rocshmem/src/memory/heap_type.hpp b/projects/rocshmem/src/memory/heap_type.hpp index 325922205f..0935f60851 100644 --- a/projects/rocshmem/src/memory/heap_type.hpp +++ b/projects/rocshmem/src/memory/heap_type.hpp @@ -40,16 +40,25 @@ namespace rocshmem { -#if defined USE_MANAGED_HEAP -using HEAP_T = HeapMemory; -#elif defined USE_COHERENT_HEAP +// Compilation error 'HEAP_T redefined' indicates that user had more than one +// USE_HEAP_* ON when configuring. Use ccmake to select only one. +#if defined USE_HEAP_DEVICE_COARSEGRAIN using HEAP_T = HeapMemory; -#elif defined USE_HOST_HEAP -using HEAP_T = HeapMemory; -#elif defined USE_HIP_HOST_HEAP +#endif +#if defined USE_HEAP_DEVICE_FINEGRAIN +using HEAP_T = HeapMemory; +#endif +#if defined USE_HEAP_DEVICE_UNCACHED +using HEAP_T = HeapMemory; +#endif +#if defined USE_HEAP_MANAGED +using HEAP_T = HeapMemory; +#endif +#if defined USE_HEAP_HOST_HIP using HEAP_T = HeapMemory; -#else -using HEAP_T = HeapMemory; +#endif +#if defined USE_HEAP_HOST +using HEAP_T = HeapMemory; #endif } // namespace rocshmem diff --git a/projects/rocshmem/src/memory/hip_allocator.hpp b/projects/rocshmem/src/memory/hip_allocator.hpp index 7d62729b2b..0fe18edf6f 100644 --- a/projects/rocshmem/src/memory/hip_allocator.hpp +++ b/projects/rocshmem/src/memory/hip_allocator.hpp @@ -36,12 +36,15 @@ #include #include +#include "rocshmem_config.h" // NOLINT(build/include_subdir) #include "memory_allocator.hpp" // `hipDeviceMallocUncached` was introduced at ROCm 5.5 #if (HIP_VERSION_MAJOR > 5) || \ (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 5) #define HIP_SUPPORTS_MALLOC_UNCACHED +#elif defined USE_HEAP_DEVICE_UNCACHED +#error "USE_HEAP_DEVICE_UNCACHED unsupported in this HIP version" #endif namespace rocshmem { @@ -57,13 +60,14 @@ class HIPAllocatorFinegrained : public MemoryAllocator { hipDeviceMallocFinegrained) {} }; -#ifdef HIP_SUPPORTS_MALLOC_UNCACHED +#if defined HIP_SUPPORTS_MALLOC_UNCACHED class HIPAllocatorUncached : public MemoryAllocator { public: HIPAllocatorUncached() : MemoryAllocator(hipExtMallocWithFlags, hipFree, hipDeviceMallocUncached) {} }; + // The default fine-grained coherence allocator is the uncached allocator using HIPDefaultFinegrainedAllocator = HIPAllocatorUncached; #else