diff --git a/projects/rocshmem/src/containers/forward_list.hpp b/projects/rocshmem/src/containers/forward_list.hpp index 81d29eb374..acd0d70343 100644 --- a/projects/rocshmem/src/containers/forward_list.hpp +++ b/projects/rocshmem/src/containers/forward_list.hpp @@ -486,9 +486,19 @@ std::string to_string(const ForwardList& list); template class ForwardListProxy { - using ProxyT = DeviceProxy, 1>; + using ProxyT = DeviceProxy>; public: + ForwardList(size_t num_elems = 1) : proxy_{num_elems} {} + + ForwardList(const ForwardList& other) = delete; + + ForwardList& operator=(const ForwardList& other) = delete; + + ForwardList(ForwardList&& other) = default; + + ForwardList& operator=(ForwardList&& other) = default; + __host__ __device__ ForwardList* get() { return proxy_.get(); } private: diff --git a/projects/rocshmem/src/containers/free_list.hpp b/projects/rocshmem/src/containers/free_list.hpp index 73f50b2dcd..bf4f1fc7a3 100644 --- a/projects/rocshmem/src/containers/free_list.hpp +++ b/projects/rocshmem/src/containers/free_list.hpp @@ -262,7 +262,17 @@ class FreeListProxy { public: __host__ __device__ FreeListT* get() { return proxy_.get(); } - FreeListProxy() { new (proxy_.get()) FreeListT(); } + FreeListProxy(size_t num_elems = 1) : proxy_{num_elems} { + new (proxy_.get()) FreeListT(); + } + + FreeListProxy(const FreeListProxy& other) = delete; + + FreeListProxy& operator=(const FreeListProxy& other) = delete; + + FreeListProxy(FreeListProxy&& other) = default; + + FreeListProxy& operator=(FreeListProxy&& other) = default; ~FreeListProxy() { auto free_list = proxy_.get(); diff --git a/projects/rocshmem/src/device_proxy.hpp b/projects/rocshmem/src/device_proxy.hpp index 807dd1aeed..243c82c931 100644 --- a/projects/rocshmem/src/device_proxy.hpp +++ b/projects/rocshmem/src/device_proxy.hpp @@ -30,21 +30,28 @@ namespace rocshmem { -template +template class DeviceProxy { public: - DeviceProxy() { + DeviceProxy() = default; + + DeviceProxy(size_t num_elems) : num_elems_ {num_elems} { + /** + * @brief The allocation size of the internal memory + * + */ + size_t size_bytes = sizeof(T) * num_elems_; /* * Allocate memory and verify that the allocation worked. */ T* temp{nullptr}; - allocator_.allocate(reinterpret_cast(&temp), SIZE_BYTES_); + allocator_.allocate(reinterpret_cast(&temp), size_bytes); assert(temp); /* * Default memory provided by the allocation to recognizable bytes. */ - memset(static_cast(temp), 0xBC, SIZE_BYTES_); + memset(static_cast(temp), 0xBC, size_bytes); /* * Pass the memory into a unique ptr for tracking. @@ -58,6 +65,14 @@ class DeviceProxy { ptr_ = up_.get(); } + DeviceProxy(const DeviceProxy& other) = delete; + + DeviceProxy& operator=(const DeviceProxy& other) = delete; + + DeviceProxy(DeviceProxy&& other) = default; + + DeviceProxy& operator=(DeviceProxy&& other) = default; + /** * @brief Return internal storage tracked by the Proxy. * @@ -98,9 +113,9 @@ class DeviceProxy { T* ptr_{nullptr}; /** - * @brief The allocation size for the internal memory + * @brief Number of elements of type T to be allocated */ - static constexpr size_t SIZE_BYTES_{sizeof(T) * SIZE_IN}; + size_t num_elems_{}; }; } // namespace rocshmem diff --git a/projects/rocshmem/src/hdp_proxy.hpp b/projects/rocshmem/src/hdp_proxy.hpp index f3c1da2f22..607a5db0f8 100644 --- a/projects/rocshmem/src/hdp_proxy.hpp +++ b/projects/rocshmem/src/hdp_proxy.hpp @@ -36,10 +36,18 @@ class HdpProxy { /* * Placement new the memory which is allocated by proxy_ */ - HdpProxy() { + HdpProxy(size_t num_elems = 1) : proxy_{num_elems} { new (proxy_.get()) HdpPolicy(); } + HdpProxy(const HdpProxy& other) = delete; + + HdpProxy& operator=(const HdpProxy& other) = delete; + + HdpProxy(HdpProxy&& other) = default; + + HdpProxy& operator=(HdpProxy&& other) = default; + /* * Since placement new is called in the constructor, then * delete must be called manually. diff --git a/projects/rocshmem/src/ipc/backend_ipc.cpp b/projects/rocshmem/src/ipc/backend_ipc.cpp index abf24f94f5..9cd99f9329 100644 --- a/projects/rocshmem/src/ipc/backend_ipc.cpp +++ b/projects/rocshmem/src/ipc/backend_ipc.cpp @@ -221,7 +221,7 @@ void IPCBackend::create_new_team([[maybe_unused]] Team *parent_team, /** * Allocate device-side memory for team_world and - * construct a GPU_IB team in it + * construct a IPC team in it */ GPUIBTeam *new_team_obj; CHECK_HIP(hipMalloc(&new_team_obj, sizeof(IPCTeam))); diff --git a/projects/rocshmem/src/ipc/ipc_context_proxy.hpp b/projects/rocshmem/src/ipc/ipc_context_proxy.hpp index 7370761f5b..da8f587e8d 100644 --- a/projects/rocshmem/src/ipc/ipc_context_proxy.hpp +++ b/projects/rocshmem/src/ipc/ipc_context_proxy.hpp @@ -41,8 +41,9 @@ class IPCDefaultContextProxy { /* * Placement new the memory which is allocated by proxy_ */ - explicit IPCDefaultContextProxy(IPCBackend* backend, TeamInfo *tinfo) - : constructed_{true} { + explicit IPCDefaultContextProxy(IPCBackend* backend, TeamInfo *tinfo, + size_t num_elems = 1) + : constructed_{true}, proxy_{num_elems} { auto ctx{proxy_.get()}; new (ctx) IPCContext(reinterpret_cast(backend)); ctx->tinfo = tinfo; diff --git a/projects/rocshmem/src/memory/notifier.hpp b/projects/rocshmem/src/memory/notifier.hpp index 946a9c291e..84003265a6 100644 --- a/projects/rocshmem/src/memory/notifier.hpp +++ b/projects/rocshmem/src/memory/notifier.hpp @@ -101,10 +101,18 @@ class NotifierProxy { using ProxyT = DeviceProxy>; public: - NotifierProxy() { + NotifierProxy(size_t num_elems = 1) : proxy_{num_elems} { new (proxy_.get()) Notifier(); } + NotifierProxy(const NotifierProxy& other) = delete; + + NotifierProxy& operator=(const NotifierProxy& other) = delete; + + NotifierProxy(NotifierProxy&& other) = default; + + NotifierProxy& operator=(NotifierProxy&& other) = default; + ~NotifierProxy() { proxy_.get()->~Notifier(); } diff --git a/projects/rocshmem/src/memory/slab_heap.hpp b/projects/rocshmem/src/memory/slab_heap.hpp index 171332bee3..23bd4e099f 100644 --- a/projects/rocshmem/src/memory/slab_heap.hpp +++ b/projects/rocshmem/src/memory/slab_heap.hpp @@ -160,13 +160,23 @@ class SlabHeap { template class SlabHeapProxy { - using ProxyT = DeviceProxy; + using ProxyT = DeviceProxy; public: /* * Placement new the memory which is allocated by proxy_ */ - SlabHeapProxy() { new (proxy_.get()) SlabHeap(); } + SlabHeapProxy(size_t num_elems = 1) : proxy_{num_elems} { + new (proxy_.get()) SlabHeap(); + } + + SlabHeapProxy(const SlabHeapProxy& other) = delete; + + SlabHeapProxy& operator=(const SlabHeapProxy& other) = delete; + + SlabHeapProxy(SlabHeapProxy&& other) = default; + + SlabHeapProxy& operator=(SlabHeapProxy&& other) = default; /* * Since placement new is called in the constructor, then diff --git a/projects/rocshmem/src/reverse_offload/backend_proxy.hpp b/projects/rocshmem/src/reverse_offload/backend_proxy.hpp index 5dd28c1a66..b1d80dc413 100644 --- a/projects/rocshmem/src/reverse_offload/backend_proxy.hpp +++ b/projects/rocshmem/src/reverse_offload/backend_proxy.hpp @@ -51,7 +51,9 @@ class BackendProxy { /* * Placement new the memory which is allocated by proxy_ */ - BackendProxy() { new (proxy_.get()) BackendRegister(); } + BackendProxy(size_t num_elems = 1) : proxy_{num_elems} { + new (proxy_.get()) BackendRegister(); + } /* * Since placement new is called in the constructor, then diff --git a/projects/rocshmem/src/reverse_offload/backend_ro.cpp b/projects/rocshmem/src/reverse_offload/backend_ro.cpp index f7ce31c72d..32675077d3 100644 --- a/projects/rocshmem/src/reverse_offload/backend_ro.cpp +++ b/projects/rocshmem/src/reverse_offload/backend_ro.cpp @@ -45,7 +45,7 @@ namespace rocshmem { extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT; ROBackend::ROBackend(MPI_Comm comm) - : profiler_proxy_(MAX_NUM_BLOCKS), Backend() { + : Backend() { type = BackendType::RO_BACKEND; if (auto maximum_num_contexts_str = getenv("ROCSHMEM_MAX_NUM_CONTEXTS")) { @@ -54,6 +54,18 @@ ROBackend::ROBackend(MPI_Comm comm) } poll_block_count_ = maximum_num_contexts_; + profiler_proxy_ = ProfilerProxyT(maximum_num_contexts_); + + int device_id; + hipDeviceProp_t device_props; + + CHECK_HIP(hipGetDevice(&device_id)); + CHECK_HIP(hipGetDeviceProperties(&device_props, device_id)); + + max_wg_size_ = device_props.maxThreadsPerBlock; + + queue_ = Queue(maximum_num_contexts_, max_wg_size_, queue_size_); + transport_ = new MPITransport(comm, &queue_); num_pes = transport_->getNumPes(); my_pe = transport_->getMyPe(); @@ -68,16 +80,18 @@ ROBackend::ROBackend(MPI_Comm comm) bp->heap_ptr = &heap; - ro_window_proxy_ = new WindowProxyT(&heap, transport_->get_world_comm()); + ro_window_proxy_ = new WindowProxyT(&heap, transport_->get_world_comm(), + num_windows_); + bp->heap_window_info = ro_window_proxy_->get(); initIPC(); - init_g_ret(&heap, transport_->get_world_comm(), MAX_NUM_BLOCKS, &bp->g_ret); + init_g_ret(&heap, transport_->get_world_comm(), maximum_num_contexts_, &bp->g_ret); - allocate_atomic_region(&bp->atomic_ret, MAX_NUM_BLOCKS); + allocate_atomic_region(&bp->atomic_ret, maximum_num_contexts_); - transport_->initTransport(MAX_NUM_BLOCKS, &backend_proxy); + transport_->initTransport(maximum_num_contexts_, &backend_proxy); host_interface = transport_->host_interface; @@ -99,7 +113,8 @@ ROBackend::ROBackend(MPI_Comm comm) default_context_proxy_ = DefaultContextProxyT(this, tinfo); block_handle_proxy_ = BlockHandleProxyT(bp->g_ret, bp->atomic_ret, &queue_, - &ipcImpl, hdp_proxy_.get()); + &ipcImpl, hdp_proxy_.get(), + maximum_num_contexts_); setup_ctxs(); worker_thread = std::thread(&ROBackend::ro_net_poll, this); diff --git a/projects/rocshmem/src/reverse_offload/backend_ro.hpp b/projects/rocshmem/src/reverse_offload/backend_ro.hpp index 4f41a4f7d3..2000c1792b 100644 --- a/projects/rocshmem/src/reverse_offload/backend_ro.hpp +++ b/projects/rocshmem/src/reverse_offload/backend_ro.hpp @@ -255,6 +255,21 @@ class ROBackend : public Backend { * @brief Holds maximum number of contexts used in library */ size_t maximum_num_contexts_{1024}; + + /** + * @brief Holds maximum threads per work-group + */ + int max_wg_size_{}; + + /** + * @brief Holds the queue size for each context + */ + size_t queue_size_{512}; + + /** + * @brief Number of MPI windows used for device contexts in RO Backend + */ + size_t num_windows_{32}; }; } // namespace rocshmem diff --git a/projects/rocshmem/src/reverse_offload/block_handle.hpp b/projects/rocshmem/src/reverse_offload/block_handle.hpp index d2235f8639..711db9d04a 100644 --- a/projects/rocshmem/src/reverse_offload/block_handle.hpp +++ b/projects/rocshmem/src/reverse_offload/block_handle.hpp @@ -33,7 +33,7 @@ namespace rocshmem { struct BlockHandle { ROStats profiler{}; queue_element_t *queue{nullptr}; - uint64_t queue_size{QUEUE_SIZE}; + uint64_t queue_size{}; volatile uint64_t read_index{}; volatile uint64_t write_index{}; volatile uint64_t *host_read_index{}; @@ -53,7 +53,10 @@ class DefaultBlockHandleProxy { DefaultBlockHandleProxy() = default; DefaultBlockHandleProxy(char *g_ret, atomic_ret_t *atomic_ret, Queue *queue, - IpcImpl *ipc_policy, HdpPolicy *hdp_policy) { + IpcImpl *ipc_policy, HdpPolicy *hdp_policy, + size_t num_elems = 1) + : proxy_{num_elems} { + // TODO(bpotter): create a default queue for this queue descriptor auto queue_descriptor{queue->descriptor(0)}; auto block_handle{proxy_.get()}; @@ -73,6 +76,14 @@ class DefaultBlockHandleProxy { block_handle->lock = 0; } + DefaultBlockHandleProxy(const DefaultBlockHandleProxy& other) = delete; + + DefaultBlockHandleProxy& operator=(const DefaultBlockHandleProxy& other) = delete; + + DefaultBlockHandleProxy(DefaultBlockHandleProxy&& other) = default; + + DefaultBlockHandleProxy& operator=(DefaultBlockHandleProxy&& other) = default; + __host__ __device__ BlockHandle *get() { return proxy_.get(); } private: @@ -83,15 +94,17 @@ using DefaultBlockHandleProxyT = DefaultBlockHandleProxy; template class BlockHandleProxy { - static constexpr size_t MAX_NUM_BLOCKS{65536}; - using ProxyT = DeviceProxy; + using ProxyT = DeviceProxy; public: BlockHandleProxy() = default; BlockHandleProxy(char *g_ret, atomic_ret_t *atomic_ret, Queue *queue, - IpcImpl *ipc_policy, HdpPolicy *hdp_policy) { - for (size_t i{0}; i < MAX_NUM_BLOCKS; i++) { + IpcImpl *ipc_policy, HdpPolicy *hdp_policy, + size_t max_blocks) + : proxy_{max_blocks} { + + for (size_t i{0}; i < max_blocks; i++) { auto queue_descriptor{queue->descriptor(i)}; auto block_handle{&proxy_.get()[i]}; block_handle->profiler.resetStats(); @@ -111,10 +124,20 @@ class BlockHandleProxy { } } + BlockHandleProxy(const BlockHandleProxy& other) = delete; + + BlockHandleProxy& operator=(const BlockHandleProxy& other) = delete; + + BlockHandleProxy(BlockHandleProxy&& other) = default; + + BlockHandleProxy& operator=(BlockHandleProxy&& other) = default; + __host__ __device__ BlockHandle *get() { return proxy_.get(); } private: ProxyT proxy_{}; + + size_t num_blocks_{}; }; using BlockHandleProxyT = BlockHandleProxy; diff --git a/projects/rocshmem/src/reverse_offload/context_proxy.hpp b/projects/rocshmem/src/reverse_offload/context_proxy.hpp index a5b589a3b4..f0a8fdcb44 100644 --- a/projects/rocshmem/src/reverse_offload/context_proxy.hpp +++ b/projects/rocshmem/src/reverse_offload/context_proxy.hpp @@ -42,8 +42,9 @@ class DefaultContextProxy { /* * Placement new the memory which is allocated by proxy_ */ - explicit DefaultContextProxy(ROBackend* backend, TeamInfo *tinfo) - : constructed_{true} { + explicit DefaultContextProxy(ROBackend* backend, TeamInfo *tinfo, + size_t num_elems = 1) + : constructed_{true}, proxy_{num_elems} { auto ctx{proxy_.get()}; new (ctx) ROContext(reinterpret_cast(backend), -1); rocshmem_ctx_t local{ctx, tinfo}; @@ -87,54 +88,6 @@ class DefaultContextProxy { using DefaultContextProxyT = DefaultContextProxy; -template -class BlockContextProxy { - static constexpr size_t MAX_NUM_BLOCKS{65536}; - using ProxyT = DeviceProxy; - - public: - BlockContextProxy() = default; - - explicit BlockContextProxy(ROBackend* backend) : constructed_{true} { - auto* ctx{proxy_.get()}; - for (size_t i{0}; i < MAX_NUM_BLOCKS; i++) { - auto ctx_p{&ctx[i]}; - new (ctx_p) ROContext(reinterpret_cast(backend), i); - } - } - - ~BlockContextProxy() { - if (constructed_) { - auto* ctx{proxy_.get()}; - for (size_t i{0}; i < MAX_NUM_BLOCKS; i++) { - auto ctx_p{&ctx[i]}; - ctx_p->~ROContext(); - } - } - } - - BlockContextProxy(const BlockContextProxy& other) = delete; - - BlockContextProxy& operator=(const BlockContextProxy& other) = delete; - - BlockContextProxy(BlockContextProxy&& other) = default; - - BlockContextProxy& operator=(BlockContextProxy&& other) = default; - - __host__ __device__ Context* get() { return proxy_.get(); } - - private: - /* - * @brief Memory managed by the lifetime of this object - */ - ProxyT proxy_{}; - - /* - * @brief denotes if an objects was constructed in proxy - */ - bool constructed_{false}; -}; - } // namespace rocshmem #endif // LIBRARY_SRC_REVERSE_OFFLOAD_CONTEXT_PROXY_HPP_ diff --git a/projects/rocshmem/src/reverse_offload/context_ro_device.cpp b/projects/rocshmem/src/reverse_offload/context_ro_device.cpp index e9e42c4afe..5c7b82ee89 100644 --- a/projects/rocshmem/src/reverse_offload/context_ro_device.cpp +++ b/projects/rocshmem/src/reverse_offload/context_ro_device.cpp @@ -50,7 +50,7 @@ __host__ ROContext::ROContext(Backend *b, size_t block_id) auto block_base{backend->block_handle_proxy_.get()}; block_handle = &block_base[block_id]; } - ro_net_win_id = block_id % backend->ro_window_proxy_->MAX_NUM_WINDOWS; + ro_net_win_id = block_id % backend->ro_window_proxy_->get_num_MPI_windows(); ipcImpl_.ipc_bases = b->ipcImpl.ipc_bases; ipcImpl_.shm_size = b->ipcImpl.shm_size; diff --git a/projects/rocshmem/src/reverse_offload/profiler.hpp b/projects/rocshmem/src/reverse_offload/profiler.hpp index a180f560c8..9dddf95c3d 100644 --- a/projects/rocshmem/src/reverse_offload/profiler.hpp +++ b/projects/rocshmem/src/reverse_offload/profiler.hpp @@ -51,13 +51,13 @@ typedef NullStats ROStats; template class ProfilerProxy { - static constexpr size_t MAX_NUM_BLOCKS{65536}; - - using ProxyT = DeviceProxy; + using ProxyT = DeviceProxy; public: - explicit ProfilerProxy(size_t num_blocks) : num_elem_{num_blocks} { - assert(num_blocks <= MAX_NUM_BLOCKS); + ProfilerProxy() = default; + + explicit ProfilerProxy(size_t num_blocks) + : num_elem_{num_blocks}, proxy_{num_blocks} { auto *stat{proxy_.get()}; assert(stat); @@ -68,6 +68,14 @@ class ProfilerProxy { } } + ProfilerProxy(const ProfilerProxy& other) = delete; + + ProfilerProxy& operator=(const ProfilerProxy& other) = delete; + + ProfilerProxy(ProfilerProxy&& other) = default; + + ProfilerProxy& operator=(ProfilerProxy&& other) = default; + ~ProfilerProxy() { auto *stat{proxy_.get()}; assert(stat); diff --git a/projects/rocshmem/src/reverse_offload/queue.cpp b/projects/rocshmem/src/reverse_offload/queue.cpp index d5eccffc5d..b6e37b48c1 100644 --- a/projects/rocshmem/src/reverse_offload/queue.cpp +++ b/projects/rocshmem/src/reverse_offload/queue.cpp @@ -33,8 +33,22 @@ Queue::Queue() { } } +Queue::Queue(size_t max_queues, size_t max_wg_size, size_t queue_size) + : max_queues_{max_queues}, + max_wg_size_{max_wg_size}, + queue_size_{queue_size}, + queue_proxy_{max_queues, queue_size}, + queue_desc_proxy_{max_queues, max_wg_size} { + + gpu_queue = true; + char *value{nullptr}; + if ((value = getenv("RO_NET_CPU_QUEUE")) != nullptr) { + gpu_queue = false; + } +} + uint64_t Queue::get_read_index(uint64_t queue_index) { - return descriptor(queue_index)->read_index % QUEUE_SIZE; + return descriptor(queue_index)->read_index % queue_size_; } void Queue::increment_read_index(uint64_t queue_index) { @@ -92,7 +106,7 @@ void Queue::notify(int blockId, int threadId) { } uint64_t Queue::size() { - return QUEUE_SIZE; + return queue_size_; } __host__ __device__ queue_desc_t* Queue::descriptor(uint64_t index) { diff --git a/projects/rocshmem/src/reverse_offload/queue.hpp b/projects/rocshmem/src/reverse_offload/queue.hpp index d057c1d206..3ea41a39c7 100644 --- a/projects/rocshmem/src/reverse_offload/queue.hpp +++ b/projects/rocshmem/src/reverse_offload/queue.hpp @@ -35,6 +35,8 @@ class Queue { public: Queue(); + Queue(size_t max_queues, size_t max_threads_per_block, size_t queue_size); + bool process(uint64_t queue_index, MPITransport* transport); uint64_t get_read_index(uint64_t queue_index); @@ -67,6 +69,12 @@ class Queue { HdpProxy hdp_proxy_{}; bool gpu_queue{false}; + + size_t max_queues_{}; + + size_t max_wg_size_{}; + + size_t queue_size_{}; }; } // namespace rocshmem diff --git a/projects/rocshmem/src/reverse_offload/queue_desc_proxy.hpp b/projects/rocshmem/src/reverse_offload/queue_desc_proxy.hpp index b2fc11054d..f5aa66c461 100644 --- a/projects/rocshmem/src/reverse_offload/queue_desc_proxy.hpp +++ b/projects/rocshmem/src/reverse_offload/queue_desc_proxy.hpp @@ -58,32 +58,48 @@ typedef struct queue_desc { template class QueueDescProxy { - static constexpr size_t MAX_NUM_BLOCKS{65536}; - static constexpr size_t MAX_THREADS_PER_BLOCK{1024}; - static constexpr size_t MAX_THREADS{MAX_NUM_BLOCKS * MAX_THREADS_PER_BLOCK}; - using ProxyT = DeviceProxy; - using ProxyStatusT = DeviceProxy; + using ProxyT = DeviceProxy; + using ProxyStatusT = DeviceProxy; public: - QueueDescProxy() { + QueueDescProxy() = default; + + QueueDescProxy(size_t max_queues, size_t max_threads_per_queue) + : max_queues_{max_queues}, max_threads_per_queue_{max_threads_per_queue}, + max_threads_{max_queues * max_threads_per_queue}, proxy_{max_queues}, + proxy_status_{max_queues * max_threads_per_queue} { auto *status{proxy_status_.get()}; - size_t status_bytes{sizeof(char) * MAX_THREADS}; + size_t status_bytes{sizeof(char) * max_threads_}; memset(status, 0, status_bytes); auto *queue_descs{proxy_.get()}; - for (size_t i{0}; i < MAX_NUM_BLOCKS; i++) { + for (size_t i{0}; i < max_queues_; i++) { queue_descs[i].read_index = 0; queue_descs[i].write_index = 0; - queue_descs[i].status = status + i * MAX_THREADS_PER_BLOCK; + queue_descs[i].status = status + i * max_threads_per_queue_; } } + QueueDescProxy(const QueueDescProxy& other) = delete; + + QueueDescProxy& operator=(const QueueDescProxy& other) = delete; + + QueueDescProxy(QueueDescProxy&& other) = default; + + QueueDescProxy& operator=(QueueDescProxy&& other) = default; + __host__ __device__ queue_desc_t *get() { return proxy_.get(); } private: ProxyT proxy_{}; ProxyStatusT proxy_status_{}; + + size_t max_queues_{}; + + size_t max_threads_per_queue_{}; + + size_t max_threads_{}; }; using QueueDescProxyT = QueueDescProxy; diff --git a/projects/rocshmem/src/reverse_offload/queue_proxy.hpp b/projects/rocshmem/src/reverse_offload/queue_proxy.hpp index 9cde0cb0cb..ce3bda6fa0 100644 --- a/projects/rocshmem/src/reverse_offload/queue_proxy.hpp +++ b/projects/rocshmem/src/reverse_offload/queue_proxy.hpp @@ -35,8 +35,6 @@ namespace rocshmem { -constexpr size_t QUEUE_SIZE{512}; - struct cacheline_t { volatile char valid; volatile char padding[63]; @@ -82,7 +80,17 @@ class QueueElementProxy { using ProxyT = DeviceProxy; public: - QueueElementProxy() { new (proxy_.get()) queue_element_t(); } + QueueElementProxy(size_t num_elems = 1) : proxy_{num_elems} { + new (proxy_.get()) queue_element_t(); + } + + QueueElementProxy(const QueueElementProxy& other) = delete; + + QueueElementProxy& operator=(const QueueElementProxy& other) = delete; + + QueueElementProxy(QueueElementProxy&& other) = default; + + QueueElementProxy& operator=(QueueElementProxy&& other) = default; ~QueueElementProxy() { proxy_.get()->~queue_element_t(); } @@ -96,11 +104,8 @@ using QueueElementProxyT = QueueElementProxy; template class QueueProxy { - static constexpr size_t MAX_NUM_BLOCKS{65536}; - static constexpr size_t TOTAL_QUEUE_ELEMENTS{QUEUE_SIZE * MAX_NUM_BLOCKS}; - using ProxyT = DeviceProxy; - using ProxyPerBlockT = - DeviceProxy; + using ProxyT = DeviceProxy; + using ProxyPerBlockT = DeviceProxy; public: /** @@ -109,23 +114,44 @@ class QueueProxy { * The circular queues are indexed using the device block-id so that each * each block has its own queue. */ - QueueProxy() { + QueueProxy() = default; + + QueueProxy(size_t max_queues, size_t queue_size) + : max_queues_{max_queues}, queue_size_{queue_size}, + total_queue_elements_{queue_size * max_queues}, + queue_proxy_{max_queues}, + per_block_queue_proxy_{queue_size * max_queues} { + auto **queue_array{queue_proxy_.get()}; auto *per_block_queue{per_block_queue_proxy_.get()}; - for (size_t i{0}; i < MAX_NUM_BLOCKS; i++) { - queue_array[i] = per_block_queue + i * QUEUE_SIZE; + for (size_t i{0}; i < max_queues_; i++) { + queue_array[i] = per_block_queue + i * queue_size; } size_t total_queue_element_bytes{sizeof(queue_element_t) * - TOTAL_QUEUE_ELEMENTS}; + total_queue_elements_}; memset(per_block_queue, 0, total_queue_element_bytes); } + QueueProxy(const QueueProxy& other) = delete; + + QueueProxy& operator=(const QueueProxy& other) = delete; + + QueueProxy(QueueProxy&& other) = default; + + QueueProxy& operator=(QueueProxy&& other) = default; + __host__ __device__ queue_element_t **get() { return queue_proxy_.get(); } private: ProxyT queue_proxy_{}; ProxyPerBlockT per_block_queue_proxy_{}; + + size_t max_queues_{}; + + size_t queue_size_{}; + + size_t total_queue_elements_{}; }; using QueueProxyT = QueueProxy; diff --git a/projects/rocshmem/src/reverse_offload/ro_team_proxy.hpp b/projects/rocshmem/src/reverse_offload/ro_team_proxy.hpp index 8637a0d56f..e3b1afd280 100644 --- a/projects/rocshmem/src/reverse_offload/ro_team_proxy.hpp +++ b/projects/rocshmem/src/reverse_offload/ro_team_proxy.hpp @@ -39,14 +39,24 @@ class ROTeamProxy { /* * Placement new the memory which is allocated by proxy_ */ - ROTeamProxy(Backend* backend, MPI_Comm comm, int pe, int npes) - : my_pe_(pe), team_size_(npes) { + ROTeamProxy(Backend* backend, MPI_Comm comm, int pe, int npes, + size_t num_elems = 1) + : my_pe_(pe), team_size_(npes), proxy_{num_elems} { + MPI_Comm_dup(comm, &team_world_comm_); new (proxy_.get()) ROTeam(backend, wrt_parent_.get(), wrt_world_.get(), team_size_, my_pe_, team_world_comm_); } + ROTeamProxy(const ROTeamProxy& other) = delete; + + ROTeamProxy& operator=(const ROTeamProxy& other) = delete; + + ROTeamProxy(ROTeamProxy&& other) = default; + + ROTeamProxy& operator=(ROTeamProxy&& other) = default; + /* * Since placement new is called in the constructor, then * delete must be called manually. diff --git a/projects/rocshmem/src/reverse_offload/team_info_proxy.hpp b/projects/rocshmem/src/reverse_offload/team_info_proxy.hpp index b68b2a1772..3844a05946 100644 --- a/projects/rocshmem/src/reverse_offload/team_info_proxy.hpp +++ b/projects/rocshmem/src/reverse_offload/team_info_proxy.hpp @@ -36,10 +36,19 @@ class TeamInfoProxy { /* * Placement new the memory which is allocated by proxy_ */ - TeamInfoProxy(Team* parent_team, int pe_start, int stride, int size) { + TeamInfoProxy(Team* parent_team, int pe_start, int stride, int size, + size_t num_elems = 1) : proxy_{num_elems} { new (proxy_.get()) TeamInfo(parent_team, pe_start, stride, size); } + TeamInfoProxy(const TeamInfoProxy& other) = delete; + + TeamInfoProxy& operator=(const TeamInfoProxy& other) = delete; + + TeamInfoProxy(TeamInfoProxy&& other) = default; + + TeamInfoProxy& operator=(TeamInfoProxy&& other) = default; + /* * Since placement new is called in the constructor, then * delete must be called manually. diff --git a/projects/rocshmem/src/reverse_offload/window_proxy.hpp b/projects/rocshmem/src/reverse_offload/window_proxy.hpp index cdef6a5555..d7f7ec782e 100644 --- a/projects/rocshmem/src/reverse_offload/window_proxy.hpp +++ b/projects/rocshmem/src/reverse_offload/window_proxy.hpp @@ -31,25 +31,32 @@ namespace rocshmem { template class WindowProxy { - public: - static constexpr size_t MAX_NUM_WINDOWS{32}; - private: - using ProxyT = DeviceProxy; + using ProxyT = DeviceProxy; public: /* * Placement new the memory which is allocated by proxy_ */ - WindowProxy(SymmetricHeap *heap, MPI_Comm comm) { + WindowProxy(SymmetricHeap *heap, MPI_Comm comm, size_t num_windows) + : num_windows_{num_windows}, proxy_{num_windows} { + auto *window_info{proxy_.get()}; - for (size_t i{0}; i < MAX_NUM_WINDOWS; i++) { + for (size_t i{0}; i < num_windows_; i++) { window_info[i] = new WindowInfo(comm, heap->get_local_heap_base(), heap->get_size()); } } + WindowProxy(const WindowProxy& other) = delete; + + WindowProxy& operator=(const WindowProxy& other) = delete; + + WindowProxy(WindowProxy&& other) = default; + + WindowProxy& operator=(WindowProxy&& other) = default; + /* * Since placement new is called in the constructor, then * delete must be called manually. @@ -57,7 +64,7 @@ class WindowProxy { ~WindowProxy() { auto *window_info{proxy_.get()}; - for (size_t i{0}; i < MAX_NUM_WINDOWS; i++) { + for (size_t i{0}; i < num_windows_; i++) { delete window_info[i]; } } @@ -67,11 +74,17 @@ class WindowProxy { */ __host__ __device__ WindowInfo **get() { return proxy_.get(); } + __host__ size_t get_num_MPI_windows() { return num_windows_; } private: /* * @brief Memory managed by the lifetime of this object */ ProxyT proxy_{}; + + /** + * @brief Number of MPI windows used for device contexts in RO Backend + */ + size_t num_windows_{32}; }; using WindowProxyT = WindowProxy; diff --git a/projects/rocshmem/src/sync/abql_block_mutex.hpp b/projects/rocshmem/src/sync/abql_block_mutex.hpp index ec65aa4833..ea16b2ede9 100644 --- a/projects/rocshmem/src/sync/abql_block_mutex.hpp +++ b/projects/rocshmem/src/sync/abql_block_mutex.hpp @@ -111,9 +111,19 @@ class ABQLBlockMutex { template class ABQLBlockMutexProxy { - using ProxyT = DeviceProxy; + using ProxyT = DeviceProxy; public: + ABQLBlockMutexProxy(size_t num_elems = 1) : proxy_{num_elems} {} + + ABQLBlockMutexProxy(const ABQLBlockMutexProxy& other) = delete; + + ABQLBlockMutexProxy& operator=(const ABQLBlockMutexProxy& other) = delete; + + ABQLBlockMutexProxy(ABQLBlockMutexProxy&& other) = default; + + ABQLBlockMutexProxy& operator=(ABQLBlockMutexProxy&& other) = default; + __host__ __device__ ABQLBlockMutex* get() { return proxy_.get(); } private: