Files
rocm-systems/src/reverse_offload/queue_proxy.hpp
T
Avinash Kethineedi df4ad2c04d Refactor RO backend data structures (#49)
- Remove hdp and ipc pointers from BlockHandle, align RO stats with RO contexts

- Add run commands for `rocshmem_g` and `rocshmem_p` API tests in driver.sh

- Allocate rocshmem API return buffers based on number of device contexts.

- Associate status flag address with blocking calls and remove threadId dependency
   - Associated the status flag address with each blocking call request to notify the GPU thread.
   - Removed dependency on threadId for determining the appropriate status flag index.

- Move status flag buffer allocation to backend.

- Initialize allocated memeory to zero
2025-03-14 10:49:44 -05:00

162 строки
5.0 KiB
C++

/******************************************************************************
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_PROXY_HPP_
#define LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_PROXY_HPP_
#include <mpi.h>
#include "../atomic_return.hpp"
#include "../device_proxy.hpp"
#include "../hdp_policy.hpp"
#include "../ipc_policy.hpp"
#include "commands_types.hpp"
#include "profiler.hpp"
#include "../sync/abql_block_mutex.hpp"
namespace rocshmem {
struct cacheline_t {
volatile char valid;
volatile char padding[63];
} __attribute__((__aligned__(64)));
typedef struct queue_element {
/**
* Polled by the CPU to determine when a command is ready. Set by the GPU
* once a queue element has been completely filled out. This is padded
* from the actual data to prevent thrashing on an APU when the GPU is
* trying to fill out a packet and the CPU is reading the valid bit.
*/
cacheline_t notify_cpu;
/**
* All following fields written by the GPU and read by the CPU.
*/
ro_net_cmds type{};
int PE{-1};
void *src{nullptr};
void *dst{nullptr};
int ro_net_win_id{-1};
int logPE_stride{-1};
int PE_size{-1};
long *pSync{nullptr};
int op{-1};
int datatype{-1};
int PE_root{-1};
volatile char *status{nullptr};
MPI_Comm team_comm{};
union {
size_t size;
unsigned long long atomic_value;
} ol1;
union {
void *pWrk;
unsigned long long atomic_cond;
} ol2;
} __attribute__((__aligned__(64))) queue_element_t;
template <typename ALLOCATOR>
class QueueElementProxy {
using ProxyT = DeviceProxy<ALLOCATOR, queue_element_t>;
public:
QueueElementProxy(size_t num_elems = 1) : proxy_{num_elems} {
new (proxy_.get()) queue_element_t();
}
QueueElementProxy(const QueueElementProxy& other) = delete;
QueueElementProxy& operator=(const QueueElementProxy& other) = delete;
QueueElementProxy(QueueElementProxy&& other) = default;
QueueElementProxy& operator=(QueueElementProxy&& other) = default;
~QueueElementProxy() { proxy_.get()->~queue_element_t(); }
__host__ __device__ queue_element_t* get() { return proxy_.get(); }
private:
ProxyT proxy_{};
};
using QueueElementProxyT = QueueElementProxy<PosixAligned64Allocator>;
template <typename ALLOCATOR>
class QueueProxy {
using ProxyT = DeviceProxy<ALLOCATOR, queue_element_t *>;
using ProxyPerBlockT = DeviceProxy<ALLOCATOR, queue_element_t>;
public:
/**
* @brief Initializes a c-style array of circular queues.
*
* The circular queues are indexed using the device block-id so that each
* each block has its own queue.
*/
QueueProxy() = default;
QueueProxy(size_t max_queues, size_t queue_size)
: max_queues_{max_queues}, queue_size_{queue_size},
total_queue_elements_{queue_size * max_queues},
queue_proxy_{max_queues},
per_block_queue_proxy_{queue_size * max_queues} {
auto **queue_array{queue_proxy_.get()};
auto *per_block_queue{per_block_queue_proxy_.get()};
for (size_t i{0}; i < max_queues_; i++) {
queue_array[i] = per_block_queue + i * queue_size;
}
size_t total_queue_element_bytes{sizeof(queue_element_t) *
total_queue_elements_};
memset(per_block_queue, 0, total_queue_element_bytes);
}
QueueProxy(const QueueProxy& other) = delete;
QueueProxy& operator=(const QueueProxy& other) = delete;
QueueProxy(QueueProxy&& other) = default;
QueueProxy& operator=(QueueProxy&& other) = default;
__host__ __device__ queue_element_t **get() { return queue_proxy_.get(); }
private:
ProxyT queue_proxy_{};
ProxyPerBlockT per_block_queue_proxy_{};
size_t max_queues_{};
size_t queue_size_{};
size_t total_queue_elements_{};
};
using QueueProxyT = QueueProxy<HIPHostAllocator>;
} // namespace rocshmem
#endif // LIBRARY_SRC_REVERSE_OFFLOAD_QUEUE_PROXY_HPP_