2024-07-01 09:57:08 -05:00
|
|
|
/******************************************************************************
|
2025-04-15 15:37:53 -05:00
|
|
|
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
|
*
|
|
|
|
|
* SPDX-License-Identifier: MIT
|
2024-07-01 09:57:08 -05:00
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
|
* of this software and associated documentation files (the "Software"), to
|
|
|
|
|
* deal in the Software without restriction, including without limitation the
|
|
|
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
|
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
2025-04-15 15:37:53 -05:00
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
2024-07-01 09:57:08 -05:00
|
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
|
|
#include "backend_ro.hpp"
|
|
|
|
|
|
|
|
|
|
#include <immintrin.h>
|
|
|
|
|
#include <smmintrin.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
|
2025-06-16 20:21:06 -04:00
|
|
|
#include <cassert>
|
2024-07-01 09:57:08 -05:00
|
|
|
#include <cstdio>
|
|
|
|
|
#include <cstdlib>
|
|
|
|
|
#include <memory>
|
|
|
|
|
#include <thread> // NOLINT
|
2025-12-08 11:46:16 -05:00
|
|
|
#include <dlfcn.h>
|
2024-07-01 09:57:08 -05:00
|
|
|
|
2024-11-25 14:12:15 -06:00
|
|
|
#include "rocshmem/rocshmem.hpp"
|
2025-09-05 12:21:18 -04:00
|
|
|
#include "atomic_return.hpp"
|
|
|
|
|
#include "backend_type.hpp"
|
|
|
|
|
#include "context_incl.hpp"
|
2025-10-06 10:05:57 -07:00
|
|
|
#include "envvar.hpp"
|
2024-07-01 09:57:08 -05:00
|
|
|
#include "mpi_transport.hpp"
|
|
|
|
|
#include "ro_net_team.hpp"
|
2025-09-05 12:21:18 -04:00
|
|
|
#include "util.hpp"
|
2024-07-01 09:57:08 -05:00
|
|
|
|
|
|
|
|
namespace rocshmem {
|
|
|
|
|
|
2024-11-25 14:12:15 -06:00
|
|
|
extern rocshmem_ctx_t ROCSHMEM_HOST_CTX_DEFAULT;
|
2024-07-01 09:57:08 -05:00
|
|
|
|
|
|
|
|
ROBackend::ROBackend(MPI_Comm comm)
|
2025-04-14 09:18:57 -05:00
|
|
|
: Backend(comm) {
|
2024-07-01 09:57:08 -05:00
|
|
|
type = BackendType::RO_BACKEND;
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
poll_block_count_ = envvar::max_num_contexts;
|
2024-07-01 09:57:08 -05:00
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
profiler_proxy_ = ProfilerProxyT(envvar::max_num_contexts);
|
2025-02-24 15:11:46 -06:00
|
|
|
|
|
|
|
|
int device_id;
|
|
|
|
|
CHECK_HIP(hipGetDevice(&device_id));
|
|
|
|
|
|
2025-10-22 16:04:58 -05:00
|
|
|
max_wg_size_ = get_threads_per_block(device_id);
|
2025-02-24 15:11:46 -06:00
|
|
|
|
2025-10-22 16:04:58 -05:00
|
|
|
wf_size_ = get_wf_size(device_id);
|
2025-03-25 18:51:54 -05:00
|
|
|
|
|
|
|
|
setup_default_ctx_buffers();
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
size_t num_buff_elems = envvar::max_num_contexts * max_wg_size_;
|
2025-03-14 10:49:44 -05:00
|
|
|
|
|
|
|
|
g_ret_buffer_ = RetBufferProxyT(num_buff_elems);
|
|
|
|
|
|
|
|
|
|
atomic_ret_buffer_ = RetBufferProxyT(num_buff_elems);
|
|
|
|
|
|
|
|
|
|
status_ = StatusProxyT(num_buff_elems);
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
queue_ = Queue(envvar::max_num_contexts, queue_size_);
|
2025-02-24 15:11:46 -06:00
|
|
|
|
2025-04-14 09:18:57 -05:00
|
|
|
transport_ = new MPITransport(backend_comm, &queue_);
|
2024-07-01 09:57:08 -05:00
|
|
|
num_pes = transport_->getNumPes();
|
|
|
|
|
my_pe = transport_->getMyPe();
|
|
|
|
|
|
|
|
|
|
auto *bp{backend_proxy.get()};
|
|
|
|
|
|
|
|
|
|
bp->hdp_policy = hdp_proxy_.get();
|
|
|
|
|
|
|
|
|
|
bp->profiler = profiler_proxy_.get();
|
|
|
|
|
|
|
|
|
|
bp->worker_thread_exit = false;
|
|
|
|
|
|
|
|
|
|
bp->heap_ptr = &heap;
|
|
|
|
|
|
2025-02-24 15:11:46 -06:00
|
|
|
ro_window_proxy_ = new WindowProxyT(&heap, transport_->get_world_comm(),
|
|
|
|
|
num_windows_);
|
|
|
|
|
|
2024-07-01 09:57:08 -05:00
|
|
|
bp->heap_window_info = ro_window_proxy_->get();
|
|
|
|
|
|
|
|
|
|
initIPC();
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
transport_->initTransport(envvar::max_num_contexts, &backend_proxy);
|
2024-07-01 09:57:08 -05:00
|
|
|
|
|
|
|
|
host_interface = transport_->host_interface;
|
|
|
|
|
|
|
|
|
|
default_host_ctx = std::make_unique<ROHostContext>(this, 0);
|
|
|
|
|
|
2024-11-25 14:12:15 -06:00
|
|
|
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
|
2024-07-01 09:57:08 -05:00
|
|
|
|
|
|
|
|
team_world_proxy_ = new ROTeamProxy<HIPAllocator>(
|
|
|
|
|
this, transport_->get_world_comm(), my_pe, num_pes);
|
|
|
|
|
team_tracker.set_team_world(team_world_proxy_->get());
|
|
|
|
|
|
2024-11-25 14:12:15 -06:00
|
|
|
ROCSHMEM_TEAM_WORLD =
|
|
|
|
|
reinterpret_cast<rocshmem_team_t>(team_world_proxy_->get());
|
2024-07-01 09:57:08 -05:00
|
|
|
|
|
|
|
|
default_block_handle_proxy_ = DefaultBlockHandleProxyT(
|
2025-03-14 10:49:44 -05:00
|
|
|
g_ret_buffer_.get(),
|
|
|
|
|
atomic_ret_buffer_.get(), &queue_,
|
2025-03-25 18:51:54 -05:00
|
|
|
status_.get(), default_ctx_status_.get(),
|
|
|
|
|
default_ctx_g_ret_buffer_.get(),
|
|
|
|
|
default_ctx_atomic_ret_buffer_.get());
|
2024-10-04 17:56:15 +00:00
|
|
|
|
|
|
|
|
TeamInfo *tinfo = team_tracker.get_team_world()->tinfo_wrt_world;
|
2025-03-14 10:49:44 -05:00
|
|
|
|
2024-10-04 17:56:15 +00:00
|
|
|
default_context_proxy_ = DefaultContextProxyT(this, tinfo);
|
2024-07-01 09:57:08 -05:00
|
|
|
|
2025-03-14 10:49:44 -05:00
|
|
|
block_handle_proxy_ = BlockHandleProxyT(g_ret_buffer_.get(),
|
|
|
|
|
atomic_ret_buffer_.get(), &queue_,
|
2025-10-06 10:05:57 -07:00
|
|
|
max_wg_size_, status_.get(), envvar::max_num_contexts);
|
2024-07-01 09:57:08 -05:00
|
|
|
setup_ctxs();
|
|
|
|
|
|
|
|
|
|
worker_thread = std::thread(&ROBackend::ro_net_poll, this);
|
|
|
|
|
|
|
|
|
|
*done_init = 1;
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-08 11:46:16 -05:00
|
|
|
/* Currently we only check whether we can dlopen an MPI library.
|
|
|
|
|
*/
|
|
|
|
|
int ROBackend::backend_can_run() {
|
|
|
|
|
auto handle = dlopen("libmpi.so", RTLD_LAZY);
|
|
|
|
|
if (!handle) {
|
|
|
|
|
printf("Could not open libmpi.so. Returning\n");
|
|
|
|
|
return ROCSHMEM_ERROR;
|
|
|
|
|
}
|
|
|
|
|
//TODO dlsym MPI_Get_library_version and verify compat when HAVE_EXTERNAL_MPI is undef
|
|
|
|
|
dlclose(handle);
|
|
|
|
|
return ROCSHMEM_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-01 09:57:08 -05:00
|
|
|
void ROBackend::setup_ctxs() {
|
2025-10-06 10:05:57 -07:00
|
|
|
CHECK_HIP(hipMalloc(&ctx_array, sizeof(ROContext) * envvar::max_num_contexts));
|
|
|
|
|
for (size_t i = 0; i < envvar::max_num_contexts; i++) {
|
2024-07-01 09:57:08 -05:00
|
|
|
new (&ctx_array[i]) ROContext(this, i);
|
|
|
|
|
ctx_free_list.get()->push_back(ctx_array + i);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-25 18:51:54 -05:00
|
|
|
void ROBackend::setup_default_ctx_buffers() {
|
2025-10-06 10:05:57 -07:00
|
|
|
size_t num_buff_elems = envvar::max_wavefront_buffers * wf_size_;
|
2025-03-25 18:51:54 -05:00
|
|
|
|
|
|
|
|
g_ret_buffer_default_ctx_ = RetBufferProxyT(num_buff_elems);
|
|
|
|
|
|
|
|
|
|
atomic_ret_buffer_default_ctx_ = RetBufferProxyT(num_buff_elems);
|
|
|
|
|
|
|
|
|
|
status_default_ctx_ = StatusProxyT(num_buff_elems);
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
default_ctx_status_.get()->allocate_queue(envvar::max_wavefront_buffers);
|
|
|
|
|
default_ctx_g_ret_buffer_.get()->allocate_queue(envvar::max_wavefront_buffers);
|
|
|
|
|
default_ctx_atomic_ret_buffer_.get()->allocate_queue(envvar::max_wavefront_buffers);
|
2025-03-25 18:51:54 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
char* status = status_default_ctx_.get();
|
|
|
|
|
uint64_t* g_ret_buf = g_ret_buffer_default_ctx_.get();
|
|
|
|
|
uint64_t* atomic_ret_buf = atomic_ret_buffer_default_ctx_.get();
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
for (size_t i = 0; i < envvar::max_wavefront_buffers; i++) {
|
|
|
|
|
size_t offset = i * wf_size_;
|
2025-03-25 18:51:54 -05:00
|
|
|
default_ctx_status_.get()->push(status + offset);
|
|
|
|
|
default_ctx_g_ret_buffer_.get()->push(g_ret_buf + offset);
|
|
|
|
|
default_ctx_atomic_ret_buffer_.get()->push(atomic_ret_buf + offset);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-01 09:57:08 -05:00
|
|
|
ROBackend::~ROBackend() {
|
|
|
|
|
ro_net_free_runtime();
|
|
|
|
|
CHECK_HIP(hipFree(ctx_array));
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-25 14:12:15 -06:00
|
|
|
__device__ bool ROBackend::create_ctx(int64_t options, rocshmem_ctx_t *ctx) {
|
2024-07-01 09:57:08 -05:00
|
|
|
ROContext *ctx_;
|
|
|
|
|
|
|
|
|
|
auto pop_result = ctx_free_list.get()->pop_front();
|
|
|
|
|
if (!pop_result.success) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
ctx_ = pop_result.value;
|
|
|
|
|
|
|
|
|
|
ctx->ctx_opaque = ctx_;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-25 14:12:15 -06:00
|
|
|
__device__ void ROBackend::destroy_ctx(rocshmem_ctx_t *ctx) {
|
2024-07-01 09:57:08 -05:00
|
|
|
ctx_free_list.get()->push_back(static_cast<ROContext *>(ctx->ctx_opaque));
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-25 14:12:15 -06:00
|
|
|
void ROBackend::team_destroy(rocshmem_team_t team) {
|
2024-07-01 09:57:08 -05:00
|
|
|
ROTeam *team_obj{get_internal_ro_team(team)};
|
|
|
|
|
|
|
|
|
|
team_obj->~ROTeam();
|
|
|
|
|
// CHECK_HIP(hipFree(team_obj));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::create_new_team(Team *parent_team,
|
|
|
|
|
TeamInfo *team_info_wrt_parent,
|
|
|
|
|
TeamInfo *team_info_wrt_world, int num_pes,
|
|
|
|
|
int my_pe_in_new_team, MPI_Comm team_comm,
|
2024-11-25 14:12:15 -06:00
|
|
|
rocshmem_team_t *new_team) {
|
2024-07-01 09:57:08 -05:00
|
|
|
transport_->createNewTeam(this, parent_team, team_info_wrt_parent,
|
|
|
|
|
team_info_wrt_world, num_pes, my_pe_in_new_team,
|
|
|
|
|
team_comm, new_team);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::ctx_create(int64_t options, void **ctx) {
|
|
|
|
|
ROHostContext *new_ctx{nullptr};
|
|
|
|
|
new_ctx = new ROHostContext(this, options);
|
|
|
|
|
*ctx = new_ctx;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ROHostContext *get_internal_ro_net_ctx(Context *ctx) {
|
|
|
|
|
return reinterpret_cast<ROHostContext *>(ctx);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::ctx_destroy(Context *ctx) {
|
|
|
|
|
ROHostContext *ro_net_host_ctx{get_internal_ro_net_ctx(ctx)};
|
|
|
|
|
delete ro_net_host_ctx;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::reset_backend_stats() {
|
|
|
|
|
auto *bp{backend_proxy.get()};
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
for (size_t i = 0; i < envvar::max_num_contexts; i++) {
|
2024-07-01 09:57:08 -05:00
|
|
|
bp->profiler[i].resetStats();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::dump_backend_stats() {
|
|
|
|
|
uint64_t total{0};
|
|
|
|
|
for (int i = 0; i < NUM_STATS; i++) {
|
|
|
|
|
total += globalStats.getStat(i);
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-10 14:47:24 -05:00
|
|
|
int device_id;
|
|
|
|
|
hipDeviceProp_t device_props;
|
|
|
|
|
CHECK_HIP(hipGetDevice(&device_id));
|
|
|
|
|
int wallClockMhz;
|
|
|
|
|
CHECK_HIP(hipDeviceGetAttribute(&wallClockMhz, hipDeviceAttributeWallClockRate, device_id));
|
|
|
|
|
int gpu_frequency_mhz{wallClockMhz};
|
2024-07-01 09:57:08 -05:00
|
|
|
|
|
|
|
|
uint64_t us_wait_slot{0};
|
|
|
|
|
uint64_t us_pack{0};
|
|
|
|
|
uint64_t us_fence1{0};
|
|
|
|
|
uint64_t us_fence2{0};
|
|
|
|
|
uint64_t us_wait_host{0};
|
|
|
|
|
|
|
|
|
|
auto *bp{backend_proxy.get()};
|
|
|
|
|
|
2025-10-06 10:05:57 -07:00
|
|
|
for (size_t i = 0; i < envvar::max_num_contexts; i++) {
|
2024-07-01 09:57:08 -05:00
|
|
|
// Average latency as perceived from a thread
|
|
|
|
|
const ROStats &prof{bp->profiler[i]};
|
|
|
|
|
us_wait_slot += prof.getStat(WAITING_ON_SLOT) / gpu_frequency_mhz;
|
|
|
|
|
us_pack += prof.getStat(PACK_QUEUE) / gpu_frequency_mhz;
|
|
|
|
|
us_fence1 += prof.getStat(THREAD_FENCE_1) / gpu_frequency_mhz;
|
|
|
|
|
us_fence2 += prof.getStat(THREAD_FENCE_2) / gpu_frequency_mhz;
|
|
|
|
|
us_wait_host += prof.getStat(WAITING_ON_HOST) / gpu_frequency_mhz;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
constexpr int FIELD_WIDTH{20};
|
|
|
|
|
constexpr int FLOAT_PRECISION{2};
|
|
|
|
|
|
|
|
|
|
printf("%*s%*s%*s%*s%*s\n", FIELD_WIDTH + 1, "Wait On Slot (us)",
|
|
|
|
|
FIELD_WIDTH + 1, "Pack Queue (us)", FIELD_WIDTH + 1, "Fence 1 (us)",
|
|
|
|
|
FIELD_WIDTH + 1, "Fence 2 (us)", FIELD_WIDTH + 1, "Wait Host (us)");
|
|
|
|
|
|
|
|
|
|
printf("%*.*f %*.*f %*.*f %*.*f %*.*f\n\n", FIELD_WIDTH, FLOAT_PRECISION,
|
|
|
|
|
static_cast<double>(us_wait_slot) / total, FIELD_WIDTH,
|
|
|
|
|
FLOAT_PRECISION, static_cast<double>(us_pack) / total, FIELD_WIDTH,
|
|
|
|
|
FLOAT_PRECISION, static_cast<double>(us_fence1) / total, FIELD_WIDTH,
|
|
|
|
|
FLOAT_PRECISION, static_cast<double>(us_fence2) / total, FIELD_WIDTH,
|
|
|
|
|
FLOAT_PRECISION, static_cast<double>(us_wait_host) / total);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::ro_net_free_runtime() {
|
|
|
|
|
/*
|
|
|
|
|
* Validate that a handle was passed that is not a nullptr.
|
|
|
|
|
*/
|
|
|
|
|
auto *bp{backend_proxy.get()};
|
|
|
|
|
assert(bp);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Set this flag to denote that the runtime is being torn down.
|
|
|
|
|
*/
|
|
|
|
|
bp->worker_thread_exit = true;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Tear down the worker threads.
|
|
|
|
|
*/
|
|
|
|
|
worker_thread.join();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Tear down the transport object.
|
|
|
|
|
*/
|
|
|
|
|
while (!transport_->readyForFinalize()) {
|
|
|
|
|
}
|
|
|
|
|
transport_->finalizeTransport();
|
|
|
|
|
|
|
|
|
|
ro_window_proxy_->~WindowProxyT();
|
|
|
|
|
team_world_proxy_->~ROTeamProxy<HIPAllocator>();
|
|
|
|
|
transport_->~MPITransport();
|
|
|
|
|
/*
|
|
|
|
|
* Free the profiler statistics structure.
|
|
|
|
|
*/
|
|
|
|
|
// CHECK_HIP(hipFree(bp->profiler));
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Tear down team_world
|
|
|
|
|
*/
|
|
|
|
|
auto *team_world{team_tracker.get_team_world()};
|
|
|
|
|
team_world->~Team();
|
|
|
|
|
// CHECK_HIP(hipFree(team_world));
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Free the gpu_handle.
|
|
|
|
|
*/
|
|
|
|
|
// CHECK_HIP(hipHostFree(bp));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::ro_net_poll() {
|
|
|
|
|
auto *bp{backend_proxy.get()};
|
|
|
|
|
while (!bp->worker_thread_exit) {
|
|
|
|
|
for (size_t i{0}; i < poll_block_count_; i++) {
|
|
|
|
|
int16_t request_count{0};
|
|
|
|
|
const int16_t max_count{64};
|
|
|
|
|
bool processed_req{true};
|
|
|
|
|
while (processed_req && (request_count < max_count)) {
|
|
|
|
|
processed_req = queue_.process(i, transport_);
|
|
|
|
|
request_count++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::initIPC() {
|
|
|
|
|
const auto &heap_bases{heap.get_heap_bases()};
|
|
|
|
|
|
|
|
|
|
ipcImpl.ipcHostInit(transport_->getMyPe(), heap_bases,
|
|
|
|
|
transport_->get_world_comm());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ROBackend::global_exit(int status) { transport_->global_exit(status); }
|
|
|
|
|
|
|
|
|
|
} // namespace rocshmem
|