From 282fc7fe71f016e88b2ba6e0b30716a1cf9c1603 Mon Sep 17 00:00:00 2001 From: Yiltan Date: Thu, 11 Sep 2025 18:11:52 -0400 Subject: [PATCH] Fix broken atomics from PR #233 (#251) * The QueuePair object was out of scope at the end of the for loop. So the deconstructor was called. * Although correct for C++ to do this, it ignores that we copied the QueuePair object into device memory and have an instance there. * Early deconstruction resulted in calling ibv_dereg_mr on the atomics memory region. So when the GPU kernel tried to use the memory region it wasn't registered which resulted in a protection domain error. * The solution was to allocate our QueuePair obj with the new operator which leaves memory management to us, then we can manually call the deconstructor. [ROCm/rocshmem commit: e856fbb0ebe519f8743e483a0c7a39548ab74e93] --- projects/rocshmem/src/gda/backend_gda.cpp | 29 +++++++++++++++++++---- projects/rocshmem/src/gda/backend_gda.hpp | 1 + 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/projects/rocshmem/src/gda/backend_gda.cpp b/projects/rocshmem/src/gda/backend_gda.cpp index 792b9ba2c6..6d2aee55a5 100644 --- a/projects/rocshmem/src/gda/backend_gda.cpp +++ b/projects/rocshmem/src/gda/backend_gda.cpp @@ -715,16 +715,35 @@ void GDABackend::cleanup_heap_memory_rkey() { } void GDABackend::setup_gpu_qps() { - CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * (maximum_num_contexts_ + 1) * num_pes)); - for (int i = 0; i < (maximum_num_contexts_ + 1) * num_pes; i++) { - QueuePair qp(pd_orig); - CHECK_HIP(hipMemcpy(&gpu_qps[i], &qp, sizeof(QueuePair), hipMemcpyDefault)); + size_t qp_objs_count; + size_t qp_objs_mem_size; + + qp_objs_count = (maximum_num_contexts_ + 1) * num_pes; + qp_objs_mem_size = sizeof(QueuePair) * qp_objs_count; + + CHECK_HIP(hipMalloc(&gpu_qps, qp_objs_mem_size)); + + host_qps = (QueuePair*) malloc(qp_objs_mem_size); + CHECK_NNULL(host_qps, "malloc (host_qps)"); + + for (int i = 0; i < qp_objs_count; i++) { + new (&host_qps[i]) QueuePair(pd_orig); + CHECK_HIP(hipMemcpy(&gpu_qps[i], &host_qps[i], sizeof(QueuePair), hipMemcpyDefault)); initialize_gpu_qp(&gpu_qps[i], i); } } void GDABackend::cleanup_gpu_qps() { - //TODO need to destruct qp[i]? + size_t qp_objs_count; + + qp_objs_count = (maximum_num_contexts_ + 1) * num_pes; + + for (int i = 0; i < qp_objs_count; i++) { + host_qps[i].~QueuePair(); + } + + free(host_qps); + CHECK_HIP(hipFree(gpu_qps)); gpu_qps = nullptr; } diff --git a/projects/rocshmem/src/gda/backend_gda.hpp b/projects/rocshmem/src/gda/backend_gda.hpp index f075901be8..f94f7bbe3b 100644 --- a/projects/rocshmem/src/gda/backend_gda.hpp +++ b/projects/rocshmem/src/gda/backend_gda.hpp @@ -119,6 +119,7 @@ class GDABackend : public Backend { uint32_t sq_size = 1024; uint32_t inline_threshold = 8; + QueuePair *host_qps = nullptr; QueuePair *gpu_qps = nullptr; std::vector qps; std::vector cqs;