From 282fc7fe71f016e88b2ba6e0b30716a1cf9c1603 Mon Sep 17 00:00:00 2001
From: Yiltan <ytemucin@amd.com>
Date: Thu, 11 Sep 2025 18:11:52 -0400
Subject: [PATCH] Fix broken atomics from PR #233 (#251)

* The QueuePair object was out of scope at the end of the for loop.
  So the deconstructor was called.
* Although correct for C++ to do this, it ignores that we copied the QueuePair object into
  device memory and have an instance there.
* Early deconstruction resulted in calling ibv_dereg_mr on the atomics memory region.
  So when the GPU kernel tried to use the memory region it wasn't
  registered which resulted in a protection domain error.
* The solution was to allocate our QueuePair obj with the new operator which leaves memory
  management to us, then we can manually call the deconstructor.

[ROCm/rocshmem commit: e856fbb0ebe519f8743e483a0c7a39548ab74e93]
---
 projects/rocshmem/src/gda/backend_gda.cpp | 29 +++++++++++++++++++----
 projects/rocshmem/src/gda/backend_gda.hpp |  1 +
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/projects/rocshmem/src/gda/backend_gda.cpp b/projects/rocshmem/src/gda/backend_gda.cpp
index 792b9ba2c6..6d2aee55a5 100644
--- a/projects/rocshmem/src/gda/backend_gda.cpp
+++ b/projects/rocshmem/src/gda/backend_gda.cpp
@@ -715,16 +715,35 @@ void GDABackend::cleanup_heap_memory_rkey() {
 }
 
 void GDABackend::setup_gpu_qps() {
-  CHECK_HIP(hipMalloc(&gpu_qps, sizeof(QueuePair) * (maximum_num_contexts_ + 1) * num_pes));
-  for (int i = 0; i < (maximum_num_contexts_ + 1) * num_pes; i++) {
-    QueuePair qp(pd_orig);
-    CHECK_HIP(hipMemcpy(&gpu_qps[i], &qp, sizeof(QueuePair), hipMemcpyDefault));
+  size_t qp_objs_count;
+  size_t qp_objs_mem_size;
+
+  qp_objs_count    = (maximum_num_contexts_ + 1) * num_pes;
+  qp_objs_mem_size = sizeof(QueuePair) * qp_objs_count;
+
+  CHECK_HIP(hipMalloc(&gpu_qps, qp_objs_mem_size));
+
+  host_qps = (QueuePair*) malloc(qp_objs_mem_size);
+  CHECK_NNULL(host_qps, "malloc (host_qps)");
+
+  for (int i = 0; i < qp_objs_count; i++) {
+    new (&host_qps[i]) QueuePair(pd_orig);
+    CHECK_HIP(hipMemcpy(&gpu_qps[i], &host_qps[i], sizeof(QueuePair), hipMemcpyDefault));
     initialize_gpu_qp(&gpu_qps[i], i);
   }
 }
 
 void GDABackend::cleanup_gpu_qps() {
-  //TODO need to destruct qp[i]?
+  size_t qp_objs_count;
+
+  qp_objs_count = (maximum_num_contexts_ + 1) * num_pes;
+
+  for (int i = 0; i < qp_objs_count; i++) {
+    host_qps[i].~QueuePair();
+  }
+
+  free(host_qps);
+
   CHECK_HIP(hipFree(gpu_qps));
   gpu_qps = nullptr;
 }
diff --git a/projects/rocshmem/src/gda/backend_gda.hpp b/projects/rocshmem/src/gda/backend_gda.hpp
index f075901be8..f94f7bbe3b 100644
--- a/projects/rocshmem/src/gda/backend_gda.hpp
+++ b/projects/rocshmem/src/gda/backend_gda.hpp
@@ -119,6 +119,7 @@ class GDABackend : public Backend {
 
   uint32_t sq_size = 1024;
   uint32_t inline_threshold = 8;
+  QueuePair *host_qps = nullptr;
   QueuePair *gpu_qps = nullptr;
   std::vector<ibv_qp*> qps;
   std::vector<ibv_cq*> cqs;