diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh
index b1ff5c7cd0..9eaaa9abe7 100755
--- a/scripts/functional_tests/driver.sh
+++ b/scripts/functional_tests/driver.sh
@@ -488,11 +488,10 @@ TestGDA() {
 #  ExecTest  "g"                2       8            1         32
 #  ExecTest  "g"                2       16           128       4
 
-#Implemented but known incorrect
-#  ExecTest  "p"                2       1            1         128
-#  ExecTest  "p"                2       1            1024      2
-#  ExecTest  "p"                2       8            1         32
-#  ExecTest  "p"                2       16           128       4
+  ExecTest  "p"                2       1            1         128
+  ExecTest  "p"                2       1            1024      2
+  ExecTest  "p"                2       8            1         32
+  ExecTest  "p"                2       16           128       4
 
   ################################ Non-Blocking ################################
 
@@ -608,9 +607,9 @@ TestGDA() {
   ##############################################################################
   ExecTest  "init"             2       1            1
 
-#  ExecTest  "pingpong"         2       1            1
-#  ExecTest  "pingpong"         2       8            1
-#  ExecTest  "pingpong"         2       32           1
+  ExecTest  "pingpong"         2       1            1
+  ExecTest  "pingpong"         2       8            1
+  ExecTest  "pingpong"         2       32           1
 
   # This test requires more contexts than workgroups
   export ROCSHMEM_MAX_NUM_CONTEXTS=1024
diff --git a/src/gda/backend_gda.cpp b/src/gda/backend_gda.cpp
index 242e8005c3..8bff6aaf1c 100644
--- a/src/gda/backend_gda.cpp
+++ b/src/gda/backend_gda.cpp
@@ -1043,6 +1043,7 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
   gpu_qp->rkey = htobe32(heap_rkey[conn_num % num_pes]);
   gpu_qp->lkey = htobe32(heap_mr->lkey);
   gpu_qp->qp_num = qps[conn_num]->qp_num;
+  gpu_qp->inline_threshold = inline_threshold;
   // The 2 in qp_out.bf.size * 2 below facilitates the switching between blue flame registers
   void* gpu_ptr{nullptr};
   rocm_memory_lock_to_fine_grain(qp_out.bf.reg, qp_out.bf.size * 2, &gpu_ptr, hip_dev_id);
@@ -1056,7 +1057,7 @@ void GDABackend::create_qps(int sq_length) {
   memset(&attr, 0, sizeof(struct ibv_qp_init_attr_ex));
   attr.cap.max_send_wr     = sq_length;
   attr.cap.max_send_sge    = 1;
-  attr.cap.max_inline_data = 0;
+  attr.cap.max_inline_data = inline_threshold;
 #ifdef GDA_IONIC
   attr.cap.max_recv_sge    = 1; // TODO allow zero sges in the driver
 #endif
diff --git a/src/gda/backend_gda.hpp b/src/gda/backend_gda.hpp
index db055c1a3c..c4492bbe1c 100644
--- a/src/gda/backend_gda.hpp
+++ b/src/gda/backend_gda.hpp
@@ -64,6 +64,7 @@ class GDABackend : public Backend {
   struct ibv_mr *heap_mr = nullptr;
 
   uint32_t sq_size = 1024;
+  uint32_t inline_threshold = 8;
   QueuePair *gpu_qps = nullptr;
   std::vector<ibv_qp*> qps;
   std::vector<ibv_cq*> cqs;
diff --git a/src/gda/bnxt/backend_gda_bnxt.cpp b/src/gda/bnxt/backend_gda_bnxt.cpp
index 10eeb0dd3b..4743ccb3fb 100644
--- a/src/gda/bnxt/backend_gda_bnxt.cpp
+++ b/src/gda/bnxt/backend_gda_bnxt.cpp
@@ -85,6 +85,9 @@ void GDABackend::initialize_gpu_qp(QueuePair* gpu_qp, int conn_num) {
   /* Export Memory Keys */
   gpu_qp->lkey = heap_mr->lkey;
   gpu_qp->rkey = heap_rkey[conn_num % num_pes];
+
+  /* Export Inline Threshold */
+  gpu_qp->inline_threshold = inline_threshold;
 }
 
 void GDABackend::create_cqs(int cqe) {
@@ -143,7 +146,7 @@ void GDABackend::create_qps(int sq_length) {
     ib_qp_attr.cap.max_recv_wr     = 0;
     ib_qp_attr.cap.max_send_sge    = 1;
     ib_qp_attr.cap.max_recv_sge    = 0;
-    ib_qp_attr.cap.max_inline_data = 0;
+    ib_qp_attr.cap.max_inline_data = inline_threshold;
     ib_qp_attr.qp_type             = IBV_QPT_RC;
     ib_qp_attr.sq_sig_all          = 0;
 
diff --git a/src/gda/bnxt/queue_pair_bnxt.cpp b/src/gda/bnxt/queue_pair_bnxt.cpp
index a141e3340e..2eea3e45af 100644
--- a/src/gda/bnxt/queue_pair_bnxt.cpp
+++ b/src/gda/bnxt/queue_pair_bnxt.cpp
@@ -247,17 +247,26 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr
       uint32_t wqe_size;
       uint32_t wqe_type;
       uint32_t hdr_flags;
-      uint32_t rma_slots = 3; // (Three slots: hdr, rdma)
+      uint32_t inline_msg;
+
+      uint32_t rma_slots  = 3; // (Three slots: hdr, rdma, sge)
+
+      inline_msg = length <= inline_threshold &&
+                   opcode == GDA_OP_RDMA_WRITE;
 
       hdr_ptr  = (struct bnxt_re_bsqe*) bnxt_re_get_hwqe(&sq, 0);
       rdma_ptr = (struct bnxt_re_rdma*) bnxt_re_get_hwqe(&sq, 1);
       sge_ptr  = (struct bnxt_re_sge*)  bnxt_re_get_hwqe(&sq, 2);
 
       /* Populate Header Segment */
-      wqe_size  = BNXT_RE_HDR_WS_MASK    & rma_slots;
+      wqe_type  = BNXT_RE_HDR_WT_MASK & opcode;
+      wqe_size  = BNXT_RE_HDR_WS_MASK & rma_slots;
       hdr_flags = ((uint32_t) BNXT_RE_HDR_FLAGS_MASK)
                 & ((uint32_t) BNXT_RE_WR_FLAGS_SIGNALED);
-      wqe_type  = BNXT_RE_HDR_WT_MASK    & opcode;
+
+      if (inline_msg) {
+        hdr_flags |= ((uint32_t) BNXT_RE_WR_FLAGS_INLINE);
+      }
 
       hdr.rsv_ws_fl_wt  = (wqe_size  << BNXT_RE_HDR_WS_SHIFT)
                         | (hdr_flags << BNXT_RE_HDR_FLAGS_SHIFT)
@@ -269,15 +278,22 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t length, uintptr_t *laddr
       rdma.rva  = (uint64_t) raddr;
       rdma.rkey = rkey;
 
-      /* Populate SG Segment */
-      sge.pa     = (uint64_t) laddr;
-      sge.lkey   = lkey;
-      sge.length = length;
+      if (!inline_msg) {
+        /* Populate SG Segment */
+        sge.pa     = (uint64_t) laddr;
+        sge.lkey   = lkey;
+        sge.length = length;
+      }
 
       /* Write WQE to SQ */
       memcpy(hdr_ptr,  &hdr,  sizeof(struct bnxt_re_bsqe));
       memcpy(rdma_ptr, &rdma, sizeof(struct bnxt_re_rdma));
-      memcpy(sge_ptr,  &sge,  sizeof(struct bnxt_re_sge));
+
+      if (inline_msg) {
+        memcpy(sge_ptr,  laddr,  length);
+      } else {
+        memcpy(sge_ptr,  &sge,  sizeof(struct bnxt_re_sge));
+      }
 
       /* Populate MSN Table */
       bnxt_re_fill_psns_for_msntbl(&sq, length);
diff --git a/src/gda/context_gda_tmpl_device.hpp b/src/gda/context_gda_tmpl_device.hpp
index 265afb232d..e28c5e0aa8 100644
--- a/src/gda/context_gda_tmpl_device.hpp
+++ b/src/gda/context_gda_tmpl_device.hpp
@@ -42,10 +42,7 @@ namespace rocshmem {
  *****************************************************************************/
 template <typename T>
 __device__ void GDAContext::p(T *dest, T value, int pe) {
-  printf("rocshmem::gda:p not implemented\n");
-  abort();
-  //TODO the following is incorrect because value is not ibv registered memory
-  //putmem_nbi(dest, &value, sizeof(T), pe);
+  putmem_nbi(dest, &value, sizeof(T), pe);
 }
 
 template <typename T>
diff --git a/src/gda/queue_pair.cpp b/src/gda/queue_pair.cpp
index 397df577b9..e4ec342854 100644
--- a/src/gda/queue_pair.cpp
+++ b/src/gda/queue_pair.cpp
@@ -446,7 +446,13 @@ __device__ void QueuePair::post_wqe_rma(int pe, int32_t size, uintptr_t *laddr,
   SegmentBuilder seg_build(my_sq_index, sq_buf);
   seg_build.update_ctrl_seg(my_sq_counter, opcode, 0, qp_num, MLX5_WQE_CTRL_CQ_UPDATE, 3, 0, 0);
   seg_build.update_raddr_seg(raddr, rkey);
-  seg_build.update_data_seg(laddr, size, lkey);
+
+  if (size <= inline_threshold && opcode == GDA_OP_RDMA_WRITE) {
+    seg_build.update_inl_data_seg(laddr, size);
+  } else {
+    seg_build.update_data_seg(laddr, size, lkey);
+  }
+
   __atomic_signal_fence(__ATOMIC_SEQ_CST);
 
   if (is_leader) {
diff --git a/src/gda/queue_pair.hpp b/src/gda/queue_pair.hpp
index df7e83ba5f..4356116341 100644
--- a/src/gda/queue_pair.hpp
+++ b/src/gda/queue_pair.hpp
@@ -234,8 +234,6 @@ class QueuePair {
   uint32_t sq_prod{0};
   uint32_t sq_msn{0};
 
-  uint32_t inline_threshold{0};
-
 #elif defined(GDA_BNXT)
   uint64_t *dbr;
   struct bnxt_device_cq cq;
@@ -305,6 +303,8 @@ class QueuePair {
 
 #endif // GDA_IONIC
 
+  uint32_t inline_threshold{0};
+
   uint32_t qp_num{0};
   uint32_t rkey{0};
   uint32_t lkey{0};
diff --git a/src/gda/segment_builder.cpp b/src/gda/segment_builder.cpp
index ca3033d103..f9f6c9046a 100644
--- a/src/gda/segment_builder.cpp
+++ b/src/gda/segment_builder.cpp
@@ -108,6 +108,14 @@ __device__ void SegmentBuilder::update_data_seg(uintptr_t *address, uint32_t len
   segp++;
 }
 
+__device__ void SegmentBuilder::update_inl_data_seg(uintptr_t *laddr, int32_t size) {
+  // size is masked with 0x3FF because only the first 10 bits of byte_count are valid
+  swap_endian_store(&segp->inl_data_seg.byte_count, ((size & 0x3FF) | MLX5_INLINE_SEG));
+  // + 1 because we start packing the segment with data after the byte_count parameter
+  memcpy(&segp->inl_data_seg + 1, laddr, size);
+  segp++;
+}
+
 __device__ void SegmentBuilder::update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp) {
   segp->atomic_seg = {0};
   swap_endian_store(reinterpret_cast<uint64_t*>(&segp->atomic_seg.swap_add), atomic_data);
diff --git a/src/gda/segment_builder.hpp b/src/gda/segment_builder.hpp
index 130a9604f3..c5ce93aca6 100644
--- a/src/gda/segment_builder.hpp
+++ b/src/gda/segment_builder.hpp
@@ -32,58 +32,32 @@
 namespace rocshmem {
 
 class SegmentBuilder {
- public:
-  __device__ SegmentBuilder(uint64_t wqe_idx, void *base);
+  public:
+    __device__ SegmentBuilder(uint64_t wqe_idx, void *base);
 
-  /*
-   * struct mlx5_wqe_ctrl_seg {
-   *   __be32 opmod_idx_opcode;
-   *   __be32 qpn_ds;
-   *   uint8_t signature;
-   *   __be16 dci_stream_channel_id;
-   *   uint8_t fm_ce_se;
-   *   __be32 imm;
-   * } __attribute__((__packed__)) __attribute__((__aligned__(4)));
-   */
-  __device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm);
+    __device__ void update_ctrl_seg(uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num,
+                                    uint8_t fm_ce_se, uint8_t ds, uint8_t signature, uint32_t imm);
 
-  /*
-   * struct mlx5_wqe_raddr_seg {
-   *   __be64 raddr;
-   *   __be32 rkey;
-   *   __be32 reserved;
-   * };
-   */
-  __device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey);
+    __device__ void update_raddr_seg(uint64_t *raddr, uint32_t rkey);
 
-  /*
-   * struct mlx5_wqe_data_seg {
-   * __be32 byte_count;
-   * __be32 lkey;
-   * __be64 addr;
-   * };
-   */
-  __device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey);
+    __device__ void update_data_seg(uint64_t *laddr, uint32_t size, uint32_t lkey);
 
-  /*
-   * struct mlx5_wqe_atomic_seg {
-   *   __be64 swap_add;
-   *   __be64 compare;
-   * };
-   */
-  __device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp);
+    __device__ void update_inl_data_seg(uintptr_t *laddr, int32_t size);
 
- private:
-  const int SEGMENTS_PER_WQE = 4;
+    __device__ void update_atomic_seg(uint64_t atomic_data, uint64_t atomic_cmp);
 
-  union mlx5_segment {
-    mlx5_wqe_ctrl_seg ctrl_seg;
-    mlx5_wqe_raddr_seg raddr_seg;
-    mlx5_wqe_data_seg data_seg;
-    mlx5_wqe_atomic_seg atomic_seg;
-  };
+  private:
+    const int SEGMENTS_PER_WQE = 4;
 
-  mlx5_segment *segp;
+    union mlx5_segment {
+      mlx5_wqe_ctrl_seg ctrl_seg;
+      mlx5_wqe_raddr_seg raddr_seg;
+      mlx5_wqe_data_seg data_seg;
+      mlx5_wqe_inl_data_seg inl_data_seg;
+      mlx5_wqe_atomic_seg atomic_seg;
+    };
+
+    mlx5_segment *segp;
 };
 
 }  // namespace rocshmem