From 58f96af7ec51d55eca240a30dc22acbed8003322 Mon Sep 17 00:00:00 2001 From: Yiltan Date: Mon, 8 Sep 2025 10:57:39 -0400 Subject: [PATCH] Implemented workgroup puts (#238) --- scripts/functional_tests/driver.sh | 12 ++++++------ src/gda/context_gda_device.cpp | 9 +++++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/scripts/functional_tests/driver.sh b/scripts/functional_tests/driver.sh index 269092ef72..c5fdb595f1 100755 --- a/scripts/functional_tests/driver.sh +++ b/scripts/functional_tests/driver.sh @@ -450,9 +450,9 @@ TestGDA() { ExecTest "put" 2 32 256 512 ExecTest "put" 2 64 1024 8 -# ExecTest "wgput" 2 1 64 1048576 -# ExecTest "wgput" 2 2 64 1048576 -# ExecTest "wgput" 2 16 64 8 + ExecTest "wgput" 2 1 64 1048576 + ExecTest "wgput" 2 2 64 1048576 + ExecTest "wgput" 2 16 64 8 ExecTest "waveput" 2 1 64 1048576 ExecTest "waveput" 2 2 64 1048576 @@ -503,9 +503,9 @@ TestGDA() { ExecTest "putnbi" 2 32 256 512 ExecTest "putnbi" 2 64 1024 8 -# ExecTest "wgputnbi" 2 1 64 1048576 -# ExecTest "wgputnbi" 2 2 64 1048576 -# ExecTest "wgputnbi" 2 16 64 8 + ExecTest "wgputnbi" 2 1 64 1048576 + ExecTest "wgputnbi" 2 2 64 1048576 + ExecTest "wgputnbi" 2 16 64 8 ExecTest "waveputnbi" 2 1 64 1048576 ExecTest "waveputnbi" 2 2 64 1048576 diff --git a/src/gda/context_gda_device.cpp b/src/gda/context_gda_device.cpp index 52eef85623..3a7266ae1c 100644 --- a/src/gda/context_gda_device.cpp +++ b/src/gda/context_gda_device.cpp @@ -130,9 +130,10 @@ __device__ void *GDAContext::shmem_ptr(const void *dest, int pe) { __device__ void GDAContext::putmem_wg(void *dest, const void *source, size_t nelems, int pe) { + uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; if (is_thread_zero_in_block()) { - printf("rocshmem::gda:putmem_wg not implemented\n"); - abort(); + qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe); + qps[pe].quiet(); } } @@ -146,9 +147,9 @@ __device__ void GDAContext::getmem_wg(void *dest, const void *source, __device__ void GDAContext::putmem_nbi_wg(void *dest, const void *source, size_t nelems, int pe) { + uint64_t L_offset = reinterpret_cast(dest) - base_heap[my_pe]; if (is_thread_zero_in_block()) { - printf("rocshmem::gda:putmem_nbi_wg not implemented\n"); - abort(); + qps[pe].put_nbi(base_heap[pe] + L_offset, source, nelems, pe); } }