diff --git a/projects/rocshmem/src/ipc/context_ipc_device.cpp b/projects/rocshmem/src/ipc/context_ipc_device.cpp index be3b11c0e4..1bb39b260f 100644 --- a/projects/rocshmem/src/ipc/context_ipc_device.cpp +++ b/projects/rocshmem/src/ipc/context_ipc_device.cpp @@ -161,7 +161,7 @@ __device__ void IPCContext::getmem_nbi_wave(void *dest, const void *source, __device__ void IPCContext::internal_putmem(void *dest, const void *source, size_t nelems, int pe) { uint64_t L_offset = reinterpret_cast(dest) - wrk_sync_pool_bases_[my_pe]; - memcpy(wrk_sync_pool_bases_[pe] + L_offset, const_cast(source), nelems); + memcpy_lane(wrk_sync_pool_bases_[pe] + L_offset, const_cast(source), nelems); ipcImpl_.ipcFence(); } @@ -169,7 +169,7 @@ __device__ void IPCContext::internal_getmem(void *dest, const void *source, size_t nelems, int pe) { const char *src_typed = reinterpret_cast(source); uint64_t L_offset = const_cast(src_typed) - wrk_sync_pool_bases_[my_pe]; - memcpy(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems); + memcpy_lane(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems); ipcImpl_.ipcFence(); } diff --git a/projects/rocshmem/src/ipc_policy.cpp b/projects/rocshmem/src/ipc_policy.cpp index 313607b1e1..63ce178db1 100644 --- a/projects/rocshmem/src/ipc_policy.cpp +++ b/projects/rocshmem/src/ipc_policy.cpp @@ -219,7 +219,7 @@ __host__ void IpcOnImpl::ipcHostStop() { } __device__ void IpcOnImpl::ipcCopy(void *dst, void *src, size_t size) { - memcpy(dst, src, size); + memcpy_lane(dst, src, size); } __device__ void IpcOnImpl::ipcCopy_wave(void *dst, void *src, size_t size) { diff --git a/projects/rocshmem/src/util.hpp b/projects/rocshmem/src/util.hpp index a221cf3a4f..d76ab0f3ef 100644 --- a/projects/rocshmem/src/util.hpp +++ b/projects/rocshmem/src/util.hpp @@ -385,7 +385,7 @@ __device__ void gpu_dprintf(const char* fmt, const Args&... args) { #define LOAD(VAR) __atomic_load_n((VAR), __ATOMIC_SEQ_CST) #define STORE(DST, SRC) __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST) -__device__ __forceinline__ void memcpy(void* dst, void* src, size_t size) { +__device__ __forceinline__ void memcpy_lane(void* dst, void* src, size_t size) { uint8_t* dst_bytes{static_cast(dst)}; uint8_t* src_bytes{static_cast(src)};