Add IPC and/or/xor/swap amos, reenable functional tests (#184)

Αυτή η υποβολή περιλαμβάνεται σε:
Aurelien Bouteiller
2025-09-22 13:02:02 -04:00
υποβλήθηκε από GitHub
γονέας f5aefd15f3
υποβολή 96336da78f
4 αρχεία άλλαξαν με 83 προσθήκες και 66 διαγραφές
@@ -85,49 +85,59 @@ __device__ void IPCContext::amo_set(void *dest, T value, int pe) {
}
template <typename T>
__device__ T IPCContext::amo_swap(void *dst, T value, int pe) {
printf("IPC amo_swap not implemented\n");
abort();
return 0;
__device__ T IPCContext::amo_swap(void *dest, T value, int pe) {
uint64_t L_offset =
reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
return ipcImpl_.ipcAMOSwap(
reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
}
template <typename T>
__device__ T IPCContext::amo_fetch_and(void *dst, T value, int pe) {
printf("IPC amo_fetch_and not implemented\n");
abort();
return 0;
__device__ T IPCContext::amo_fetch_and(void *dest, T value, int pe) {
uint64_t L_offset =
reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
return ipcImpl_.ipcAMOFetchAnd(
reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
}
template <typename T>
__device__ void IPCContext::amo_and(void *dst, T value, int pe) {
printf("IPC amo_and not implemented\n");
abort();
__device__ void IPCContext::amo_and(void *dest, T value, int pe) {
uint64_t L_offset =
reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
ipcImpl_.ipcAMOAnd(
reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
}
template <typename T>
__device__ T IPCContext::amo_fetch_or(void *dst, T value, int pe) {
printf("IPC amo_fetch_or not implemented\n");
abort();
return 0;
__device__ T IPCContext::amo_fetch_or(void *dest, T value, int pe) {
uint64_t L_offset =
reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
return ipcImpl_.ipcAMOFetchOr(
reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
}
template <typename T>
__device__ void IPCContext::amo_or(void *dst, T value, int pe) {
printf("IPC amo_or not implemented\n");
abort();
__device__ void IPCContext::amo_or(void *dest, T value, int pe) {
uint64_t L_offset =
reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
ipcImpl_.ipcAMOOr(
reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
}
template <typename T>
__device__ T IPCContext::amo_fetch_xor(void *dst, T value, int pe) {
printf("IPC amo_fetch_xor not implemented\n");
abort();
return 0;
__device__ T IPCContext::amo_fetch_xor(void *dest, T value, int pe) {
uint64_t L_offset =
reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
return ipcImpl_.ipcAMOFetchXor(
reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
}
template <typename T>
__device__ void IPCContext::amo_xor(void *dst, T value, int pe) {
printf("IPC amo_xor not implemented\n");
abort();
__device__ void IPCContext::amo_xor(void *dest, T value, int pe) {
uint64_t L_offset =
reinterpret_cast<char *>(dest) - ipcImpl_.ipc_bases[my_pe];
ipcImpl_.ipcAMOXor(
reinterpret_cast<T *>(ipcImpl_.ipc_bases[pe] + L_offset), value);
}
template <typename T>
+48 -13
Προβολή Αρχείου
@@ -84,12 +84,25 @@ class IpcOnImpl {
__device__ void ipcFence() { __threadfence_system(); }
template <typename T>
__device__ void ipcAMOAdd(T *val, T value) {
__hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ T ipcAMOFetchAdd(T *val, T value) {
return __hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOCas(T *val, T cond, T value) {
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ T ipcAMOFetchCas(T *val, T cond, T value) {
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
@@ -98,24 +111,46 @@ class IpcOnImpl {
return cond;
}
template <typename T>
__device__ void ipcAMOAdd(T *val, T value) {
__hip_atomic_fetch_add(val, value, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOCas(T *val, T cond, T value) {
__hip_atomic_compare_exchange_strong(val, &cond, value, __ATOMIC_SEQ_CST,
__ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOSet(T *val, T value) {
__hip_atomic_store(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ T ipcAMOSwap(T *val, T value) {
return __hip_atomic_exchange(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOAnd(T *val, T value) {
__hip_atomic_fetch_and(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ T ipcAMOFetchAnd(T *val, T value) {
return __hip_atomic_fetch_and(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOOr(T *val, T value) {
__hip_atomic_fetch_or(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ T ipcAMOFetchOr(T *val, T value) {
return __hip_atomic_fetch_or(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ void ipcAMOXor(T *val, T value) {
__hip_atomic_fetch_xor(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
template <typename T>
__device__ T ipcAMOFetchXor(T *val, T value) {
return __hip_atomic_fetch_xor(val, value, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SYSTEM);
}
__device__ void zero_byte_read(int pe) {
int local_pe = pe % shm_size;
uint32_t *pe_ipc_base = reinterpret_cast<uint32_t *>(ipc_bases[local_pe]);
@@ -75,19 +75,9 @@ void AMOBitwiseTester<T>::launchKernel(dim3 gridsize, dim3 blocksize, int loop,
num_timed_msgs = loop;
}
#if defined(USE_IPC) and not defined(USE_RO)
#define DISABLE_IPC_TEST 1
#else
#define DISABLE_IPC_TEST 0
#endif
template <typename T>
void AMOBitwiseTester<T>::verifyResults(size_t size) {
T ret;
if(DISABLE_IPC_TEST) {
printf("AMO binary ops not implemented for IPC: values were not verified\n");
return;
}
if (args.myid == 0) {
T expected_val = 0;
@@ -141,10 +131,6 @@ void AMOBitwiseTester<T>::verifyResults(size_t size) {
TestType type, ShmemContextType ctx_type) { \
__shared__ rocshmem_ctx_t ctx; \
int wg_id = get_flat_grid_id(); \
if(DISABLE_IPC_TEST) { \
printf("AMO binary ops not implemented for IPC: test was not run\n"); \
return; \
} \
rocshmem_wg_init(); \
rocshmem_wg_ctx_create(ctx_type, &ctx); \
if (hipThreadIdx_x == 0) { \
@@ -75,12 +75,6 @@ void AMOExtendedTester<T>::launchKernel(dim3 gridsize, dim3 blocksize, int loop,
num_timed_msgs = loop;
}
#if defined(USE_IPC) and not defined(USE_RO)
#define DISABLE_IPC_TEST 1
#else
#define DISABLE_IPC_TEST 0
#endif
template <typename T>
void AMOExtendedTester<T>::verifyResults(size_t size) {
T ret;
@@ -95,10 +89,6 @@ void AMOExtendedTester<T>::verifyResults(size_t size) {
expected_val = 44;
break;
case AMO_SwapTestType:
if(DISABLE_IPC_TEST) {
printf("AMO Swap not implemented for IPC: values were not verified\n");
return;
}
expected_val = num_msgs / 2;
break;
default:
@@ -146,10 +136,6 @@ void AMOExtendedTester<T>::verifyResults(size_t size) {
rocshmem_ctx_##TNAME##_atomic_set(ctx, (T *)r_buf, 44, 1); \
break; \
case AMO_SwapTestType: \
if(DISABLE_IPC_TEST) { \
printf("AMO Swap not implemented for IPC: test was not run\n"); \
break; \
} \
ret = rocshmem_ctx_##TNAME##_atomic_swap(ctx, (T *)r_buf, \
ret + 1, 1); \
break; \