From e31b4d42e57b246de2d63bed72de2df8cd2def8e Mon Sep 17 00:00:00 2001 From: Avinash Kethineedi Date: Mon, 6 Oct 2025 11:50:50 -0400 Subject: [PATCH] Update atomic functional tests (#262) * feat: implement function to return number of blocks in grid. * test: update atomics functional tests - Standard atomic tests: `atomic_add`, `atomic_inc`, `fetch_atomic_add`, `fetch_atomic_inc`, and `fetch_compare_and_swap` - Bitwise atomic tests: `atomic_and`, `atomic_or`, `atomic_xor`, fetch_atomic_and`, `fetch_atomic_or`, and `fetch_atomic_xor` - Extended atomic tests: `atomic_fetch`, `atomic_set`, and `atomic_swap` * Added two different address modes for atomics. * Added all supported data types for atomics tests. [ROCm/rocshmem commit: 0a4f8a83b989fa38830cd382c388807d34808dbd] --- projects/rocshmem/src/util.hpp | 7 + .../functional_tests/amo_bitwise_tester.cpp | 315 ++++++++++++------ .../functional_tests/amo_bitwise_tester.hpp | 17 +- .../functional_tests/amo_extended_tester.cpp | 247 ++++++++++---- .../functional_tests/amo_extended_tester.hpp | 17 +- .../functional_tests/amo_standard_tester.cpp | 304 ++++++++++++----- .../functional_tests/amo_standard_tester.hpp | 17 +- .../functional_tests/tester_arguments.cpp | 8 + .../functional_tests/tester_arguments.hpp | 9 + 9 files changed, 676 insertions(+), 265 deletions(-) diff --git a/projects/rocshmem/src/util.hpp b/projects/rocshmem/src/util.hpp index d06e82c857..a791c875a0 100644 --- a/projects/rocshmem/src/util.hpp +++ b/projects/rocshmem/src/util.hpp @@ -191,6 +191,13 @@ __device__ __forceinline__ int get_flat_block_id() { hipThreadIdx_z * hipBlockDim_x * hipBlockDim_y; } +/* + * Returns the number of blocks in the caller's flattened grid. + */ +__device__ __forceinline__ int get_grid_num_blocks() { + return hipGridDim_x * hipGridDim_y * hipGridDim_z; +} + /* * Returns the flattened block index that the calling thread is a member of in * in the grid. Callers from the same block will have the same index. diff --git a/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.cpp b/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.cpp index daee50a913..25c8c6dfe5 100644 --- a/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.cpp @@ -32,33 +32,54 @@ using namespace rocshmem; /* Declare the global kernel template with a generic implementation */ template __global__ void AMOBitwiseTest(int loop, int skip, long long int *start_time, - long long int *end_time, char *r_buf, - T *s_buf, T *ret_val, TestType type, + long long int *end_time, T *dest, T *ret_val, + AddrMode addr_mode, TestType type, ShmemContextType ctx_type) { return; } +template +__device__ inline T* compute_target_ptr(T* base_ptr, AddrMode addr_mode, + int wg_idx, int itr, int n_wgs) { + // PerBlock: element = wg_idx, with n_wgs elements per loop + // PerGrid : single element shared by the whole grid per loop + if (addr_mode == AddrMode::PerBlock) { + size_t offset = wg_idx + itr * n_wgs; + return base_ptr + offset; + } else { // PerGrid + return base_ptr + itr; + } +} + /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ template AMOBitwiseTester::AMOBitwiseTester(TesterArguments args) : Tester(args) { - CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)rocshmem_malloc(args.max_msg_size); - _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); + n_out = (args.addr_mode == AddrMode::PerBlock) ? args.num_wgs : 1; + n_in = args.num_wgs * args.wg_size; + n_loops = args.loop + args.skip; + + // One return per *thread* per loop + CHECK_HIP(hipMalloc((void **)&ret_val, args.max_msg_size * n_in * n_loops)); + + dest = (T *)rocshmem_malloc(args.max_msg_size * n_out * n_loops); + if (dest == nullptr) { + std::cerr << "Error allocating memory from symmetric heap" << std::endl; + std::cerr << "dest: " << (void*)dest << std::endl; + } } template AMOBitwiseTester::~AMOBitwiseTester() { - rocshmem_free(_r_buf); - CHECK_HIP(hipFree(_ret_val)); + CHECK_HIP(hipFree(ret_val)); + rocshmem_free(dest); } template void AMOBitwiseTester::resetBuffers(size_t size) { - memset(_r_buf, 0, args.max_msg_size); - memset(_ret_val, 0, args.max_msg_size * args.num_wgs); - memset(_s_buf, 0, args.max_msg_size * args.num_wgs); + memset(ret_val, 0, args.max_msg_size * n_in * n_loops); + memset(dest, 0, args.max_msg_size * n_out * n_loops); } template @@ -67,59 +88,158 @@ void AMOBitwiseTester::launchKernel(dim3 gridsize, dim3 blocksize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(AMOBitwiseTest, gridsize, blocksize, shared_bytes, stream, - loop, args.skip, start_time, end_time, _r_buf, _s_buf, - _ret_val, _type, _shmem_context); + args.loop, args.skip, start_time, end_time, dest, + ret_val, args.addr_mode, _type, _shmem_context); - _gridSize = gridsize; - num_msgs = (loop + args.skip) * gridsize.x; - num_timed_msgs = loop; + num_msgs = n_loops * gridsize.x * blocksize.x; + num_timed_msgs = args.loop * gridsize.x * blocksize.x; +} + +template +void fail_eq(const G& got, const G& exp) { + std::cerr << "data validation error\n" + << "got " << got << ", expected " << exp << std::endl; + std::exit(-1); +} + +// Map (loop, elem_idx) -> dest[] index for current address mode. +template +int AMOBitwiseTester::destIndex(int l, int elem_idx) const { + return (args.addr_mode == AddrMode::PerBlock) + ? l * static_cast(args.num_wgs) + elem_idx + : l; // PerGrid has a single element per loop +} + +// Number of output elements to check per loop for current address mode. +template +int AMOBitwiseTester::numElems() const { + return (args.addr_mode == AddrMode::PerBlock) + ? static_cast(args.num_wgs) + : 1; // PerGrid +} + +// Return pointer to the start of the ret_val “chunk” for (loop, elem_idx) +// plus the chunk length for this address mode. +template +std::pair AMOBitwiseTester::retChunk(int l, int elem_idx) const { + if (args.addr_mode == AddrMode::PerBlock) { + // One chunk per element (workgroup): wg_size returns + T* p = ret_val + l * n_in + elem_idx * args.wg_size; + int sz = static_cast(args.wg_size); + return {p, sz}; + } + // PerGrid: one big chunk per loop (all threads) + T* p = ret_val + l * n_in; + int sz = static_cast(n_in); + return {p, sz}; +} + +template +void AMOBitwiseTester::verifyDestValues() { + const int loops = static_cast(n_loops); + const int n_elems = numElems(); + + auto check_equal_all = [&](T expected) { + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + const int idx = destIndex(l, elem); + if (dest[idx] != expected) fail_eq(dest[idx], expected); + } + } + }; + + // Use all-ones mask for type T + const T MASK = static_cast(~T{0}); + + switch (_type) { + case AMO_AndTestType: + case AMO_FetchAndTestType: { + // Start at 0; 0 & MASK == 0 regardless of writer count. + check_equal_all(static_cast(0)); + break; + } + case AMO_OrTestType: + case AMO_FetchOrTestType: { + // final value is MASK. + check_equal_all(MASK); + break; + } + case AMO_XorTestType: + case AMO_FetchXorTestType: { + // PerBlock: K = wg_size; PerGrid: K = num_wgs * wg_size + const int K = (args.addr_mode == AddrMode::PerBlock) + ? static_cast(args.wg_size) + : static_cast(args.num_wgs * args.wg_size); + const T expected = (K & 1) ? MASK : static_cast(0); + check_equal_all(expected); + break; + } + default: + break; + } +} + +template +void AMOBitwiseTester::verifyReturnValues() { + // Only “fetch-*” types produce return values to validate. + if (_type == AMO_AndTestType || _type == AMO_OrTestType || + _type == AMO_XorTestType) return; + + const int loops = static_cast(n_loops); + const int n_elems = numElems(); + const T MASK = static_cast(~T{0}); + + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + auto [p, cnt] = retChunk(l, elem); + + // Count distribution of observed old values in this chunk + int zeros = 0, masks = 0; + for (int i = 0; i < cnt; ++i) { + zeros += (p[i] == static_cast(0)); + masks += (p[i] == MASK); + } + if (zeros + masks != cnt) { + fail_eq(zeros + masks, cnt); // unexpected values present + } + + switch (_type) { + case AMO_FetchAndTestType: + // Old value is 0 (dest stays 0) + if (!(zeros == cnt && masks == 0)) fail_eq(zeros, cnt); + break; + + case AMO_FetchOrTestType: + // Exactly one 0 (the first OR), rest MASK + if (!(zeros == 1 && masks == cnt - 1)) fail_eq(zeros, 1); + break; + + case AMO_FetchXorTestType: { + // returns multiset = { ceil(K/2) zeros, floor(K/2) MASKs } + const int exp_zeros = (cnt + 1) / 2; // ceil(cnt/2) + const int exp_masks = cnt / 2; // floor(cnt/2) + if (!(zeros == exp_zeros && masks == exp_masks)) { + fail_eq(zeros, exp_zeros); + } + // cross-check + if ((cnt & 1) && zeros != masks + 1) fail_eq(zeros, masks + 1); + if (!(cnt & 1) && zeros != masks) fail_eq(zeros, masks); + break; + } + default: + break; + } + } + } } template void AMOBitwiseTester::verifyResults(size_t size) { - T ret; - if (args.myid == 0) { - T expected_val = 0; - - switch (_type) { - case AMO_FetchAndTestType: - expected_val = 0; - break; - case AMO_AndTestType: - expected_val = 0; - break; - case AMO_FetchOrTestType: - expected_val = 0xFFFF; - break; - case AMO_OrTestType: - expected_val = 0xFFFF; - break; - case AMO_FetchXorTestType: - expected_val = 0xFFFF; - break; - case AMO_XorTestType: - expected_val = (num_msgs % 2) ? 0xFFFF : 0; - break; - default: - break; - } - - int fetch_op = - (_type == AMO_FetchAndTestType || _type == AMO_FetchOrTestType || - _type == AMO_FetchXorTestType) - ? 1 - : 0; - - if (fetch_op == 1) { - ret = *std::max_element(_ret_val, _ret_val + args.num_wgs); - } else { - ret = *std::max_element(_s_buf, _s_buf + args.num_wgs); - } - if (ret != expected_val) { - std::cerr << "data validation error\n"; - std::cerr << "got " << ret << ", expected " << expected_val << std::endl; - exit(-1); - } + // PE 0 checks returns; target PE checks dest. + if (args.myid) { + verifyDestValues(); + } else { + verifyReturnValues(); } } @@ -127,50 +247,51 @@ void AMOBitwiseTester::verifyResults(size_t size) { template <> \ __global__ void AMOBitwiseTest( \ int loop, int skip, long long int *start_time, \ - long long int *end_time, char *r_buf, T *s_buf, T *ret_val, \ - TestType type, ShmemContextType ctx_type) { \ + long long int *end_time, T *dest, T *ret_val, \ + AddrMode addr_mode, TestType type, ShmemContextType ctx_type) { \ __shared__ rocshmem_ctx_t ctx; \ - int wg_id = get_flat_grid_id(); \ + int wg_id = get_flat_grid_id(); \ + int global_id = get_flat_id(); \ + int n_threads = get_flat_grid_size(); \ + int n_wgs = get_grid_num_blocks(); \ rocshmem_wg_init(); \ rocshmem_wg_ctx_create(ctx_type, &ctx); \ - if (hipThreadIdx_x == 0) { \ + for (int i = 0; i < loop + skip; i++) { \ + T *ptr = compute_target_ptr(dest, addr_mode, wg_id, i, n_wgs); \ T ret = 0; \ - T cond = 0; \ - for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) { \ - start_time[wg_id] = wall_clock64(); \ - } \ - switch (type) { \ - case AMO_FetchAndTestType: \ - ret = rocshmem_ctx_##TNAME##_atomic_fetch_and(ctx, (T *)r_buf, \ - 0xFFFF, 1); \ - break; \ - case AMO_AndTestType: \ - rocshmem_ctx_##TNAME##_atomic_and(ctx, (T *)r_buf, 0xFFFF, 1); \ - break; \ - case AMO_FetchOrTestType: \ - ret = rocshmem_ctx_##TNAME##_atomic_fetch_or(ctx, (T *)r_buf, \ - 0xFFFF, 1); \ - break; \ - case AMO_OrTestType: \ - rocshmem_ctx_##TNAME##_atomic_or(ctx, (T *)r_buf, 0xFFFF, 1); \ - break; \ - case AMO_FetchXorTestType: \ - ret = rocshmem_ctx_##TNAME##_atomic_fetch_xor(ctx, (T *)r_buf, \ - 0xFFFF, 1); \ - break; \ - case AMO_XorTestType: \ - rocshmem_ctx_##TNAME##_atomic_xor(ctx, (T *)r_buf, 0xFFFF, 1); \ - break; \ - default: \ - break; \ - } \ + if (i == skip) { \ + start_time[wg_id] = wall_clock64(); \ } \ - rocshmem_ctx_quiet(ctx); \ - end_time[wg_id] = wall_clock64(); \ - ret_val[wg_id] = ret; \ - rocshmem_ctx_getmem(ctx, &s_buf[wg_id], r_buf, sizeof(T), 1); \ + switch (type) { \ + case AMO_FetchAndTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_and(ctx, ptr, \ + (T)~(T)0, 1); \ + break; \ + case AMO_AndTestType: \ + rocshmem_ctx_##TNAME##_atomic_and(ctx, ptr, (T)~(T)0, 1); \ + break; \ + case AMO_FetchOrTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_or(ctx, ptr, \ + (T)~(T)0, 1); \ + break; \ + case AMO_OrTestType: \ + rocshmem_ctx_##TNAME##_atomic_or(ctx, ptr, (T)~(T)0, 1); \ + break; \ + case AMO_FetchXorTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_xor(ctx, ptr, \ + (T)~(T)0, 1); \ + break; \ + case AMO_XorTestType: \ + rocshmem_ctx_##TNAME##_atomic_xor(ctx, ptr, (T)~(T)0, 1); \ + break; \ + default: \ + break; \ + } \ + ret_val[global_id + i * n_threads] = ret; \ } \ + rocshmem_ctx_quiet(ctx); \ + end_time[wg_id] = wall_clock64(); \ + __syncthreads(); \ rocshmem_wg_ctx_destroy(&ctx); \ rocshmem_wg_finalize(); \ } \ @@ -179,3 +300,5 @@ void AMOBitwiseTester::verifyResults(size_t size) { AMO_BITWISE_DEF_GEN(unsigned int, uint) AMO_BITWISE_DEF_GEN(unsigned long, ulong) AMO_BITWISE_DEF_GEN(unsigned long long, ulonglong) +AMO_BITWISE_DEF_GEN(int32_t, int32) +AMO_BITWISE_DEF_GEN(int64_t, int64) diff --git a/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.hpp b/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.hpp index ed627d60ce..dba795bea2 100644 --- a/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.hpp +++ b/projects/rocshmem/tests/functional_tests/amo_bitwise_tester.hpp @@ -44,10 +44,19 @@ class AMOBitwiseTester : public Tester { virtual void verifyResults(size_t size) override; - dim3 _gridSize{}; - char *_r_buf; - T *_ret_val; - T *_s_buf; + void verifyDestValues(); + void verifyReturnValues(); + + int destIndex(int l, int elem_idx) const; + int numElems() const; + std::pair retChunk(int l, int elem_idx) const; + + T* dest{nullptr}; // symmetric target buffer [loop][elem] + T* ret_val{nullptr}; // device returns [loop][thread] + + size_t n_in{0}; // num_wgs * wg_size + size_t n_out{0}; // elements per loop: PerBlock->num_wgs, PerGrid->1 + size_t n_loops{0}; // loop + skip }; #endif diff --git a/projects/rocshmem/tests/functional_tests/amo_extended_tester.cpp b/projects/rocshmem/tests/functional_tests/amo_extended_tester.cpp index 8058c0cc8e..b4bc76f754 100644 --- a/projects/rocshmem/tests/functional_tests/amo_extended_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/amo_extended_tester.cpp @@ -32,33 +32,54 @@ using namespace rocshmem; /* Declare the global kernel template with a generic implementation */ template __global__ void AMOExtendedTest(int loop, int skip, long long int *start_time, - long long int *end_time, char *r_buf, - T *s_buf, T *ret_val, TestType type, + long long int *end_time, T *dest, T *ret_val, + AddrMode addr_mode, TestType type, ShmemContextType ctx_type) { return; } +template +__device__ inline T* compute_target_ptr(T* base_ptr, AddrMode addr_mode, + int wg_idx, int itr, int n_wgs) { + // PerBlock: element = wg_idx, with n_wgs elements per loop + // PerGrid : single element shared by the whole grid per loop + if (addr_mode == AddrMode::PerBlock) { + size_t offset = wg_idx + itr * n_wgs; + return base_ptr + offset; + } else { // PerGrid + return base_ptr + itr; + } +} + /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ template AMOExtendedTester::AMOExtendedTester(TesterArguments args) : Tester(args) { - CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)rocshmem_malloc(args.max_msg_size); - _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); + n_out = (args.addr_mode == AddrMode::PerBlock) ? args.num_wgs : 1; + n_in = args.num_wgs * args.wg_size; + n_loops = args.loop + args.skip; + + // One return per *thread* per loop + CHECK_HIP(hipMalloc((void **)&ret_val, args.max_msg_size * n_in * n_loops)); + + dest = (T *)rocshmem_malloc(args.max_msg_size * n_out * n_loops); + if (dest == nullptr) { + std::cerr << "Error allocating memory from symmetric heap" << std::endl; + std::cerr << "dest: " << (void*)dest << std::endl; + } } template AMOExtendedTester::~AMOExtendedTester() { - rocshmem_free(_r_buf); - CHECK_HIP(hipFree(_ret_val)); + CHECK_HIP(hipFree(ret_val)); + rocshmem_free(dest); } template void AMOExtendedTester::resetBuffers(size_t size) { - memset(_r_buf, 0, args.max_msg_size); - memset(_ret_val, 0, args.max_msg_size * args.num_wgs); - memset(_s_buf, 0, args.max_msg_size * args.num_wgs); + memset(ret_val, 0, args.max_msg_size * n_in * n_loops); + memset(dest, 0, args.max_msg_size * n_out * n_loops); } template @@ -67,87 +88,173 @@ void AMOExtendedTester::launchKernel(dim3 gridsize, dim3 blocksize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(AMOExtendedTest, gridsize, blocksize, shared_bytes, stream, - loop, args.skip, start_time, end_time, _r_buf, _s_buf, - _ret_val, _type, _shmem_context); + args.loop, args.skip, start_time, end_time, dest, + ret_val, args.addr_mode, _type, _shmem_context); - _gridSize = gridsize; - num_msgs = (loop + args.skip) * gridsize.x; - num_timed_msgs = loop; + num_msgs = n_loops * gridsize.x * blocksize.x; + num_timed_msgs = args.loop * gridsize.x * blocksize.x; +} + +template +void fail_eq(const G& got, const G& exp) { + std::cerr << "data validation error\n" + << "got " << got << ", expected " << exp << std::endl; + std::exit(-1); +} + +// Map (loop, elem_idx) -> dest[] index for current address mode. +template +int AMOExtendedTester::destIndex(int l, int elem_idx) const { + return (args.addr_mode == AddrMode::PerBlock) + ? l * static_cast(args.num_wgs) + elem_idx + : l; // PerGrid has a single element per loop +} + +// Number of output elements to check per loop for current address mode. +template +int AMOExtendedTester::numElems() const { + return (args.addr_mode == AddrMode::PerBlock) + ? static_cast(args.num_wgs) + : 1; // PerGrid +} + +// Return pointer to the start of the ret_val “chunk” for (loop, elem_idx) +// plus the chunk length for this address mode. +template +std::pair AMOExtendedTester::retChunk(int l, int elem_idx) const { + if (args.addr_mode == AddrMode::PerBlock) { + // One chunk per element (workgroup): wg_size returns + T* p = ret_val + l * n_in + elem_idx * args.wg_size; + int sz = static_cast(args.wg_size); + return {p, sz}; + } + // PerGrid: one big chunk per loop (all threads) + T* p = ret_val + l * n_in; + int sz = static_cast(n_in); + return {p, sz}; } template -void AMOExtendedTester::verifyResults(size_t size) { - T ret; - if (args.myid == 0) { - T expected_val = 0; +void AMOExtendedTester::verifyDestValues() { + const int loops = static_cast(n_loops); + const int n_elems = numElems(); - switch (_type) { - case AMO_FetchTestType: - expected_val = 0; - break; - case AMO_SetTestType: - expected_val = 44; - break; - case AMO_SwapTestType: - expected_val = num_msgs / 2; - break; - default: - break; + auto check_equal_all = [&](T expected) { + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + const int idx = destIndex(l, elem); + if (dest[idx] != expected) fail_eq(dest[idx], expected); + } } + }; - int fetch_op = - (_type == AMO_FetchTestType || _type == AMO_SwapTestType) ? 1 : 0; + auto check_nonzero_all = [&]() { + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + const int idx = destIndex(l, elem); + if (dest[idx] == T{0}) fail_eq(dest[idx], T{1}); + } + } + }; - if (fetch_op == 1) { - ret = *std::max_element(_ret_val, _ret_val + args.num_wgs); - } else { - ret = *std::max_element(_s_buf, _s_buf + args.num_wgs); - } - if (ret != expected_val) { - std::cerr << "data validation error\n"; - std::cerr << "got " << ret << ", expected " << expected_val << std::endl; - exit(-1); + switch (_type) { + case AMO_FetchTestType: + // fetch does not modify dest -> stays 0 + check_equal_all(T{0}); + break; + + case AMO_SetTestType: + // set writes a constant (17) + check_equal_all(static_cast(17)); + break; + + case AMO_SwapTestType: + // swap writes non-zero values -> final must be non-zero + check_nonzero_all(); + break; + + default: + break; + } +} + +template +void AMOExtendedTester::verifyReturnValues() { + // Only fetch/swap produce return values to validate + if (_type == AMO_SetTestType) return; + + const int loops = static_cast(n_loops); + const int n_elems = numElems(); + + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + auto [p, cnt] = retChunk(l, elem); + + int zeros = 0; + for (int i = 0; i < cnt; ++i) { + zeros += (p[i] == T{0}); + } + + if (_type == AMO_FetchTestType) { + // fetch returns the current value (initially 0), dest unchanged + if (zeros != cnt) fail_eq(zeros, cnt); + } else { // AMO_SwapTestType + // For a single element per (loop,elem), exactly one atomic_swap + // observes old==0 (the first arriving swap). The rest see non-zero. + if (zeros != 1) fail_eq(zeros, 1); + } } } } +template +void AMOExtendedTester::verifyResults(size_t /*size*/) { + // PE 0 checks returns; target PE checks dest. + if (args.myid) { + verifyDestValues(); + } else { + verifyReturnValues(); + } +} + #define AMO_EXTENDED_DEF_GEN(T, TNAME) \ template <> \ __global__ void AMOExtendedTest( \ int loop, int skip, long long int *start_time, \ - long long int *end_time, char *r_buf, T *s_buf, T *ret_val, \ - TestType type, ShmemContextType ctx_type) { \ + long long int *end_time, T *dest, T *ret_val, \ + AddrMode addr_mode, TestType type, ShmemContextType ctx_type) { \ __shared__ rocshmem_ctx_t ctx; \ - int wg_id = get_flat_grid_id(); \ + int wg_id = get_flat_grid_id(); \ + int global_id = get_flat_id(); \ + int t_id = get_flat_block_id(); \ + int n_threads = get_flat_grid_size(); \ + int n_wgs = get_grid_num_blocks(); \ rocshmem_wg_init(); \ rocshmem_wg_ctx_create(ctx_type, &ctx); \ - if (hipThreadIdx_x == 0) { \ - T ret = 0; \ - T cond = 0; \ - for (int i = 0; i < loop + skip; i++) { \ - if (i == skip) { \ - start_time[wg_id] = wall_clock64(); \ - } \ - switch (type) { \ - case AMO_FetchTestType: \ - ret = rocshmem_ctx_##TNAME##_atomic_fetch(ctx, (T *)r_buf, 1); \ - break; \ - case AMO_SetTestType: \ - rocshmem_ctx_##TNAME##_atomic_set(ctx, (T *)r_buf, 44, 1); \ - break; \ - case AMO_SwapTestType: \ - ret = rocshmem_ctx_##TNAME##_atomic_swap(ctx, (T *)r_buf, \ - ret + 1, 1); \ - break; \ - default: \ - break; \ - } \ + for (int i = 0; i < loop + skip; i++) { \ + T *ptr = compute_target_ptr(dest, addr_mode, wg_id, i, n_wgs); \ + T ret = 0; \ + if (i == skip) { \ + start_time[wg_id] = wall_clock64(); \ } \ - rocshmem_ctx_quiet(ctx); \ - end_time[wg_id] = wall_clock64(); \ - ret_val[wg_id] = ret; \ - rocshmem_ctx_getmem(ctx, &s_buf[wg_id], r_buf, sizeof(T), 1); \ + switch (type) { \ + case AMO_FetchTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch(ctx, ptr, 1); \ + break; \ + case AMO_SetTestType: \ + rocshmem_ctx_##TNAME##_atomic_set(ctx, ptr, (T)17, 1); \ + break; \ + case AMO_SwapTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_swap(ctx, ptr, (T)(t_id + 1),1);\ + break; \ + default: \ + break; \ + } \ + ret_val[global_id + i * n_threads] = ret; \ } \ + rocshmem_ctx_quiet(ctx); \ + end_time[wg_id] = wall_clock64(); \ + __syncthreads(); \ rocshmem_wg_ctx_destroy(&ctx); \ rocshmem_wg_finalize(); \ } \ diff --git a/projects/rocshmem/tests/functional_tests/amo_extended_tester.hpp b/projects/rocshmem/tests/functional_tests/amo_extended_tester.hpp index 3e76610307..0c4a5701c1 100644 --- a/projects/rocshmem/tests/functional_tests/amo_extended_tester.hpp +++ b/projects/rocshmem/tests/functional_tests/amo_extended_tester.hpp @@ -44,10 +44,19 @@ class AMOExtendedTester : public Tester { virtual void verifyResults(size_t size) override; - dim3 _gridSize{}; - char *_r_buf; - T *_ret_val; - T *_s_buf; + void verifyDestValues(); + void verifyReturnValues(); + + int destIndex(int l, int elem_idx) const; + int numElems() const; + std::pair retChunk(int l, int elem_idx) const; + + T* dest{nullptr}; // symmetric target buffer [loop][elem] + T* ret_val{nullptr}; // device returns [loop][thread] + + size_t n_in{0}; // num_wgs * wg_size + size_t n_out{0}; // elements per loop: PerBlock->num_wgs, PerGrid->1 + size_t n_loops{0}; // loop + skip }; #endif diff --git a/projects/rocshmem/tests/functional_tests/amo_standard_tester.cpp b/projects/rocshmem/tests/functional_tests/amo_standard_tester.cpp index c05f1d780e..7b81d36f1a 100644 --- a/projects/rocshmem/tests/functional_tests/amo_standard_tester.cpp +++ b/projects/rocshmem/tests/functional_tests/amo_standard_tester.cpp @@ -33,33 +33,54 @@ using namespace rocshmem; /* Declare the global kernel template with a generic implementation */ template __global__ void AMOStandardTest(int loop, int skip, long long int *start_time, - long long int *end_time, char *r_buf, - T *s_buf, T *ret_val, TestType type, - ShmemContextType ctx_type) { + long long int *end_time, T *dest, + T *ret_val, AddrMode addr_mode, + TestType type, ShmemContextType ctx_type) { return; } +template +__device__ inline T* compute_target_ptr(T* base_ptr, AddrMode addr_mode, + int wg_idx, int itr, int n_wgs) { + // PerBlock: element = wg_idx, with n_wgs elements per loop + // PerGrid : single element shared by the whole grid per loop + if (addr_mode == AddrMode::PerBlock) { + size_t offset = wg_idx + itr * n_wgs; + return base_ptr + offset; + } else { // PerGrid + return base_ptr + itr; + } +} + /****************************************************************************** * HOST TESTER CLASS METHODS *****************************************************************************/ template AMOStandardTester::AMOStandardTester(TesterArguments args) : Tester(args) { - CHECK_HIP(hipMalloc((void **)&_ret_val, args.max_msg_size * args.num_wgs)); - _r_buf = (char *)rocshmem_malloc(args.max_msg_size); - _s_buf = (T *)rocshmem_malloc(args.max_msg_size * args.num_wgs); + n_out = (args.addr_mode == AddrMode::PerBlock) ? args.num_wgs : 1; + n_in = args.num_wgs * args.wg_size; + n_loops = args.loop + args.skip; + + // One return per *thread* per loop + CHECK_HIP(hipMalloc((void **)&ret_val, args.max_msg_size * n_in * n_loops)); + + dest = (T *)rocshmem_malloc(args.max_msg_size * n_out * n_loops); + if (dest == nullptr) { + std::cerr << "Error allocating memory from symmetric heap" << std::endl; + std::cerr << "dest: " << dest << std::endl; + } } template AMOStandardTester::~AMOStandardTester() { - rocshmem_free(_r_buf); - CHECK_HIP(hipFree(_ret_val)); + CHECK_HIP(hipFree(ret_val)); + rocshmem_free(dest); } template void AMOStandardTester::resetBuffers(size_t size) { - memset(_r_buf, 0, args.max_msg_size); - memset(_ret_val, 0, args.max_msg_size * args.num_wgs); - memset(_s_buf, 0, args.max_msg_size * args.num_wgs); + memset(ret_val, 0, args.max_msg_size * n_in * n_loops); + memset(dest, 0, args.max_msg_size * n_out * n_loops); } template @@ -68,52 +89,162 @@ void AMOStandardTester::launchKernel(dim3 gridsize, dim3 blocksize, int loop, size_t shared_bytes = 0; hipLaunchKernelGGL(AMOStandardTest, gridsize, blocksize, shared_bytes, stream, - loop, args.skip, start_time, end_time, _r_buf, _s_buf, - _ret_val, _type, _shmem_context); + args.loop, args.skip, start_time, end_time, dest, ret_val, + args.addr_mode, _type, _shmem_context); - _gridSize = gridsize; - num_msgs = (loop + args.skip) * gridsize.x; - num_timed_msgs = loop; + num_msgs = n_loops * gridsize.x * blocksize.x; + num_timed_msgs = args.loop * gridsize.x * blocksize.x; +} + + +template +void fail_eq(const G& got, const G& exp) { + std::cerr << "data validation error\n" + << "got " << got << ", expected " << exp << std::endl; + std::exit(-1); +} + +template +void fail_nonzero(const G& got) { + std::cerr << "data validation error\n" + << "got " << got << ", expected non-zero" << std::endl; + std::exit(-1); +} + +// Map (loop, elem_idx) -> dest[] index for current address mode. +template +int AMOStandardTester::destIndex(int l, int elem_idx) const { + return (args.addr_mode == AddrMode::PerBlock) + ? l * args.num_wgs + elem_idx + : l; // PerGrid has a single element per loop +} + +// Number of output elements to check per loop for current address mode. +template +int AMOStandardTester::numElems() const { + return (args.addr_mode == AddrMode::PerBlock) + ? static_cast(args.num_wgs) + : 1; // PerGrid +} + +// Return pointer to the start of the ret_val “chunk” for (loop, elem_idx) +// plus the chunk length for this address mode. +template +std::pair AMOStandardTester::retChunk(int l, int elem_idx) const { + if (args.addr_mode == AddrMode::PerBlock) { + // One chunk per element (workgroup): wg_size returns + T* p = ret_val + l * n_in + elem_idx * args.wg_size; + int sz = static_cast(args.wg_size); + return {p, sz}; + } + // PerGrid: one big chunk per loop + T* p = ret_val + l * n_in; + int sz = static_cast(n_in); + return {p, sz}; +} + +template +void AMOStandardTester::verifyDestValues() { + const int loops = static_cast(n_loops); + const int n_elems = numElems(); + + auto check_equal_all = [&](T expected) { + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + const int idx = destIndex(l, elem); + if (dest[idx] != expected) fail_eq(dest[idx], expected); + } + } + }; + + auto check_nonzero_all = [&]() { + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + const int idx = destIndex(l, elem); + if (dest[idx] == T{0}) fail_nonzero(dest[idx]); + } + } + }; + + switch (_type) { + case AMO_AddTestType: + case AMO_FAddTestType: { + const T expected = (args.addr_mode == AddrMode::PerBlock) + ? static_cast(args.wg_size * 2) + : static_cast(args.wg_size * args.num_wgs * 2); + check_equal_all(expected); + break; + } + case AMO_IncTestType: + case AMO_FIncTestType: { + const T expected = (args.addr_mode == AddrMode::PerBlock) + ? static_cast(args.wg_size) + : static_cast(args.wg_size * args.num_wgs); + check_equal_all(expected); + break; + } + case AMO_FCswapTestType: + check_nonzero_all(); + break; + default: + break; + } +} + +template +void AMOStandardTester::verifyReturnValues() { + // Only “fetch-*” types produce return values to validate. + if (_type == AMO_AddTestType || _type == AMO_IncTestType) return; + + const int loops = static_cast(n_loops); + const int n_elems = numElems(); + + auto check_sorted_sequence = [&](auto value_of_i) { + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + auto [p, cnt] = retChunk(l, elem); + std::sort(p, p + cnt); + for (int i = 0; i < cnt; ++i) { + const T expected = static_cast(value_of_i(i)); + if (p[i] != expected) fail_eq(p[i], expected); + } + } + } + }; + + auto check_single_success_zero = [&]() { + for (int l = 0; l < loops; ++l) { + for (int elem = 0; elem < n_elems; ++elem) { + auto [p, cnt] = retChunk(l, elem); + unsigned success = 0; + for (int i = 0; i < cnt; ++i) if (!p[i]) ++success; + if (success != 1u) fail_eq(success, 1u); + } + } + }; + + switch (_type) { + case AMO_FAddTestType: + check_sorted_sequence([](int i) { return i * 2; }); + break; + case AMO_FIncTestType: + check_sorted_sequence([](int i) { return i; }); + break; + case AMO_FCswapTestType: + check_single_success_zero(); + break; + default: + break; + } } template void AMOStandardTester::verifyResults(size_t size) { - T ret; - if (args.myid == 0) { - T expected_val = 0; - - switch (_type) { - case AMO_FAddTestType: - expected_val = 2 * (num_msgs - 1); - break; - case AMO_FIncTestType: - expected_val = num_msgs - 1; - break; - case AMO_AddTestType: - expected_val = 2 * num_msgs; - break; - case AMO_IncTestType: - expected_val = num_msgs; - break; - case AMO_FCswapTestType: - expected_val = (num_msgs - 2) / _gridSize.x; - break; - default: - break; - } - - int fetch_op = (_type == AMO_FAddTestType || _type == AMO_FIncTestType || _type == AMO_FCswapTestType) ? 1: 0; - - if (fetch_op == 1) { - ret = *std::max_element(_ret_val, _ret_val + args.num_wgs); - } else { - ret = *std::max_element(_s_buf, _s_buf + args.num_wgs); - } - if (ret != expected_val) { - std::cerr << "data validation error\n"; - std::cerr << "got " << ret << ", expected " << expected_val << std::endl; - exit(-1); - } + // PE 0 checks returns; target PE checks dest. + if (args.myid) { + verifyDestValues(); + } else { + verifyReturnValues(); } } @@ -121,48 +252,47 @@ void AMOStandardTester::verifyResults(size_t size) { template <> \ __global__ void AMOStandardTest( \ int loop, int skip, long long int *start_time, \ - long long int *end_time, char *r_buf, T *s_buf, T *ret_val, \ - TestType type, ShmemContextType ctx_type) { \ + long long int *end_time, T *dest, T *ret_val, \ + AddrMode addr_mode, TestType type, ShmemContextType ctx_type) { \ __shared__ rocshmem_ctx_t ctx; \ - int wg_id = get_flat_grid_id(); \ + int wg_id = get_flat_grid_id(); \ + int global_id = get_flat_id(); \ + int t_id = get_flat_block_id(); \ + int n_threads = get_flat_grid_size(); \ + int n_wgs = get_grid_num_blocks(); \ rocshmem_wg_init(); \ rocshmem_wg_ctx_create(ctx_type, &ctx); \ - if (hipThreadIdx_x == 0) { \ + for (int i = 0; i < loop + skip; i++) { \ + T *ptr = compute_target_ptr(dest, addr_mode, wg_id, i, n_wgs); \ T ret = 0; \ - T cond = 0; \ - for (int i = 0; i < loop + skip; i++) { \ if (i == skip) { \ - start_time[wg_id] = wall_clock64(); \ - } \ - switch (type) { \ - case AMO_FAddTestType: \ - ret = rocshmem_ctx_##TNAME##_atomic_fetch_add(ctx, (T *)r_buf, 2, \ - 1); \ - break; \ - case AMO_FIncTestType: \ - ret = \ - rocshmem_ctx_##TNAME##_atomic_fetch_inc(ctx, (T *)r_buf, 1); \ - break; \ - case AMO_FCswapTestType: \ - ret = rocshmem_ctx_##TNAME##_atomic_compare_swap(ctx, (T *)r_buf, \ - cond, (T)i, 1); \ - cond = i; \ - break; \ - case AMO_AddTestType: \ - rocshmem_ctx_##TNAME##_atomic_add(ctx, (T *)r_buf, 2, 1); \ - break; \ - case AMO_IncTestType: \ - rocshmem_ctx_##TNAME##_atomic_inc(ctx, (T *)r_buf, 1); \ - break; \ - default: \ - break; \ - } \ + start_time[wg_id] = wall_clock64(); \ } \ - rocshmem_ctx_quiet(ctx); \ - end_time[wg_id] = wall_clock64(); \ - ret_val[wg_id] = ret; \ - rocshmem_ctx_getmem(ctx, &s_buf[wg_id], r_buf, sizeof(T), 1); \ + switch (type) { \ + case AMO_FAddTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_add(ctx, (T *)ptr, 2, 1); \ + break; \ + case AMO_FIncTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_fetch_inc(ctx, (T *)ptr, 1); \ + break; \ + case AMO_FCswapTestType: \ + ret = rocshmem_ctx_##TNAME##_atomic_compare_swap(ctx, (T *)ptr, 0, \ + (T)(t_id + 1), 1); \ + break; \ + case AMO_AddTestType: \ + rocshmem_ctx_##TNAME##_atomic_add(ctx, (T *)ptr, 2, 1); \ + break; \ + case AMO_IncTestType: \ + rocshmem_ctx_##TNAME##_atomic_inc(ctx, (T *)ptr, 1); \ + break; \ + default: \ + break; \ + } \ + ret_val[global_id + i * n_threads] = ret; \ } \ + rocshmem_ctx_quiet(ctx); \ + end_time[wg_id] = wall_clock64(); \ + __syncthreads(); \ rocshmem_wg_ctx_destroy(&ctx); \ rocshmem_wg_finalize(); \ } \ diff --git a/projects/rocshmem/tests/functional_tests/amo_standard_tester.hpp b/projects/rocshmem/tests/functional_tests/amo_standard_tester.hpp index cec63882fc..c3ff4ebf3c 100644 --- a/projects/rocshmem/tests/functional_tests/amo_standard_tester.hpp +++ b/projects/rocshmem/tests/functional_tests/amo_standard_tester.hpp @@ -44,10 +44,19 @@ class AMOStandardTester : public Tester { virtual void verifyResults(size_t size) override; - dim3 _gridSize{}; - char *_r_buf; - T *_ret_val; - T *_s_buf; + void verifyDestValues(); + void verifyReturnValues(); + + int destIndex(int l, int elem_idx) const; + int numElems() const; + std::pair retChunk(int l, int elem_idx) const; + + T* dest{nullptr}; // symmetric target buffer [loop][elem] + T* ret_val{nullptr}; // device returns [loop][thread] + + size_t n_in{0}; // num_wgs * wg_size + size_t n_out{0}; // elements per loop: PerBlock->num_wgs, PerGrid->1 + size_t n_loops{0}; // loop + skip }; #endif diff --git a/projects/rocshmem/tests/functional_tests/tester_arguments.cpp b/projects/rocshmem/tests/functional_tests/tester_arguments.cpp index 835d59fd5a..585ade1409 100644 --- a/projects/rocshmem/tests/functional_tests/tester_arguments.cpp +++ b/projects/rocshmem/tests/functional_tests/tester_arguments.cpp @@ -62,6 +62,13 @@ TesterArguments::TesterArguments(int argc, char *argv[]) { } else if (arg == "-x") { i++; shmem_context = atoi(argv[i]); + } else if (arg == "-m") { + int atomics_addr_mode = atoi(argv[i]); + if(atomics_addr_mode >= static_cast(AddrMode::PerGrid) && + atomics_addr_mode <= static_cast(AddrMode::PerBlock)) { + addr_mode = static_cast(atomics_addr_mode); + } + i++; } else { show_usage(argv[0]); exit(-1); @@ -141,6 +148,7 @@ void TesterArguments::show_usage(std::string executable_name) { std::cout << "\t-o \n"; std::cout << "\t-ta \n"; std::cout << "\t-x \n"; + std::cout << "\t-m Atomics Address mode\n"; } void TesterArguments::get_rocshmem_arguments() { diff --git a/projects/rocshmem/tests/functional_tests/tester_arguments.hpp b/projects/rocshmem/tests/functional_tests/tester_arguments.hpp index 217e54d50f..f8a313fe2a 100644 --- a/projects/rocshmem/tests/functional_tests/tester_arguments.hpp +++ b/projects/rocshmem/tests/functional_tests/tester_arguments.hpp @@ -39,6 +39,14 @@ enum TeamSplitType { ROCSHMEM_TEST_TEAM_ODDEVEN, // odd-even splitting }; +/*----------------------------------------- + * Atomics Addressing modes (contention model) + *-----------------------------------------*/ +enum class AddrMode : int { + PerGrid, // all WGs -> same address + PerBlock, // each WG -> its own address (default) +}; + class TesterArguments { public: TesterArguments(int argc, char *argv[]); @@ -69,6 +77,7 @@ public: unsigned coal_coef = 64; unsigned op_type = 0; unsigned shmem_context = rocshmem::ROCSHMEM_CTX_WG_PRIVATE; + AddrMode addr_mode = AddrMode::PerBlock; /** * Arguments obtained from rocshmem