From fa5ba557a5ea7edbb15932a6ca40e584dc55fb4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirza=20Halil=C4=8Devi=C4=87?= <109971222+mirza-halilcevic@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:54:17 +0100 Subject: [PATCH] EXSWHTEC-300 - Extend tests for atomic bitwise operations #288 Change-Id: I499a1c57400f13ebdf056d093228501f4eb2cb31 --- catch/unit/atomics/CMakeLists.txt | 3 + catch/unit/atomics/__hip_atomic_fetch_and.cc | 187 +++++++++++++++++++ catch/unit/atomics/__hip_atomic_fetch_or.cc | 187 +++++++++++++++++++ catch/unit/atomics/__hip_atomic_fetch_xor.cc | 187 +++++++++++++++++++ catch/unit/atomics/bitwise_common.hh | 113 ++++++++--- 5 files changed, 654 insertions(+), 23 deletions(-) create mode 100644 catch/unit/atomics/__hip_atomic_fetch_and.cc create mode 100644 catch/unit/atomics/__hip_atomic_fetch_or.cc create mode 100644 catch/unit/atomics/__hip_atomic_fetch_xor.cc diff --git a/catch/unit/atomics/CMakeLists.txt b/catch/unit/atomics/CMakeLists.txt index f18abbf3e5..bfe6e6bf59 100644 --- a/catch/unit/atomics/CMakeLists.txt +++ b/catch/unit/atomics/CMakeLists.txt @@ -35,6 +35,9 @@ set(TEST_SRC unsafeAtomicMax.cc atomicExch.cc atomicExch_system.cc + __hip_atomic_fetch_and.cc + __hip_atomic_fetch_or.cc + __hip_atomic_fetch_xor.cc ) if(HIP_PLATFORM MATCHES "nvidia") diff --git a/catch/unit/atomics/__hip_atomic_fetch_and.cc b/catch/unit/atomics/__hip_atomic_fetch_and.cc new file mode 100644 index 0000000000..51fd37bf59 --- /dev/null +++ b/catch/unit/atomics/__hip_atomic_fetch_and.cc @@ -0,0 +1,187 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "bitwise_common.hh" + +#include + +/** + * @addtogroup __hip_atomic_fetch_and __hip_atomic_fetch_and + * @{ + * @ingroup AtomicsTest + */ + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on the same + * address. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_and.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_SameAddress", "", int, + unsigned int, unsigned long, unsigned long long) { + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + Bitwise::SingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on adjacent + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_and.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Adjacent_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Adjacent address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic AND with memory scope WAVEFRONT from multiple threads on scattered + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_and.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Wavefront_Scattered_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Scattered address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on the same + * address. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_and.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_SameAddress", "", int, + unsigned int, unsigned long, unsigned long long) { + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + Bitwise::SingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on adjacent + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_and.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Adjacent_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Adjacent address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic AND with memory scope WORKGROUP from multiple threads on scattered + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_and.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_and_Positive_Workgroup_Scattered_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Scattered address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + cache_line_size); + } + } +} diff --git a/catch/unit/atomics/__hip_atomic_fetch_or.cc b/catch/unit/atomics/__hip_atomic_fetch_or.cc new file mode 100644 index 0000000000..000df50f80 --- /dev/null +++ b/catch/unit/atomics/__hip_atomic_fetch_or.cc @@ -0,0 +1,187 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "bitwise_common.hh" + +#include + +/** + * @addtogroup __hip_atomic_fetch_or __hip_atomic_fetch_or + * @{ + * @ingroup AtomicsTest + */ + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on the same + * address. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_or.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_SameAddress", "", int, + unsigned int, unsigned long, unsigned long long) { + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + Bitwise::SingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on adjacent + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_or.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Adjacent_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Adjacent address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic OR with memory scope WAVEFRONT from multiple threads on scattered + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_or.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Wavefront_Scattered_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Scattered address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on the same + * address. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_or.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_SameAddress", "", int, + unsigned int, unsigned long, unsigned long long) { + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + Bitwise::SingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on adjacent + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_or.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Adjacent_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Adjacent address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic OR with memory scope WORKGROUP from multiple threads on scattered + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_or.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_or_Positive_Workgroup_Scattered_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Scattered address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + cache_line_size); + } + } +} diff --git a/catch/unit/atomics/__hip_atomic_fetch_xor.cc b/catch/unit/atomics/__hip_atomic_fetch_xor.cc new file mode 100644 index 0000000000..0f3f3f3743 --- /dev/null +++ b/catch/unit/atomics/__hip_atomic_fetch_xor.cc @@ -0,0 +1,187 @@ +/* +Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "bitwise_common.hh" + +#include + +/** + * @addtogroup __hip_atomic_fetch_xor __hip_atomic_fetch_xor + * @{ + * @ingroup AtomicsTest + */ + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on the same + * address. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_xor.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_SameAddress", "", int, + unsigned int, unsigned long, unsigned long long) { + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + Bitwise::SingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on adjacent + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_xor.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Adjacent_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Adjacent address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic XOR with memory scope WAVEFRONT from multiple threads on scattered + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_xor.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Wavefront_Scattered_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Scattered address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + cache_line_size); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on the same + * address. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_xor.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_SameAddress", "", int, + unsigned int, unsigned long, unsigned long long) { + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Same address " << current) { + Bitwise::SingleDeviceSingleKernelTest(1, sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on adjacent + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_xor.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Adjacent_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Adjacent address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + sizeof(TestType)); + } + } +} + +/** + * Test Description + * ------------------------ + * - Performs a builtin atomic XOR with memory scope WORKGROUP from multiple threads on scattered + * addresses. + * - Uses only one device and launches one kernel. + * Test source + * ------------------------ + * - unit/atomics/__hip_atomic_fetch_xor.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.2 + */ +TEMPLATE_TEST_CASE("Unit___hip_atomic_fetch_xor_Positive_Workgroup_Scattered_Addresses", "", int, + unsigned int, unsigned long, unsigned long long) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + const auto cache_line_size = 128u; + + for (auto current = 0; current < cmd_options.iterations; ++current) { + DYNAMIC_SECTION("Scattered address " << current) { + Bitwise::SingleDeviceSingleKernelTest(warp_size, + cache_line_size); + } + } +} diff --git a/catch/unit/atomics/bitwise_common.hh b/catch/unit/atomics/bitwise_common.hh index 9e71c99cb0..887d25d4f9 100644 --- a/catch/unit/atomics/bitwise_common.hh +++ b/catch/unit/atomics/bitwise_common.hh @@ -22,10 +22,10 @@ THE SOFTWARE. #pragma once +#include #include #include #include -#include namespace cg = cooperative_groups; @@ -37,6 +37,9 @@ enum class AtomicOperation { kOrSystem, kXor, kXorSystem, + kBuiltinAnd, + kBuiltinOr, + kBuiltinXor }; constexpr auto kMask = 0xAAAA; @@ -52,7 +55,7 @@ __host__ __device__ TestType GetTestValue() { return kTestValue; } -template +template __device__ TestType PerformAtomicOperation(TestType* const mem) { const auto mask = kMask; @@ -68,10 +71,17 @@ __device__ TestType PerformAtomicOperation(TestType* const mem) { return atomicXor(mem, mask); } else if constexpr (operation == AtomicOperation::kXorSystem) { return atomicXor_system(mem, mask); + } else if constexpr (operation == AtomicOperation::kBuiltinAnd) { + return __hip_atomic_fetch_and(mem, mask, __ATOMIC_RELAXED, memory_scope); + } else if constexpr (operation == AtomicOperation::kBuiltinOr) { + return __hip_atomic_fetch_or(mem, mask, __ATOMIC_RELAXED, memory_scope); + } else if constexpr (operation == AtomicOperation::kBuiltinXor) { + return __hip_atomic_fetch_xor(mem, mask, __ATOMIC_RELAXED, memory_scope); } } -template +template __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) { __shared__ TestType shared_mem; @@ -84,7 +94,7 @@ __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals) __syncthreads(); } - old_vals[tid] = PerformAtomicOperation(mem); + old_vals[tid] = PerformAtomicOperation(mem); if constexpr (use_shared_mem) { __syncthreads(); @@ -99,7 +109,16 @@ __host__ __device__ TestType* PitchedOffset(TestType* const ptr, const unsigned return reinterpret_cast(byte_ptr + idx * pitch); } -template +__device__ void GenerateMemoryTraffic(uint8_t* const begin_addr, uint8_t* const end_addr) { + for (volatile uint8_t* addr = begin_addr; addr != end_addr; ++addr) { + uint8_t val = *addr; + val ^= 0xAB; + *addr = val; + } +} + +template __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals, const unsigned int width, const unsigned pitch) { extern __shared__ uint8_t shared_mem[]; @@ -116,8 +135,18 @@ __global__ void TestKernel(TestType* const global_mem, TestType* const old_vals, __syncthreads(); } - old_vals[tid] = - PerformAtomicOperation(PitchedOffset(mem, pitch, tid % width)); + const auto n = cooperative_groups::this_grid().size() - width; + + TestType* atomic_addr = PitchedOffset(mem, pitch, tid % width); + + if (tid < n) { + old_vals[tid] = PerformAtomicOperation( + PitchedOffset(mem, pitch, tid % width)); + } else { + uint8_t* const begin_addr = reinterpret_cast(atomic_addr + 1); + uint8_t* const end_addr = reinterpret_cast(atomic_addr) + pitch; + GenerateMemoryTraffic(begin_addr, end_addr); + } if constexpr (use_shared_mem) { __syncthreads(); @@ -157,13 +186,16 @@ std::tuple, std::vector> TestKernelHostRef(const auto& res = res_vals[tid % p.width]; old_vals.push_back(res); - if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem) { + if constexpr (operation == AtomicOperation::kAnd || operation == AtomicOperation::kAndSystem || + operation == AtomicOperation::kBuiltinAnd) { res = res & mask; } else if constexpr (operation == AtomicOperation::kOr || - operation == AtomicOperation::kOrSystem) { + operation == AtomicOperation::kOrSystem || + operation == AtomicOperation::kBuiltinOr) { res = res | mask; } else if constexpr (operation == AtomicOperation::kXor || - operation == AtomicOperation::kXorSystem) { + operation == AtomicOperation::kXorSystem || + operation == AtomicOperation::kBuiltinXor) { res = res ^ mask; } } @@ -188,19 +220,21 @@ void Verify(const TestParams& p, std::vector& res_vals, std::vector +template void LaunchKernel(const TestParams& p, hipStream_t stream, TestType* const mem_ptr, TestType* const old_vals) { const auto shared_mem_size = use_shared_mem ? p.width * p.pitch : 0u; if (p.width == 1 && p.pitch == sizeof(TestType)) - TestKernel + TestKernel <<>>(mem_ptr, old_vals); else - TestKernel + TestKernel <<>>(mem_ptr, old_vals, p.width, p.pitch); } -template +template void TestCore(const TestParams& p) { const auto old_vals_alloc_size = p.kernel_count * p.ThreadCount() * sizeof(TestType); std::vector> old_vals_devs; @@ -232,7 +266,8 @@ void TestCore(const TestParams& p) { for (auto j = 0u; j < p.kernel_count; ++j) { const auto& stream = streams[i * p.kernel_count + j].stream(); const auto old_vals = old_vals_devs[i].ptr() + j * p.ThreadCount(); - LaunchKernel(p, stream, mem_dev.ptr(), old_vals); + LaunchKernel(p, stream, mem_dev.ptr(), + old_vals); } } @@ -247,17 +282,48 @@ void TestCore(const TestParams& p) { Verify(p, res_vals, old_vals); } -template +inline dim3 GenerateThreadDimensions() { return GENERATE(dim3(16), dim3(1024)); } + +inline dim3 GenerateBlockDimensions() { + int sm_count = 0; + HIP_CHECK(hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, 0)); + return GENERATE_COPY(dim3(sm_count), dim3(sm_count + sm_count / 2)); +} + +template void SingleDeviceSingleKernelTest(const unsigned int width, const unsigned int pitch) { TestParams params; params.num_devices = 1; params.kernel_count = 1; - params.threads = GENERATE(dim3(1023)); + if constexpr ((operation == AtomicOperation::kBuiltinAnd || + operation == AtomicOperation::kBuiltinOr || + operation == AtomicOperation::kBuiltinXor) && + memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD) { + params.threads = 1; + } else if constexpr ((operation == AtomicOperation::kBuiltinAnd || + operation == AtomicOperation::kBuiltinOr || + operation == AtomicOperation::kBuiltinXor) && + memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT) { + int warp_size = 0; + HIP_CHECK(hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, 0)); + params.threads = dim3(warp_size); + } else { + params.threads = GenerateThreadDimensions(); + } params.width = width; params.pitch = pitch; SECTION("Global memory") { - params.blocks = GENERATE(dim3(3)); + if constexpr ((operation == AtomicOperation::kBuiltinAnd || + operation == AtomicOperation::kBuiltinOr || + operation == AtomicOperation::kBuiltinXor) && + (memory_scope == __HIP_MEMORY_SCOPE_SINGLETHREAD || + memory_scope == __HIP_MEMORY_SCOPE_WAVEFRONT || + memory_scope == __HIP_MEMORY_SCOPE_WORKGROUP)) { + params.blocks = dim3(1); + } else { + params.blocks = GenerateBlockDimensions(); + } using LA = LinearAllocs; for (const auto alloc_type : {LA::hipMalloc, LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { @@ -288,8 +354,8 @@ void SingleDeviceMultipleKernelTest(const unsigned int kernel_count, const unsig TestParams params; params.num_devices = 1; params.kernel_count = kernel_count; - params.blocks = GENERATE(dim3(3)); - params.threads = GENERATE(dim3(1023)); + params.blocks = GenerateBlockDimensions(); + params.threads = GenerateThreadDimensions(); params.width = width; params.pitch = pitch; @@ -329,8 +395,8 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices, TestParams params; params.num_devices = num_devices; params.kernel_count = kernel_count; - params.blocks = GENERATE(dim3(3)); - params.threads = GENERATE(dim3(1023)); + params.blocks = GenerateBlockDimensions(); + params.threads = GenerateThreadDimensions(); params.width = width; params.pitch = pitch; @@ -338,8 +404,9 @@ void MultipleDeviceMultipleKernelTest(const unsigned int num_devices, for (const auto alloc_type : {LA::hipHostMalloc, LA::hipMallocManaged, LA::mallocAndRegister}) { params.alloc_type = alloc_type; DYNAMIC_SECTION("Allocation type: " << to_string(alloc_type)) { - TestCore(params); + TestCore(params); } } } + } // namespace Bitwise